package de.datexis.retrieval.preprocess;

import de.datexis.common.Resource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Stream;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/datexis/retrieval/preprocess/WikipediaIndex.class */
public class WikipediaIndex {
    protected final Logger log = LoggerFactory.getLogger(getClass());
    protected final String PAGE_LINE = "INSERT INTO `page` VALUES ";
    protected final String REDIRECT_LINE = "INSERT INTO `redirect` VALUES ";
    protected Map<Long, String> pageTitles = new ConcurrentHashMap(5000000);
    protected Map<String, Long> pageIndex = new ConcurrentHashMap(15000000);
    protected Map<Long, String> pageRedirects = new ConcurrentHashMap(15000000);
    Map<String, String> pageURIs = null;
    protected long matched = 0;
    protected long unmatched = 0;

    public void readPages(Resource resource) throws IOException {
        CSVFormat withNullString = CSVFormat.DEFAULT.withAllowMissingColumnNames().withDelimiter(',').withRecordSeparator('\n').withQuote('\'').withQuoteMode(QuoteMode.NON_NUMERIC).withEscape('\\').withNullString("NULL");
        this.log.info("Reading Wikipedia pages from {}...", resource.toString());
        AtomicLong atomicLong = new AtomicLong();
        InputStream inputStream = resource.getInputStream();
        Throwable th = null;
        try {
            try {
                ((Stream) new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8.newDecoder())).lines().parallel()).forEach(str -> {
                    if (str.startsWith("INSERT INTO `page` VALUES ")) {
                        try {
                            CSVParser parse = withNullString.parse(new StringReader(str.substring("INSERT INTO `page` VALUES ".length() + 1, str.length() - 2).replace("),(", "\n")));
                            Throwable th2 = null;
                            try {
                                for (CSVRecord cSVRecord : parse.getRecords()) {
                                    long parseLong = Long.parseLong(cSVRecord.get(0));
                                    int parseInt = Integer.parseInt(cSVRecord.get(1));
                                    String str = cSVRecord.get(2);
                                    boolean equals = cSVRecord.get(5).equals("1");
                                    if (parseInt == 0) {
                                        if (str == null) {
                                            if (parseLong == 81447) {
                                                str = "NULL";
                                            } else {
                                                this.log.warn("title is null: {}", cSVRecord.toString());
                                            }
                                        }
                                        if (!equals && !str.endsWith("(disambiguation)")) {
                                            this.pageTitles.putIfAbsent(Long.valueOf(parseLong), str);
                                        }
                                        this.pageIndex.putIfAbsent(str, Long.valueOf(parseLong));
                                        long incrementAndGet = atomicLong.incrementAndGet();
                                        if (incrementAndGet % 1000000 == 0) {
                                            double freeMemory = Runtime.getRuntime().freeMemory() / 1.073741824E9d;
                                            this.log.debug("read {}M rows, memory usage {} GB", Long.valueOf(incrementAndGet / 1000000), Double.valueOf(((int) (((Runtime.getRuntime().totalMemory() / 1.073741824E9d) - freeMemory) * 10.0d)) / 10.0d));
                                        }
                                    }
                                }
                                if (parse != null) {
                                    if (0 != 0) {
                                        try {
                                            parse.close();
                                        } catch (Throwable th3) {
                                            th2.addSuppressed(th3);
                                        }
                                    } else {
                                        parse.close();
                                    }
                                }
                            } finally {
                            }
                        } catch (IOException e) {
                            this.log.error(e.toString());
                        }
                    }
                });
                if (inputStream != null) {
                    if (0 != 0) {
                        try {
                            inputStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        inputStream.close();
                    }
                }
                this.log.info("Read {} entities out of total {} pages", Integer.valueOf(this.pageTitles.size()), Integer.valueOf(this.pageIndex.size()));
            } finally {
            }
        } catch (Throwable th3) {
            if (inputStream != null) {
                if (th != null) {
                    try {
                        inputStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    inputStream.close();
                }
            }
            throw th3;
        }
    }

    public void readIDMapping(Resource resource) throws IOException {
        List readLines = FileUtils.readLines(resource.toFile(), "UTF-8");
        this.pageURIs = new ConcurrentHashMap(readLines.size());
        readLines.stream().map(str -> {
            return str.split("\\t");
        }).forEach(strArr -> {
            this.pageURIs.put(WikipediaUrlPreprocessor.cleanWikiPageTitle(strArr[0]), strArr[1]);
        });
    }

    public void readRedirects(Resource resource) throws IOException {
        CSVFormat withNullString = CSVFormat.DEFAULT.withAllowMissingColumnNames().withDelimiter(',').withRecordSeparator('\n').withQuote('\'').withEscape('\\').withNullString("NULL");
        this.log.info("Reading Wikipedia redirects from {}...", resource.toString());
        AtomicLong atomicLong = new AtomicLong();
        InputStream inputStream = resource.getInputStream();
        Throwable th = null;
        try {
            try {
                ((Stream) new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8.newDecoder())).lines().parallel()).forEach(str -> {
                    if (str.startsWith("INSERT INTO `redirect` VALUES ")) {
                        try {
                            CSVParser parse = withNullString.parse(new StringReader(str.substring("INSERT INTO `redirect` VALUES ".length() + 1, str.length() - 2).replace("),(", "\n")));
                            Throwable th2 = null;
                            try {
                                try {
                                    for (CSVRecord cSVRecord : parse.getRecords()) {
                                        long parseLong = Long.parseLong(cSVRecord.get(0));
                                        int parseInt = Integer.parseInt(cSVRecord.get(1));
                                        String str = cSVRecord.get(2);
                                        if (parseInt == 0) {
                                            this.pageRedirects.putIfAbsent(Long.valueOf(parseLong), str);
                                            long incrementAndGet = atomicLong.incrementAndGet();
                                            if (incrementAndGet % 1000000 == 0) {
                                                double freeMemory = Runtime.getRuntime().freeMemory() / 1.073741824E9d;
                                                this.log.debug("read {}M rows, memory usage {} GB", Long.valueOf(incrementAndGet / 1000000), Double.valueOf(((int) (((Runtime.getRuntime().totalMemory() / 1.073741824E9d) - freeMemory) * 10.0d)) / 10.0d));
                                            }
                                        }
                                    }
                                    if (parse != null) {
                                        if (0 != 0) {
                                            try {
                                                parse.close();
                                            } catch (Throwable th3) {
                                                th2.addSuppressed(th3);
                                            }
                                        } else {
                                            parse.close();
                                        }
                                    }
                                } finally {
                                }
                            } catch (Throwable th4) {
                                th2 = th4;
                                throw th4;
                            }
                        } catch (IOException e) {
                            this.log.error(e.toString());
                        }
                    }
                });
                if (inputStream != null) {
                    if (0 != 0) {
                        try {
                            inputStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        inputStream.close();
                    }
                }
                this.log.info("Read {} redirects", Integer.valueOf(this.pageRedirects.size()));
            } finally {
            }
        } catch (Throwable th3) {
            if (inputStream != null) {
                if (th != null) {
                    try {
                        inputStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    inputStream.close();
                }
            }
            throw th3;
        }
    }

    public long countPages() {
        return this.pageTitles.size();
    }

    public String getTitleForId(long j) {
        return this.pageTitles.get(Long.valueOf(j));
    }

    public String getTitleFromRedirect(String str) {
        Long idForTitle = getIdForTitle(str);
        if (idForTitle == null) {
            return null;
        }
        return getTitleForId(idForTitle.longValue());
    }

    public Long getIdForTitle(String str) {
        Long l;
        if (str == null) {
            return null;
        }
        String str2 = str;
        int i = 0;
        do {
            Long l2 = this.pageIndex.get(str2);
            if (l2 == null) {
                if (str2.length() > 0) {
                    l = this.pageIndex.get(str2.substring(0, 1).toUpperCase() + (str2.length() > 1 ? str2.substring(1) : ""));
                } else {
                    l = null;
                }
                l2 = l;
            }
            if (l2 == null) {
                this.unmatched++;
                return null;
            }
            if (!this.pageRedirects.containsKey(l2)) {
                this.matched++;
                return l2;
            }
            str2 = this.pageRedirects.get(l2);
            if (str2 != null && str2.equals(str)) {
                this.matched++;
                return l2;
            }
            i++;
        } while (i < 32);
        this.log.error("Page id not found for '{}' after {} redirects", str2, Integer.valueOf(i));
        return null;
    }

    public String getURIForTitle(String str) {
        return this.pageURIs == null ? getTitleFromRedirect(str) : this.pageURIs.get(str);
    }

    public String getStats() {
        return "WikipediaIndex: " + this.matched + " matched, " + this.unmatched + " unmatched.";
    }

    public void filterPages(Resource resource) throws IOException {
        filterPages(FileUtils.readLines(resource.toFile(), "UTF-8"));
    }

    public void filterPages(List<String> list) {
        String str;
        HashMap hashMap = new HashMap(list.size());
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            String cleanWikiPageTitle = WikipediaUrlPreprocessor.cleanWikiPageTitle(it.next());
            Long l = this.pageIndex.get(cleanWikiPageTitle);
            if (l == null) {
                this.log.info("Page '{}' not found in index", cleanWikiPageTitle);
            } else if (this.pageRedirects.containsKey(l)) {
                String titleFromRedirect = getTitleFromRedirect(cleanWikiPageTitle);
                Long l2 = this.pageIndex.get(titleFromRedirect);
                this.log.trace("Page '{}' is a redirect to {}", cleanWikiPageTitle, titleFromRedirect);
                hashMap.putIfAbsent(l2, titleFromRedirect);
                if (this.pageURIs != null && (str = this.pageURIs.get(cleanWikiPageTitle)) != null) {
                    this.pageURIs.putIfAbsent(titleFromRedirect, str);
                }
            } else {
                hashMap.putIfAbsent(l, cleanWikiPageTitle);
            }
        }
        this.pageTitles = hashMap;
        this.log.info("Filtered {} pages from given list of {} URLs", Integer.valueOf(this.pageTitles.size()), Integer.valueOf(list.size()));
    }
}
