package de.julielab.genemapper.resources.uima;

import de.julielab.genemapper.WikipediaCategoryManager;
import de.julielab.genemapper.resources.MultiStreamBZip2InputStream;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.jcore.types.Header;
import de.julielab.jcore.types.wikipedia.Title;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.io.LineIterator;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ResourceMetaData(name = "JCoRe GeneMapper Wikipedia Reader", description = "Reads the XML Wikipedia dump. Extracts a portion of the first text line of each page to capture the term definition.")
/* loaded from: input_file:de/julielab/genemapper/resources/uima/WikipediaReader.class */
public class WikipediaReader extends JCasCollectionReader_ImplBase {
    public static final String PARAM_WIKIPEDIA_XML = "WikipediaXML";
    public static final String PARAM_EXCERPT_LENGTH = "ExcerptLength";
    public static final String PARAM_TITLE_WHITELIST = "TitleWhitelist";
    public static final String PARAM_WIKIPEDIA_CATEGORY_TREE_PATH = "WikipediaCategoryTreePath";
    private static final Logger log = LoggerFactory.getLogger(WikipediaReader.class);
    private static final Pattern WIKI_LINK_PATTERN = Pattern.compile("\\[\\[[^]|]+\\|([^]]+)\\]\\]");
    private static final Pattern WIKI_MARKUP_ELEMENTS = Pattern.compile("[]\\[{}']+");
    private static final Pattern XML_REF_ELEMENT_PATTERN = Pattern.compile("<ref[^<]+</ref>");
    private static final Pattern XML_MARKUP_ELEMENTS = Pattern.compile("<[^>]+>");
    private static final Pattern NON_WS_PATTERN = Pattern.compile("[^\\s]");
    private static final Set<Character> NON_TEXT_CHARS = Set.of('{', '}', '#', '|', '<', '[', '*');
    private static WikipediaCategoryManager wikipediaCategoryManager;
    private final XMLInputFactory factory = XMLInputFactory.newInstance();

    @ConfigurationParameter(name = PARAM_WIKIPEDIA_XML)
    private String wikipediaXml;

    @ConfigurationParameter(name = PARAM_EXCERPT_LENGTH, description = "Maximum number of characters to be kept from the first line of each page. Defaults to 1000.", mandatory = false, defaultValue = {"1000"})
    private int excerptLength;

    @ConfigurationParameter(name = PARAM_TITLE_WHITELIST, description = "Path to a file. If given, only pages that have a title on the list will be returned as a CAS.", mandatory = false)
    private String titleWhiteListFilePath;

    @ConfigurationParameter(name = "WikipediaCategoryTreePath", mandatory = false, description = "Optional. File created by GeNo's 'WikipediaCategoryTreeAndRedirectsExtractor' class that represents a map from page and category titles to categories they belong to. Will be used to filter for pages that are in some way related to the Molecular Biology category.")
    private String wikipediaCategoryTreePath;
    private Set<String> titleWhitelist;
    private XMLStreamReader parser;
    private ParsingStatus currentPage;
    private int processedPages;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:de/julielab/genemapper/resources/uima/WikipediaReader$ParsingStatus.class */
    public class ParsingStatus {
        private String title;
        private String text;
        private String namespace;
        private String pageId;
        private boolean skip;

        private ParsingStatus() {
        }

        public boolean isSkip() {
            return this.skip;
        }

        public String getText() {
            return this.text;
        }

        public void setText(String str) {
            this.text = str;
        }

        public String getTitle() {
            return this.title;
        }

        public void setTitle(String str) {
            this.title = str;
        }

        public String getNamespace() {
            return this.namespace;
        }

        public void setNamespace(String str) {
            this.namespace = str;
        }

        public String getPageId() {
            return this.pageId;
        }

        public void setPageId(String str) {
            this.pageId = str;
        }

        public void skip() {
            this.skip = true;
        }
    }

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        try {
            this.wikipediaXml = (String) uimaContext.getConfigParameterValue(PARAM_WIKIPEDIA_XML);
            this.excerptLength = ((Integer) Optional.ofNullable((Integer) uimaContext.getConfigParameterValue(PARAM_EXCERPT_LENGTH)).orElse(1000)).intValue();
            this.titleWhiteListFilePath = (String) uimaContext.getConfigParameterValue(PARAM_TITLE_WHITELIST);
            this.wikipediaCategoryTreePath = (String) uimaContext.getConfigParameterValue("WikipediaCategoryTreePath");
            log.info("Reading Wikipedia dump from {}.", this.wikipediaXml);
            log.info("Maximum excerpt length: {}", Integer.valueOf(this.excerptLength));
            if (this.titleWhiteListFilePath != null) {
                BufferedReader readerFromFile = FileUtilities.getReaderFromFile(new File(this.titleWhiteListFilePath));
                try {
                    this.titleWhitelist = (Set) readerFromFile.lines().filter(Predicate.not(str -> {
                        return str.startsWith("#") || str.isBlank();
                    })).collect(Collectors.toSet());
                    log.info("Received Wikipedia title whitelist from {} with {} entries.", this.titleWhiteListFilePath, Integer.valueOf(this.titleWhitelist.size()));
                    if (readerFromFile != null) {
                        readerFromFile.close();
                    }
                } finally {
                }
            }
            CompressorInputStream bufferedInputStream = new BufferedInputStream(new FileInputStream(this.wikipediaXml));
            this.parser = this.factory.createXMLStreamReader(new BufferedReader(new InputStreamReader(this.wikipediaXml.endsWith(".bz2") ? new MultiStreamBZip2InputStream(bufferedInputStream) : bufferedInputStream)));
            this.currentPage = getNextPage();
            this.processedPages = 0;
            synchronized (WikipediaReader.class) {
                if (this.wikipediaCategoryTreePath != null && wikipediaCategoryManager == null) {
                    log.info("Creating Dijkstra tree for Category:Biology");
                    wikipediaCategoryManager = new WikipediaCategoryManager(this.wikipediaCategoryTreePath, true);
                    wikipediaCategoryManager.buildDijkstraTree("Category:Biology");
                }
            }
        } catch (IOException | XMLStreamException e) {
            log.error("Exception while initializing WikipediaReader", e);
            throw new ResourceInitializationException(e);
        }
    }

    public void getNext(JCas jCas) throws CollectionException {
        try {
            if (this.currentPage != null) {
                String lowerCase = this.currentPage.getText().toLowerCase();
                String lowerCase2 = this.currentPage.getTitle().toLowerCase();
                int indexOf = lowerCase.indexOf(lowerCase2);
                int length = lowerCase2.length();
                if (indexOf < 0 && lowerCase2.endsWith("s")) {
                    indexOf = lowerCase.indexOf(lowerCase2.substring(0, lowerCase2.length() - 1));
                    length = lowerCase2.length() - 1;
                }
                Title title = null;
                if (indexOf >= 0) {
                    title = new Title(jCas, indexOf, indexOf + length);
                    title.addToIndexes();
                }
                int begin = title != null ? title.getBegin() : 0;
                int end = title != null ? title.getEnd() : lowerCase.indexOf(" ");
                Header header = (begin < 0 || end <= begin) ? new Header(jCas) : new Header(jCas, begin, end);
                header.setDocId(this.currentPage.getPageId());
                header.setTitle(this.currentPage.getTitle());
                header.addToIndexes();
                jCas.setDocumentText(this.currentPage.getText());
                this.currentPage = getNextPage();
                this.processedPages++;
                if (this.processedPages % 100000 == 0) {
                    log.info("Processed {} pages.", Integer.valueOf(this.processedPages));
                }
            }
        } catch (Throwable th) {
            log.error("Error while reading Wikipedia", th);
            throw new CollectionException(th);
        }
    }

    @Nullable
    private ParsingStatus getNextPage() throws XMLStreamException {
        ParsingStatus parsingStatus = null;
        boolean z = false;
        while (this.parser.hasNext() && (!z || (this.currentPage != null && this.currentPage.isSkip()))) {
            int next = this.parser.next();
            if (next == 1) {
                if (this.parser.getLocalName().equalsIgnoreCase("page")) {
                    parsingStatus = new ParsingStatus();
                }
                if (parsingStatus != null && !parsingStatus.isSkip()) {
                    if (this.parser.getLocalName().equalsIgnoreCase("title")) {
                        String elementText = this.parser.getElementText();
                        if (wikipediaCategoryManager != null && wikipediaCategoryManager.getShortestPathToDijkstraTreeRoot(elementText, (Set) null).isEmpty()) {
                            parsingStatus.skip();
                        }
                        parsingStatus.setTitle(elementText);
                    } else if (this.parser.getLocalName().equalsIgnoreCase("ns")) {
                        parsingStatus.setNamespace(this.parser.getElementText());
                    } else if (this.parser.getLocalName().equalsIgnoreCase("text") && parsingStatus.getNamespace().equals("0")) {
                        if (this.titleWhitelist == null || this.titleWhitelist.isEmpty() || this.titleWhitelist.contains(parsingStatus.getTitle())) {
                            parseText(this.parser.getElementText(), parsingStatus);
                        }
                    } else if (parsingStatus != null && parsingStatus.getPageId() == null && this.parser.getLocalName().equalsIgnoreCase("id")) {
                        parsingStatus.setPageId(this.parser.getElementText());
                    }
                }
            } else if (next == 2 && this.parser.getLocalName().equalsIgnoreCase("page") && parsingStatus != null) {
                z = true;
                if (parsingStatus.getText() == null || parsingStatus.getText().isBlank()) {
                    z = false;
                    parsingStatus = null;
                }
            }
        }
        return parsingStatus;
    }

    private void parseText(String str, ParsingStatus parsingStatus) {
        LineIterator lineIterator = new LineIterator(new StringReader(str));
        while (lineIterator.hasNext()) {
            String next = lineIterator.next();
            Matcher matcher = NON_WS_PATTERN.matcher(next);
            if (matcher.find() && !next.isBlank() && !NON_TEXT_CHARS.contains(Character.valueOf(next.charAt(matcher.start())))) {
                String replaceAll = XML_MARKUP_ELEMENTS.matcher(XML_REF_ELEMENT_PATTERN.matcher(WIKI_MARKUP_ELEMENTS.matcher(WIKI_LINK_PATTERN.matcher(next).replaceAll("$1")).replaceAll("")).replaceAll("")).replaceAll("");
                parsingStatus.setText(replaceAll.substring(0, Math.min(replaceAll.length(), this.excerptLength)));
                return;
            }
        }
    }

    public boolean hasNext() throws IOException, CollectionException {
        return this.currentPage != null;
    }

    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.processedPages, 0, "pages")};
    }
}
