package de.julielab.genemapper.resources;

import de.julielab.java.utilities.FileUtilities;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/genemapper/resources/WikipediaCategoryTreeAndRedirectsExtractor.class */
public class WikipediaCategoryTreeAndRedirectsExtractor {
    private static final Logger log = LoggerFactory.getLogger(WikipediaCategoryTreeAndRedirectsExtractor.class);
    private static final Pattern CATEGORY_PATTERN = Pattern.compile("\\[\\[Category:([^]]+)\\]\\]");
    private static final Pattern XML_COMMENT_PATTERN = Pattern.compile("(?s)<!--.*?-->");
    private static final Pattern WIKI_LINK_PATTERN = Pattern.compile("\\[\\[(.*?)[]|]");
    private static final Set<String> ACCEPTED_NAMESPACES = Set.of("0", "14");
    private final File wikipediaXml;
    private final File categoriesOutputFile;
    private final File redirectsOutputFile;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:de/julielab/genemapper/resources/WikipediaCategoryTreeAndRedirectsExtractor$ParsingStatus.class */
    public class ParsingStatus {
        private String title;
        private String text;
        private String namespace;
        private String id;
        private final List<String> categories = new ArrayList();
        private List<String> referredToTitles = Collections.emptyList();
        private String redirectTitle;
        private boolean isDisambiguationPage;

        private ParsingStatus() {
        }

        public List<String> getCategories() {
            return this.categories;
        }

        public String getText() {
            return this.text;
        }

        public void setText(String str) {
            this.text = str;
        }

        public String getTitle() {
            return this.title;
        }

        public void setTitle(String str) {
            this.title = str;
        }

        public String getNamespace() {
            return this.namespace;
        }

        public void setNamespace(String str) {
            this.namespace = str;
        }

        public String getId() {
            return this.id;
        }

        public void setId(String str) {
            this.id = str;
        }

        public void addCategory(String str) {
            this.categories.add(str);
        }

        public String getRedirectTitle() {
            return this.redirectTitle;
        }

        public boolean isRedirect() {
            return this.redirectTitle != null;
        }

        public void setRedirectTitle(String str) {
            this.redirectTitle = str;
        }

        public boolean isDisambiguationPage() {
            return this.isDisambiguationPage;
        }

        public void setIsDisambiguationPage(boolean z) {
            this.isDisambiguationPage = z;
        }

        public List<String> getReferredToTitles() {
            return this.referredToTitles;
        }

        public void addReferredToTitle(String str) {
            if (this.referredToTitles.isEmpty()) {
                this.referredToTitles = new ArrayList();
            }
            this.referredToTitles.add(str);
        }
    }

    public WikipediaCategoryTreeAndRedirectsExtractor(File file, File file2, File file3) {
        this.wikipediaXml = file;
        this.categoriesOutputFile = file2;
        this.redirectsOutputFile = file3;
    }

    public static void main(String[] strArr) throws IOException, XMLStreamException {
        if (strArr.length < 3) {
            System.err.println("Usage: " + WikipediaCategoryTreeAndRedirectsExtractor.class.getSimpleName() + " <wikipedia XML dump> <category tree file output path> <redirect map file output path>");
            System.exit(0);
        }
        File file = new File(strArr[0]);
        File file2 = new File(strArr[1]);
        File file3 = new File(strArr[2]);
        WikipediaCategoryTreeAndRedirectsExtractor wikipediaCategoryTreeAndRedirectsExtractor = new WikipediaCategoryTreeAndRedirectsExtractor(file, file2, file3);
        log.info("Reading Wikipedia dump from {}. Writing category map to {} and redirect map to {}.", new Object[]{file, file2, file3});
        wikipediaCategoryTreeAndRedirectsExtractor.create();
        log.info("Finished the creation of the Wikipedia category and redirect maps. They are stored at {} and {}, respectively.", file2, file3);
    }

    /* JADX WARN: Type inference failed for: r0v12, types: [de.julielab.genemapper.resources.MultiStreamBZip2InputStream, java.io.InputStream] */
    private void create() throws IOException, XMLStreamException {
        XMLInputFactory newInstance = XMLInputFactory.newInstance();
        log.debug("Creating input and output streams.");
        try {
            FileInputStream fileInputStream = new FileInputStream(this.wikipediaXml);
            try {
                BufferedInputStream bufferedInputStream = new BufferedInputStream(fileInputStream);
                try {
                    ?? multiStreamBZip2InputStream = new MultiStreamBZip2InputStream(bufferedInputStream);
                    try {
                        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(multiStreamBZip2InputStream));
                        try {
                            BufferedWriter writerToFile = FileUtilities.getWriterToFile(this.categoriesOutputFile);
                            try {
                                writerToFile = FileUtilities.getWriterToFile(this.redirectsOutputFile);
                                try {
                                    writerToFile.write("page title");
                                    writerToFile.write("\t");
                                    writerToFile.write("category");
                                    writerToFile.newLine();
                                    writerToFile.write("target title");
                                    writerToFile.write("\t");
                                    writerToFile.write("redirected title");
                                    writerToFile.newLine();
                                    log.debug("Starting to parse Wikipedia XML.");
                                    XMLStreamReader createXMLStreamReader = newInstance.createXMLStreamReader(bufferedReader);
                                    ParsingStatus parsingStatus = null;
                                    int i = 0;
                                    while (createXMLStreamReader.hasNext()) {
                                        int next = createXMLStreamReader.next();
                                        if (next == 1) {
                                            if (createXMLStreamReader.getLocalName().equalsIgnoreCase("page")) {
                                                parsingStatus = new ParsingStatus();
                                            } else if (createXMLStreamReader.getLocalName().equalsIgnoreCase("title")) {
                                                parsingStatus.setTitle(createXMLStreamReader.getElementText());
                                            } else if (createXMLStreamReader.getLocalName().equalsIgnoreCase("ns")) {
                                                parsingStatus.setNamespace(createXMLStreamReader.getElementText());
                                            } else if (createXMLStreamReader.getLocalName().equalsIgnoreCase("text") && ACCEPTED_NAMESPACES.contains(parsingStatus.getNamespace())) {
                                                parseText(createXMLStreamReader.getElementText(), parsingStatus);
                                            } else if (createXMLStreamReader.getLocalName().equalsIgnoreCase("id")) {
                                                parsingStatus.setId(createXMLStreamReader.getElementText());
                                            } else if (createXMLStreamReader.getLocalName().equals("redirect")) {
                                                parsingStatus.setRedirectTitle(createXMLStreamReader.getAttributeValue(0));
                                            }
                                        } else if (next == 2 && createXMLStreamReader.getLocalName().equalsIgnoreCase("page") && ACCEPTED_NAMESPACES.contains(parsingStatus.getNamespace())) {
                                            if (parsingStatus.isRedirect() || parsingStatus.isDisambiguationPage()) {
                                                writeRedirect(parsingStatus, writerToFile);
                                            } else {
                                                writeCategories(parsingStatus, writerToFile);
                                            }
                                            i++;
                                            if (i % 100000 == 0) {
                                                log.info("{} lines written.", Integer.valueOf(i));
                                            }
                                        }
                                    }
                                    if (writerToFile != null) {
                                        writerToFile.close();
                                    }
                                    if (writerToFile != null) {
                                        writerToFile.close();
                                    }
                                    bufferedReader.close();
                                    multiStreamBZip2InputStream.close();
                                    bufferedInputStream.close();
                                    fileInputStream.close();
                                } finally {
                                    if (writerToFile != null) {
                                        try {
                                            writerToFile.close();
                                        } catch (Throwable th) {
                                            th.addSuppressed(th);
                                        }
                                    }
                                }
                            } catch (Throwable th2) {
                                throw th2;
                            }
                        } catch (Throwable th3) {
                            try {
                                bufferedReader.close();
                            } catch (Throwable th4) {
                                th3.addSuppressed(th4);
                            }
                            throw th3;
                        }
                    } catch (Throwable th5) {
                        try {
                            multiStreamBZip2InputStream.close();
                        } catch (Throwable th6) {
                            th5.addSuppressed(th6);
                        }
                        throw th5;
                    }
                } catch (Throwable th7) {
                    try {
                        bufferedInputStream.close();
                    } catch (Throwable th8) {
                        th7.addSuppressed(th8);
                    }
                    throw th7;
                }
            } finally {
            }
        } catch (IOException | XMLStreamException e) {
            log.error("XML parsing error", e);
            throw e;
        }
    }

    private void writeRedirect(ParsingStatus parsingStatus, BufferedWriter bufferedWriter) throws IOException {
        if (parsingStatus.isRedirect()) {
            bufferedWriter.write(parsingStatus.getRedirectTitle());
            bufferedWriter.write("\t");
            bufferedWriter.write(parsingStatus.getTitle());
            bufferedWriter.newLine();
            return;
        }
        Iterator<String> it = parsingStatus.getReferredToTitles().iterator();
        while (it.hasNext()) {
            bufferedWriter.write(it.next());
            bufferedWriter.write("\t");
            bufferedWriter.write(parsingStatus.getTitle());
            bufferedWriter.newLine();
        }
    }

    private void writeCategories(ParsingStatus parsingStatus, BufferedWriter bufferedWriter) throws IOException {
        for (String str : parsingStatus.getCategories()) {
            bufferedWriter.write(parsingStatus.getTitle());
            bufferedWriter.write("\t");
            bufferedWriter.write(str);
            bufferedWriter.newLine();
        }
    }

    private void parseText(String str, ParsingStatus parsingStatus) {
        String replaceAll = XML_COMMENT_PATTERN.matcher(str).replaceAll("");
        LineIterator lineIterator = new LineIterator(new StringReader(replaceAll));
        Matcher matcher = CATEGORY_PATTERN.matcher("");
        while (lineIterator.hasNext()) {
            String next = lineIterator.next();
            if (next.startsWith("[[Category:")) {
                matcher.reset(next);
                if (matcher.find()) {
                    String group = matcher.group(1);
                    if (group.contains("|")) {
                        group = group.substring(0, group.lastIndexOf(124));
                    }
                    parsingStatus.addCategory(group.trim());
                }
            } else if (next.contains("{{disambig}}")) {
                parsingStatus.setIsDisambiguationPage(true);
                Matcher matcher2 = WIKI_LINK_PATTERN.matcher(replaceAll);
                while (matcher2.find()) {
                    parsingStatus.addReferredToTitle(matcher2.group(1));
                }
            }
        }
    }
}
