package de.julielab.genemapper.resources;

import de.julielab.java.utilities.FileUtilities;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.Set;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/genemapper/resources/WikipediaTitleDictionaryCreator.class */
public class WikipediaTitleDictionaryCreator {
    private static final Set<Character> NON_TEXT_CHARS = Set.of('{', '}', '#', '|', '<', '[', '*');
    private static final Logger log = LoggerFactory.getLogger(WikipediaTitleDictionaryCreator.class);
    private final File wikipediaXml;
    private final File outputFile;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:de/julielab/genemapper/resources/WikipediaTitleDictionaryCreator$ParsingStatus.class */
    public class ParsingStatus {
        private boolean inPage;
        private String nameSpace;
        private String title;
        private String pageId;
        private String text;

        private ParsingStatus() {
        }

        public boolean isInPage() {
            return this.inPage;
        }

        public void setInPage(boolean z) {
            this.inPage = z;
        }

        public String getNameSpace() {
            return this.nameSpace;
        }

        public void setCurrentNameSpace(String str) {
            this.nameSpace = str;
        }

        public String getTitle() {
            return this.title;
        }

        public void setTitle(String str) {
            this.title = str;
        }

        public String getPageId() {
            return this.pageId;
        }

        public void setPageId(String str) {
            this.pageId = str;
        }

        public String getText() {
            return this.text;
        }

        public void setText(String str) {
            this.text = str;
        }
    }

    public WikipediaTitleDictionaryCreator(File file, File file2) {
        this.wikipediaXml = file;
        this.outputFile = file2;
    }

    public static void main(String[] strArr) throws IOException, XMLStreamException {
        File file = new File(strArr[0]);
        File file2 = new File(strArr[1]);
        WikipediaTitleDictionaryCreator wikipediaTitleDictionaryCreator = new WikipediaTitleDictionaryCreator(file, file2);
        log.info("Reading Wikipedia dump from {}. Writing dictionary to {}.", file, file2);
        wikipediaTitleDictionaryCreator.create();
    }

    /* JADX WARN: Type inference failed for: r0v9, types: [de.julielab.genemapper.resources.MultiStreamBZip2InputStream, java.io.InputStream] */
    private void create() throws IOException, XMLStreamException {
        XMLInputFactory newInstance = XMLInputFactory.newInstance();
        FileInputStream fileInputStream = new FileInputStream(this.wikipediaXml);
        try {
            BufferedInputStream bufferedInputStream = new BufferedInputStream(fileInputStream);
            try {
                ?? multiStreamBZip2InputStream = new MultiStreamBZip2InputStream(bufferedInputStream);
                try {
                    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(multiStreamBZip2InputStream));
                    try {
                        BufferedWriter writerToFile = FileUtilities.getWriterToFile(this.outputFile);
                        try {
                            XMLStreamReader createXMLStreamReader = newInstance.createXMLStreamReader(bufferedReader);
                            ParsingStatus parsingStatus = new ParsingStatus();
                            int i = 0;
                            int i2 = 0;
                            while (createXMLStreamReader.hasNext()) {
                                int next = createXMLStreamReader.next();
                                if (next == 1) {
                                    if (createXMLStreamReader.getLocalName().equalsIgnoreCase("page")) {
                                        parsingStatus.setInPage(true);
                                    }
                                    if (parsingStatus.isInPage()) {
                                        if (createXMLStreamReader.getLocalName().equalsIgnoreCase("ns")) {
                                            parsingStatus.setCurrentNameSpace(createXMLStreamReader.getElementText());
                                        } else if (createXMLStreamReader.getLocalName().equalsIgnoreCase("title")) {
                                            parsingStatus.setTitle(createXMLStreamReader.getElementText());
                                        } else if (createXMLStreamReader.getLocalName().equalsIgnoreCase("id") && parsingStatus.getPageId() == null) {
                                            parsingStatus.setPageId(createXMLStreamReader.getElementText());
                                        } else if (createXMLStreamReader.getLocalName().equalsIgnoreCase("text") && "0".equals(parsingStatus.getNameSpace())) {
                                            parseText(createXMLStreamReader.getElementText(), parsingStatus);
                                        }
                                    }
                                } else if (next == 2 && createXMLStreamReader.getLocalName().equalsIgnoreCase("page")) {
                                    if ("0".equals(parsingStatus.getNameSpace()) && parsingStatus.getText() != null && !parsingStatus.getText().isBlank()) {
                                        writerToFile.write(parsingStatus.getTitle());
                                        writerToFile.write("\t");
                                        writerToFile.write(parsingStatus.getPageId());
                                        writerToFile.newLine();
                                        i2++;
                                        parsingStatus.setCurrentNameSpace(null);
                                        parsingStatus.setText(null);
                                        parsingStatus.setPageId(null);
                                    }
                                    i++;
                                    if (i % 100000 == 0) {
                                        log.info("{} pages processed, {} pages added to dictionary.", Integer.valueOf(i), Integer.valueOf(i2));
                                    }
                                    parsingStatus.setInPage(false);
                                }
                            }
                            if (writerToFile != null) {
                                writerToFile.close();
                            }
                            bufferedReader.close();
                            multiStreamBZip2InputStream.close();
                            bufferedInputStream.close();
                            fileInputStream.close();
                        } catch (Throwable th) {
                            if (writerToFile != null) {
                                try {
                                    writerToFile.close();
                                } catch (Throwable th2) {
                                    th.addSuppressed(th2);
                                }
                            }
                            throw th;
                        }
                    } catch (Throwable th3) {
                        try {
                            bufferedReader.close();
                        } catch (Throwable th4) {
                            th3.addSuppressed(th4);
                        }
                        throw th3;
                    }
                } catch (Throwable th5) {
                    try {
                        multiStreamBZip2InputStream.close();
                    } catch (Throwable th6) {
                        th5.addSuppressed(th6);
                    }
                    throw th5;
                }
            } finally {
            }
        } catch (Throwable th7) {
            try {
                fileInputStream.close();
            } catch (Throwable th8) {
                th7.addSuppressed(th8);
            }
            throw th7;
        }
    }

    private void parseText(String str, ParsingStatus parsingStatus) {
        LineIterator lineIterator = new LineIterator(new StringReader(str));
        while (lineIterator.hasNext()) {
            String next = lineIterator.next();
            int findFirstNonWhitespaceCharacter = findFirstNonWhitespaceCharacter(next);
            if (findFirstNonWhitespaceCharacter != 0) {
                char c = (char) findFirstNonWhitespaceCharacter;
                if (!next.isBlank() && !NON_TEXT_CHARS.contains(Character.valueOf(c))) {
                    parsingStatus.setText(next.substring(0, Math.min(next.length(), 1000)));
                    return;
                }
            }
        }
    }

    private int findFirstNonWhitespaceCharacter(String str) {
        for (int i = 0; i < str.length(); i++) {
            char charAt = str.charAt(i);
            if (!Character.isWhitespace(charAt)) {
                return charAt;
            }
        }
        return -1;
    }
}
