package de.datexis.nel.reader;

import de.datexis.common.Resource;
import de.datexis.index.ArticleRef;
import de.datexis.index.impl.LuceneArticleIndex;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.nel.NamedEntityAnnotation;
import de.datexis.ner.MentionAnnotation;
import de.datexis.preprocess.DocumentFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/datexis/nel/reader/WNEDDataset.class */
public class WNEDDataset {
    protected static final Logger log = LoggerFactory.getLogger(WNEDDataset.class);
    protected List<Class<? extends Annotation>> annotations = new ArrayList();

    /* loaded from: input_file:de/datexis/nel/reader/WNEDDataset$Reader.class */
    public static class Reader {
        Resource xmlFile;
        Resource rawTextPath;
        LuceneArticleIndex search;
        boolean wikidata = false;
        WNEDDataset reader = new WNEDDataset();

        public Reader(Resource resource, Resource resource2) {
            this.xmlFile = resource;
            this.rawTextPath = resource2;
        }

        public Reader withAnnotations(Class<? extends Annotation> cls) {
            this.reader.annotations.add(cls);
            return this;
        }

        public Reader withWikidataIDs(LuceneArticleIndex luceneArticleIndex) {
            this.wikidata = true;
            this.search = luceneArticleIndex;
            return this;
        }

        public Dataset read() throws IOException {
            return this.wikidata ? this.reader.readDataSet(this.xmlFile, this.rawTextPath, this.search) : this.reader.readDataSet(this.xmlFile, this.rawTextPath);
        }
    }

    public Dataset readDataSet(Resource resource, Resource resource2) throws IOException {
        String replaceFirst = resource.getFileName().replaceFirst("\\.xml$", "");
        log.info("Reading Dataset \"" + replaceFirst + "\" from " + resource.toString());
        List<Document> readDocuments = readDocuments(resource, resource2);
        Dataset dataset = new Dataset(replaceFirst);
        Iterator<Document> it = readDocuments.iterator();
        while (it.hasNext()) {
            dataset.addDocument(it.next());
        }
        return dataset;
    }

    public Dataset readDataSet(Resource resource, Resource resource2, LuceneArticleIndex luceneArticleIndex) throws IOException {
        Dataset readDataSet = readDataSet(resource, resource2);
        Iterator it = readDataSet.getDocuments().iterator();
        while (it.hasNext()) {
            for (NamedEntityAnnotation namedEntityAnnotation : ((Document) it.next()).getAnnotations(NamedEntityAnnotation.class)) {
                if (namedEntityAnnotation.getRefId() == null || !namedEntityAnnotation.getRefId().equals("NIL")) {
                    Optional<ArticleRef> queryWikipediaPage = luceneArticleIndex.queryWikipediaPage(namedEntityAnnotation.getRefId());
                    if (queryWikipediaPage.isPresent()) {
                        namedEntityAnnotation.setRefName(queryWikipediaPage.get().getTitle());
                        namedEntityAnnotation.setRefId(queryWikipediaPage.get().getId());
                        namedEntityAnnotation.setRefUrl(queryWikipediaPage.get().getUrl());
                    } else {
                        log.warn("Could not find Wikidata ID for '{}', setting NIL", namedEntityAnnotation.getRefId());
                        namedEntityAnnotation.setRefId("NIL");
                    }
                }
            }
        }
        return readDataSet;
    }

    protected List<Document> readDocuments(Resource resource, Resource resource2) throws IOException {
        DocumentBuilderFactory newInstance = DocumentBuilderFactory.newInstance();
        ArrayList arrayList = new ArrayList();
        try {
            NodeList elementsByTagName = newInstance.newDocumentBuilder().parse(resource.getInputStream()).getElementsByTagName("document");
            for (int i = 0; i < elementsByTagName.getLength(); i++) {
                String nodeValue = elementsByTagName.item(i).getAttributes().getNamedItem("docName").getNodeValue();
                Document createDocument = createDocument(resource2.resolve(nodeValue), nodeValue);
                arrayList.add(createDocument);
                NodeList childNodes = elementsByTagName.item(i).getChildNodes();
                for (int i2 = 0; i2 < childNodes.getLength(); i2++) {
                    if (childNodes.item(i2).getNodeType() == 1) {
                        addAnnotations(createDocument, (Element) childNodes.item(i2));
                    }
                }
            }
        } catch (ParserConfigurationException | SAXException e) {
            log.error("Error parsing file: " + resource.toString());
        }
        return arrayList;
    }

    @NotNull
    private Document createDocument(Resource resource, String str) throws IOException {
        InputStream inputStream = resource.getInputStream();
        Throwable th = null;
        try {
            try {
                String str2 = (String) new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8.newDecoder())).lines().collect(Collectors.joining("\n"));
                if (inputStream != null) {
                    if (0 != 0) {
                        try {
                            inputStream.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        inputStream.close();
                    }
                }
                Document fromText = DocumentFactory.fromText(str2.replaceAll("\\n\\n", " \n"), DocumentFactory.Newlines.DISCARD);
                fromText.setId(str);
                fromText.setLanguage("en");
                return fromText;
            } finally {
            }
        } catch (Throwable th3) {
            if (inputStream != null) {
                if (th != null) {
                    try {
                        inputStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    inputStream.close();
                }
            }
            throw th3;
        }
    }

    private void addAnnotations(Document document, Element element) {
        for (Class<? extends Annotation> cls : this.annotations) {
            if (cls.equals(NamedEntityAnnotation.class)) {
                document.addAnnotation(createNamedEntityAnnotation(element));
            } else if (cls.equals(MentionAnnotation.class)) {
                document.addAnnotation(createMentionAnnotation(element));
            } else {
                log.error("Annotation type {} cannot be created.", cls.getCanonicalName());
            }
        }
    }

    private NamedEntityAnnotation createNamedEntityAnnotation(Element element) {
        NamedEntityAnnotation namedEntityAnnotation = new NamedEntityAnnotation();
        namedEntityAnnotation.setRefId(getElementByTagName(element, "wikiName").getTextContent());
        String textContent = getElementByTagName(element, "mention").getTextContent();
        namedEntityAnnotation.setText(textContent);
        namedEntityAnnotation.setBegin(Integer.parseInt(getElementByTagName(element, "offset").getTextContent()));
        int parseInt = Integer.parseInt(getElementByTagName(element, "length").getTextContent());
        if (parseInt != textContent.length()) {
            log.warn("Error in source file: length differs for \"" + textContent + "\" (" + parseInt + "!=" + textContent.length() + ")");
            parseInt = textContent.length();
        }
        namedEntityAnnotation.setLength(parseInt);
        namedEntityAnnotation.setConfidence(1.0d);
        namedEntityAnnotation.setSource(Annotation.Source.GOLD);
        return namedEntityAnnotation;
    }

    private MentionAnnotation createMentionAnnotation(Element element) {
        String textContent = getElementByTagName(element, "mention").getTextContent();
        int parseInt = Integer.parseInt(getElementByTagName(element, "offset").getTextContent());
        int parseInt2 = Integer.parseInt(getElementByTagName(element, "length").getTextContent());
        if (parseInt2 != textContent.length()) {
            log.warn("Error in source file: length differs for \"" + textContent + "\" (" + parseInt2 + "!=" + textContent.length() + ")");
            parseInt2 = textContent.length();
        }
        MentionAnnotation mentionAnnotation = new MentionAnnotation(Annotation.Source.GOLD, textContent, parseInt, parseInt + parseInt2);
        mentionAnnotation.setConfidence(1.0d);
        return mentionAnnotation;
    }

    private static Node getElementByTagName(Element element, String str) {
        return element.getElementsByTagName(str).item(0);
    }
}
