package de.julielab.jcore.reader.nlmgene;

import com.pengyifan.bioc.BioCAnnotation;
import com.pengyifan.bioc.BioCCollection;
import com.pengyifan.bioc.BioCDocument;
import com.pengyifan.bioc.BioCPassage;
import com.pengyifan.bioc.io.BioCCollectionReader;
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.types.ResourceEntry;
import de.julielab.jcore.types.Title;
import de.julielab.jcore.types.pubmed.AbstractText;
import de.julielab.jcore.types.pubmed.Header;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.util.Collections;
import java.util.Iterator;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import javax.xml.stream.XMLStreamException;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ResourceMetaData(name = "JCoRe NLM-Gene Reader", description = "Collection reader for the BioC format of the NLM-Gene corpus.", vendor = "JULIE Lab Jena, Germany")
@TypeCapability(inputs = {}, outputs = {"de.julielab.jcore.types.Gene", "de.julielab.jcore.types.ResourceEntry"})
/* loaded from: input_file:de/julielab/jcore/reader/nlmgene/NLMGeneReader.class */
public class NLMGeneReader extends JCasCollectionReader_ImplBase {
    public static final String PARAM_INPUT_DIR = "InputDirectory";
    public static final String PARAM_ID_LIST_PATH = "IdList";
    private static final Logger log = LoggerFactory.getLogger(NLMGeneReader.class);

    @ConfigurationParameter(name = PARAM_INPUT_DIR, description = "Path to the directory that contains the BioC XML files of the NLM-Gene corpus.")
    private String inputDir;

    @ConfigurationParameter(name = PARAM_ID_LIST_PATH, mandatory = false, description = "Path to a file with a list of IDs to restrict the read files to. This will typically be the list with IDs for the training or for the test set of the corpus. When no list is specified, the whole corpus is read.")
    private String idList;
    private Iterator<Path> corpusFileIterator;
    private int numRead;

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        this.inputDir = (String) uimaContext.getConfigParameterValue(PARAM_INPUT_DIR);
        this.idList = (String) uimaContext.getConfigParameterValue(PARAM_ID_LIST_PATH);
        try {
            this.corpusFileIterator = readInputFiles(this.inputDir, this.idList);
            this.numRead = 0;
        } catch (IOException e) {
            log.error("Could not read NLM-Gene corpus input files.", e);
            throw new ResourceInitializationException(e);
        }
    }

    private Iterator<Path> readInputFiles(String str, String str2) throws IOException {
        Path of = Path.of(str, new String[0]);
        Path of2 = str2 != null ? Path.of(str2, new String[0]) : null;
        Set emptySet = (of2 == null || !Files.exists(of2, new LinkOption[0])) ? Collections.emptySet() : (Set) Files.readAllLines(of2).stream().collect(Collectors.toSet());
        return Files.list(of).filter(path -> {
            return path.toString().toLowerCase().endsWith(".xml") || path.toString().toLowerCase().endsWith(".xml.gz");
        }).filter(path2 -> {
            if (emptySet.isEmpty()) {
                return true;
            }
            return emptySet.contains(path2.getFileName().toString().replaceAll("(?i)\\.bioc\\.xml(\\.gz)?", ""));
        }).iterator();
    }

    public void getNext(JCas jCas) throws CollectionException {
        Path next = this.corpusFileIterator.next();
        try {
            BioCCollection readCollection = new BioCCollectionReader(next).readCollection();
            if (readCollection.getDocmentCount() > 1) {
                throw new IllegalArgumentException("A single document per BioC collection is expected but the collection of file " + next + " has " + readCollection.getDocmentCount() + " documents. This case is not supported.");
            }
            BioCDocument document = readCollection.getDocument(0);
            handleHeader(jCas, document);
            StringBuilder sb = new StringBuilder();
            for (BioCPassage bioCPassage : document.getPassages()) {
                int length = sb.length();
                sb.append((String) bioCPassage.getText().get());
                handlePassageStructureType(jCas, sb, bioCPassage, length);
                handleAnnotation(jCas, document, bioCPassage, sb);
                sb.append(System.getProperty("line.separator"));
            }
            jCas.setDocumentText(sb.toString());
        } catch (XMLStreamException | IOException e) {
            log.error("Could not read NLM-Gene corpus file {}", next, e);
            throw new CollectionException(e);
        }
    }

    private void handleHeader(JCas jCas, BioCDocument bioCDocument) {
        Header header = new Header(jCas);
        header.setDocId(bioCDocument.getID());
        header.setComponentId(getClass().getSimpleName());
        header.setSource("NLM-Gene");
        header.addToIndexes();
    }

    private void handleAnnotation(JCas jCas, BioCDocument bioCDocument, BioCPassage bioCPassage, StringBuilder sb) {
        for (BioCAnnotation bioCAnnotation : bioCPassage.getAnnotations()) {
            Gene gene = new Gene(jCas, bioCAnnotation.getTotalLocation().getOffset(), bioCAnnotation.getTotalLocation().getOffset() + bioCAnnotation.getTotalLocation().getLength());
            gene.setComponentId(getClass().getSimpleName());
            Optional<String> infon = bioCAnnotation.getInfon("type");
            Optional<String> infon2 = bioCAnnotation.getInfon("code");
            handleErrors(bioCDocument, bioCPassage, bioCAnnotation, gene, infon, sb);
            handleGeneId(jCas, bioCAnnotation, gene);
            handleSpecificType(gene, infon, infon2);
            gene.addToIndexes();
        }
    }

    private void handleSpecificType(Gene gene, Optional<String> optional, Optional<String> optional2) {
        gene.setSpecificType(optional.get());
        if (optional2.isPresent()) {
            gene.setSpecificType(optional.get() + "-" + optional2.get());
        }
    }

    private void handleErrors(BioCDocument bioCDocument, BioCPassage bioCPassage, BioCAnnotation bioCAnnotation, Gene gene, Optional<String> optional, StringBuilder sb) {
        if (!optional.isPresent()) {
            throw new IllegalStateException("The annotation " + bioCAnnotation.getID() + " of passage " + ((String) bioCPassage.getInfon("type").get()) + " of document " + bioCDocument.getID() + " does not specify a type.");
        }
    }

    private void handleGeneId(JCas jCas, BioCAnnotation bioCAnnotation, Gene gene) {
        Optional infon = bioCAnnotation.getInfon("NCBI Gene identifier");
        if (infon.isPresent()) {
            ResourceEntry resourceEntry = new ResourceEntry(jCas, gene.getBegin(), gene.getEnd());
            resourceEntry.setEntryId((String) infon.get());
            if (((String) infon.get()).contains("|")) {
                resourceEntry.setEntryId(((String) infon.get()).split("\\|")[0]);
            }
            resourceEntry.setComponentId(getClass().getSimpleName());
            FSArray fSArray = new FSArray(jCas, 1);
            fSArray.set(0, resourceEntry);
            gene.setResourceEntryList(fSArray);
        }
    }

    private void handlePassageStructureType(JCas jCas, StringBuilder sb, BioCPassage bioCPassage, int i) {
        Optional infon = bioCPassage.getInfon("type");
        if (infon.isPresent() && ((String) infon.get()).equals("title")) {
            Title title = new Title(jCas, i, sb.length());
            title.setTitleType("document");
            title.setComponentId(getClass().getSimpleName());
            title.addToIndexes();
            return;
        }
        if (infon.isPresent() && ((String) infon.get()).equals("abstract")) {
            AbstractText abstractText = new AbstractText(jCas, i, sb.length());
            abstractText.setComponentId(getClass().getSimpleName());
            abstractText.addToIndexes();
        }
    }

    public Progress[] getProgress() {
        return new Progress[]{new ProgressImpl(this.numRead, 0, "documents")};
    }

    public boolean hasNext() {
        return this.corpusFileIterator.hasNext();
    }
}
