package de.julielab.genemapper.resources.uima;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import de.julielab.costosys.configuration.FieldConfig;
import de.julielab.costosys.dbconnection.CoStoSysConnection;
import de.julielab.costosys.dbconnection.DataBaseConnector;
import de.julielab.costosys.dbconnection.util.CoStoSysSQLRuntimeException;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.IOStreamUtilities;
import de.julielab.jcore.types.ChunkNP;
import de.julielab.jcore.types.EntityMention;
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.types.MeshHeading;
import de.julielab.jcore.types.Organism;
import de.julielab.jcore.types.Sentence;
import de.julielab.jcore.utility.JCoReTools;
import de.julielab.jcore.utility.index.JCoReOverlapAnnotationIndex;
import de.julielab.xml.JulieXMLTools;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Stream;
import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@ResourceMetaData(name = "JCoRe Synonym Species Occurrences DB Writer", description = "Counts common occurrences of gene synonyms and species mentions and stores the counts in a Postgres database. The idea is to create an a priori distribution of species for each synonym to use for disambiguation. For this purpose, two sources of gene mentions are used: A gazetteer based on the synonym dictionary created by the _makeDictionary.sh script and a higher-precision machine learning-based approach for false positive filtering. The gazetteer gene mentions must have the specific type 'GazetteerGene' to be counted. Apart from the ML-based gene mentions for FP filtering there should not any other gene mentions in the CAS to avoid counting mistakes.")
@TypeCapability(inputs = {"de.julielab.jcore.types.Organism", "de.julielab.jcore.types.Gene"})
/* loaded from: input_file:de/julielab/genemapper/resources/uima/SynonymSpeciesOccurrencesConsumer.class */
public class SynonymSpeciesOccurrencesConsumer extends JCasAnnotator_ImplBase {
    public static final String PARAM_COSTOSYS_FILE = "CostosysConfiguration";
    public static final String PARAM_TABLE_NAME = "TableName";
    private static final Logger log = LoggerFactory.getLogger(SynonymSpeciesOccurrencesConsumer.class);
    private static final Multimap<String, String> desc2TaxId = HashMultimap.create();
    private static final Map<String, String> referenceSpeciesTaxIds = new HashMap();

    @ConfigurationParameter(name = PARAM_COSTOSYS_FILE, description = "Path to the CoStoSys configuration file the specifies the database to write to.")
    private String costosysConfiguration;

    @ConfigurationParameter(name = PARAM_TABLE_NAME, mandatory = false, description = "Optional. The name of the database table in which the species-genesynonym occurrence counts will be stored. Defaults to 'occurrences'.")
    private String tableName;
    private DataBaseConnector dbc;
    private List<Triple<String, String, OccurrenceScope>> occurrences;
    private final TermNormalizer normalizer = new TermNormalizer();
    private long processed;

    /* loaded from: input_file:de/julielab/genemapper/resources/uima/SynonymSpeciesOccurrencesConsumer$OccurrenceScope.class */
    public enum OccurrenceScope {
        SENTENCE,
        NP,
        MESH,
        DOCUMENT
    }

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        this.costosysConfiguration = (String) uimaContext.getConfigParameterValue(PARAM_COSTOSYS_FILE);
        this.tableName = (String) Optional.ofNullable((String) uimaContext.getConfigParameterValue(PARAM_TABLE_NAME)).orElse("occurrences");
        try {
            this.dbc = new DataBaseConnector(this.costosysConfiguration);
            addOccurrencesTableFieldConfiguration();
            this.dbc.createTable(this.tableName, "occurrences", "Created by " + SynonymSpeciesOccurrencesConsumer.class.getCanonicalName() + " on " + new Date() + ". This table collects counts of common occurrences of gene synonyms and species taxonomy IDs. Its aim is to serve as a source of disambiguation for the jcore-gene-mapper-ae when it is not clear to which species a gene might belong and there are multiple (or even no) candidates to chose from.");
        } catch (FileNotFoundException e) {
            log.error("Could not instantiate DatabaseConnector", e);
            throw new ResourceInitializationException(e);
        } catch (CoStoSysSQLRuntimeException e2) {
            log.info("This exception is expected and is no cause for alarm.");
            if (!e2.getMessage().contains("already exists")) {
                throw new ResourceInitializationException(e2);
            }
        }
        this.occurrences = new ArrayList();
        readDesc2TaxMap();
        try {
            readReferenceMap();
            this.processed = 0L;
        } catch (IOException e3) {
            log.error("Could not read the reference species taxonomy ID map", e3);
            throw new ResourceInitializationException(e3);
        }
    }

    private void readReferenceMap() throws IOException {
        synchronized (referenceSpeciesTaxIds) {
            if (referenceSpeciesTaxIds.isEmpty()) {
                InputStream findResource = FileUtilities.findResource("/reference_species.txt");
                try {
                    if (findResource == null) {
                        throw new FileNotFoundException("Could not find the reference species mapping file expected on the classpath as /reference_species.txt.");
                    }
                    BufferedReader readerFromInputStream = IOStreamUtilities.getReaderFromInputStream(findResource);
                    try {
                        readerFromInputStream.lines().map(str -> {
                            return str.split("\t");
                        }).forEach(strArr -> {
                            referenceSpeciesTaxIds.put(strArr[1], strArr[0]);
                        });
                        if (readerFromInputStream != null) {
                            readerFromInputStream.close();
                        }
                        if (findResource != null) {
                            findResource.close();
                        }
                    } catch (Throwable th) {
                        if (readerFromInputStream != null) {
                            try {
                                readerFromInputStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        }
                        throw th;
                    }
                } finally {
                }
            }
        }
    }

    private void addOccurrencesTableFieldConfiguration() {
        ArrayList arrayList = new ArrayList();
        arrayList.add(JulieXMLTools.createField(new String[]{"name", "synonym", "type", "text", "retrieve", "true", "primaryKey", "true"}));
        arrayList.add(JulieXMLTools.createField(new String[]{"name", "tax_id", "type", "integer", "retrieve", "true", "primaryKey", "true"}));
        arrayList.add(JulieXMLTools.createField(new String[]{"name", "scope", "type", "text", "retrieve", "true", "primaryKey", "true"}));
        arrayList.add(JulieXMLTools.createField(new String[]{"name", "count", "type", "integer", "retrieve", "true"}));
        this.dbc.addFieldConfiguration(new FieldConfig(arrayList, (String) null, "occurrences"));
    }

    private void readDesc2TaxMap() throws ResourceInitializationException {
        synchronized (desc2TaxId) {
            if (desc2TaxId.isEmpty()) {
                try {
                    InputStream findResource = FileUtilities.findResource("/desc2tax");
                    if (findResource == null) {
                        findResource = FileUtilities.findResource("/desc2tax.gz");
                    }
                    if (findResource == null) {
                        throw new ResourceInitializationException(new FileNotFoundException("Could not find the desc2tax file that maps MeSH descriptor names to taxonomy IDs."));
                    }
                    BufferedReader readerFromInputStream = IOStreamUtilities.getReaderFromInputStream(findResource);
                    try {
                        readerFromInputStream.lines().map(str -> {
                            return str.split("\t");
                        }).forEach(strArr -> {
                            desc2TaxId.put(strArr[0].trim(), strArr[1].trim().intern());
                        });
                        if (readerFromInputStream != null) {
                            readerFromInputStream.close();
                        }
                    } catch (Throwable th) {
                        if (readerFromInputStream != null) {
                            try {
                                readerFromInputStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        }
                        throw th;
                    }
                } catch (IOException e) {
                    log.error("IOException while searching for the descriptor to taxonomy ID mapping file", e);
                    throw new ResourceInitializationException(e);
                }
            }
        }
    }

    public void process(JCas jCas) throws AnalysisEngineProcessException {
        FSIterator it = jCas.getAnnotationIndex(Gene.type).iterator();
        if (it.hasNext()) {
            Set<String> allTaxIdsInDocument = getAllTaxIdsInDocument(jCas);
            Set<String> meshTaxIds = getMeshTaxIds(jCas);
            HashMultimap create = HashMultimap.create();
            HashSet<String> hashSet = new HashSet();
            JCoReOverlapAnnotationIndex jCoReOverlapAnnotationIndex = new JCoReOverlapAnnotationIndex(jCas, Sentence.type);
            JCoReOverlapAnnotationIndex jCoReOverlapAnnotationIndex2 = new JCoReOverlapAnnotationIndex(jCas, Organism.type);
            JCoReOverlapAnnotationIndex jCoReOverlapAnnotationIndex3 = new JCoReOverlapAnnotationIndex(jCas, ChunkNP.type);
            JCoReOverlapAnnotationIndex jCoReOverlapAnnotationIndex4 = new JCoReOverlapAnnotationIndex(jCas, Gene.type);
            while (it.hasNext()) {
                EntityMention entityMention = (EntityMention) it.next();
                if (entityMention.getSpecificType() != null && entityMention.getSpecificType().equals("GazetteerGene") && jCoReOverlapAnnotationIndex4.search(entityMention).stream().anyMatch(gene -> {
                    return gene.getSpecificType() == null || !gene.getSpecificType().equals("GazetteerGene");
                })) {
                    String synonym = getSynonym(entityMention);
                    hashSet.add(synonym);
                    int size = this.occurrences.size();
                    Optional findAny = jCoReOverlapAnnotationIndex3.search(entityMention).stream().findAny();
                    if (findAny.isPresent()) {
                        jCoReOverlapAnnotationIndex2.search((ChunkNP) findAny.get()).stream().map(organism -> {
                            return organism.getResourceEntryList(0).getEntryId();
                        }).flatMap(str -> {
                            return Stream.of((Object[]) new String[]{str, referenceSpeciesTaxIds.get(str)});
                        }).filter((v0) -> {
                            return Objects.nonNull(v0);
                        }).peek(str2 -> {
                            create.put(synonym, str2);
                        }).forEach(str3 -> {
                            this.occurrences.add(new ImmutableTriple(synonym, str3, OccurrenceScope.NP));
                        });
                    }
                    if (this.occurrences.size() == size) {
                        Optional findAny2 = jCoReOverlapAnnotationIndex.search(entityMention).stream().findAny();
                        if (findAny2.isPresent()) {
                            jCoReOverlapAnnotationIndex2.search((Sentence) findAny2.get()).stream().map(organism2 -> {
                                return organism2.getResourceEntryList(0).getEntryId();
                            }).flatMap(str4 -> {
                                return Stream.of((Object[]) new String[]{str4, referenceSpeciesTaxIds.get(str4)});
                            }).filter((v0) -> {
                                return Objects.nonNull(v0);
                            }).peek(str5 -> {
                                create.put(synonym, str5);
                            }).forEach(str6 -> {
                                this.occurrences.add(new ImmutableTriple(synonym, str6, OccurrenceScope.SENTENCE));
                            });
                        }
                    }
                    Iterator<String> it2 = meshTaxIds.iterator();
                    while (it2.hasNext()) {
                        this.occurrences.add(new ImmutableTriple(synonym, it2.next(), OccurrenceScope.MESH));
                    }
                }
            }
            for (String str7 : hashSet) {
                for (String str8 : allTaxIdsInDocument) {
                    if (!create.get(str7).contains(str8)) {
                        this.occurrences.add(new ImmutableTriple(str7, str8, OccurrenceScope.DOCUMENT));
                    }
                }
            }
        }
        this.processed++;
        if (this.processed % 500 == 0) {
            try {
                log.debug("Triggering batchProcessComplete after 500 processed documents.");
                batchProcessComplete();
            } catch (AnalysisEngineProcessException e) {
                log.error("Exception while executing batchProcessComplete", e);
                throw e;
            }
        }
    }

    private String getSynonym(EntityMention entityMention) {
        try {
            return this.normalizer.normalize(entityMention.getCoveredText());
        } catch (StringIndexOutOfBoundsException e) {
            try {
                log.error("Gene mention in document {} has invalid offsets: {}-{} (document text length: {})", new Object[]{JCoReTools.getDocId(entityMention.getCAS().getJCas()), Integer.valueOf(entityMention.getBegin()), Integer.valueOf(entityMention.getEnd()), Integer.valueOf(entityMention.getCAS().getDocumentText().length())});
            } catch (CASException e2) {
                log.error("Gene mention has invalid offsets. Cannot output more details due to CASException when trying to obtain more information.");
            }
            throw e;
        }
    }

    private Set<String> getMeshTaxIds(JCas jCas) {
        HashSet hashSet = new HashSet();
        FSIterator it = jCas.getAnnotationIndex(MeshHeading.type).iterator();
        while (it.hasNext()) {
            Collection<String> collection = desc2TaxId.get(((Annotation) it.next()).getDescriptorName());
            if (collection != null) {
                for (String str : collection) {
                    hashSet.add(str);
                    String str2 = referenceSpeciesTaxIds.get(str);
                    if (str2 != null) {
                        hashSet.add(str2);
                    }
                }
            }
        }
        return hashSet;
    }

    private Set<String> getAllTaxIdsInDocument(JCas jCas) {
        HashSet hashSet = new HashSet();
        FSIterator it = jCas.getAnnotationIndex(Organism.type).iterator();
        while (it.hasNext()) {
            Organism organism = (Annotation) it.next();
            hashSet.add(organism.getResourceEntryList(0).getEntryId());
            String str = referenceSpeciesTaxIds.get(organism.getResourceEntryList(0).getEntryId());
            if (str != null) {
                hashSet.add(str);
            }
        }
        return hashSet;
    }

    private void sendOccurrencesToDatabase() throws SQLException {
        log.debug("Sending {} gene synonym species occurrences to the database", Integer.valueOf(this.occurrences.size()));
        CoStoSysConnection obtainOrReserveConnection = this.dbc.obtainOrReserveConnection();
        try {
            boolean autoCommit = obtainOrReserveConnection.getAutoCommit();
            obtainOrReserveConnection.setAutoCommit(false);
            PreparedStatement prepareStatement = obtainOrReserveConnection.prepareStatement("INSERT INTO " + this.tableName + "(synonym,tax_id,scope,count) values(?,?,?,1) ON CONFLICT ON CONSTRAINT " + this.tableName + "_pkey DO UPDATE SET count = " + this.tableName + ".count + 1");
            for (Triple<String, String, OccurrenceScope> triple : this.occurrences) {
                prepareStatement.setString(1, (String) triple.getLeft());
                prepareStatement.setInt(2, Integer.parseInt((String) triple.getMiddle()));
                prepareStatement.setString(3, ((OccurrenceScope) triple.getRight()).name());
                prepareStatement.addBatch();
            }
            prepareStatement.executeBatch();
            obtainOrReserveConnection.commit();
            obtainOrReserveConnection.setAutoCommit(autoCommit);
            this.occurrences.clear();
            if (obtainOrReserveConnection != null) {
                obtainOrReserveConnection.close();
            }
            log.debug("Finished sending gene synonym species occurrences to the database.");
        } catch (Throwable th) {
            if (obtainOrReserveConnection != null) {
                try {
                    obtainOrReserveConnection.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    public void batchProcessComplete() throws AnalysisEngineProcessException {
        log.debug("Batch processing complete.");
        try {
            sendOccurrencesToDatabase();
        } catch (SQLException e) {
            log.error("Could not send the collected gene synonym species occurrences to the database", e);
            throw new AnalysisEngineProcessException(e);
        }
    }

    public void collectionProcessComplete() throws AnalysisEngineProcessException {
        log.debug("Collection processing complete.");
        try {
            sendOccurrencesToDatabase();
        } catch (SQLException e) {
            log.error("Could not send the collected gene synonym species occurrences to the database", e);
            throw new AnalysisEngineProcessException(e);
        }
    }
}
