package de.julielab.geneexpbase.data;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import de.julielab.geneexpbase.TermNormalizer;
import de.julielab.geneexpbase.candidateretrieval.CandidateRetrieval;
import de.julielab.geneexpbase.candidateretrieval.QueryGenerator;
import de.julielab.geneexpbase.candidateretrieval.SynHit;
import de.julielab.geneexpbase.configuration.Configuration;
import de.julielab.geneexpbase.genemodel.Acronym;
import de.julielab.geneexpbase.genemodel.Apposition;
import de.julielab.geneexpbase.genemodel.CoreferenceSet;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneMention;
import de.julielab.geneexpbase.genemodel.GeneOrthologs;
import de.julielab.geneexpbase.genemodel.MeshHeading;
import de.julielab.geneexpbase.genemodel.PosTag;
import de.julielab.geneexpbase.genemodel.SpeciesCandidates;
import de.julielab.geneexpbase.genemodel.SpeciesMention;
import de.julielab.java.utilities.spanutils.OffsetMap;
import de.julielab.java.utilities.spanutils.OffsetSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.inject.Inject;
import javax.inject.Named;
import org.apache.commons.lang3.Range;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/geneexpbase/data/DocumentLoader.class */
public class DocumentLoader {
    private static final Logger log = LoggerFactory.getLogger(DocumentLoader.class);
    private final CandidateRetrieval candidateRetrieval;
    private final QueryGenerator queryGeneratorForMentionInference;
    private final GeneOrthologs geneOrthologs;
    private final Configuration configuration;
    private final TermNormalizer normalizer;

    @Inject
    public DocumentLoader(CandidateRetrieval candidateRetrieval, @Named("IdInference") QueryGenerator queryGenerator, GeneOrthologs geneOrthologs, TermNormalizer termNormalizer, Configuration configuration) {
        this.candidateRetrieval = candidateRetrieval;
        this.queryGeneratorForMentionInference = queryGenerator;
        this.geneOrthologs = geneOrthologs;
        this.normalizer = termNormalizer;
        this.configuration = configuration;
    }

    private static void setGoldMentionsToPredictions(Collection<List<GeneMention>> collection, Multimap<String, GeneMention> multimap, Set<String> set) {
        Map map = (Map) collection.stream().flatMap((v0) -> {
            return v0.stream();
        }).collect(Collectors.groupingBy((v0) -> {
            return v0.getDocId();
        }, HashMap::new, Collectors.toMap((v0) -> {
            return v0.getOffsets();
        }, geneMention -> {
            ArrayList arrayList = new ArrayList();
            arrayList.add(geneMention);
            return arrayList;
        }, (list, list2) -> {
            list.addAll(list2);
            return list;
        }, OffsetMap::new)));
        for (GeneMention geneMention2 : multimap.values()) {
            if (set.contains(geneMention2.getDocId()) && map.get(geneMention2.getDocId()) != null) {
                geneMention2.setOverlappingGoldMentions(new ArrayList((Collection) ((OffsetMap) map.get(geneMention2.getDocId())).getOverlapping(geneMention2).values().stream().flatMap((v0) -> {
                    return v0.stream();
                }).collect(Collectors.toList())));
            }
        }
    }

    public Stream<GeneDocument> getDocuments(DocumentSourceFiles documentSourceFiles) throws DocumentLoadingException {
        Multimap<String, GeneMention> readGoldIds;
        boolean z = false;
        boolean z2 = false;
        try {
            try {
                readGoldIds = CorpusReader.readMentionsWithOffsets(documentSourceFiles.getGoldGeneList());
                z = true;
            } catch (Exception e) {
                readGoldIds = CorpusReader.readGoldIds(documentSourceFiles.getGoldGeneList());
                z2 = documentSourceFiles.getInferDocumentLevelLabelsToMentions();
            }
            Iterator it = readGoldIds.values().iterator();
            while (it.hasNext()) {
                GeneMention geneMention = (GeneMention) it.next();
                if (geneMention.getIds().isEmpty() || geneMention.getIds().get(0).equals(GeneMention.NOID)) {
                    it.remove();
                }
            }
            readGoldIds.values().forEach(geneMention2 -> {
                geneMention2.setTagger(GeneMention.GeneTagger.GOLD);
            });
            readGoldIds.values().forEach(geneMention3 -> {
                geneMention3.setSpecificType(geneMention3.getSpecificType() != null ? geneMention3.getSpecificType() : documentSourceFiles.getDefaultEntityType());
            });
            boolean z3 = z;
            List asList = Arrays.asList("Gene", "protein", "protein_complex", "protein_enum", "protein_familiy_or_group");
            Multimap<String, GeneMention> readMixedFileForGenesWithOffsets = CorpusReader.readMixedFileForGenesWithOffsets(documentSourceFiles.getPredictedGenesPath(), documentSourceFiles.getAllowedGeneTypes(), documentSourceFiles.getTaggersToUse());
            Multimap<String, Acronym> readAcronymAnnotations = CorpusReader.readAcronymAnnotations(documentSourceFiles.getAcronymsPath());
            Multimap<String, CoreferenceSet> readCoreferenceAnnotations = CorpusReader.readCoreferenceAnnotations(documentSourceFiles.getCorefPath());
            Multimap<String, Apposition> readAppositionAnnotations = CorpusReader.readAppositionAnnotations(documentSourceFiles.getAppositionsPath());
            Map<String, String> readGeneContexts = CorpusReader.readGeneContexts(documentSourceFiles.getDocTextPath());
            Multimap<String, Range<Integer>> readMixedFileForSentenceOffsets = CorpusReader.readMixedFileForSentenceOffsets(documentSourceFiles.getSentencesPath());
            Multimap<String, Range<Integer>> readMixedFileForNonGenePhraseOffsets = CorpusReader.readMixedFileForNonGenePhraseOffsets(documentSourceFiles.getSentencesPath());
            Map<String, OffsetMap<SpeciesMention>> readMixedFileForTextSpecies = CorpusReader.readMixedFileForTextSpecies(documentSourceFiles.getSpeciesPath());
            Map<String, OffsetMap<String>> readMixedFileForChunkOffsets = CorpusReader.readMixedFileForChunkOffsets(documentSourceFiles.getChunksPath());
            Map<String, OffsetMap<String>> readMixedFileForOntologyClassMentions = CorpusReader.readMixedFileForOntologyClassMentions(documentSourceFiles.getOntologyMentionsPath());
            Multimap<String, PosTag> readMixedFileForPosTags = CorpusReader.readMixedFileForPosTags(documentSourceFiles.getPosPath());
            Multimap<String, MeshHeading> create = HashMultimap.create();
            if (documentSourceFiles.hashMesh()) {
                create = CorpusReader.readMeshHeadings(documentSourceFiles.getMeshPath());
            }
            if (documentSourceFiles.hasSubstances()) {
                create.putAll(CorpusReader.readMeshHeadings(documentSourceFiles.getSubstancesPath()));
            }
            if (readMixedFileForGenesWithOffsets.isEmpty()) {
                throw new IllegalArgumentException("Could not find any entity of types '" + asList + "' of tagger '" + documentSourceFiles.getTaggersToUse() + "' in " + documentSourceFiles.getPredictedGenesPath() + ".");
            }
            Multimap<String, MeshHeading> multimap = create;
            boolean isSpeciesCorpus = documentSourceFiles.isSpeciesCorpus();
            Set<String> keySet = readGeneContexts.keySet();
            if (documentSourceFiles.isHasGeneIds()) {
                readGoldIds.values().stream().filter(geneMention4 -> {
                    return keySet.contains(geneMention4.getDocId());
                }).forEach(geneMention5 -> {
                    geneMention5.setIds((List) geneMention5.getIds().stream().map(str -> {
                        return GeneInformation.REPLACED.getOrDefault(str, str);
                    }).collect(Collectors.toList()));
                    geneMention5.setTaxonomyId(this.candidateRetrieval.mapGeneIdToTaxId(geneMention5.getGoldMentionId()));
                    if (geneMention5.getTaxonomyId().isBlank()) {
                        log.warn("Could not retrieve the taxonomy of the gold gene ID " + geneMention5.getIds());
                    }
                });
            } else if (documentSourceFiles.isSpeciesCorpus()) {
                readGoldIds.values().stream().forEach(geneMention6 -> {
                    geneMention6.setTaxonomyIds(geneMention6.getIds());
                });
            }
            Multimap<String, GeneMention> multimap2 = readGoldIds;
            boolean z4 = z;
            boolean z5 = z2;
            return keySet.stream().map(str -> {
                return getGeneDocument(this.normalizer, this.candidateRetrieval, multimap2, readMixedFileForGenesWithOffsets, readAcronymAnnotations, readCoreferenceAnnotations, readAppositionAnnotations, readGeneContexts, readMixedFileForSentenceOffsets, readMixedFileForNonGenePhraseOffsets, readMixedFileForTextSpecies, readMixedFileForChunkOffsets, readMixedFileForPosTags, readMixedFileForOntologyClassMentions, multimap, isSpeciesCorpus, this.geneOrthologs, z3, z5, str);
            }).peek(geneDocument -> {
                if (z4 || z5) {
                    setGoldMentionsToPredictions(geneDocument.getGoldGenes().values(), readMixedFileForGenesWithOffsets, keySet);
                }
                geneDocument.setCompletelyAnnotated(documentSourceFiles.isCompletelyAnnotated());
            });
        } catch (IOException e2) {
            throw new DocumentLoadingException(e2);
        }
    }

    public GeneDocument getGeneDocument(TermNormalizer termNormalizer, CandidateRetrieval candidateRetrieval, Multimap<String, GeneMention> multimap, Multimap<String, GeneMention> multimap2, Multimap<String, Acronym> multimap3, Multimap<String, CoreferenceSet> multimap4, Multimap<String, Apposition> multimap5, Map<String, String> map, Multimap<String, Range<Integer>> multimap6, Multimap<String, Range<Integer>> multimap7, Map<String, OffsetMap<SpeciesMention>> map2, Map<String, OffsetMap<String>> map3, Multimap<String, PosTag> multimap8, Map<String, OffsetMap<String>> map4, Multimap<String, MeshHeading> multimap9, boolean z, GeneOrthologs geneOrthologs, boolean z2, boolean z3, String str) {
        GeneDocument geneDocument = new GeneDocument(str);
        geneDocument.setTermNormalizer(termNormalizer);
        geneDocument.setAcronyms(new HashSet(multimap3.get(str)));
        geneDocument.setCoreferenceRelations(multimap4.get(str));
        geneDocument.setAppositions(multimap5.get(str));
        String[] strArr = (String[]) Stream.of((Object[]) map.get(str).split("\\n")).filter(Predicate.not((v0) -> {
            return v0.isBlank();
        })).toArray(i -> {
            return new String[i];
        });
        String str2 = null;
        String str3 = null;
        if (strArr.length > 1) {
            str2 = strArr[0];
            str3 = strArr[1];
        } else if (strArr.length == 1) {
            if (strArr[0].length() > 300) {
                str3 = strArr[0];
            } else {
                str2 = strArr[0];
            }
        }
        if (str2 != null) {
            geneDocument.setTitleOffsets(Range.between(0, Integer.valueOf(str2.length())));
        }
        geneDocument.setAbstractOffsets(Range.between(Integer.valueOf(((Integer) geneDocument.getTitleOffsets().getMaximum()).intValue() + 1), Integer.valueOf(map.get(str).length())));
        geneDocument.setDocumentTitle(str2);
        geneDocument.setDocumentAbstract(str3);
        geneDocument.setDocumentText(map.get(str));
        geneDocument.setChunks(map3.get(str));
        geneDocument.setOntologyClassMentions(map4.get(str));
        geneDocument.setPosTags(multimap8.get(str));
        geneDocument.setSpecies(new SpeciesCandidates(((Integer) geneDocument.getTitleOffsets().getMinimum()).intValue(), ((Integer) geneDocument.getTitleOffsets().getMaximum()).intValue(), Collections.emptySet(), map2.get(str)));
        geneDocument.setSentences(new OffsetSet(multimap6.get(str)));
        geneDocument.setNonGenePhrases(new OffsetSet(multimap7.get(str)));
        geneDocument.setMeshHeadings(multimap9.get(str));
        geneDocument.setGenes(new HashSet(multimap2.get(str)));
        geneDocument.getAllGenes().forEach(geneMention -> {
            geneMention.setDocumentContext(geneDocument.getDocumentText());
            geneMention.setId(GeneMention.NOID);
            geneMention.setTaxonomyId(null);
        });
        geneDocument.selectAllGenes();
        if (z2) {
            Collection collection = multimap.get(str);
            Objects.requireNonNull(geneDocument);
            collection.forEach(geneDocument::putGoldGene);
            geneDocument.setGoldIds((Set) multimap.get(str).stream().map((v0) -> {
                return v0.getIds();
            }).flatMap((v0) -> {
                return v0.stream();
            }).collect(Collectors.toSet()));
        } else {
            Set<String> set = (Set) multimap.get(str).stream().map((v0) -> {
                return v0.getIds();
            }).flatMap((v0) -> {
                return v0.stream();
            }).collect(Collectors.toSet());
            if (z3) {
                inferDocumentLevelLabelsToMentions(geneDocument, set, candidateRetrieval, geneOrthologs, z);
            }
            geneDocument.setGoldIds(set);
            geneDocument.setGoldTaxonomyIds((Set) set.stream().map(str4 -> {
                return candidateRetrieval.mapGeneIdToTaxId(str4);
            }).collect(Collectors.toSet()));
        }
        if (z) {
            geneDocument.getGoldGenes().values().stream().flatMap((v0) -> {
                return v0.stream();
            }).forEach(geneMention2 -> {
                geneMention2.setTaxonomyIds(geneMention2.getIds());
                geneMention2.setId(null);
            });
            geneDocument.setGoldTaxonomyIds(new HashSet(geneDocument.getGoldIds()));
        }
        geneDocument.addState(GeneDocument.State.REFERENCE_SPECIES_ADDED);
        geneDocument.setGoldMentionsWithOffsets(z2 || z3);
        return geneDocument;
    }

    public void inferDocumentLevelLabelsToMentions(GeneDocument geneDocument, Set<String> set, CandidateRetrieval candidateRetrieval, GeneOrthologs geneOrthologs, boolean z) {
        if (set.size() > 1024) {
            log.debug("Document {} has {} goldIds", geneDocument.getId(), Integer.valueOf(set.size()));
        }
        ArrayList arrayList = new ArrayList();
        for (GeneMention geneMention : geneDocument.getGenesIterable()) {
            ArrayList arrayList2 = new ArrayList();
            Iterator<String> it = set.iterator();
            while (it.hasNext()) {
                ArrayList arrayList3 = new ArrayList(1024);
                for (int i = 0; i < 1024 && it.hasNext(); i++) {
                    arrayList3.add(it.next());
                }
                try {
                    arrayList2.addAll(candidateRetrieval.getCandidates(geneMention, arrayList3, Collections.emptySet(), this.queryGeneratorForMentionInference));
                    if (set.size() > 1024) {
                        log.debug("Retrieved {} candidates for gene {} in document {}", new Object[]{Integer.valueOf(arrayList2.size()), geneMention.getText(), geneDocument.getId()});
                    }
                } catch (Exception e) {
                    log.error("Could not retrieve candidates for gene {} and goldIds {}.", geneMention, set);
                    throw e;
                }
            }
            if (arrayList2.isEmpty()) {
                arrayList.add(geneMention);
            } else {
                SynHit synHit = (SynHit) arrayList2.get(0);
                Set set2 = (Set) geneOrthologs.getOrthologs(synHit.getId()).stream().map(orthologyRecord -> {
                    return orthologyRecord.getGeneIdNotEqualTo(synHit.getId());
                }).collect(Collectors.toSet());
                set2.remove(synHit.getId());
                Sets.intersection(set2, set);
                GeneMention geneMention2 = new GeneMention(geneMention);
                geneMention2.setDocId(geneDocument.getId());
                geneMention2.setIds(List.of(synHit.getId()));
                geneMention2.setTaxonomyIds((List) geneMention2.getIds().stream().map(str -> {
                    return candidateRetrieval.mapGeneIdToTaxId(str);
                }).collect(Collectors.toList()));
                geneDocument.putGoldGene(geneMention2);
                geneMention.setOverlappingGoldMentions(Collections.singletonList(geneMention2));
            }
        }
        arrayList.forEach(geneMention3 -> {
            geneDocument.removeGene(geneMention3);
        });
        geneDocument.setGoldMentionsWithOffsets(true);
        geneDocument.setGoldOffsetsInferred(true);
    }
}
