package de.julielab.jules.ae.genemapping;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import de.julielab.jules.ae.genemapping.SynHit;
import de.julielab.jules.ae.genemapping.genemodel.GeneMention;
import de.julielab.jules.ae.genemapping.genemodel.GeneName;
import de.julielab.jules.ae.genemapping.index.SynonymIndexFieldNames;
import de.julielab.jules.ae.genemapping.scoring.JaroWinklerScorer;
import de.julielab.jules.ae.genemapping.scoring.LevenshteinScorer;
import de.julielab.jules.ae.genemapping.scoring.LuceneScorer;
import de.julielab.jules.ae.genemapping.scoring.MaxEntScorer;
import de.julielab.jules.ae.genemapping.scoring.Scorer;
import de.julielab.jules.ae.genemapping.scoring.SimpleScorer;
import de.julielab.jules.ae.genemapping.scoring.TokenJaroSimilarityScorer;
import de.julielab.jules.ae.genemapping.utils.GeneCandidateRetrievalException;
import de.julielab.jules.ae.genemapping.utils.GeneMappingException;
import de.julielab.jules.ae.genemapping.utils.norm.TermNormalizer;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/jules/ae/genemapping/LuceneCandidateRetrieval.class */
public class LuceneCandidateRetrieval implements CandidateRetrieval {
    public static final String NAME_PRIO_DELIMITER = "__";
    public static final int SIMPLE_SCORER = 0;
    public static final int TOKEN_JAROWINKLER_SCORER = 1;
    public static final int MAXENT_SCORER = 2;
    public static final int JAROWINKLER_SCORER = 3;
    public static final int LEVENSHTEIN_SCORER = 4;
    public static final int TFIDF = 5;
    public static final int LUCENE_SCORER = 10;
    public static final String MAXENT_SCORER_MODEL = "/genemapper_jules_mallet.mod";
    private static final int LUCENE_MAX_HITS = 20;
    private String maxEntModel;
    private TermNormalizer normalizer;
    private IndexSearcher mentionIndexSearcher;
    private Scorer exactScorer;
    private Scorer approxScorer;
    private LoadingCache<CandidateCacheKey, List<SynHit>> candidateCache;
    private SpellChecker spellingChecker;
    public static final String LOGGER_NAME_CANDIDATES = "de.julielab.jules.ae.genemapper.candidates";
    public static final Logger candidateLog = LoggerFactory.getLogger(LOGGER_NAME_CANDIDATES);
    private static final Logger log = LoggerFactory.getLogger(LuceneCandidateRetrieval.class);
    private static ConcurrentHashMap<String, LoadingCache<CandidateCacheKey, List<SynHit>>> caches = new ConcurrentHashMap<>();

    @Deprecated
    public LuceneCandidateRetrieval(IndexSearcher indexSearcher, Scorer scorer) throws IOException {
        this.maxEntModel = MAXENT_SCORER_MODEL;
        this.mentionIndexSearcher = indexSearcher;
        this.exactScorer = scorer;
        this.normalizer = new TermNormalizer();
    }

    public LuceneCandidateRetrieval(GeneMappingConfiguration geneMappingConfiguration) throws GeneMappingException {
        this.maxEntModel = MAXENT_SCORER_MODEL;
        String property = geneMappingConfiguration.getProperty("mention_index");
        if (property == null) {
            throw new GeneMappingException("mention index not specified in configuration file (critical).");
        }
        try {
            this.mentionIndexSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get(property, new String[0]))));
            this.mentionIndexSearcher.setSimilarity(new ClassicSimilarity());
            log.debug("mention index loaded.");
            String property2 = geneMappingConfiguration.getProperty(GeneMappingConfiguration.SPELLING_INDEX);
            if (property2 != null) {
                File file = new File(property2);
                if (file.exists()) {
                    this.spellingChecker = new SpellChecker(FSDirectory.open(file.toPath()));
                }
            }
            if (this.spellingChecker == null) {
                log.warn("Spelling index was not given or file does not exist. No spelling correction can be done. Specified spelling index: {}", property2);
            }
            String property3 = geneMappingConfiguration.getProperty(GeneMappingConfiguration.EXACT_SCORER_TYPE);
            if (property3 == null) {
                throw new GeneMappingException("No configuration value given for exact_scorer_type");
            }
            this.exactScorer = setScorerType(Integer.valueOf(property3).intValue());
            String property4 = geneMappingConfiguration.getProperty(GeneMappingConfiguration.APPROX_SCORER_TYPE);
            if (property4 == null) {
                throw new GeneMappingException("No configuration value given for approx_scorer_type");
            }
            this.approxScorer = setScorerType(Integer.valueOf(property4).intValue());
            String property5 = geneMappingConfiguration.getProperty("maxent_model");
            if (property5 != null) {
                this.maxEntModel = property5;
            }
            this.normalizer = new TermNormalizer();
            log.info("Mention index: " + property);
            log.info("Exact scorer: " + this.exactScorer);
            log.info("Approx scorer: " + this.approxScorer);
            synchronized (caches) {
                this.candidateCache = caches.get(property);
                if (null == this.candidateCache) {
                    log.info("Creating new gene candidate cache for index {}", property);
                    this.candidateCache = CacheBuilder.newBuilder().maximumSize(1000000L).expireAfterWrite(60L, TimeUnit.MINUTES).build(new CacheLoader<CandidateCacheKey, List<SynHit>>() { // from class: de.julielab.jules.ae.genemapping.LuceneCandidateRetrieval.1
                        public List<SynHit> load(CandidateCacheKey candidateCacheKey) throws IOException, BooleanQuery.TooManyClauses {
                            return Collections.unmodifiableList(LuceneCandidateRetrieval.this.getCandidatesFromIndexWithoutCache(candidateCacheKey));
                        }
                    });
                    if (null != caches.put(property, this.candidateCache)) {
                        throw new IllegalStateException("There already is a candidate index for " + property + " which points to a faulty concurrency implementation");
                    }
                } else {
                    log.info("Using existing gene candidate cache for index {}", property);
                }
            }
        } catch (IOException e) {
            throw new GeneMappingException(e);
        }
    }

    public TermNormalizer getNormalizer() {
        return this.normalizer;
    }

    public void setNormalizer(TermNormalizer termNormalizer) {
        this.normalizer = termNormalizer;
    }

    public Scorer getScorer() {
        return this.exactScorer;
    }

    public IndexSearcher getMentionIndexSearcher() {
        return this.mentionIndexSearcher;
    }

    public SpellChecker getSpellingChecker() {
        return this.spellingChecker;
    }

    public Scorer setScorerType(int i) throws GeneMappingException {
        Scorer levenshteinScorer;
        if (i == 0) {
            levenshteinScorer = new SimpleScorer();
        } else if (i == 1) {
            levenshteinScorer = new TokenJaroSimilarityScorer();
        } else if (i == 2) {
            levenshteinScorer = !this.maxEntModel.equals(MAXENT_SCORER_MODEL) ? new MaxEntScorer(new File(this.maxEntModel)) : new MaxEntScorer(getClass().getResourceAsStream(MAXENT_SCORER_MODEL));
        } else if (i == 3) {
            levenshteinScorer = new JaroWinklerScorer();
        } else if (i == 10) {
            levenshteinScorer = new LuceneScorer();
        } else {
            if (i != 4) {
                throw new GeneMappingException("Unknown mention scorer type: " + i);
            }
            levenshteinScorer = new LevenshteinScorer();
        }
        return levenshteinScorer;
    }

    public String getScorerInfo() {
        return this.exactScorer == null ? "Lucene Score (unnormalized)" : this.exactScorer.info();
    }

    public int getScorerType() {
        return this.exactScorer.getScorerType();
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public List<SynHit> getCandidates(String str) throws GeneCandidateRetrievalException {
        return getCandidates(new GeneMention(str, this.normalizer));
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public List<SynHit> getCandidates(GeneMention geneMention) throws GeneCandidateRetrievalException {
        return getCandidates(geneMention, geneMention.getTaxonomyIds());
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public List<SynHit> getCandidates(GeneMention geneMention, Collection<String> collection) throws GeneCandidateRetrievalException {
        try {
            List<SynHit> arrayList = new ArrayList();
            CandidateCacheKey candidateCacheKey = new CandidateCacheKey(geneMention.getGeneName());
            if (collection.isEmpty()) {
                arrayList = getCandidatesFromIndex(candidateCacheKey);
                if (log.isDebugEnabled()) {
                    log.debug("Returning {} candidates for gene mention {}[{}-{}]", new Object[]{Integer.valueOf(arrayList.size()), candidateCacheKey.geneName.getText(), Integer.valueOf(geneMention.getOffsets() != null ? geneMention.getBegin() : -1), Integer.valueOf(geneMention.getOffsets() != null ? geneMention.getEnd() : -1)});
                }
            }
            Iterator<String> it = collection.iterator();
            while (it.hasNext()) {
                candidateCacheKey.taxId = it.next();
                arrayList.addAll(getCandidatesFromIndex(candidateCacheKey));
                if (log.isDebugEnabled()) {
                    int i = -1;
                    int i2 = -1;
                    if (geneMention.getOffsets() != null) {
                        i = geneMention.getBegin();
                        i2 = geneMention.getEnd();
                    }
                    log.debug("Returning {} candidates for gene mention {}[{}-{}] for taxonomy ID {}", new Object[]{Integer.valueOf(arrayList.size()), candidateCacheKey.geneName.getText(), Integer.valueOf(i), Integer.valueOf(i2), collection});
                }
            }
            arrayList.stream().forEach(synHit -> {
                synHit.setCompareType(SynHit.CompareType.SCORE);
            });
            return (List) arrayList.stream().sorted().collect(Collectors.toList());
        } catch (ExecutionException e) {
            throw new GeneCandidateRetrievalException(e);
        }
    }

    private List<SynHit> getCandidatesFromIndex(CandidateCacheKey candidateCacheKey) throws ExecutionException {
        return (List) ((List) this.candidateCache.get(candidateCacheKey)).stream().map(synHit -> {
            try {
                return synHit.m6clone();
            } catch (CloneNotSupportedException e) {
                log.error("Could not clone a cached SynHit: {}", synHit, e);
                throw new RuntimeException(e);
            }
        }).collect(Collectors.toList());
    }

    private ArrayList<SynHit> getCandidatesFromIndexWithoutCache(CandidateCacheKey candidateCacheKey) throws IOException, BooleanQuery.TooManyClauses {
        Query makeDisjunctionMaxQuery = QueryGenerator.makeDisjunctionMaxQuery(candidateCacheKey, this.spellingChecker);
        TopDocs search = this.mentionIndexSearcher.search(makeDisjunctionMaxQuery, LUCENE_MAX_HITS);
        log.debug("searching with query: " + makeDisjunctionMaxQuery + "; found hits: " + search.totalHits);
        return scoreHits(search, candidateCacheKey.geneName);
    }

    private ArrayList<SynHit> scoreHits(TopDocs topDocs, GeneName geneName) throws CorruptIndexException, IOException {
        ArrayList<SynHit> arrayList = new ArrayList<>();
        String lowerCase = geneName.getText().toLowerCase();
        String normalizedText = geneName.getNormalizedText();
        ScoreDoc[] scoreDocArr = topDocs.scoreDocs;
        log.debug("ordering candidates for best match to this reference term: " + lowerCase + " for top " + scoreDocArr.length + " candidates");
        candidateLog.trace("Search term: " + normalizedText);
        for (int i = 0; i < scoreDocArr.length; i++) {
            Document doc = this.mentionIndexSearcher.doc(scoreDocArr[i].doc);
            String stringValue = doc.getField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD).stringValue();
            ArrayList arrayList2 = new ArrayList();
            ArrayList arrayList3 = new ArrayList();
            Arrays.stream(doc.getFields(SynonymIndexFieldNames.ID_FIELD)).map((v0) -> {
                return v0.stringValue();
            }).map(str -> {
                return str.split(NAME_PRIO_DELIMITER);
            }).forEach(strArr -> {
                arrayList2.add(strArr[0]);
                arrayList3.add(Integer.valueOf(strArr[1]));
            });
            List list = (List) Arrays.stream(doc.getFields(SynonymIndexFieldNames.TAX_ID_FIELD)).map((v0) -> {
                return v0.stringValue();
            }).collect(Collectors.toList());
            Scorer scorer = stringValue.equals(normalizedText) ? this.exactScorer : this.approxScorer;
            SynHit synHit = new SynHit(stringValue, scorer.getScorerType() == 10 ? stringValue.equals(normalizedText) ? 9999.0d : scoreDocArr[i].score : scorer.getScore(normalizedText, stringValue), arrayList2, GeneMapping.SOURCE_DEFINITION, list);
            synHit.setMappedMention(lowerCase);
            synHit.setMappedGeneName(geneName);
            synHit.setSynonymPriorities(arrayList3);
            arrayList.add(synHit);
        }
        return arrayList;
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public List<SynHit> getCandidates(GeneMention geneMention, String str) throws GeneCandidateRetrievalException {
        return getCandidates(geneMention, Arrays.asList(str));
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public List<SynHit> getCandidates(String str, String str2) throws GeneCandidateRetrievalException {
        return getCandidates(new GeneMention(str, this.normalizer), Arrays.asList(str2));
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public List<SynHit> getCandidates(String str, Collection<String> collection) throws GeneCandidateRetrievalException {
        return getCandidates(new GeneMention(str, this.normalizer), collection);
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public String mapGeneIdToTaxId(String str) throws IOException {
        TopDocs search = this.mentionIndexSearcher.search(new TermQuery(new Term(SynonymIndexFieldNames.ID_FIELD, str + "__-1")), 1);
        ScoreDoc[] scoreDocArr = search.scoreDocs;
        if (search.totalHits <= 0) {
            return "";
        }
        Document doc = this.mentionIndexSearcher.doc(scoreDocArr[0].doc);
        List list = (List) Arrays.stream(doc.getFields(SynonymIndexFieldNames.ID_FIELD)).map((v0) -> {
            return v0.stringValue();
        }).map(str2 -> {
            return str2.split(NAME_PRIO_DELIMITER);
        }).map(strArr -> {
            return strArr[0];
        }).collect(Collectors.toList());
        List list2 = (List) Arrays.stream(doc.getFields(SynonymIndexFieldNames.TAX_ID_FIELD)).map((v0) -> {
            return v0.stringValue();
        }).collect(Collectors.toList());
        String str3 = "";
        for (int i = 0; i < list.size(); i++) {
            if (((String) list.get(i)).equals(str)) {
                str3 = (String) list2.get(i);
            }
        }
        if (str3.equals("")) {
            log.warn("GeneID: " + str + " has no TaxId assigned.");
        }
        return str3;
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public List<SynHit> getIndexEntries(List<String> list) throws IOException {
        log.warn("LuceneCandidateRetrieval.getIndexEntries(): This method currently does not work as intended since the synonym index is now synonym-centric instead of id-centric. The ID field values have the form id_priority, thus at this place a wildcard query for all priorities would be needed");
        ArrayList arrayList = new ArrayList(list.size());
        for (String str : list) {
            TopDocs search = this.mentionIndexSearcher.search(new BooleanQuery.Builder().add(new BooleanClause(new TermQuery(new Term(SynonymIndexFieldNames.ID_FIELD, str + "__-1")), BooleanClause.Occur.FILTER)).build(), 1);
            if (search.totalHits > 0) {
                List list2 = (List) Arrays.stream(this.mentionIndexSearcher.doc(search.scoreDocs[0].doc).getFields(SynonymIndexFieldNames.TAX_ID_FIELD)).map((v0) -> {
                    return v0.stringValue();
                }).filter(str2 -> {
                    return !StringUtils.isBlank(str2);
                }).collect(Collectors.toList());
                if (list2.isEmpty()) {
                    log.warn("GeneID: " + str + " has no TaxId assigned.");
                }
                arrayList.add(new SynHit("<none>", 0.0d, Arrays.asList(str), GeneMapping.SOURCE_DEFINITION, list2));
            }
            arrayList.add(null);
        }
        return arrayList;
    }

    @Override // de.julielab.jules.ae.genemapping.CandidateRetrieval
    public List<String> getSynonyms(String str) throws IOException {
        List<String> emptyList = Collections.emptyList();
        TopDocs search = this.mentionIndexSearcher.search(new BooleanQuery.Builder().add(new BooleanClause(new WildcardQuery(new Term(SynonymIndexFieldNames.ID_FIELD, str + "__*")), BooleanClause.Occur.FILTER)).build(), 200);
        if (search.totalHits > 0) {
            emptyList = new ArrayList(200);
            for (int i = 0; i < search.scoreDocs.length; i++) {
                emptyList.add(this.mentionIndexSearcher.doc(search.scoreDocs[i].doc).getField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD).stringValue());
            }
        }
        return emptyList;
    }

    public List<String> getPriorityNames(String str, int i) throws IOException {
        List<String> emptyList = Collections.emptyList();
        TopDocs search = this.mentionIndexSearcher.search(new BooleanQuery.Builder().add(new BooleanClause(new TermQuery(new Term(SynonymIndexFieldNames.ID_FIELD, str + "__" + i)), BooleanClause.Occur.FILTER)).build(), 1);
        if (search.totalHits > 0) {
            emptyList = new ArrayList(1);
            for (int i2 = 0; i2 < search.scoreDocs.length; i2++) {
                emptyList.add(this.mentionIndexSearcher.doc(search.scoreDocs[i2].doc).getField(SynonymIndexFieldNames.LOOKUP_SYN_FIELD).stringValue());
            }
        }
        return emptyList;
    }

    public List<String> getPriorityNames(List<String> list, int i) throws IOException {
        Stream.Builder builder = Stream.builder();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            builder.accept(getPriorityNames(it.next(), i));
        }
        return (List) builder.build().flatMap((v0) -> {
            return v0.stream();
        }).collect(Collectors.toList());
    }
}
