package pitt.search.semanticvectors;

import cern.colt.matrix.AbstractFormatter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.BaseCompositeReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.packed.PackedInts;
import pitt.search.semanticvectors.utils.StringUtils;
import pitt.search.semanticvectors.utils.VerbatimLogger;

/* loaded from: input_file:pitt/search/semanticvectors/LuceneUtils.class */
public class LuceneUtils {
    public static final Version LUCENE_VERSION = Version.LUCENE_46;
    private static final Logger logger = Logger.getLogger(DocVectors.class.getCanonicalName());
    private FlagConfig flagConfig;
    private BaseCompositeReader<AtomicReader> compositeReader;
    private AtomicReader atomicReader;
    private Hashtable<Term, Float> termEntropy = new Hashtable<>();
    private Hashtable<Term, Float> termIDF = new Hashtable<>();
    private TreeSet<String> stopwords = null;
    private TreeSet<String> startwords = null;

    /* loaded from: input_file:pitt/search/semanticvectors/LuceneUtils$TermWeight.class */
    public enum TermWeight {
        NONE,
        IDF,
        LOGENTROPY,
        SQRT
    }

    public LuceneUtils(FlagConfig flagConfig) throws IOException {
        if (flagConfig.luceneindexpath().isEmpty()) {
            throw new IllegalArgumentException("-luceneindexpath is a required argument for initializing LuceneUtils instance.");
        }
        this.compositeReader = DirectoryReader.open(FSDirectory.open(new File(flagConfig.luceneindexpath())));
        this.atomicReader = SlowCompositeReaderWrapper.wrap(this.compositeReader);
        MultiFields.getFields(this.compositeReader);
        this.flagConfig = flagConfig;
        if (!flagConfig.stoplistfile().isEmpty()) {
            loadStopWords(flagConfig.stoplistfile());
        }
        VerbatimLogger.info("Initialized LuceneUtils from Lucene index in directory: " + flagConfig.luceneindexpath() + AbstractFormatter.DEFAULT_ROW_SEPARATOR);
    }

    public void loadStopWords(String str) throws IOException {
        logger.info("Using stopword file: " + str);
        this.stopwords = new TreeSet<>();
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(str));
            for (String readLine = bufferedReader.readLine(); readLine != null; readLine = bufferedReader.readLine()) {
                this.stopwords.add(readLine);
            }
            bufferedReader.close();
        } catch (IOException e) {
            throw new IOException("Couldn't open file " + str);
        }
    }

    public void loadStartWords(String str) throws IOException {
        System.err.println("Using startword file: " + str);
        this.startwords = new TreeSet<>();
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(str));
            for (String readLine = bufferedReader.readLine(); readLine != null; readLine = bufferedReader.readLine()) {
                this.startwords.add(readLine);
            }
            bufferedReader.close();
        } catch (IOException e) {
            throw new IOException("Couldn't open file " + str);
        }
    }

    public boolean stoplistContains(String str) {
        if (this.stopwords == null) {
            return false;
        }
        return this.stopwords.contains(str);
    }

    public Document getDoc(int i) throws IOException {
        return this.atomicReader.document(i);
    }

    public Terms getTermsForField(String str) throws IOException {
        if (this.atomicReader.terms(str) == null) {
            throw new NullPointerException(String.format("No terms for field: '%s'.\nKnown fields are: '%s'.", str, StringUtils.join(getFieldNames())));
        }
        return this.atomicReader.terms(str);
    }

    public DocsEnum getDocsForTerm(Term term) throws IOException {
        return this.atomicReader.termDocsEnum(term);
    }

    public Terms getTermVector(int i, String str) throws IOException {
        return this.atomicReader.getTermVector(i, str);
    }

    public FieldInfos getFieldInfos() {
        return this.atomicReader.getFieldInfos();
    }

    public List<String> getFieldNames() {
        ArrayList arrayList = new ArrayList();
        Iterator<FieldInfo> it = this.atomicReader.getFieldInfos().iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().name);
        }
        return arrayList;
    }

    public int getGlobalTermFreq(Term term) {
        try {
            int i = (int) this.compositeReader.totalTermFreq(term);
            if (i == -1) {
                logger.warning("Lucene StandardDirectoryReader returned -1 for term: '" + term.text() + "' in field: '" + term.field() + "'. Changing to 0.\nThis may be due to a version-mismatch and might be solved by rebuilding your Lucene index.");
                i = 0;
            }
            return i;
        } catch (IOException e) {
            logger.info("Couldn't get term frequency for term " + term.text());
            return 1;
        }
    }

    public float getGlobalTermWeightFromString(String str) {
        float f = 0.0f;
        for (String str2 : this.flagConfig.contentsfields()) {
            f += getGlobalTermWeight(new Term(str2, str));
        }
        return f;
    }

    public float getGlobalTermWeight(Term term) {
        switch (this.flagConfig.termweight()) {
            case NONE:
            case SQRT:
                return 1.0f;
            case IDF:
                return getIDF(term);
            case LOGENTROPY:
                return getEntropy(term);
            default:
                VerbatimLogger.severe("Unrecognized termweight option: " + this.flagConfig.termweight() + ". Returning 1.\n");
                return 1.0f;
        }
    }

    public float getLocalTermWeight(int i) {
        switch (this.flagConfig.termweight()) {
            case NONE:
                return 1.0f;
            case SQRT:
                return (float) Math.sqrt(i);
            case IDF:
                return i;
            case LOGENTROPY:
                return (float) Math.log10(1 + i);
            default:
                VerbatimLogger.severe("Unrecognized termweight option: " + this.flagConfig.termweight() + ". Returning 1.");
                return 1.0f;
        }
    }

    public int getNumDocs() {
        return this.compositeReader.numDocs();
    }

    private float getIDF(Term term) {
        if (this.termIDF.containsKey(term)) {
            return this.termIDF.get(term).floatValue();
        }
        try {
            if (this.compositeReader.docFreq(term) == 0) {
                return PackedInts.COMPACT;
            }
            float log10 = (float) Math.log10(this.compositeReader.numDocs() / r0);
            this.termIDF.put(term, Float.valueOf(log10));
            return log10;
        } catch (IOException e) {
            e.printStackTrace();
            return 1.0f;
        }
    }

    private float getEntropy(Term term) {
        if (this.termEntropy.containsKey(term)) {
            return this.termEntropy.get(term).floatValue();
        }
        int globalTermFreq = getGlobalTermFreq(term);
        double d = 0.0d;
        try {
            DocsEnum docsForTerm = getDocsForTerm(term);
            while (docsForTerm.nextDoc() != Integer.MAX_VALUE) {
                double freq = docsForTerm.freq() / globalTermFreq;
                d += freq * (Math.log(freq) / Math.log(2.0d));
            }
            d /= Math.log(getNumDocs()) / Math.log(2.0d);
        } catch (IOException e) {
            logger.info("Couldn't get term entropy for term " + term.text());
        }
        this.termEntropy.put(term, Float.valueOf(1.0f + ((float) d)));
        return (float) (1.0d + d);
    }

    public boolean termFilter(Term term) {
        return termFilter(term, this.flagConfig.contentsfields(), this.flagConfig.minfrequency(), this.flagConfig.maxfrequency(), this.flagConfig.maxnonalphabetchars(), this.flagConfig.filteroutnumbers(), this.flagConfig.mintermlength());
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean termFilter(Term term, String[] strArr, int i, int i2, int i3, int i4) {
        boolean z = false;
        for (String str : strArr) {
            if (term.field().compareToIgnoreCase(str) == 0) {
                z = true;
            }
        }
        if (stoplistContains(term.text()) || !z) {
            return false;
        }
        if (i3 != -1) {
            int i5 = 0;
            String text = term.text();
            if (text.length() < i4) {
                return false;
            }
            for (int i6 = 0; i6 < text.length(); i6++) {
                if (!Character.isLetter(text.charAt(i6))) {
                    i5++;
                }
                if (i5 > i3) {
                    return false;
                }
            }
        }
        int globalTermFreq = getGlobalTermFreq(term);
        return !((globalTermFreq < i) | (globalTermFreq > i2));
    }

    private boolean termFilter(Term term, String[] strArr, int i, int i2, int i3, boolean z, int i4) {
        if (z) {
            try {
                Double.parseDouble(term.text());
                return false;
            } catch (Exception e) {
            }
        }
        return termFilter(term, strArr, i, i2, i3, i4);
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static void compressIndex(String str) {
    }
}
