package pitt.search.semanticvectors;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.logging.Logger;
import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import pitt.search.semanticvectors.utils.VerbatimLogger;
import pitt.search.semanticvectors.vectors.Vector;
import pitt.search.semanticvectors.vectors.VectorFactory;

/* loaded from: input_file:pitt/search/semanticvectors/IncrementalDocVectors.class */
public class IncrementalDocVectors {
    private static final Logger logger = Logger.getLogger(IncrementalDocVectors.class.getCanonicalName());
    private FlagConfig flagConfig;
    private VectorStore termVectorData;
    private LuceneUtils luceneUtils;

    private IncrementalDocVectors() {
    }

    public static void createIncrementalDocVectors(VectorStore vectorStore, FlagConfig flagConfig, LuceneUtils luceneUtils) throws IOException {
        IncrementalDocVectors incrementalDocVectors = new IncrementalDocVectors();
        incrementalDocVectors.flagConfig = flagConfig;
        incrementalDocVectors.termVectorData = vectorStore;
        incrementalDocVectors.luceneUtils = luceneUtils;
        incrementalDocVectors.trainIncrementalDocVectors();
    }

    private void trainIncrementalDocVectors() throws IOException {
        int numDocs = this.luceneUtils.getNumDocs();
        File file = new File(VectorStoreUtils.getStoreFileName(this.flagConfig.docvectorsfile(), this.flagConfig));
        String parent = file.getParent();
        if (parent == null) {
            parent = "";
        }
        FSDirectory open = FSDirectory.open(new File(parent));
        IndexOutput createOutput = open.createOutput(file.getName(), IOContext.DEFAULT);
        VerbatimLogger.info("Writing vectors incrementally to file " + file + " ... ");
        createOutput.writeString(VectorStoreWriter.generateHeaderString(this.flagConfig));
        for (int i = 0; i < numDocs; i++) {
            if (i > 0 && (i % PatternReplaceCharFilter.DEFAULT_MAX_BLOCK_CHARS == 0 || (i < 10000 && i % LogDocMergePolicy.DEFAULT_MIN_MERGE_DOCS == 0))) {
                VerbatimLogger.info("Processed " + i + " documents ... ");
            }
            String num = Integer.toString(i);
            if (this.luceneUtils.getDoc(i).getField(this.flagConfig.docidfield()) != null) {
                num = this.luceneUtils.getDoc(i).getField(this.flagConfig.docidfield()).stringValue();
                if (num.length() == 0) {
                    logger.severe("Empty document name!!! This will cause problems ...");
                    logger.severe("Please set -docidfield to a nonempty field in your Lucene index.");
                }
            }
            Vector createZeroVector = VectorFactory.createZeroVector(this.flagConfig.vectortype(), this.flagConfig.dimension());
            for (String str : this.flagConfig.contentsfields()) {
                Terms termVector = this.luceneUtils.getTermVector(i, str);
                if (termVector == null) {
                    VerbatimLogger.fine(String.format("When building document vectors, no term vector for field: '%s' in document %d.", str, Integer.valueOf(i)));
                } else {
                    TermsEnum it = termVector.iterator(null);
                    while (true) {
                        BytesRef next = it.next();
                        if (next != null) {
                            String text = new Term(str, next).text();
                            DocsEnum docs = it.docs(null, null);
                            docs.nextDoc();
                            int freq = docs.freq();
                            try {
                                Vector vector = this.termVectorData.getVector(text);
                                if (vector != null && vector.getDimension() > 0) {
                                    float localTermWeight = this.luceneUtils.getLocalTermWeight(freq);
                                    float globalTermWeight = this.luceneUtils.getGlobalTermWeight(new Term(str, text));
                                    float sqrt = this.flagConfig.fieldweight() ? (float) (1.0d / Math.sqrt(termVector.size())) : 1.0f;
                                    createZeroVector.superpose(vector, localTermWeight * globalTermWeight * sqrt, null);
                                }
                            } catch (NullPointerException e) {
                                logger.finest("term " + text + " not represented");
                            }
                        }
                    }
                }
            }
            if (createZeroVector.isZeroVector()) {
                logger.severe(String.format("Document vector is zero for document '%s'. This probably means that none of the -contentsfields were populated. this is a bad sign and should be investigated.", num));
            }
            createZeroVector.normalize();
            createOutput.writeString(num);
            createZeroVector.writeToLuceneStream(createOutput);
        }
        VerbatimLogger.info("Finished writing vectors.\n");
        createOutput.flush();
        createOutput.close();
        open.close();
    }

    public static void main(String[] strArr) throws Exception {
        FlagConfig flagConfig = FlagConfig.getFlagConfig(strArr);
        String[] strArr2 = flagConfig.remainingArgs;
        if (strArr2.length != 2) {
            throw new IllegalArgumentException("After parsing command line flags, there were " + strArr2.length + " arguments, instead of the expected 2.");
        }
        VectorStoreRAM vectorStoreRAM = new VectorStoreRAM(flagConfig);
        vectorStoreRAM.initFromFile(strArr2[0]);
        logger.info("Minimum frequency = " + flagConfig.minfrequency());
        logger.info("Maximum frequency = " + flagConfig.maxfrequency());
        logger.info("Number non-alphabet characters = " + flagConfig.maxnonalphabetchars());
        logger.info("Contents fields are: " + Arrays.toString(flagConfig.contentsfields()));
        createIncrementalDocVectors(vectorStoreRAM, flagConfig, new LuceneUtils(flagConfig));
    }
}
