package pitt.search.semanticvectors;

import cern.colt.matrix.AbstractFormatter;
import ch.akuhn.edu.mit.tedlab.DMat;
import ch.akuhn.edu.mit.tedlab.SMat;
import ch.akuhn.edu.mit.tedlab.SVDRec;
import ch.akuhn.edu.mit.tedlab.Svdlib;
import java.io.File;
import java.io.IOException;
import java.util.logging.Logger;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import pitt.search.semanticvectors.utils.VerbatimLogger;
import pitt.search.semanticvectors.vectors.RealVector;
import pitt.search.semanticvectors.vectors.VectorType;

/* loaded from: input_file:pitt/search/semanticvectors/LSA.class */
public class LSA {
    private static final Logger logger = Logger.getLogger(LSA.class.getCanonicalName());
    public static String usageMessage = "\nLSA class in package pitt.search.semanticvectors\nUsage: java pitt.search.semanticvectors.LSA [other flags] -luceneindexpath PATH_TO_LUCENE_INDEXUse flags to configure dimension, min term frequency, etc. See online documentation for other available flags";
    private FlagConfig flagConfig;
    private String[] termList;
    private String contentsField;
    private LuceneUtils luceneUtils;

    private LSA(String str, FlagConfig flagConfig) throws IOException {
        this.flagConfig = flagConfig;
        this.luceneUtils = new LuceneUtils(flagConfig);
        if (flagConfig.contentsfields().length > 1) {
            logger.warning("LSA implementation only supports a single -contentsfield. Only '" + flagConfig.contentsfields()[0] + "' will be indexed.");
        }
        this.contentsField = flagConfig.contentsfields()[0];
        if (flagConfig.dimension() > this.luceneUtils.getNumDocs()) {
            logger.warning("Dimension for SVD cannot be greater than number of documents ... Setting dimension to " + this.luceneUtils.getNumDocs());
            flagConfig.setDimension(this.luceneUtils.getNumDocs());
        }
        if (flagConfig.termweight().equals("logentropy")) {
            VerbatimLogger.info("Term weighting: log-entropy.\n");
        }
        VerbatimLogger.info("Set up LSA indexer.\nDimension: " + flagConfig.dimension() + " Lucene index contents field: '" + this.contentsField + "' Minimum frequency = " + flagConfig.minfrequency() + " Maximum frequency = " + flagConfig.maxfrequency() + " Number non-alphabet characters = " + flagConfig.maxnonalphabetchars() + AbstractFormatter.DEFAULT_ROW_SEPARATOR);
    }

    /* JADX WARN: Multi-variable type inference failed */
    private SMat smatFromIndex() throws IOException {
        int i = 0;
        while (this.luceneUtils.getTermsForField(this.contentsField).iterator(null).next() != null) {
            i++;
        }
        VerbatimLogger.info(String.format("There are %d terms (and %d docs).\n", Integer.valueOf(i), Integer.valueOf(this.luceneUtils.getNumDocs())));
        this.termList = new String[i];
        int[] iArr = new int[i];
        int i2 = 0;
        int i3 = 0;
        Terms termsForField = this.luceneUtils.getTermsForField(this.contentsField);
        TermsEnum it = termsForField.iterator(null);
        while (true) {
            BytesRef next = it.next();
            if (next == null) {
                break;
            }
            Term term = new Term(this.contentsField, next);
            if (this.luceneUtils.termFilter(term)) {
                this.termList[i3] = term.text();
                int i4 = 0;
                while (this.luceneUtils.getDocsForTerm(term).nextDoc() != Integer.MAX_VALUE) {
                    i4++;
                    i2++;
                }
                iArr[i3] = new int[i4];
                DocsEnum docsForTerm = this.luceneUtils.getDocsForTerm(term);
                int i5 = 0;
                while (docsForTerm.nextDoc() != Integer.MAX_VALUE) {
                    iArr[i3][i5] = docsForTerm.docID();
                    i5++;
                }
                i3++;
            }
        }
        SMat sMat = new SMat(this.luceneUtils.getNumDocs(), i3, i2);
        TermsEnum it2 = termsForField.iterator(it);
        int i6 = 0;
        int i7 = 0;
        while (true) {
            BytesRef next2 = it2.next();
            if (next2 == null) {
                sMat.pointr[sMat.cols] = sMat.vals;
                return sMat;
            }
            Term term2 = new Term(this.contentsField, next2);
            if (this.luceneUtils.termFilter(term2)) {
                DocsEnum docsForTerm2 = this.luceneUtils.getDocsForTerm(term2);
                sMat.pointr[i6] = i7;
                while (docsForTerm2.nextDoc() != Integer.MAX_VALUE) {
                    sMat.rowind[i7] = docsForTerm2.docID();
                    sMat.value[i7] = docsForTerm2.freq() * this.luceneUtils.getGlobalTermWeight(term2);
                    i7++;
                }
                i6++;
            }
        }
    }

    private void writeOutput(DMat dMat, DMat dMat2) throws IOException {
        FSDirectory open = FSDirectory.open(new File("."));
        IndexOutput createOutput = open.createOutput(VectorStoreUtils.getStoreFileName(this.flagConfig.termvectorsfile(), this.flagConfig), IOContext.DEFAULT);
        createOutput.writeString(VectorStoreWriter.generateHeaderString(this.flagConfig));
        int i = 0;
        while (i < dMat.cols) {
            createOutput.writeString(this.termList[i]);
            float[] fArr = new float[this.flagConfig.dimension()];
            for (int i2 = 0; i2 < this.flagConfig.dimension(); i2++) {
                fArr[i2] = (float) dMat.value[i2][i];
            }
            RealVector realVector = new RealVector(fArr);
            realVector.normalize();
            realVector.writeToLuceneStream(createOutput);
            i++;
        }
        createOutput.flush();
        createOutput.close();
        VerbatimLogger.info("Wrote " + i + " term vectors incrementally to file " + this.flagConfig.termvectorsfile() + ".\n");
        IndexOutput createOutput2 = open.createOutput(VectorStoreUtils.getStoreFileName(this.flagConfig.docvectorsfile(), this.flagConfig), IOContext.DEFAULT);
        createOutput2.writeString(VectorStoreWriter.generateHeaderString(this.flagConfig));
        int i3 = 0;
        while (i3 < dMat2.cols) {
            createOutput2.writeString(this.luceneUtils.getDoc(i3).get(this.flagConfig.docidfield()));
            float[] fArr2 = new float[this.flagConfig.dimension()];
            for (int i4 = 0; i4 < this.flagConfig.dimension(); i4++) {
                fArr2[i4] = (float) dMat2.value[i4][i3];
            }
            RealVector realVector2 = new RealVector(fArr2);
            realVector2.normalize();
            realVector2.writeToLuceneStream(createOutput2);
            i3++;
        }
        createOutput2.flush();
        createOutput2.close();
        VerbatimLogger.info("Wrote " + i3 + " document vectors incrementally to file " + this.flagConfig.docvectorsfile() + ". Done.\n");
    }

    public static void main(String[] strArr) throws IllegalArgumentException, IOException {
        try {
            FlagConfig flagConfig = FlagConfig.getFlagConfig(strArr);
            String[] strArr2 = flagConfig.remainingArgs;
            if (flagConfig.vectortype() != VectorType.REAL) {
                logger.warning("LSA is only supported for real vectors ... setting vectortype to 'real'.");
            }
            if (flagConfig.luceneindexpath().isEmpty()) {
                throw new IllegalArgumentException("-luceneindexpath must be set.");
            }
            if (flagConfig.contentsfields().length != 1) {
                throw new IllegalArgumentException("LSA only supports one -contentsfield, more than this may cause a corrupt matrix.");
            }
            LSA lsa = new LSA(flagConfig.luceneindexpath(), flagConfig);
            SMat smatFromIndex = lsa.smatFromIndex();
            Svdlib svdlib = new Svdlib();
            VerbatimLogger.info("Starting SVD using algorithm LAS2 ...\n");
            SVDRec svdLAS2A = svdlib.svdLAS2A(smatFromIndex, flagConfig.dimension());
            lsa.writeOutput(svdLAS2A.Vt, svdLAS2A.Ut);
        } catch (IllegalArgumentException e) {
            System.out.println(usageMessage);
            throw e;
        }
    }
}
