package ivory.core.preprocess;

import edu.umd.cloud9.io.map.HMapIFW;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.map.MapIF;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.IntDocVector;
import ivory.core.data.document.WeightedIntDocVector;
import ivory.core.data.stat.DfTableArray;
import ivory.core.data.stat.DocLengthTable;
import ivory.core.data.stat.DocLengthTable2B;
import ivory.core.data.stat.DocLengthTable4B;
import ivory.pwsim.score.ScoringModel;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/core/preprocess/BuildWeightedIntDocVectors.class */
public class BuildWeightedIntDocVectors extends PowerTool {
    private static final Logger sLogger = Logger.getLogger(BuildWeightedIntDocVectors.class);
    public static final String[] RequiredParameters;

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/BuildWeightedIntDocVectors$Docs.class */
    public enum Docs {
        Total
    }

    /* loaded from: input_file:ivory/core/preprocess/BuildWeightedIntDocVectors$MyMapper.class */
    private static class MyMapper extends MapReduceBase implements Mapper<IntWritable, IntDocVector, IntWritable, WeightedIntDocVector> {
        static IntWritable mDocno = new IntWritable();
        private static DocLengthTable mDLTable;
        private static ScoringModel mScoreFn;
        private static DfTableArray mDFTable;
        private boolean normalize = false;
        private boolean shortDocLengths = false;
        HMapIFW vectorWeights = new HMapIFW();
        int term;
        float wt;
        float sum2;

        private MyMapper() {
        }

        public void configure(JobConf jobConf) {
            Path[] localCacheFiles;
            BuildWeightedIntDocVectors.sLogger.setLevel(Level.WARN);
            this.normalize = jobConf.getBoolean("Ivory.Normalize", false);
            this.shortDocLengths = jobConf.getBoolean("Ivory.ShortDocLengths", false);
            try {
                if (jobConf.get("mapred.job.tracker").equals("local")) {
                    RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(jobConf.get(Constants.IndexPath), FileSystem.get(jobConf));
                    localCacheFiles = new Path[]{new Path(retrievalEnvironment.getDfByIntData()), new Path(retrievalEnvironment.getCfByIntData()), retrievalEnvironment.getDoclengthsData()};
                } else {
                    localCacheFiles = DistributedCache.getLocalCacheFiles(jobConf);
                }
                BuildWeightedIntDocVectors.sLogger.info("localFiles: " + localCacheFiles);
                BuildWeightedIntDocVectors.sLogger.info("localFiles [0]: " + localCacheFiles[0]);
                BuildWeightedIntDocVectors.sLogger.info("localFiles [1]: " + localCacheFiles[1]);
                BuildWeightedIntDocVectors.sLogger.info("localFiles [2]: " + localCacheFiles[2]);
                try {
                    mDFTable = new DfTableArray(localCacheFiles[0], FileSystem.getLocal(jobConf));
                    try {
                        if (this.shortDocLengths) {
                            mDLTable = new DocLengthTable2B(localCacheFiles[2], FileSystem.getLocal(jobConf));
                        } else {
                            mDLTable = new DocLengthTable4B(localCacheFiles[2], FileSystem.getLocal(jobConf));
                        }
                        try {
                            mScoreFn = (ScoringModel) Class.forName(jobConf.get("Ivory.ScoringModel")).newInstance();
                            mScoreFn.setDocCount(mDLTable.getDocCount());
                            mScoreFn.setAvgDocLength(mDLTable.getAvgDocLength());
                        } catch (Exception e) {
                            throw new RuntimeException("Error initializing Ivory.ScoringModel from " + jobConf.get("Ivory.ScoringModel"));
                        }
                    } catch (IOException e2) {
                        throw new RuntimeException("Error loading dl table from " + localCacheFiles[2]);
                    }
                } catch (IOException e3) {
                    throw new RuntimeException("Error loading df table from " + localCacheFiles[0]);
                }
            } catch (IOException e4) {
                throw new RuntimeException("Local cache files not read properly.");
            }
        }

        public void map(IntWritable intWritable, IntDocVector intDocVector, OutputCollector<IntWritable, WeightedIntDocVector> outputCollector, Reporter reporter) throws IOException {
            mDocno.set(intWritable.get());
            int docLength = mDLTable.getDocLength(mDocno.get());
            this.vectorWeights.clear();
            IntDocVector.Reader reader = intDocVector.getReader();
            BuildWeightedIntDocVectors.sLogger.debug("===================================BEGIN READ DOC");
            this.sum2 = 0.0f;
            while (reader.hasMoreTerms()) {
                this.term = reader.nextTerm();
                mScoreFn.setDF(mDFTable.getDf(this.term));
                this.wt = mScoreFn.computeDocumentWeight(reader.getTf(), docLength);
                this.vectorWeights.put(this.term, this.wt);
                this.sum2 += this.wt * this.wt;
            }
            BuildWeightedIntDocVectors.sLogger.debug("===================================END READ DOC");
            if (this.normalize) {
                this.sum2 = (float) Math.sqrt(this.sum2);
                for (MapIF.Entry entry : this.vectorWeights.entrySet()) {
                    this.vectorWeights.put(entry.getKey(), this.vectorWeights.get(entry.getKey()) / this.sum2);
                }
            }
            outputCollector.collect(mDocno, new WeightedIntDocVector(docLength, this.vectorWeights));
            reporter.incrCounter(Docs.Total, 1L);
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (IntDocVector) obj2, (OutputCollector<IntWritable, WeightedIntDocVector>) outputCollector, reporter);
        }
    }

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public BuildWeightedIntDocVectors(Configuration configuration) {
        super(configuration);
    }

    public int runTool() throws Exception {
        sLogger.setLevel(Level.WARN);
        sLogger.info("PowerTool: GetWeightedIntDocVectors");
        JobConf jobConf = new JobConf(getConf(), BuildWeightedIntDocVectors.class);
        FileSystem fileSystem = FileSystem.get(jobConf);
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(jobConf.get(Constants.IndexPath), fileSystem);
        String weightedIntDocVectorsDirectory = retrievalEnvironment.getWeightedIntDocVectorsDirectory();
        int i = jobConf.getInt(Constants.NumMapTasks, 0);
        int i2 = jobConf.getInt(Constants.MinSplitSize, 0);
        String str = jobConf.get(Constants.CollectionName);
        sLogger.info("Characteristics of the collection:");
        sLogger.info(" - CollectionName: " + str);
        sLogger.info("Characteristics of the job:");
        sLogger.info(" - NumMapTasks: " + i);
        sLogger.info(" - MinSplitSize: " + i2);
        String dfByIntData = retrievalEnvironment.getDfByIntData();
        String cfByIntData = retrievalEnvironment.getCfByIntData();
        if (!fileSystem.exists(new Path(dfByIntData))) {
            throw new RuntimeException("Error, df data file " + dfByIntData + "doesn't exist!");
        }
        DistributedCache.addCacheFile(new URI(dfByIntData), jobConf);
        if (!fileSystem.exists(new Path(cfByIntData))) {
            throw new RuntimeException("Error, cf data file " + cfByIntData + "doesn't exist!");
        }
        DistributedCache.addCacheFile(new URI(cfByIntData), jobConf);
        Path doclengthsData = retrievalEnvironment.getDoclengthsData();
        if (!fileSystem.exists(doclengthsData)) {
            throw new RuntimeException("Error, doc-length data file " + doclengthsData + "doesn't exist!");
        }
        DistributedCache.addCacheFile(doclengthsData.toUri(), jobConf);
        Path path = new Path(retrievalEnvironment.getIntDocVectorsDirectory());
        Path path2 = new Path(weightedIntDocVectorsDirectory);
        if (fileSystem.exists(path2)) {
            sLogger.info("Output path already exists!");
            return 0;
        }
        jobConf.setJobName("GetWeightedIntDocVectors:" + str);
        jobConf.setNumMapTasks(i);
        jobConf.setNumReduceTasks(0);
        jobConf.setInt("mapred.min.split.size", i2);
        jobConf.set("mapred.child.java.opts", "-Xmx2048m");
        FileInputFormat.setInputPaths(jobConf, new Path[]{path});
        FileOutputFormat.setOutputPath(jobConf, path2);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setMapOutputKeyClass(IntWritable.class);
        jobConf.setMapOutputValueClass(WeightedIntDocVector.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        jobConf.setOutputKeyClass(IntWritable.class);
        jobConf.setOutputValueClass(WeightedIntDocVector.class);
        jobConf.setMapperClass(MyMapper.class);
        long currentTimeMillis = System.currentTimeMillis();
        JobClient.runJob(jobConf);
        sLogger.info("Job Finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        return 0;
    }

    static {
        sLogger.setLevel(Level.WARN);
        RequiredParameters = new String[]{Constants.NumMapTasks, Constants.IndexPath, "Ivory.ScoringModel", "Ivory.Normalize"};
    }
}
