package ivory.core.preprocess;

import com.google.common.collect.Maps;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.map.MapKF;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.DefaultFrequencySortedDictionary;
import ivory.core.data.document.LazyTermDocVector;
import ivory.core.data.stat.DfTableArray;
import ivory.core.data.stat.DocLengthTable;
import ivory.core.data.stat.DocLengthTable2B;
import ivory.core.data.stat.DocLengthTable4B;
import ivory.pwsim.score.ScoringModel;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/core/preprocess/BuildWeightedTermDocVectors.class */
public class BuildWeightedTermDocVectors extends PowerTool {
    private static final Logger LOG = Logger.getLogger(BuildWeightedTermDocVectors.class);
    public static final String[] RequiredParameters = {Constants.NumMapTasks, Constants.IndexPath, "Ivory.ScoringModel", "Ivory.Normalize"};

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/BuildWeightedTermDocVectors$Docs.class */
    public enum Docs {
        Total,
        ZERO,
        SHORT
    }

    /* loaded from: input_file:ivory/core/preprocess/BuildWeightedTermDocVectors$MyMapper.class */
    private static class MyMapper extends MapReduceBase implements Mapper<IntWritable, LazyTermDocVector, IntWritable, HMapSFW> {
        static IntWritable mDocno = new IntWritable();
        private static DocLengthTable mDLTable;
        private static ScoringModel mScoreFn;
        DefaultFrequencySortedDictionary dict;
        DfTableArray dfTable;
        String term;
        float wt;
        float sum2;
        int MIN_SIZE = 0;
        boolean shortDocLengths = false;
        private boolean normalize = false;
        HMapSFW weightedVector = new HMapSFW();

        private MyMapper() {
        }

        public void configure(JobConf jobConf) {
            this.normalize = jobConf.getBoolean("Ivory.Normalize", false);
            this.shortDocLengths = jobConf.getBoolean("Ivory.ShortDocLengths", true);
            this.MIN_SIZE = jobConf.getInt("Ivory.MinNumTerms", 0);
            HashMap newHashMap = Maps.newHashMap();
            try {
                if (jobConf.get("mapred.job.tracker").equals("local")) {
                    throw new RuntimeException("Local mode not supported!");
                }
                RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(jobConf.get(Constants.IndexPath), FileSystem.get(jobConf));
                String indexTermsData = retrievalEnvironment.getIndexTermsData();
                String indexTermIdsData = retrievalEnvironment.getIndexTermIdsData();
                String indexTermIdMappingData = retrievalEnvironment.getIndexTermIdMappingData();
                String dfByIntData = retrievalEnvironment.getDfByIntData();
                String path = retrievalEnvironment.getDoclengthsData().toString();
                String substring = indexTermsData.substring(indexTermsData.lastIndexOf("/") + 1);
                String substring2 = indexTermIdsData.substring(indexTermIdsData.lastIndexOf("/") + 1);
                String substring3 = indexTermIdMappingData.substring(indexTermIdMappingData.lastIndexOf("/") + 1);
                String substring4 = dfByIntData.substring(dfByIntData.lastIndexOf("/") + 1);
                String substring5 = path.substring(path.lastIndexOf("/") + 1);
                Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(jobConf);
                for (Path path2 : localCacheFiles) {
                    BuildWeightedTermDocVectors.LOG.info("In DistributedCache: " + path2);
                    if (path2.toString().contains(substring)) {
                        newHashMap.put(substring, path2);
                    } else if (path2.toString().contains(substring2)) {
                        newHashMap.put(substring2, path2);
                    } else if (path2.toString().contains(substring3)) {
                        newHashMap.put(substring3, path2);
                    } else if (path2.toString().contains(substring4)) {
                        newHashMap.put(substring4, path2);
                    } else if (path2.toString().contains(substring5)) {
                        newHashMap.put(substring5, path2);
                    }
                }
                BuildWeightedTermDocVectors.LOG.info(" - terms: " + newHashMap.get(substring));
                BuildWeightedTermDocVectors.LOG.info(" - id: " + newHashMap.get(substring2));
                BuildWeightedTermDocVectors.LOG.info(" - idToTerms: " + newHashMap.get(substring3));
                BuildWeightedTermDocVectors.LOG.info(" - df data: " + newHashMap.get(substring4));
                BuildWeightedTermDocVectors.LOG.info(" - dl data: " + newHashMap.get(substring5));
                try {
                    this.dict = new DefaultFrequencySortedDictionary((Path) newHashMap.get(substring), (Path) newHashMap.get(substring2), (Path) newHashMap.get(substring3), FileSystem.getLocal(jobConf));
                    this.dfTable = new DfTableArray((Path) newHashMap.get(substring4), FileSystem.getLocal(jobConf));
                    BuildWeightedTermDocVectors.LOG.info("Global Stats table loaded successfully.");
                    try {
                        if (this.shortDocLengths) {
                            mDLTable = new DocLengthTable2B((Path) newHashMap.get(substring5), FileSystem.getLocal(jobConf));
                        } else {
                            mDLTable = new DocLengthTable4B((Path) newHashMap.get(substring5), FileSystem.getLocal(jobConf));
                        }
                        try {
                            mScoreFn = (ScoringModel) Class.forName(jobConf.get("Ivory.ScoringModel")).newInstance();
                            mScoreFn.setDocCount(mDLTable.getDocCount());
                            mScoreFn.setAvgDocLength(mDLTable.getAvgDocLength());
                        } catch (Exception e) {
                            throw new RuntimeException("Error initializing Ivory.ScoringModel from " + jobConf.get("Ivory.ScoringModel"));
                        }
                    } catch (IOException e2) {
                        throw new RuntimeException("Error loading dl table from " + localCacheFiles[4]);
                    }
                } catch (Exception e3) {
                    e3.printStackTrace();
                    throw new RuntimeException("Error loading Terms File for dictionary from " + localCacheFiles[0]);
                }
            } catch (IOException e4) {
                throw new RuntimeException("Local cache files not read properly.");
            }
        }

        public void map(IntWritable intWritable, LazyTermDocVector lazyTermDocVector, OutputCollector<IntWritable, HMapSFW> outputCollector, Reporter reporter) throws IOException {
            mDocno.set(intWritable.get());
            int docLength = mDLTable.getDocLength(mDocno.get());
            this.weightedVector.clear();
            LazyTermDocVector.Reader reader = lazyTermDocVector.getReader();
            this.sum2 = 0.0f;
            while (reader.hasMoreTerms()) {
                this.term = reader.nextTerm();
                int id = this.dict.getId(this.term);
                if (id != -1) {
                    mScoreFn.setDF(this.dfTable.getDf(id));
                    this.wt = mScoreFn.computeDocumentWeight(reader.getTf(), docLength);
                    this.weightedVector.put(this.term, this.wt);
                    this.sum2 += this.wt * this.wt;
                }
            }
            if (this.normalize) {
                this.sum2 = (float) Math.sqrt(this.sum2);
                for (MapKF.Entry entry : this.weightedVector.entrySet()) {
                    this.weightedVector.put((Comparable) entry.getKey(), this.weightedVector.get((Comparable) entry.getKey()) / this.sum2);
                }
            }
            if (this.weightedVector.size() == 0) {
                reporter.incrCounter(Docs.ZERO, 1L);
            } else if (this.weightedVector.size() < this.MIN_SIZE) {
                reporter.incrCounter(Docs.SHORT, 1L);
            } else {
                outputCollector.collect(mDocno, this.weightedVector);
                reporter.incrCounter(Docs.Total, 1L);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (LazyTermDocVector) obj2, (OutputCollector<IntWritable, HMapSFW>) outputCollector, reporter);
        }
    }

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public BuildWeightedTermDocVectors(Configuration configuration) {
        super(configuration);
    }

    public int runTool() throws Exception {
        LOG.info("PowerTool: " + BuildWeightedTermDocVectors.class.getName());
        JobConf jobConf = new JobConf(getConf(), BuildWeightedTermDocVectors.class);
        FileSystem fileSystem = FileSystem.get(jobConf);
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(getConf().get(Constants.IndexPath), fileSystem);
        String weightedTermDocVectorsDirectory = retrievalEnvironment.getWeightedTermDocVectorsDirectory();
        int i = getConf().getInt(Constants.NumMapTasks, 0);
        int i2 = getConf().getInt(Constants.MinSplitSize, 0);
        String str = getConf().get(Constants.CollectionName);
        String indexTermsData = retrievalEnvironment.getIndexTermsData();
        String indexTermIdsData = retrievalEnvironment.getIndexTermIdsData();
        String indexTermIdMappingData = retrievalEnvironment.getIndexTermIdMappingData();
        String dfByIntData = retrievalEnvironment.getDfByIntData();
        Path path = new Path(retrievalEnvironment.getTermDocVectorsDirectory());
        Path path2 = new Path(weightedTermDocVectorsDirectory);
        if (fileSystem.exists(path2)) {
            LOG.info("Output path already exists!");
            return 0;
        }
        if (!fileSystem.exists(new Path(indexTermsData)) || !fileSystem.exists(new Path(indexTermIdsData)) || !fileSystem.exists(new Path(indexTermIdMappingData))) {
            throw new RuntimeException("Error, terms file " + indexTermsData + "/" + indexTermIdsData + "/" + indexTermIdMappingData + "doesn't exist!");
        }
        DistributedCache.addCacheFile(new URI(indexTermsData), jobConf);
        DistributedCache.addCacheFile(new URI(indexTermIdsData), jobConf);
        DistributedCache.addCacheFile(new URI(indexTermIdMappingData), jobConf);
        if (!fileSystem.exists(new Path(dfByIntData))) {
            throw new RuntimeException("Error, df data file " + dfByIntData + "doesn't exist!");
        }
        DistributedCache.addCacheFile(new URI(dfByIntData), jobConf);
        Path doclengthsData = retrievalEnvironment.getDoclengthsData();
        if (!fileSystem.exists(doclengthsData)) {
            throw new RuntimeException("Error, doc-length data file " + doclengthsData + "doesn't exist!");
        }
        DistributedCache.addCacheFile(doclengthsData.toUri(), jobConf);
        jobConf.setJobName(BuildWeightedTermDocVectors.class.getSimpleName() + ":" + str);
        jobConf.setMapperClass(MyMapper.class);
        jobConf.setNumMapTasks(i);
        jobConf.setNumReduceTasks(0);
        jobConf.setInt("mapred.min.split.size", i2);
        jobConf.set("mapred.child.java.opts", "-Xmx2048m");
        jobConf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE));
        jobConf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
        if (getConf().get("Ivory.ShortDocLengths") != null) {
            jobConf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths"));
        }
        jobConf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel"));
        FileInputFormat.setInputPaths(jobConf, new Path[]{path});
        FileOutputFormat.setOutputPath(jobConf, path2);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setMapOutputKeyClass(IntWritable.class);
        jobConf.setMapOutputValueClass(HMapSFW.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        jobConf.setOutputKeyClass(IntWritable.class);
        jobConf.setOutputValueClass(HMapSFW.class);
        LOG.info("Running job: " + jobConf.getJobName());
        long currentTimeMillis = System.currentTimeMillis();
        JobClient.runJob(jobConf);
        LOG.info("Job Finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        return 0;
    }
}
