package ivory.core.preprocess;

import com.google.common.collect.Maps;
import edu.umd.cloud9.util.PowerTool;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.DefaultFrequencySortedDictionary;
import ivory.core.data.document.IntDocVector;
import ivory.core.data.document.LazyIntDocVector;
import ivory.core.data.document.TermDocVector;
import ivory.core.tokenize.DocumentProcessingUtils;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.SortedMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/core/preprocess/BuildIntDocVectors.class */
public class BuildIntDocVectors extends PowerTool {
    private static final Logger LOG = Logger.getLogger(BuildIntDocVectors.class);
    public static final String[] RequiredParameters = {Constants.IndexPath};

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/BuildIntDocVectors$Docs.class */
    public enum Docs {
        Skipped,
        Total;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Docs[] valuesCustom() {
            Docs[] valuesCustom = values();
            int length = valuesCustom.length;
            Docs[] docsArr = new Docs[length];
            System.arraycopy(valuesCustom, 0, docsArr, 0, length);
            return docsArr;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/BuildIntDocVectors$MapTime.class */
    public enum MapTime {
        DecodingAndIdMapping,
        EncodingAndSpilling;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static MapTime[] valuesCustom() {
            MapTime[] valuesCustom = values();
            int length = valuesCustom.length;
            MapTime[] mapTimeArr = new MapTime[length];
            System.arraycopy(valuesCustom, 0, mapTimeArr, 0, length);
            return mapTimeArr;
        }
    }

    /* loaded from: input_file:ivory/core/preprocess/BuildIntDocVectors$MyMapper.class */
    private static class MyMapper extends Mapper<IntWritable, TermDocVector, IntWritable, IntDocVector> {
        private DefaultFrequencySortedDictionary dictionary = null;
        private static final LazyIntDocVector docVector = new LazyIntDocVector();

        private MyMapper() {
        }

        public void setup(Mapper<IntWritable, TermDocVector, IntWritable, IntDocVector>.Context context) {
            try {
                Configuration configuration = context.getConfiguration();
                RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(configuration.get(Constants.IndexPath), FileSystem.get(configuration));
                String indexTermsData = retrievalEnvironment.getIndexTermsData();
                String indexTermIdsData = retrievalEnvironment.getIndexTermIdsData();
                String indexTermIdMappingData = retrievalEnvironment.getIndexTermIdMappingData();
                String substring = indexTermsData.substring(indexTermsData.lastIndexOf("/") + 1);
                String substring2 = indexTermIdsData.substring(indexTermIdsData.lastIndexOf("/") + 1);
                String substring3 = indexTermIdMappingData.substring(indexTermIdMappingData.lastIndexOf("/") + 1);
                BuildIntDocVectors.LOG.info("Looking for the following files in dcache: " + substring + ", " + substring2 + ", " + substring3);
                if (configuration.get("mapred.job.tracker").equals("local")) {
                    this.dictionary = new DefaultFrequencySortedDictionary(new Path(substring), new Path(substring2), new Path(substring3), FileSystem.getLocal(configuration));
                    return;
                }
                HashMap newHashMap = Maps.newHashMap();
                Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
                for (Path path : localCacheFiles) {
                    BuildIntDocVectors.LOG.info("In DistributedCache: " + path);
                    if (path.toString().contains(substring)) {
                        newHashMap.put(substring, path);
                    } else if (path.toString().contains(substring2)) {
                        newHashMap.put(substring2, path);
                    } else if (path.toString().contains(substring3)) {
                        newHashMap.put(substring3, path);
                    }
                }
                BuildIntDocVectors.LOG.info(" - terms: " + newHashMap.get(substring));
                BuildIntDocVectors.LOG.info(" - id: " + newHashMap.get(substring2));
                BuildIntDocVectors.LOG.info(" - idToTerms: " + newHashMap.get(substring3));
                String str = String.valueOf(localCacheFiles.length) + " " + localCacheFiles[0].toString() + " " + localCacheFiles[1].toString() + " " + localCacheFiles[2].toString();
                if (newHashMap.get(substring) == null) {
                    throw new RuntimeException(str);
                }
                this.dictionary = new DefaultFrequencySortedDictionary((Path) newHashMap.get(substring), (Path) newHashMap.get(substring2), (Path) newHashMap.get(substring3), FileSystem.getLocal(configuration));
            } catch (Exception e) {
                e.printStackTrace();
                throw new RuntimeException("Error initializing data!", e);
            }
        }

        public void map(IntWritable intWritable, TermDocVector termDocVector, Mapper<IntWritable, TermDocVector, IntWritable, IntDocVector>.Context context) throws IOException, InterruptedException {
            long currentTimeMillis = System.currentTimeMillis();
            SortedMap<Integer, int[]> integerizeTermDocVector = DocumentProcessingUtils.integerizeTermDocVector(termDocVector, this.dictionary);
            context.getCounter(MapTime.DecodingAndIdMapping).increment(System.currentTimeMillis() - currentTimeMillis);
            long currentTimeMillis2 = System.currentTimeMillis();
            docVector.setTermPositionsMap(integerizeTermDocVector);
            context.write(intWritable, docVector);
            context.getCounter(MapTime.EncodingAndSpilling).increment(System.currentTimeMillis() - currentTimeMillis2);
            context.getCounter(Docs.Total).increment(1L);
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
            map((IntWritable) obj, (TermDocVector) obj2, (Mapper<IntWritable, TermDocVector, IntWritable, IntDocVector>.Context) context);
        }
    }

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public BuildIntDocVectors(Configuration configuration) {
        super(configuration);
    }

    public int runTool() throws Exception {
        Configuration conf = getConf();
        FileSystem fileSystem = FileSystem.get(conf);
        String str = conf.get(Constants.IndexPath);
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(str, fileSystem);
        String readCollectionName = retrievalEnvironment.readCollectionName();
        LOG.info("PowerTool: " + BuildIntDocVectors.class.getSimpleName());
        LOG.info(String.format(" - %s: %s", Constants.CollectionName, readCollectionName));
        LOG.info(String.format(" - %s: %s", Constants.IndexPath, str));
        String indexTermsData = retrievalEnvironment.getIndexTermsData();
        String indexTermIdsData = retrievalEnvironment.getIndexTermIdsData();
        String indexTermIdMappingData = retrievalEnvironment.getIndexTermIdMappingData();
        Path path = new Path(indexTermsData);
        Path path2 = new Path(indexTermIdsData);
        if (!fileSystem.exists(path) || !fileSystem.exists(path2)) {
            LOG.error("Error, terms files don't exist!");
            return 0;
        }
        Path path3 = new Path(retrievalEnvironment.getIntDocVectorsDirectory());
        if (fileSystem.exists(path3)) {
            LOG.info("IntDocVectors already exist: skipping!");
            return 0;
        }
        DistributedCache.addCacheFile(new URI(indexTermsData), conf);
        DistributedCache.addCacheFile(new URI(indexTermIdsData), conf);
        DistributedCache.addCacheFile(new URI(indexTermIdMappingData), conf);
        conf.set("mapreduce.map.memory.mb", "2048");
        conf.set("mapreduce.map.java.opts", "-Xmx2048m");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
        Job job = Job.getInstance(conf, String.valueOf(BuildIntDocVectors.class.getSimpleName()) + ":" + readCollectionName);
        job.setJarByClass(BuildIntDocVectors.class);
        job.setNumReduceTasks(0);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        FileInputFormat.setInputPaths(job, retrievalEnvironment.getTermDocVectorsDirectory());
        FileOutputFormat.setOutputPath(job, path3);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(LazyIntDocVector.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(LazyIntDocVector.class);
        job.setMapperClass(MyMapper.class);
        long currentTimeMillis = System.currentTimeMillis();
        job.waitForCompletion(true);
        LOG.info("Job Finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        return 0;
    }
}
