package ivory.core.preprocess;

import edu.umd.cloud9.io.pair.PairOfIntLong;
import edu.umd.cloud9.util.PowerTool;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.TermDocVector;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/core/preprocess/ComputeGlobalTermStatistics.class */
public class ComputeGlobalTermStatistics extends PowerTool {
    private static final Logger LOG = Logger.getLogger(ComputeGlobalTermStatistics.class);
    public static final String[] RequiredParameters = {Constants.CollectionName, Constants.IndexPath, Constants.MinDf, Constants.MaxDf};

    /* loaded from: input_file:ivory/core/preprocess/ComputeGlobalTermStatistics$MyCombiner.class */
    private static class MyCombiner extends Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {
        private static final PairOfIntLong output = new PairOfIntLong();

        private MyCombiner() {
        }

        public void reduce(Text text, Iterable<PairOfIntLong> iterable, Reducer<Text, PairOfIntLong, Text, PairOfIntLong>.Context context) throws IOException, InterruptedException {
            int i = 0;
            long j = 0;
            for (PairOfIntLong pairOfIntLong : iterable) {
                i += pairOfIntLong.getLeftElement();
                j += pairOfIntLong.getRightElement();
            }
            output.set(i, j);
            context.write(text, output);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterable iterable, Reducer.Context context) throws IOException, InterruptedException {
            reduce((Text) obj, (Iterable<PairOfIntLong>) iterable, (Reducer<Text, PairOfIntLong, Text, PairOfIntLong>.Context) context);
        }
    }

    /* loaded from: input_file:ivory/core/preprocess/ComputeGlobalTermStatistics$MyMapper.class */
    private static class MyMapper extends Mapper<IntWritable, TermDocVector, Text, PairOfIntLong> {
        private static final Text term = new Text();
        private static final PairOfIntLong pair = new PairOfIntLong();

        private MyMapper() {
        }

        /* JADX WARN: Multi-variable type inference failed */
        /* JADX WARN: Type inference failed for: r0v14, types: [int] */
        public void map(IntWritable intWritable, TermDocVector termDocVector, Mapper<IntWritable, TermDocVector, Text, PairOfIntLong>.Context context) throws IOException, InterruptedException {
            TermDocVector.Reader reader = termDocVector.getReader();
            short s = 0;
            while (reader.hasMoreTerms()) {
                term.set(reader.nextTerm());
                short tf = reader.getTf();
                s += tf;
                pair.set(1, tf);
                context.write(term, pair);
            }
            context.getCounter(Statistics.Docs).increment(1L);
            context.getCounter(Statistics.SumOfDocLengths).increment(s);
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
            map((IntWritable) obj, (TermDocVector) obj2, (Mapper<IntWritable, TermDocVector, Text, PairOfIntLong>.Context) context);
        }
    }

    /* loaded from: input_file:ivory/core/preprocess/ComputeGlobalTermStatistics$MyReducer.class */
    private static class MyReducer extends Reducer<Text, PairOfIntLong, Text, PairOfIntLong> {
        private static final PairOfIntLong output = new PairOfIntLong();
        private int minDf;
        private int maxDf;

        private MyReducer() {
        }

        public void setup(Reducer<Text, PairOfIntLong, Text, PairOfIntLong>.Context context) {
            this.minDf = context.getConfiguration().getInt(Constants.MinDf, 2);
            this.maxDf = context.getConfiguration().getInt(Constants.MaxDf, Integer.MAX_VALUE);
        }

        public void reduce(Text text, Iterable<PairOfIntLong> iterable, Reducer<Text, PairOfIntLong, Text, PairOfIntLong>.Context context) throws IOException, InterruptedException {
            int i = 0;
            long j = 0;
            for (PairOfIntLong pairOfIntLong : iterable) {
                i += pairOfIntLong.getLeftElement();
                j += pairOfIntLong.getRightElement();
            }
            if (i < this.minDf || i > this.maxDf) {
                return;
            }
            context.getCounter(Statistics.Terms).increment(1L);
            output.set(i, j);
            context.write(text, output);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterable iterable, Reducer.Context context) throws IOException, InterruptedException {
            reduce((Text) obj, (Iterable<PairOfIntLong>) iterable, (Reducer<Text, PairOfIntLong, Text, PairOfIntLong>.Context) context);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/ComputeGlobalTermStatistics$Statistics.class */
    public enum Statistics {
        Docs,
        Terms,
        SumOfDocLengths
    }

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public ComputeGlobalTermStatistics(Configuration configuration) {
        super(configuration);
    }

    public int runTool() throws Exception {
        Configuration conf = getConf();
        FileSystem fileSystem = FileSystem.get(conf);
        String str = conf.get(Constants.IndexPath);
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(str, fileSystem);
        String readCollectionName = retrievalEnvironment.readCollectionName();
        String termDocVectorsDirectory = retrievalEnvironment.getTermDocVectorsDirectory();
        String termDfCfDirectory = retrievalEnvironment.getTermDfCfDirectory();
        if (!fileSystem.exists(new Path(str))) {
            LOG.info("index path doesn't existing: skipping!");
            return 0;
        }
        if (!fileSystem.exists(new Path(termDocVectorsDirectory))) {
            LOG.info("term doc vectors path doesn't existing: skipping!");
            return 0;
        }
        LOG.info("PowerTool: " + ComputeGlobalTermStatistics.class.getSimpleName());
        LOG.info(String.format(" - %s: %s", Constants.CollectionName, readCollectionName));
        LOG.info(String.format(" - %s: %s", Constants.IndexPath, str));
        LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, 10));
        Path path = new Path(termDfCfDirectory);
        if (fileSystem.exists(path)) {
            LOG.info("TermDfCf directory exist: skipping!");
            return 0;
        }
        Job job = Job.getInstance(getConf(), ComputeGlobalTermStatistics.class.getSimpleName() + ":" + readCollectionName);
        job.setJarByClass(ComputeGlobalTermStatistics.class);
        job.setNumReduceTasks(10);
        FileInputFormat.setInputPaths(job, new Path[]{new Path(termDocVectorsDirectory)});
        FileOutputFormat.setOutputPath(job, path);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PairOfIntLong.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(PairOfIntLong.class);
        job.setMapperClass(MyMapper.class);
        job.setCombinerClass(MyCombiner.class);
        job.setReducerClass(MyReducer.class);
        job.setJarByClass(ComputeGlobalTermStatistics.class);
        long currentTimeMillis = System.currentTimeMillis();
        job.waitForCompletion(true);
        LOG.info("Job Finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        Counters counters = job.getCounters();
        retrievalEnvironment.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());
        retrievalEnvironment.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
        return 0;
    }
}
