package ivory.core.preprocess;

import com.google.common.collect.Lists;
import edu.umd.cloud9.io.pair.PairOfIntLong;
import edu.umd.cloud9.util.PowerTool;
import it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.util.FrontCodedStringList;
import it.unimi.dsi.util.ShiftAddXorSignedStringMap;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.DictionaryTransformationStrategy;
import ivory.core.util.QuickSort;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/core/preprocess/BuildDictionary.class */
public class BuildDictionary extends PowerTool {
    private static final Logger LOG = Logger.getLogger(BuildDictionary.class);
    public static final String[] RequiredParameters = {Constants.CollectionName, Constants.IndexPath};

    /* loaded from: input_file:ivory/core/preprocess/BuildDictionary$MyReducer.class */
    private static class MyReducer extends Reducer<Text, PairOfIntLong, NullWritable, NullWritable> {
        private FSDataOutputStream termsOut;
        private FSDataOutputStream idsOut;
        private FSDataOutputStream idsToTermOut;
        private FSDataOutputStream dfByTermOut;
        private FSDataOutputStream cfByTermOut;
        private FSDataOutputStream dfByIntOut;
        private FSDataOutputStream cfByIntOut;
        private int numTerms;
        private int[] seqNums = null;
        private int[] dfs = null;
        private long[] cfs = null;
        private int curKeyIndex = 0;
        private String[] terms;

        private MyReducer() {
        }

        public void setup(Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context) throws IOException {
            BuildDictionary.LOG.info("Starting setup.");
            Configuration configuration = context.getConfiguration();
            FileSystem fileSystem = FileSystem.get(configuration);
            RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(configuration.get(Constants.IndexPath), fileSystem);
            this.numTerms = configuration.getInt(Constants.CollectionTermCount, 0);
            this.terms = new String[this.numTerms];
            this.seqNums = new int[this.numTerms];
            this.dfs = new int[this.numTerms];
            this.cfs = new long[this.numTerms];
            this.termsOut = fileSystem.create(new Path(retrievalEnvironment.getIndexTermsData()), true);
            this.idsOut = fileSystem.create(new Path(retrievalEnvironment.getIndexTermIdsData()), true);
            this.idsOut.writeInt(this.numTerms);
            this.idsToTermOut = fileSystem.create(new Path(retrievalEnvironment.getIndexTermIdMappingData()), true);
            this.idsToTermOut.writeInt(this.numTerms);
            this.dfByTermOut = fileSystem.create(new Path(retrievalEnvironment.getDfByTermData()), true);
            this.dfByTermOut.writeInt(this.numTerms);
            this.cfByTermOut = fileSystem.create(new Path(retrievalEnvironment.getCfByTermData()), true);
            this.cfByTermOut.writeInt(this.numTerms);
            this.dfByIntOut = fileSystem.create(new Path(retrievalEnvironment.getDfByIntData()), true);
            this.dfByIntOut.writeInt(this.numTerms);
            this.cfByIntOut = fileSystem.create(new Path(retrievalEnvironment.getCfByIntData()), true);
            this.cfByIntOut.writeInt(this.numTerms);
            BuildDictionary.LOG.info("Finished setup.");
        }

        public void reduce(Text text, Iterable<PairOfIntLong> iterable, Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context) throws IOException, InterruptedException {
            String text2 = text.toString();
            Iterator<PairOfIntLong> it = iterable.iterator();
            PairOfIntLong next = it.next();
            int leftElement = next.getLeftElement();
            long rightElement = next.getRightElement();
            WritableUtils.writeVInt(this.dfByTermOut, leftElement);
            WritableUtils.writeVLong(this.cfByTermOut, rightElement);
            if (it.hasNext()) {
                throw new RuntimeException("More than one record for term: " + text2);
            }
            this.terms[this.curKeyIndex] = text2;
            this.seqNums[this.curKeyIndex] = this.curKeyIndex;
            this.dfs[this.curKeyIndex] = -leftElement;
            this.cfs[this.curKeyIndex] = rightElement;
            this.curKeyIndex++;
            context.getCounter(Terms.Total).increment(1L);
        }

        public void cleanup(Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context context) throws IOException {
            BuildDictionary.LOG.info("Starting cleanup.");
            if (this.curKeyIndex != this.numTerms) {
                throw new RuntimeException("Total expected Terms: " + this.numTerms + ", Total observed terms: " + this.curKeyIndex + "!");
            }
            QuickSort.quicksortWithSecondary(this.seqNums, this.dfs, this.cfs, 0, this.numTerms - 1);
            for (int i = 0; i < this.numTerms; i++) {
                WritableUtils.writeVInt(this.dfByIntOut, -this.dfs[i]);
                WritableUtils.writeVLong(this.cfByIntOut, this.cfs[i]);
            }
            this.cfs = null;
            for (int i2 = 0; i2 < this.numTerms; i2++) {
                this.dfs[i2] = i2 + 1;
            }
            for (int i3 = 0; i3 < this.numTerms; i3++) {
                this.idsToTermOut.writeInt(this.seqNums[i3]);
            }
            QuickSort.quicksort(this.dfs, this.seqNums, 0, this.numTerms - 1);
            for (int i4 = 0; i4 < this.numTerms; i4++) {
                this.idsOut.writeInt(this.dfs[i4]);
            }
            ArrayList newArrayList = Lists.newArrayList(this.terms);
            FrontCodedStringList frontCodedStringList = new FrontCodedStringList(newArrayList, 8, true);
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            ObjectOutputStream objectOutputStream = new ObjectOutputStream(byteArrayOutputStream);
            objectOutputStream.writeObject(frontCodedStringList);
            objectOutputStream.close();
            byte[] byteArray = byteArrayOutputStream.toByteArray();
            this.termsOut.writeInt(byteArray.length);
            this.termsOut.write(byteArray);
            ShiftAddXorSignedStringMap shiftAddXorSignedStringMap = new ShiftAddXorSignedStringMap(newArrayList.iterator(), new TwoStepsLcpMonotoneMinimalPerfectHashFunction(newArrayList, DictionaryTransformationStrategy.getStrategy()));
            ByteArrayOutputStream byteArrayOutputStream2 = new ByteArrayOutputStream();
            ObjectOutputStream objectOutputStream2 = new ObjectOutputStream(byteArrayOutputStream2);
            objectOutputStream2.writeObject(shiftAddXorSignedStringMap);
            objectOutputStream2.close();
            byte[] byteArray2 = byteArrayOutputStream2.toByteArray();
            this.termsOut.writeInt(byteArray2.length);
            this.termsOut.write(byteArray2);
            this.termsOut.close();
            this.idsOut.close();
            this.idsToTermOut.close();
            this.dfByTermOut.close();
            this.cfByTermOut.close();
            this.dfByIntOut.close();
            this.cfByIntOut.close();
            BuildDictionary.LOG.info("Finished cleanup.");
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterable iterable, Reducer.Context context) throws IOException, InterruptedException {
            reduce((Text) obj, (Iterable<PairOfIntLong>) iterable, (Reducer<Text, PairOfIntLong, NullWritable, NullWritable>.Context) context);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/BuildDictionary$Terms.class */
    public enum Terms {
        Total;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Terms[] valuesCustom() {
            Terms[] valuesCustom = values();
            int length = valuesCustom.length;
            Terms[] termsArr = new Terms[length];
            System.arraycopy(valuesCustom, 0, termsArr, 0, length);
            return termsArr;
        }
    }

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public BuildDictionary(Configuration configuration) {
        super(configuration);
    }

    public int runTool() throws Exception {
        Configuration conf = getConf();
        FileSystem fileSystem = FileSystem.get(conf);
        String str = conf.get(Constants.IndexPath);
        String str2 = conf.get(Constants.CollectionName);
        LOG.info("PowerTool: " + BuildDictionary.class.getSimpleName());
        LOG.info(String.format(" - %s: %s", Constants.CollectionName, str2));
        LOG.info(String.format(" - %s: %s", Constants.IndexPath, str));
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(str, fileSystem);
        if (!fileSystem.exists(new Path(str))) {
            LOG.error("index path doesn't existing: skipping!");
            return 0;
        }
        if (fileSystem.exists(new Path(retrievalEnvironment.getIndexTermsData())) && fileSystem.exists(new Path(retrievalEnvironment.getIndexTermIdsData())) && fileSystem.exists(new Path(retrievalEnvironment.getIndexTermIdMappingData())) && fileSystem.exists(new Path(retrievalEnvironment.getDfByTermData())) && fileSystem.exists(new Path(retrievalEnvironment.getCfByTermData())) && fileSystem.exists(new Path(retrievalEnvironment.getDfByIntData())) && fileSystem.exists(new Path(retrievalEnvironment.getCfByIntData()))) {
            LOG.info("term and term id data exist: skipping!");
            return 0;
        }
        conf.setInt(Constants.CollectionTermCount, retrievalEnvironment.readCollectionTermCount());
        conf.set("mapreduce.map.memory.mb", "2048");
        conf.set("mapreduce.map.java.opts", "-Xmx2048m");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
        Path path = new Path(retrievalEnvironment.getTempDirectory());
        fileSystem.delete(path, true);
        Job job = Job.getInstance(conf, String.valueOf(BuildDictionary.class.getSimpleName()) + ":" + str2);
        job.setJarByClass(BuildDictionary.class);
        job.setNumReduceTasks(1);
        FileInputFormat.setInputPaths(job, new Path[]{new Path(retrievalEnvironment.getTermDfCfDirectory())});
        FileOutputFormat.setOutputPath(job, path);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PairOfIntLong.class);
        job.setOutputKeyClass(Text.class);
        job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class);
        job.setMapperClass(Mapper.class);
        job.setReducerClass(MyReducer.class);
        long currentTimeMillis = System.currentTimeMillis();
        job.waitForCompletion(true);
        LOG.info("Job Finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        fileSystem.delete(path, true);
        return 0;
    }
}
