package ivory.lsh.eval;

import edu.umd.cloud9.io.SequenceFileUtils;
import edu.umd.cloud9.io.map.HMapIIW;
import edu.umd.cloud9.util.map.HMapII;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.WeightedIntDocVector;
import ivory.lsh.driver.PwsimEnvironment;
import ivory.lsh.eval.SampleSignatures;
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/lsh/eval/SampleIntDocVectors.class */
public class SampleIntDocVectors extends Configured implements Tool {
    static Class keyClass = IntWritable.class;
    static Class valueClass = WeightedIntDocVector.class;
    static Class inputFormat = SequenceFileInputFormat.class;
    private static final Logger sLogger = Logger.getLogger(SampleIntDocVectors.class);
    private Options options;
    private String sampleDocnosFile;
    private String inputPath;
    private String outputPath;
    private String workDir;
    private int sampleSize;
    private float sampleFreq;
    private static final String WORKDIR_PATH_OPTION = "index";
    private static final String INPUT_PATH_OPTION = "input";
    private static final String OUTPUT_PATH_OPTION = "output";
    private static final String SAMPLEDOCNOS_OPTION = "docnos";
    private static final String SAMPLESIZE_OPTION = "size";
    private static final String LIBJARS_OPTION = "libjars";

    /* loaded from: input_file:ivory/lsh/eval/SampleIntDocVectors$MyMapper.class */
    private static class MyMapper extends MapReduceBase implements Mapper<IntWritable, WeightedIntDocVector, IntWritable, WeightedIntDocVector> {
        private int sampleFreq;
        private HMapII samplesMap = null;

        private MyMapper() {
        }

        private String getFilename(String str) {
            return str.substring(str.lastIndexOf("/") + 1);
        }

        private HMapIIW readSamplesFromCache(String str, JobConf jobConf) throws IOException {
            HMapIIW hMapIIW = null;
            for (Path path : DistributedCache.getLocalCacheFiles(jobConf)) {
                if (path.toString().contains(str)) {
                    hMapIIW = new HMapIIW();
                    LineReader lineReader = new LineReader(FileSystem.getLocal(jobConf).open(path));
                    Text text = new Text();
                    while (lineReader.readLine(text) != 0) {
                        int parseInt = Integer.parseInt(text.toString());
                        SampleIntDocVectors.sLogger.info(String.valueOf(parseInt) + " --> sample");
                        hMapIIW.put(parseInt, 1);
                    }
                    lineReader.close();
                    SampleIntDocVectors.sLogger.info(String.valueOf(hMapIIW.size()) + " sampled");
                }
            }
            if (hMapIIW == null) {
                throw new RuntimeException("Not found in local cache: " + str);
            }
            return hMapIIW;
        }

        public void configure(JobConf jobConf) {
            SampleIntDocVectors.sLogger.setLevel(Level.INFO);
            this.sampleFreq = jobConf.getInt("SampleFrequency", -1);
            String str = jobConf.get("Ivory.SampleFile");
            if (str != null) {
                try {
                    this.samplesMap = readSamplesFromCache(getFilename(str), jobConf);
                } catch (IOException e) {
                    e.printStackTrace();
                    throw new RuntimeException("I/O error in " + str);
                } catch (NumberFormatException e2) {
                    e2.printStackTrace();
                    throw new RuntimeException("Incorrect format in " + str);
                } catch (Exception e3) {
                    e3.printStackTrace();
                    throw new RuntimeException("Error reading sample file!");
                }
            }
        }

        public void map(IntWritable intWritable, WeightedIntDocVector weightedIntDocVector, OutputCollector<IntWritable, WeightedIntDocVector> outputCollector, Reporter reporter) throws IOException {
            if (this.samplesMap == null) {
                if (((int) (Math.random() * this.sampleFreq)) == 0) {
                    outputCollector.collect(intWritable, weightedIntDocVector);
                }
            } else if (this.samplesMap.containsKey(intWritable.get())) {
                reporter.incrCounter(SampleSignatures.mapoutput.count, 1L);
                outputCollector.collect(intWritable, weightedIntDocVector);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (WeightedIntDocVector) obj2, (OutputCollector<IntWritable, WeightedIntDocVector>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:ivory/lsh/eval/SampleIntDocVectors$MyReducer.class */
    public static class MyReducer extends MapReduceBase implements Reducer<IntWritable, WeightedIntDocVector, IntWritable, WeightedIntDocVector> {
        public void reduce(IntWritable intWritable, Iterator<WeightedIntDocVector> it, OutputCollector<IntWritable, WeightedIntDocVector> outputCollector, Reporter reporter) throws IOException {
            outputCollector.collect(intWritable, it.next());
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((IntWritable) obj, (Iterator<WeightedIntDocVector>) it, (OutputCollector<IntWritable, WeightedIntDocVector>) outputCollector, reporter);
        }
    }

    private void printUsage() {
        new HelpFormatter().printHelp(getClass().getCanonicalName(), this.options);
    }

    public int run(String[] strArr) throws Exception {
        sLogger.setLevel(Level.INFO);
        if (parseArgs(strArr) < 0) {
            printUsage();
            System.exit(-1);
        }
        JobConf jobConf = new JobConf(getConf(), SampleIntDocVectors.class);
        FileSystem fileSystem = FileSystem.get(jobConf);
        this.inputPath = this.inputPath == null ? PwsimEnvironment.getIntDocvectorsFile(this.workDir, fileSystem) : this.inputPath;
        this.outputPath = this.outputPath == null ? PwsimEnvironment.getIntDocvectorsFile(this.workDir, fileSystem, this.sampleSize) : this.outputPath;
        if (!fileSystem.exists(new Path(this.inputPath))) {
            throw new RuntimeException("Error, input path does not exist!");
        }
        jobConf.setJobName(getClass().getName());
        if (this.sampleDocnosFile != null && fileSystem.exists(new Path(this.sampleDocnosFile))) {
            jobConf.set("Ivory.SampleFile", this.sampleDocnosFile);
            DistributedCache.addCacheFile(new URI(this.sampleDocnosFile), jobConf);
        } else {
            if (this.sampleSize == -1) {
                throw new RuntimeException("Either provide sample frequency with option -size or existing sample docnos with option -docnos");
            }
            this.sampleFreq = new RetrievalEnvironment(this.workDir, fileSystem).readCollectionDocumentCount() / this.sampleSize;
            jobConf.setInt("SampleFrequency", (int) this.sampleFreq);
        }
        fileSystem.delete(new Path(this.outputPath), true);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(this.inputPath)});
        FileOutputFormat.setOutputPath(jobConf, new Path(this.outputPath));
        FileOutputFormat.setCompressOutput(jobConf, false);
        jobConf.setJarByClass(SampleIntDocVectors.class);
        jobConf.set("mapred.child.java.opts", "-Xmx2048m");
        jobConf.setInt("mapred.map.max.attempts", 100);
        jobConf.setInt("mapred.reduce.max.attempts", 100);
        jobConf.setInt("mapred.task.timeout", 600000000);
        sLogger.info("Running job " + jobConf.getJobName());
        sLogger.info("Input directory: " + this.inputPath);
        sLogger.info("Output directory: " + this.outputPath);
        sLogger.info("Sample frequency: " + this.sampleFreq);
        sLogger.info("Sample docnos: " + jobConf.get("Ivory.SampleFile"));
        jobConf.setNumMapTasks(100);
        jobConf.setNumReduceTasks(1);
        jobConf.setInputFormat(inputFormat);
        jobConf.setMapOutputKeyClass(keyClass);
        jobConf.setMapOutputValueClass(valueClass);
        jobConf.setOutputKeyClass(keyClass);
        jobConf.setOutputValueClass(valueClass);
        jobConf.setMapperClass(MyMapper.class);
        jobConf.setReducerClass(MyReducer.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        JobClient.runJob(jobConf);
        if (this.sampleDocnosFile == null || fileSystem.exists(new Path(this.sampleDocnosFile))) {
            return 0;
        }
        sLogger.info("Extracting sample docnos from sampled vectors...");
        try {
            SortedMap readFileIntoMap = SequenceFileUtils.readFileIntoMap(new Path(String.valueOf(this.outputPath) + "/part-00000"));
            FSDataOutputStream create = fileSystem.create(new Path(this.sampleDocnosFile));
            Iterator it = readFileIntoMap.entrySet().iterator();
            while (it.hasNext()) {
                create.writeBytes(String.valueOf(((IntWritable) ((Map.Entry) it.next()).getKey()).get()) + "\n");
            }
            create.close();
            return 0;
        } catch (Exception e) {
            throw new RuntimeException(e.toString());
        }
    }

    private int parseArgs(String[] strArr) {
        this.options = new Options();
        Options options = this.options;
        OptionBuilder.withDescription("path to directory with weighted integer doc vectors");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options.addOption(OptionBuilder.create("index"));
        Options options2 = this.options;
        OptionBuilder.withDescription("path to weighted integer doc vectors");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        options2.addOption(OptionBuilder.create(INPUT_PATH_OPTION));
        Options options3 = this.options;
        OptionBuilder.withDescription("path to sampled weighted integer doc vectors");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        options3.addOption(OptionBuilder.create("output"));
        Options options4 = this.options;
        OptionBuilder.withDescription("only keep pairs that match these docnos");
        OptionBuilder.withArgName("path to sample docnos file");
        OptionBuilder.hasArg();
        options4.addOption(OptionBuilder.create(SAMPLEDOCNOS_OPTION));
        Options options5 = this.options;
        OptionBuilder.withDescription("sample a document with probability = number-of-docs/N");
        OptionBuilder.withArgName("N");
        OptionBuilder.hasArg();
        options5.addOption(OptionBuilder.create(SAMPLESIZE_OPTION));
        Options options6 = this.options;
        OptionBuilder.withDescription("Hadoop option to load external jars");
        OptionBuilder.withArgName("jar packages");
        OptionBuilder.hasArg();
        options6.addOption(OptionBuilder.create(LIBJARS_OPTION));
        try {
            CommandLine parse = new GnuParser().parse(this.options, strArr);
            this.workDir = parse.getOptionValue("index");
            this.inputPath = parse.hasOption(INPUT_PATH_OPTION) ? parse.getOptionValue(INPUT_PATH_OPTION) : null;
            this.outputPath = parse.hasOption("output") ? parse.getOptionValue("output") : null;
            this.sampleSize = parse.hasOption(SAMPLESIZE_OPTION) ? Integer.parseInt(parse.getOptionValue(SAMPLESIZE_OPTION)) : -1;
            this.sampleDocnosFile = parse.hasOption(SAMPLEDOCNOS_OPTION) ? parse.getOptionValue(SAMPLEDOCNOS_OPTION) : null;
            return 0;
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            return -1;
        }
    }

    public static void main(String[] strArr) throws Exception {
        ToolRunner.run(new SampleIntDocVectors(), strArr);
    }
}
