package ivory.lsh.pwsim.cl;

import edu.umd.cloud9.io.map.HMapIIW;
import edu.umd.cloud9.io.pair.PairOfInts;
import ivory.core.Constants;
import ivory.lsh.data.BitsSignatureTable;
import ivory.lsh.data.Signature;
import ivory.lsh.driver.PwsimEnvironment;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/lsh/pwsim/cl/CLSlidingWindowPwsim.class */
public class CLSlidingWindowPwsim extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(CLSlidingWindowPwsim.class);
    private static final String WORKDIR_PATH_OPTION = "index";
    private static final String INPUT_PATH_OPTION = "input";
    private static final String OUTPUT_PATH_OPTION = "output";
    private static final String THRESHOLD_OPTION = "T";
    private static final String WINDOWSIZE_OPTION = "B";
    private static final String SIGNLENG_OPTION = "num_bits";
    private static final String NUMPERMS_OPTION = "Q";
    private static final String OVERLAPSIZE_OPTION = "overlap";
    private static final String SIGNTYPE_OPTION = "type";
    private static final String SAMPLEDOCNOS_OPTION = "docnos";
    private static final String NUMREDUCERS_OPTION = "reduce";
    private static final String TOPN_OPTION = "topN";
    private static final String LIBJARS_OPTION = "libjars";
    private Options options;
    private int numOfPermutations;
    private int chunkOverlapSize;
    private int numReducers;
    private int windowSize;
    private int maxDist;
    private int numResults;
    private int numOfBits;
    private String signatureType;
    private String sampleDocnosFile;
    private String workDir;
    private String inputPath;
    private String outputPath;

    /* loaded from: input_file:ivory/lsh/pwsim/cl/CLSlidingWindowPwsim$MyMapper.class */
    public static class MyMapper extends MapReduceBase implements Mapper<IntWritable, BitsSignatureTable, PairOfInts, IntWritable> {
        static int slidingWindowSize;
        static int maxDist;
        private int hammingDistance;
        private Signature[] signatures = null;
        private int[] docNos = null;
        private HMapIIW samplesMap = null;
        private PairOfInts outKey = new PairOfInts();
        private IntWritable outValue = new IntWritable();
        private int nSignatures = -1;

        private String getFilename(String str) {
            return str.substring(str.lastIndexOf("/") + 1);
        }

        public void configure(JobConf jobConf) {
            CLSlidingWindowPwsim.sLogger.setLevel(Level.INFO);
            slidingWindowSize = jobConf.getInt("Ivory.SlidingWindowSize", -1);
            maxDist = jobConf.getInt("Ivory.MaxHammingDistance", -1);
            String str = jobConf.get("Ivory.SampleFile");
            if (str != null) {
                try {
                    this.samplesMap = readSamplesFromCache(getFilename(str), jobConf);
                } catch (IOException e) {
                    e.printStackTrace();
                    throw new RuntimeException("I/O error in " + str);
                } catch (NumberFormatException e2) {
                    e2.printStackTrace();
                    throw new RuntimeException("Incorrect format in " + str);
                } catch (Exception e3) {
                    e3.printStackTrace();
                    throw new RuntimeException("Error reading sample file: " + str);
                }
            }
        }

        private HMapIIW readSamplesFromCache(String str, JobConf jobConf) throws IOException {
            HMapIIW hMapIIW = null;
            for (Path path : DistributedCache.getLocalCacheFiles(jobConf)) {
                if (path.toString().contains(str)) {
                    hMapIIW = new HMapIIW();
                    LineReader lineReader = new LineReader(FileSystem.getLocal(jobConf).open(path));
                    Text text = new Text();
                    while (lineReader.readLine(text) != 0) {
                        int parseInt = Integer.parseInt(text.toString());
                        CLSlidingWindowPwsim.sLogger.info(String.valueOf(parseInt) + " --> sample");
                        hMapIIW.put(parseInt, 1);
                    }
                    lineReader.close();
                    CLSlidingWindowPwsim.sLogger.info(String.valueOf(hMapIIW.size()) + " sampled");
                }
            }
            if (hMapIIW == null) {
                throw new RuntimeException("Not found in local cache: " + str);
            }
            return hMapIIW;
        }

        public void map(IntWritable intWritable, BitsSignatureTable bitsSignatureTable, OutputCollector<PairOfInts, IntWritable> outputCollector, Reporter reporter) throws IOException {
            this.signatures = bitsSignatureTable.getSignatures();
            this.docNos = bitsSignatureTable.getDocNos();
            this.nSignatures = bitsSignatureTable.getNumOfSignatures();
            for (int i = 0; i < this.nSignatures; i++) {
                if ((this.docNos[i] > 1000000000 && this.samplesMap == null) || (this.samplesMap != null && this.samplesMap.containsKey(this.docNos[i]))) {
                    for (int i2 = i - 1; i2 > i - slidingWindowSize && i2 >= 0; i2--) {
                        if (this.docNos[i2] <= 1000000000) {
                            reporter.incrCounter(Pairs.PrefixSum, this.signatures[i].getLongestPrefix(this.signatures[i2]));
                            reporter.incrCounter(Pairs.Processed, 1L);
                            this.hammingDistance = this.signatures[i].hammingDistance(this.signatures[i2], maxDist);
                            CLSlidingWindowPwsim.sLogger.debug(Integer.valueOf(this.hammingDistance));
                            if (this.hammingDistance <= maxDist) {
                                reporter.incrCounter(Pairs.Emitted, 1L);
                                this.outValue.set(this.hammingDistance);
                                this.outKey.set(this.docNos[i2], this.docNos[i]);
                                outputCollector.collect(this.outKey, this.outValue);
                            }
                        }
                    }
                    for (int i3 = i + 1; i3 < i + slidingWindowSize && i3 < this.nSignatures; i3++) {
                        if (this.docNos[i3] <= 1000000000) {
                            reporter.incrCounter(Pairs.PrefixSum, this.signatures[i].getLongestPrefix(this.signatures[i3]));
                            reporter.incrCounter(Pairs.Processed, 1L);
                            this.hammingDistance = this.signatures[i].hammingDistance(this.signatures[i3], maxDist);
                            CLSlidingWindowPwsim.sLogger.debug(Integer.valueOf(this.hammingDistance));
                            if (this.hammingDistance <= maxDist) {
                                reporter.incrCounter(Pairs.Emitted, 1L);
                                this.outValue.set(this.hammingDistance);
                                this.outKey.set(this.docNos[i3], this.docNos[i]);
                                outputCollector.collect(this.outKey, this.outValue);
                            }
                        }
                    }
                }
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (BitsSignatureTable) obj2, (OutputCollector<PairOfInts, IntWritable>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:ivory/lsh/pwsim/cl/CLSlidingWindowPwsim$MyReducer.class */
    public static class MyReducer extends MapReduceBase implements Reducer<PairOfInts, IntWritable, PairOfInts, IntWritable> {
        IntWritable outValue = new IntWritable();
        HashMap<String, Integer> map = new HashMap<>();

        public void reduce(PairOfInts pairOfInts, Iterator<IntWritable> it, OutputCollector<PairOfInts, IntWritable> outputCollector, Reporter reporter) throws IOException {
            outputCollector.collect(pairOfInts, it.next());
            reporter.incrCounter(Pairs.Emitted, 1L);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((PairOfInts) obj, (Iterator<IntWritable>) it, (OutputCollector<PairOfInts, IntWritable>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:ivory/lsh/pwsim/cl/CLSlidingWindowPwsim$MyReducerTopN.class */
    public static class MyReducerTopN extends MapReduceBase implements Reducer<IntWritable, PairOfInts, IntWritable, PairOfInts> {
        int numResults;
        TreeSet<PairOfInts> list = new TreeSet<>();

        public void configure(JobConf jobConf) {
            this.numResults = jobConf.getInt("Ivory.NumResults", -1);
            CLSlidingWindowPwsim.sLogger.info("numResults");
        }

        public void reduce(IntWritable intWritable, Iterator<PairOfInts> it, OutputCollector<IntWritable, PairOfInts> outputCollector, Reporter reporter) throws IOException {
            this.list.clear();
            while (it.hasNext()) {
                PairOfInts next = it.next();
                this.list.add(new PairOfInts(next.getLeftElement(), next.getRightElement()));
            }
            for (int i = 0; !this.list.isEmpty() && i < this.numResults; i++) {
                outputCollector.collect(intWritable, this.list.pollFirst());
            }
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((IntWritable) obj, (Iterator<PairOfInts>) it, (OutputCollector<IntWritable, PairOfInts>) outputCollector, reporter);
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:ivory/lsh/pwsim/cl/CLSlidingWindowPwsim$Pairs.class */
    public enum Pairs {
        Processed,
        Emitted,
        PrefixSum;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Pairs[] valuesCustom() {
            Pairs[] valuesCustom = values();
            int length = valuesCustom.length;
            Pairs[] pairsArr = new Pairs[length];
            System.arraycopy(valuesCustom, 0, pairsArr, 0, length);
            return pairsArr;
        }
    }

    public int run(String[] strArr) throws Exception {
        if (parseArgs(strArr) < 0) {
            printUsage();
            return -1;
        }
        JobConf jobConf = new JobConf(getConf(), CLSlidingWindowPwsim.class);
        FileSystem fileSystem = FileSystem.get(jobConf);
        this.inputPath = this.inputPath == null ? PwsimEnvironment.getTablesDir(this.workDir, fileSystem, this.signatureType, this.numOfBits, this.chunkOverlapSize, this.numOfPermutations) : this.inputPath;
        this.outputPath = this.outputPath == null ? PwsimEnvironment.getPwsimDir(this.workDir, this.signatureType, this.maxDist, this.numOfBits, this.numOfPermutations, this.windowSize) : this.outputPath;
        if (fileSystem.exists(new Path(this.outputPath))) {
            sLogger.info("SlidingWindowPwsim output already exists! Quitting...\nPath: " + this.outputPath);
            return 0;
        }
        if (this.sampleDocnosFile != null) {
            DistributedCache.addCacheFile(new URI(this.sampleDocnosFile), jobConf);
            jobConf.set("Ivory.SampleFile", this.sampleDocnosFile);
        }
        jobConf.setJobName("SlidingWindowPwsim:" + jobConf.get(Constants.CollectionName) + this.workDir.replaceFirst("tables", "") + "_B=" + this.windowSize + "_" + this.numResults);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(this.inputPath)});
        FileOutputFormat.setOutputPath(jobConf, new Path(this.outputPath));
        FileOutputFormat.setCompressOutput(jobConf, false);
        jobConf.setJarByClass(CLSlidingWindowPwsim.class);
        jobConf.set("mapreduce.map.java.opts", "-Xmx2000m");
        jobConf.setInt("mapred.task.timeout", 60000000);
        jobConf.setInt("Ivory.SlidingWindowSize", this.windowSize);
        jobConf.setInt("Ivory.MaxHammingDistance", this.maxDist);
        jobConf.setNumMapTasks(100);
        jobConf.setNumReduceTasks(this.numReducers);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setMapOutputKeyClass(PairOfInts.class);
        jobConf.setMapOutputValueClass(IntWritable.class);
        jobConf.setOutputKeyClass(PairOfInts.class);
        jobConf.setOutputValueClass(IntWritable.class);
        jobConf.setMapperClass(MyMapper.class);
        if (this.numResults == -1) {
            jobConf.setReducerClass(MyReducer.class);
        } else {
            jobConf.setReducerClass(MyReducerTopN.class);
        }
        if (this.sampleDocnosFile == null) {
            jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        } else {
            jobConf.setOutputFormat(TextOutputFormat.class);
        }
        sLogger.info("Running job " + jobConf.getJobName() + "...");
        sLogger.info("Input path: " + this.workDir);
        sLogger.info("Output path: " + this.outputPath);
        sLogger.info("Window size: " + this.windowSize);
        sLogger.info("Threshold: " + this.maxDist);
        sLogger.info("Sample file?: " + (this.sampleDocnosFile != null ? this.sampleDocnosFile : "none"));
        sLogger.info("Number of results: " + (this.numResults == -1 ? "all" : Integer.valueOf(this.numResults)));
        long currentTimeMillis = System.currentTimeMillis();
        RunningJob runJob = JobClient.runJob(jobConf);
        System.out.println("Job finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        Counters counters = runJob.getCounters();
        System.out.println("Avg prefix length = " + (((float) counters.findCounter(Pairs.PrefixSum).getCounter()) / ((float) counters.findCounter(Pairs.Processed).getCounter())));
        return 0;
    }

    private void printUsage() {
        new HelpFormatter().printHelp(getClass().getCanonicalName(), this.options);
    }

    private int parseArgs(String[] strArr) {
        this.options = new Options();
        Options options = this.options;
        OptionBuilder.withDescription("path to index directory");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        options.addOption(OptionBuilder.create("index"));
        Options options2 = this.options;
        OptionBuilder.withDescription("path to input (permuted tables)");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        options2.addOption(OptionBuilder.create(INPUT_PATH_OPTION));
        Options options3 = this.options;
        OptionBuilder.withDescription("path to output");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        options3.addOption(OptionBuilder.create("output"));
        Options options4 = this.options;
        OptionBuilder.withDescription("hamming distance threshold for similar pairs");
        OptionBuilder.withArgName("threshold");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options4.addOption(OptionBuilder.create(THRESHOLD_OPTION));
        Options options5 = this.options;
        OptionBuilder.withDescription("only keep pairs that match these docnos");
        OptionBuilder.withArgName("path to sample docnos file");
        OptionBuilder.hasArg();
        options5.addOption(OptionBuilder.create(SAMPLEDOCNOS_OPTION));
        Options options6 = this.options;
        OptionBuilder.withDescription("number of reducers");
        OptionBuilder.withArgName("number");
        OptionBuilder.hasArg();
        options6.addOption(OptionBuilder.create(NUMREDUCERS_OPTION));
        Options options7 = this.options;
        OptionBuilder.withDescription("length of signature");
        OptionBuilder.withArgName("number of bits");
        OptionBuilder.hasArg();
        options7.addOption(OptionBuilder.create(SIGNLENG_OPTION));
        Options options8 = this.options;
        OptionBuilder.withDescription("sliding window size");
        OptionBuilder.withArgName("window");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options8.addOption(OptionBuilder.create(WINDOWSIZE_OPTION));
        Options options9 = this.options;
        OptionBuilder.withDescription("type of signature");
        OptionBuilder.withArgName("random|minhash|simhash");
        OptionBuilder.hasArg();
        options9.addOption(OptionBuilder.create(SIGNTYPE_OPTION));
        Options options10 = this.options;
        OptionBuilder.withDescription("number of permutations (tables)");
        OptionBuilder.withArgName("permutations");
        OptionBuilder.hasArg();
        options10.addOption(OptionBuilder.create(NUMPERMS_OPTION));
        Options options11 = this.options;
        OptionBuilder.withDescription("size of overlap between chunks (default: window size)");
        OptionBuilder.withArgName("overlap size");
        OptionBuilder.hasArg();
        options11.addOption(OptionBuilder.create(OVERLAPSIZE_OPTION));
        Options options12 = this.options;
        OptionBuilder.withDescription("keep only N results for each source document");
        OptionBuilder.withArgName("N");
        OptionBuilder.hasArg();
        options12.addOption(OptionBuilder.create(TOPN_OPTION));
        Options options13 = this.options;
        OptionBuilder.withDescription("Hadoop option to load external jars");
        OptionBuilder.withArgName("jar packages");
        OptionBuilder.hasArg();
        options13.addOption(OptionBuilder.create(LIBJARS_OPTION));
        try {
            CommandLine parse = new GnuParser().parse(this.options, strArr);
            this.workDir = parse.hasOption("index") ? parse.getOptionValue("index") : null;
            this.inputPath = parse.hasOption(INPUT_PATH_OPTION) ? parse.getOptionValue(INPUT_PATH_OPTION) : null;
            this.outputPath = parse.hasOption("output") ? parse.getOptionValue("output") : null;
            this.numOfBits = parse.hasOption(SIGNLENG_OPTION) ? Integer.parseInt(parse.getOptionValue(SIGNLENG_OPTION)) : -1;
            this.signatureType = parse.hasOption(SIGNTYPE_OPTION) ? parse.getOptionValue(SIGNTYPE_OPTION) : null;
            this.numOfPermutations = parse.hasOption(NUMPERMS_OPTION) ? Integer.parseInt(parse.getOptionValue(NUMPERMS_OPTION)) : -1;
            this.chunkOverlapSize = parse.hasOption(OVERLAPSIZE_OPTION) ? Integer.parseInt(parse.getOptionValue(OVERLAPSIZE_OPTION)) : -1;
            if ((this.workDir == null || this.numOfBits <= 0 || this.numOfPermutations <= 0 || this.chunkOverlapSize <= 0 || this.signatureType == null) && (this.inputPath == null || this.outputPath == null)) {
                System.err.println("Either options -index and -num_bits and -type and -Q and -overlap or options -input and -outputshould be specified!");
                return -1;
            }
            this.numReducers = parse.hasOption(NUMREDUCERS_OPTION) ? Integer.parseInt(parse.getOptionValue(NUMREDUCERS_OPTION)) : 100;
            this.windowSize = Integer.parseInt(parse.getOptionValue(WINDOWSIZE_OPTION));
            this.maxDist = Integer.parseInt(parse.getOptionValue(THRESHOLD_OPTION));
            this.numResults = parse.hasOption(TOPN_OPTION) ? Integer.parseInt(parse.getOptionValue(TOPN_OPTION)) : -1;
            this.sampleDocnosFile = parse.hasOption(SAMPLEDOCNOS_OPTION) ? parse.getOptionValue(SAMPLEDOCNOS_OPTION) : null;
            return 0;
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            return -1;
        }
    }

    public static void main(String[] strArr) throws Exception {
        ToolRunner.run(new CLSlidingWindowPwsim(), strArr);
    }
}
