package ivory.lsh.bitext;

import edu.umd.cloud9.io.array.ArrayListOfIntsWritable;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.util.map.HMapIV;
import ivory.core.util.CLIRUtils;
import ivory.lsh.data.WikiSentenceInfo;
import ivory.lsh.driver.PwsimEnvironment;
import java.io.IOException;
import java.util.Iterator;
import opennlp.model.RealValueFileEventStream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairs.class */
public class FindParallelSentencePairs extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(FindParallelSentencePairs.class);
    private static final String FCOLLECTION_OPTION = "f_collection";
    private static final String ECOLLECTION_OPTION = "e_collection";
    private static final String FLANG_OPTION = "f_lang";
    private static final String ELANG_OPTION = "e_lang";
    private static final String FINDEX_OPTION = "f_index";
    private static final String EINDEX_OPTION = "e_index";
    private static final String BITEXTNAME_OPTION = "name";
    private static final String SENTENCES_OPTION = "sentences";
    private static final String BITEXT_OPTION = "bitext";
    private static final String DATADIR_OPTION = "data";
    private static final String PWSIM_OPTION = "pwsim_output";
    private static final String CLASSIFIERID_OPTION = "classifier_id";
    private static final String CLASSIFIERTHRESHOLD_OPTION = "threshold";
    private static final String LIBJARS_OPTION = "libjars";
    private static Options options;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairs$Docs.class */
    public enum Docs {
        pairsE,
        pairsF,
        pairs,
        pairsIncompleteF,
        pairsIncompleteE,
        dbg;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Docs[] valuesCustom() {
            Docs[] valuesCustom = values();
            int length = valuesCustom.length;
            Docs[] docsArr = new Docs[length];
            System.arraycopy(valuesCustom, 0, docsArr, 0, length);
            return docsArr;
        }
    }

    /* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairs$MyMapper.class */
    private static class MyMapper extends MapReduceBase implements Mapper<PairOfInts, WikiSentenceInfo, PairOfInts, WikiSentenceInfo> {
        private HMapIV<ArrayListOfIntsWritable> pwsimMapping;
        private PairOfInts keyOut;
        private JobConf mJob;
        private ArrayListOfIntsWritable similarDocnos;

        private MyMapper() {
        }

        public void configure(JobConf jobConf) {
            this.mJob = jobConf;
            this.pwsimMapping = new HMapIV<>();
            this.keyOut = new PairOfInts();
        }

        private static String getFilename(String str) {
            return str.substring(str.lastIndexOf("/") + 1);
        }

        private static void loadPairs(HMapIV<ArrayListOfIntsWritable> hMapIV, int i, JobConf jobConf, Reporter reporter) {
            try {
                Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(jobConf);
                String str = jobConf.get("PwsimPairs");
                for (Path path : localCacheFiles) {
                    if (path.toString().contains(getFilename(str))) {
                        SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(jobConf), path, jobConf);
                        PairOfInts pairOfInts = (PairOfInts) reader.getKeyClass().newInstance();
                        int i2 = 0;
                        for (IntWritable intWritable = (IntWritable) reader.getValueClass().newInstance(); reader.next(pairOfInts, intWritable); intWritable = (IntWritable) reader.getValueClass().newInstance()) {
                            int rightElement = pairOfInts.getRightElement();
                            int leftElement = pairOfInts.getLeftElement();
                            if (i == -1) {
                                if (!hMapIV.containsKey(leftElement)) {
                                    hMapIV.put(leftElement, new ArrayListOfIntsWritable());
                                }
                                ((ArrayListOfIntsWritable) hMapIV.get(leftElement)).add(rightElement);
                            } else {
                                if (!hMapIV.containsKey(rightElement)) {
                                    hMapIV.put(rightElement, new ArrayListOfIntsWritable());
                                }
                                ((ArrayListOfIntsWritable) hMapIV.get(rightElement)).add(leftElement);
                            }
                            i2++;
                            pairOfInts = (PairOfInts) reader.getKeyClass().newInstance();
                        }
                        reader.close();
                        FindParallelSentencePairs.sLogger.info(String.valueOf(hMapIV.size()) + "," + i2 + " pairs loaded from " + path);
                    }
                }
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        public void map(PairOfInts pairOfInts, WikiSentenceInfo wikiSentenceInfo, OutputCollector<PairOfInts, WikiSentenceInfo> outputCollector, Reporter reporter) throws IOException {
            int leftElement = pairOfInts.getLeftElement();
            int langID = wikiSentenceInfo.getLangID();
            if (this.pwsimMapping.isEmpty()) {
                loadPairs(this.pwsimMapping, langID, this.mJob, reporter);
                FindParallelSentencePairs.sLogger.info("Mapping loaded: " + this.pwsimMapping.size());
            }
            if (langID == 1) {
                leftElement += 1000000000;
            }
            if (this.pwsimMapping.containsKey(leftElement)) {
                this.similarDocnos = (ArrayListOfIntsWritable) this.pwsimMapping.get(leftElement);
                if (langID == -1) {
                    reporter.incrCounter(Sentences.E, 1L);
                    reporter.incrCounter(Sentences.pairsE, this.similarDocnos.size());
                } else {
                    reporter.incrCounter(Sentences.F, 1L);
                    reporter.incrCounter(Sentences.pairsF, this.similarDocnos.size());
                }
                Iterator it = this.similarDocnos.iterator();
                while (it.hasNext()) {
                    int intValue = ((Integer) it.next()).intValue();
                    if (langID == -1) {
                        this.keyOut.set(intValue, leftElement);
                    } else {
                        this.keyOut.set(leftElement, intValue);
                    }
                    outputCollector.collect(this.keyOut, wikiSentenceInfo);
                }
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((PairOfInts) obj, (WikiSentenceInfo) obj2, (OutputCollector<PairOfInts, WikiSentenceInfo>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairs$MyReducer.class */
    private static class MyReducer extends MapReduceBase implements Reducer<PairOfInts, WikiSentenceInfo, Text, Text> {
        private int fDocno;
        private int eDocno;
        private int classifierPositiveId;
        private ArrayListWritable<HMapSFW> fVectors;
        private ArrayListWritable<HMapSFW> eVectors;
        private ArrayListWritable<Text> fSentences;
        private ArrayListWritable<Text> eSentences;
        private PreprocessHelper helper;
        private float classifierThreshold;
        private Text emptyValue = new Text();

        private MyReducer() {
        }

        public void configure(JobConf jobConf) {
            try {
                this.helper = new PreprocessHelper(3, 5, jobConf);
            } catch (Exception e) {
                e.printStackTrace();
            }
            this.classifierPositiveId = jobConf.getInt("ClassifierId", -1);
            if (this.classifierPositiveId != 0 && this.classifierPositiveId != 1) {
                throw new RuntimeException("Id of parallel label in MaxEnt classifier not specified properly: " + this.classifierPositiveId);
            }
            this.classifierThreshold = jobConf.getFloat("ClassifierThreshold", 2.0f);
            if (this.classifierThreshold > 1.0f) {
                throw new RuntimeException("Classifier confidence threshold > 1, provide value in [0,1]: " + this.classifierThreshold);
            }
            this.eVectors = new ArrayListWritable<>();
            this.fVectors = new ArrayListWritable<>();
            this.eSentences = new ArrayListWritable<>();
            this.fSentences = new ArrayListWritable<>();
        }

        public void reduce(PairOfInts pairOfInts, Iterator<WikiSentenceInfo> it, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException {
            this.eVectors.clear();
            this.fVectors.clear();
            this.eSentences.clear();
            this.fSentences.clear();
            this.fDocno = pairOfInts.getLeftElement();
            this.eDocno = pairOfInts.getRightElement();
            int i = 0;
            int i2 = 0;
            while (it.hasNext()) {
                WikiSentenceInfo next = it.next();
                if (next.getLangID() == 1) {
                    i2++;
                    this.fVectors.add(next.getVector());
                    this.fSentences.add(next.getSentence());
                    reporter.incrCounter(Sentences.F, 1L);
                } else {
                    if (next.getLangID() != -1) {
                        throw new RuntimeException("Unknown language ID -- should not happen!");
                    }
                    i++;
                    this.eVectors.add(next.getVector());
                    this.eSentences.add(next.getSentence());
                    reporter.incrCounter(Sentences.E, 1L);
                }
            }
            if (i == 0 || i2 == 0) {
                FindParallelSentencePairs.sLogger.debug("Read " + i + "," + i2 + " sentences: =" + this.eDocno + "," + this.fDocno);
                if (i == 0) {
                    reporter.incrCounter(Docs.pairsIncompleteE, 1L);
                    return;
                } else {
                    reporter.incrCounter(Docs.pairsIncompleteF, 1L);
                    return;
                }
            }
            reporter.incrCounter(Docs.pairs, 1L);
            reporter.incrCounter(Sentences.pairsCandidate, this.fVectors.size() * this.eVectors.size());
            int i3 = 0;
            long j = 0;
            FindParallelSentencePairs.sLogger.debug(String.valueOf(this.fSentences.size()) + "," + this.eSentences.size());
            for (int i4 = 0; i4 < this.fVectors.size(); i4++) {
                HMapSFW hMapSFW = (HMapSFW) this.fVectors.get(i4);
                int length = ((Text) this.fSentences.get(i4)).getLength();
                for (int i5 = 0; i5 < this.eVectors.size(); i5++) {
                    HMapSFW hMapSFW2 = (HMapSFW) this.eVectors.get(i5);
                    int length2 = ((Text) this.eSentences.get(i5)).getLength();
                    if (length2 > 2 * length || length > 2 * length2) {
                        reporter.incrCounter(Sentences.pairsFilteredBySentRatio, 1L);
                    } else {
                        reporter.incrCounter(Sentences.pairsProcessed, 1L);
                        i3++;
                        long currentTimeMillis = System.currentTimeMillis();
                        String[] computeFeaturesF1 = CLIRUtils.computeFeaturesF1(hMapSFW2, hMapSFW, length2, length);
                        j += System.currentTimeMillis() - currentTimeMillis;
                        if (computeFeaturesF1 == null) {
                            throw new RuntimeException("SHOULD NOT HAPPEN!");
                        }
                        if (this.helper.getClassifier().eval(computeFeaturesF1, RealValueFileEventStream.parseContexts(computeFeaturesF1))[this.classifierPositiveId] > this.classifierThreshold) {
                            reporter.incrCounter(Sentences.parallel, 1L);
                            outputCollector.collect(new Text(this.fSentences.get(i4) + CLIRUtils.BitextSeparator + this.eSentences.get(i5)), this.emptyValue);
                        }
                    }
                }
            }
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((PairOfInts) obj, (Iterator<WikiSentenceInfo>) it, (OutputCollector<Text, Text>) outputCollector, reporter);
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairs$Sentences.class */
    public enum Sentences {
        E,
        F,
        pairsE,
        pairsF,
        pairsProcessed,
        pairsCandidate,
        pairsFilteredByVectorSize,
        pairsFilteredBySentRatio,
        parallel;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Sentences[] valuesCustom() {
            Sentences[] valuesCustom = values();
            int length = valuesCustom.length;
            Sentences[] sentencesArr = new Sentences[length];
            System.arraycopy(valuesCustom, 0, sentencesArr, 0, length);
            return sentencesArr;
        }
    }

    private static void printUsage() {
        new HelpFormatter().printHelp("FindParallelSentencePairs", options);
        System.exit(-1);
    }

    public int run(String[] strArr) throws Exception {
        JobConf jobConf = setupConf(new JobConf(getConf(), FindParallelSentencePairs.class), strArr);
        if (jobConf == null) {
            printUsage();
            return -1;
        }
        jobConf.setInt("mapred.task.timeout", 60000000);
        jobConf.set("mapreduce.map.memory.mb", "3000");
        jobConf.set("mapreduce.map.java.opts", "-Xmx3000m");
        jobConf.set("mapreduce.reduce.memory.mb", "3000");
        jobConf.set("mapreduce.reduce.java.opts", "-Xmx3000m");
        jobConf.setBoolean("mapred.map.tasks.speculative.execution", false);
        jobConf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
        jobConf.setNumMapTasks(100);
        jobConf.setNumReduceTasks(50);
        jobConf.setInt("mapred.min.split.size", 2000000000);
        jobConf.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setOutputFormat(TextOutputFormat.class);
        jobConf.setMapOutputKeyClass(PairOfInts.class);
        jobConf.setMapOutputValueClass(WikiSentenceInfo.class);
        jobConf.setMapperClass(MyMapper.class);
        jobConf.setReducerClass(MyReducer.class);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(Text.class);
        long currentTimeMillis = System.currentTimeMillis();
        JobClient.runJob(jobConf);
        sLogger.info("Job finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        return 0;
    }

    protected static JobConf setupConf(JobConf jobConf, String[] strArr) throws Exception {
        options = new Options();
        Options options2 = options;
        OptionBuilder.withDescription("source-side raw collection path");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options2.addOption(OptionBuilder.create(FCOLLECTION_OPTION));
        Options options3 = options;
        OptionBuilder.withDescription("target-side raw collection path");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options3.addOption(OptionBuilder.create(ECOLLECTION_OPTION));
        Options options4 = options;
        OptionBuilder.withDescription("two-letter code for f-language");
        OptionBuilder.withArgName("en|de|tr|cs|zh|ar|es");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options4.addOption(OptionBuilder.create(FLANG_OPTION));
        Options options5 = options;
        OptionBuilder.withDescription("two-letter code for e-language");
        OptionBuilder.withArgName("en|de|tr|cs|zh|ar|es");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options5.addOption(OptionBuilder.create(ELANG_OPTION));
        Options options6 = options;
        OptionBuilder.withDescription("source-side index path");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options6.addOption(OptionBuilder.create(FINDEX_OPTION));
        Options options7 = options;
        OptionBuilder.withDescription("target-side index path");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options7.addOption(OptionBuilder.create(EINDEX_OPTION));
        Options options8 = options;
        OptionBuilder.withDescription("name of bitext");
        OptionBuilder.withArgName("string");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options8.addOption(OptionBuilder.create(BITEXTNAME_OPTION));
        Options options9 = options;
        OptionBuilder.withDescription("path to data files on HDFS");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options9.addOption(OptionBuilder.create(DATADIR_OPTION));
        Options options10 = options;
        OptionBuilder.withDescription("path to output of pwsim algorithm");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options10.addOption(OptionBuilder.create(PWSIM_OPTION));
        Options options11 = options;
        OptionBuilder.withDescription("classifier id to retrieve P('PARALLEL'|instance)");
        OptionBuilder.withArgName("0 or 1");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options11.addOption(OptionBuilder.create(CLASSIFIERID_OPTION));
        Options options12 = options;
        OptionBuilder.withDescription("target vocabulary (e-side) of P(e|f)");
        OptionBuilder.withArgName("0-1");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options12.addOption(OptionBuilder.create(CLASSIFIERTHRESHOLD_OPTION));
        Options options13 = options;
        OptionBuilder.withDescription("path to collection sentences");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options13.addOption(OptionBuilder.create(SENTENCES_OPTION));
        Options options14 = options;
        OptionBuilder.withDescription("path to output bitext");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options14.addOption(OptionBuilder.create(BITEXT_OPTION));
        Options options15 = options;
        OptionBuilder.withDescription("Hadoop option to load external jars");
        OptionBuilder.withArgName("jar packages");
        OptionBuilder.hasArg();
        options15.addOption(OptionBuilder.create(LIBJARS_OPTION));
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            String optionValue = parse.getOptionValue(PWSIM_OPTION);
            String optionValue2 = parse.getOptionValue(EINDEX_OPTION);
            String optionValue3 = parse.getOptionValue(FINDEX_OPTION);
            String optionValue4 = parse.getOptionValue(DATADIR_OPTION);
            String optionValue5 = parse.getOptionValue(ELANG_OPTION);
            String optionValue6 = parse.getOptionValue(FLANG_OPTION);
            String optionValue7 = parse.hasOption(BITEXTNAME_OPTION) ? parse.getOptionValue(BITEXTNAME_OPTION) : "";
            float parseFloat = Float.parseFloat(parse.getOptionValue(CLASSIFIERTHRESHOLD_OPTION));
            int parseInt = Integer.parseInt(parse.getOptionValue(CLASSIFIERID_OPTION));
            String optionValue8 = parse.getOptionValue(SENTENCES_OPTION);
            String optionValue9 = parse.getOptionValue(BITEXT_OPTION);
            if (!FileSystem.get(jobConf).exists(new Path(optionValue8))) {
                System.err.println("Input sentences does not exist at: " + optionValue8 + ". Exiting...");
                return null;
            }
            FileInputFormat.addInputPaths(jobConf, optionValue8);
            FileOutputFormat.setOutputPath(jobConf, new Path(optionValue9));
            jobConf.setJobName("FindParallelSentences_" + optionValue6 + "-" + optionValue5 + "_F1=" + parseFloat + "[" + parseInt + "]");
            try {
                JobConf bitextPaths = PwsimEnvironment.setBitextPaths(jobConf, optionValue4, optionValue5, optionValue6, optionValue7, optionValue2, optionValue3, parseFloat, parseInt, optionValue, "simple");
                sLogger.info("Running job " + bitextPaths.getJobName());
                sLogger.info("Pwsim output path: " + optionValue);
                sLogger.info("Sentences path: " + optionValue8);
                sLogger.info("Output path: " + optionValue9);
                return bitextPaths;
            } catch (Exception e) {
                e.printStackTrace();
                System.err.println("Error configuring paths: " + e.getMessage());
                return null;
            }
        } catch (ParseException e2) {
            System.err.println("Error parsing command line: " + e2.getMessage());
            return null;
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(new FindParallelSentencePairs(), strArr));
    }
}
