package ivory.lsh.bitext;

import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.io.map.HMapSIW;
import ivory.core.tokenize.Tokenizer;
import ivory.core.util.CLIRUtils;
import ivory.lsh.driver.PwsimEnvironment;
import java.io.IOException;
import opennlp.model.RealValueFileEventStream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/lsh/bitext/FilterSentencePairs.class */
public class FilterSentencePairs extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(FilterSentencePairs.class);
    private static Options options;
    private static final String FLANG_OPTION = "f_lang";
    private static final String ELANG_OPTION = "e_lang";
    private static final String FINDEX_OPTION = "f_index";
    private static final String EINDEX_OPTION = "e_index";
    private static final String BITEXTNAME_OPTION = "name";
    private static final String INPUT_OPTION = "input";
    private static final String OUTPUT_OPTION = "output";
    private static final String DATADIR_OPTION = "data";
    private static final String CLASSIFIERID_OPTION = "classifier_id";
    private static final String CLASSIFIERTHRESHOLD_OPTION = "threshold";
    private static final String LIBJARS_OPTION = "libjars";

    /* loaded from: input_file:ivory/lsh/bitext/FilterSentencePairs$MyMapper.class */
    private static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> {
        private PreprocessHelper helper;
        private String eSent;
        private String fSent;
        private int eLen;
        private int fLen;
        private HMapSFW eVector;
        private HMapSFW fVector;
        private Tokenizer eTok;
        private Tokenizer fTok;
        private Text outSent1;
        private Text outSent2;
        private float classifierThreshold;
        private int classifierPositiveId;

        private MyMapper() {
        }

        public void configure(JobConf jobConf) {
            FilterSentencePairs.sLogger.setLevel(Level.INFO);
            try {
                this.helper = new PreprocessHelper(3, 5, jobConf);
            } catch (Exception e) {
                e.printStackTrace();
            }
            this.classifierThreshold = jobConf.getFloat("ClassifierThreshold", 0.0f);
            this.classifierPositiveId = jobConf.getInt("ClassifierId", -1);
            if (this.classifierPositiveId != 0 && this.classifierPositiveId != 1) {
                throw new RuntimeException("Id of parallel label in MaxEnt classifier not specified properly: " + this.classifierPositiveId);
            }
            FilterSentencePairs.sLogger.info(Float.valueOf(this.classifierThreshold));
            this.eTok = this.helper.getETokenizer();
            this.fTok = this.helper.getFTokenizer();
            this.outSent1 = new Text();
            this.outSent2 = new Text();
        }

        public void map(LongWritable longWritable, Text text, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException {
            String[] split = text.toString().split(CLIRUtils.BitextSeparator);
            if (split.length < 2) {
                reporter.incrCounter(Sentences.ignored, 1L);
                return;
            }
            this.eSent = split[1];
            this.fSent = split[0];
            this.eLen = this.eTok.getNumberTokens(this.eSent);
            this.fLen = this.fTok.getNumberTokens(this.fSent);
            HMapSIW hMapSIW = new HMapSIW();
            this.eVector = this.helper.createEDocVector(this.eSent, hMapSIW);
            HMapSIW hMapSIW2 = new HMapSIW();
            this.fVector = this.helper.createFDocVector(this.fSent, hMapSIW2);
            if (this.eVector == null || this.fVector == null) {
                reporter.incrCounter(Sentences.ignored, 1L);
                return;
            }
            FilterSentencePairs.sLogger.debug("-------------\n" + this.fSent + "\n" + this.eSent + "\n----\n" + this.fVector + "\n" + hMapSIW2 + "\n" + this.eVector + "\n" + this.fLen + "," + this.eLen + "\n------------");
            String[] computeFeaturesF2 = CLIRUtils.computeFeaturesF2(hMapSIW, this.eVector, hMapSIW2, this.fVector, this.eLen, this.fLen, this.helper.getESrc(), this.helper.getETrg(), this.helper.getFSrc(), this.helper.getFTrg(), this.helper.getE2F(), this.helper.getF2E(), 0.1f);
            String str = "";
            for (String str2 : computeFeaturesF2) {
                str = String.valueOf(str) + str2 + " ";
            }
            if (computeFeaturesF2 == null) {
                throw new RuntimeException("SHOULD NOT HAPPEN!");
            }
            double d = this.helper.getClassifier().eval(computeFeaturesF2, RealValueFileEventStream.parseContexts(computeFeaturesF2))[this.classifierPositiveId];
            if (d > this.classifierThreshold) {
                reporter.incrCounter(Sentences.parallel, 1L);
                this.outSent1.set(String.valueOf(this.fSent) + CLIRUtils.BitextSeparator + this.eSent + CLIRUtils.BitextSeparator + str + CLIRUtils.BitextSeparator + d);
                outputCollector.collect(this.outSent1, this.outSent2);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((LongWritable) obj, (Text) obj2, (OutputCollector<Text, Text>) outputCollector, reporter);
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:ivory/lsh/bitext/FilterSentencePairs$Sentences.class */
    public enum Sentences {
        parallel,
        ignored,
        dbg,
        OOV;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Sentences[] valuesCustom() {
            Sentences[] valuesCustom = values();
            int length = valuesCustom.length;
            Sentences[] sentencesArr = new Sentences[length];
            System.arraycopy(valuesCustom, 0, sentencesArr, 0, length);
            return sentencesArr;
        }
    }

    private static void printUsage() {
        new HelpFormatter().printHelp("FilterSentencePairs", options);
        System.exit(-1);
    }

    public int run(String[] strArr) throws Exception {
        JobConf jobConf = setupConf(new JobConf(getConf(), FilterSentencePairs.class), strArr);
        if (jobConf == null) {
            printUsage();
            return -1;
        }
        jobConf.setInt("mapred.task.timeout", 60000000);
        jobConf.set("mapreduce.map.memory.mb", "3000");
        jobConf.set("mapreduce.map.java.opts", "-Xmx3000m");
        jobConf.setBoolean("mapred.map.tasks.speculative.execution", false);
        jobConf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
        jobConf.setNumMapTasks(100);
        jobConf.setNumReduceTasks(1);
        jobConf.setInt("mapred.min.split.size", 2000000000);
        jobConf.setInputFormat(TextInputFormat.class);
        jobConf.setOutputFormat(TextOutputFormat.class);
        jobConf.setMapOutputKeyClass(Text.class);
        jobConf.setMapOutputValueClass(Text.class);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(Text.class);
        jobConf.setMapperClass(MyMapper.class);
        jobConf.setReducerClass(IdentityReducer.class);
        long currentTimeMillis = System.currentTimeMillis();
        JobClient.runJob(jobConf);
        sLogger.info("Job finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        return 0;
    }

    private JobConf setupConf(JobConf jobConf, String[] strArr) throws Exception {
        options = new Options();
        Options options2 = options;
        OptionBuilder.withDescription("path to bitext (input)");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options2.addOption(OptionBuilder.create(INPUT_OPTION));
        Options options3 = options;
        OptionBuilder.withDescription("path to filtered bitext (output)");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options3.addOption(OptionBuilder.create("output"));
        Options options4 = options;
        OptionBuilder.withDescription("source-side index path");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options4.addOption(OptionBuilder.create(FINDEX_OPTION));
        Options options5 = options;
        OptionBuilder.withDescription("target-side index path");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options5.addOption(OptionBuilder.create(EINDEX_OPTION));
        Options options6 = options;
        OptionBuilder.withDescription("two-letter code for f-language");
        OptionBuilder.withArgName("en|de|tr|cs|zh|ar|es");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options6.addOption(OptionBuilder.create(FLANG_OPTION));
        Options options7 = options;
        OptionBuilder.withDescription("two-letter code for e-language");
        OptionBuilder.withArgName("en|de|tr|cs|zh|ar|es");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options7.addOption(OptionBuilder.create(ELANG_OPTION));
        Options options8 = options;
        OptionBuilder.withDescription("name of bitext");
        OptionBuilder.withArgName("string");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options8.addOption(OptionBuilder.create(BITEXTNAME_OPTION));
        Options options9 = options;
        OptionBuilder.withDescription("path to data files on HDFS");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options9.addOption(OptionBuilder.create(DATADIR_OPTION));
        Options options10 = options;
        OptionBuilder.withDescription("classifier id to retrieve P('PARALLEL'|instance)");
        OptionBuilder.withArgName("0 or 1");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options10.addOption(OptionBuilder.create(CLASSIFIERID_OPTION));
        Options options11 = options;
        OptionBuilder.withDescription("target vocabulary (e-side) of P(e|f)");
        OptionBuilder.withArgName("0-1");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options11.addOption(OptionBuilder.create(CLASSIFIERTHRESHOLD_OPTION));
        Options options12 = options;
        OptionBuilder.withDescription("Hadoop option to load external jars");
        OptionBuilder.withArgName("jar packages");
        OptionBuilder.hasArg();
        options12.addOption(OptionBuilder.create(LIBJARS_OPTION));
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            String optionValue = parse.getOptionValue(INPUT_OPTION);
            String optionValue2 = parse.getOptionValue("output");
            JobConf bitextPaths = PwsimEnvironment.setBitextPaths(jobConf, parse.getOptionValue(DATADIR_OPTION), parse.getOptionValue(ELANG_OPTION), parse.getOptionValue(FLANG_OPTION), parse.hasOption(BITEXTNAME_OPTION) ? parse.getOptionValue(BITEXTNAME_OPTION) : "", parse.getOptionValue(EINDEX_OPTION), parse.getOptionValue(FINDEX_OPTION), Float.parseFloat(parse.getOptionValue(CLASSIFIERTHRESHOLD_OPTION)), Integer.parseInt(parse.getOptionValue(CLASSIFIERID_OPTION)), null, "complex");
            FileInputFormat.setInputPaths(bitextPaths, optionValue);
            FileOutputFormat.setOutputPath(bitextPaths, new Path(optionValue2));
            sLogger.info("Running job " + bitextPaths.getJobName());
            sLogger.info("Input directory: " + optionValue);
            sLogger.info("Output directory: " + optionValue2);
            return bitextPaths;
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            return null;
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(new FilterSentencePairs(), strArr));
    }
}
