package ivory.lsh.bitext;

import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.collection.wikipedia.WikipediaPage;
import edu.umd.cloud9.io.array.ArrayListOfIntsWritable;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.util.map.HMapIV;
import ivory.core.RetrievalEnvironment;
import ivory.core.util.CLIRUtils;
import ivory.lsh.data.WikiDocInfo;
import ivory.sqe.retrieval.Constants;
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;
import opennlp.model.RealValueFileEventStream;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairsOld.class */
public class FindParallelSentencePairsOld extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(FindParallelSentencePairsOld.class);
    private static final int MinVectorTerms = 3;
    private static final int MinSentenceLength = 5;
    private static final int E = -1;
    private static final int F = 1;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairsOld$Docs.class */
    public enum Docs {
        pairsE,
        pairsF,
        pairs,
        pairsIncompleteF,
        pairsIncompleteE
    }

    /* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairsOld$MyMapper.class */
    private static class MyMapper extends MapReduceBase implements Mapper<Writable, Indexable, PairOfInts, WikiDocInfo> {
        private HMapIV<ArrayListOfIntsWritable> pwsimMapping;
        private PairOfInts keyOut;
        private JobConf mJob;
        private WikiDocInfo valOut;
        private PreprocessHelper helper;

        private MyMapper() {
        }

        public void configure(JobConf jobConf) {
            FindParallelSentencePairsOld.sLogger.setLevel(Level.INFO);
            this.mJob = jobConf;
            this.pwsimMapping = new HMapIV<>();
            try {
                this.helper = new PreprocessHelper(3, 5, jobConf);
            } catch (Exception e) {
                e.printStackTrace();
            }
            this.keyOut = new PairOfInts();
            this.valOut = new WikiDocInfo();
        }

        private static void loadPairs(HMapIV<ArrayListOfIntsWritable> hMapIV, String str, JobConf jobConf, Reporter reporter) {
            try {
                SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(jobConf), DistributedCache.getLocalCacheFiles(jobConf)[14], jobConf);
                PairOfInts pairOfInts = (PairOfInts) reader.getKeyClass().newInstance();
                for (IntWritable intWritable = (IntWritable) reader.getValueClass().newInstance(); reader.next(pairOfInts, intWritable); intWritable = (IntWritable) reader.getValueClass().newInstance()) {
                    int rightElement = pairOfInts.getRightElement() - 1000000000;
                    int leftElement = pairOfInts.getLeftElement();
                    if (str.equals(Constants.English)) {
                        if (!hMapIV.containsKey(leftElement)) {
                            hMapIV.put(leftElement, new ArrayListOfIntsWritable());
                        }
                        ((ArrayListOfIntsWritable) hMapIV.get(leftElement)).add(rightElement);
                    } else {
                        if (!hMapIV.containsKey(rightElement)) {
                            hMapIV.put(rightElement, new ArrayListOfIntsWritable());
                        }
                        ((ArrayListOfIntsWritable) hMapIV.get(rightElement)).add(leftElement);
                    }
                    pairOfInts = (PairOfInts) reader.getKeyClass().newInstance();
                }
                reader.close();
                FindParallelSentencePairsOld.sLogger.info(hMapIV.size() + " pairs loaded.");
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        public void map(Writable writable, Indexable indexable, OutputCollector<PairOfInts, WikiDocInfo> outputCollector, Reporter reporter) throws IOException {
            int i = ((IntWritable) writable).get();
            WikipediaPage wikipediaPage = (WikipediaPage) indexable;
            String language = wikipediaPage.getLanguage();
            if (this.pwsimMapping.isEmpty()) {
                loadPairs(this.pwsimMapping, language, this.mJob, reporter);
                FindParallelSentencePairsOld.sLogger.debug(Integer.valueOf(this.pwsimMapping.size()));
            }
            if (this.pwsimMapping.containsKey(i)) {
                ArrayListOfIntsWritable arrayListOfIntsWritable = (ArrayListOfIntsWritable) this.pwsimMapping.get(i);
                ArrayListWritable<HMapSFW> arrayListWritable = new ArrayListWritable<>();
                ArrayListOfIntsWritable arrayListOfIntsWritable2 = new ArrayListOfIntsWritable();
                try {
                    ArrayListWritable<Text> eSentences = language.equals(Constants.English) ? this.helper.getESentences(wikipediaPage.getContent(), arrayListWritable, arrayListOfIntsWritable2) : this.helper.getFSentences(wikipediaPage.getContent(), arrayListWritable, arrayListOfIntsWritable2);
                    if (eSentences.size() != arrayListWritable.size()) {
                        throw new RuntimeException("Sentences.size != Vectors.size");
                    }
                    Iterator it = arrayListOfIntsWritable.iterator();
                    while (it.hasNext()) {
                        int intValue = ((Integer) it.next()).intValue();
                        if (language.equals(Constants.English)) {
                            this.keyOut.set(intValue, i);
                            this.valOut.set(-1, arrayListWritable, eSentences);
                            reporter.incrCounter(Docs.pairsE, 1L);
                            reporter.incrCounter(Sentences.pairsE, arrayListWritable.size());
                        } else {
                            this.keyOut.set(i, intValue);
                            this.valOut.set(1, arrayListWritable, eSentences);
                            reporter.incrCounter(Docs.pairsF, 1L);
                            reporter.incrCounter(Sentences.pairsF, arrayListWritable.size());
                        }
                        outputCollector.collect(this.keyOut, this.valOut);
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                    throw new RuntimeException(e);
                }
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((Writable) obj, (Indexable) obj2, (OutputCollector<PairOfInts, WikiDocInfo>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairsOld$MyReducer.class */
    private static class MyReducer extends MapReduceBase implements Reducer<PairOfInts, WikiDocInfo, Text, Text> {
        private int fDocno;
        private int eDocno;
        private int classifierPositiveId;
        private ArrayListWritable<HMapSFW> fVectors;
        private ArrayListWritable<HMapSFW> eVectors;
        private ArrayListWritable<Text> fSentences;
        private ArrayListWritable<Text> eSentences;
        private PreprocessHelper helper;
        private float classifierThreshold;
        private Text emptyValue = new Text();

        private MyReducer() {
        }

        public void configure(JobConf jobConf) {
            FindParallelSentencePairsOld.sLogger.setLevel(Level.INFO);
            try {
                this.helper = new PreprocessHelper(3, 5, jobConf);
            } catch (Exception e) {
                e.printStackTrace();
            }
            this.classifierPositiveId = jobConf.getInt("ClassifierId", -1);
            if (this.classifierPositiveId != 0 && this.classifierPositiveId != 1) {
                throw new RuntimeException("Id of parallel label in MaxEnt classifier not specified properly: " + this.classifierPositiveId);
            }
            this.classifierThreshold = jobConf.getFloat("ClassifierThreshold", 2.0f);
            if (this.classifierThreshold > 1.0f) {
                throw new RuntimeException("Classifier confidence threshold > 1, provide value in [0,1]: " + this.classifierThreshold);
            }
            this.eVectors = new ArrayListWritable<>();
            this.fVectors = new ArrayListWritable<>();
            this.eSentences = new ArrayListWritable<>();
            this.fSentences = new ArrayListWritable<>();
        }

        public void reduce(PairOfInts pairOfInts, Iterator<WikiDocInfo> it, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException {
            this.eVectors.clear();
            this.fVectors.clear();
            this.eSentences.clear();
            this.fSentences.clear();
            this.fDocno = pairOfInts.getLeftElement();
            this.eDocno = pairOfInts.getRightElement();
            int i = 0;
            int i2 = 0;
            while (it.hasNext() && (i < 1 || i2 < 1)) {
                WikiDocInfo next = it.next();
                if (next.getLangID() == 1 && this.fVectors.isEmpty()) {
                    i2++;
                    this.fVectors = next.getVectors();
                    this.fSentences = next.getSentences();
                    reporter.incrCounter(Sentences.pairsF, this.fVectors.size());
                } else if (next.getLangID() == -1 && this.eVectors.isEmpty()) {
                    i++;
                    this.eVectors = next.getVectors();
                    this.eSentences = next.getSentences();
                    reporter.incrCounter(Sentences.pairsE, this.eVectors.size());
                }
            }
            if (i < 1 || i2 < 1) {
                FindParallelSentencePairsOld.sLogger.debug("Read " + i + "," + i2 + " pages: =" + this.eDocno + "," + this.fDocno);
                if (this.fVectors.isEmpty()) {
                    reporter.incrCounter(Docs.pairsIncompleteF, 1L);
                    return;
                } else {
                    reporter.incrCounter(Docs.pairsIncompleteE, 1L);
                    return;
                }
            }
            reporter.incrCounter(Docs.pairs, 1L);
            if (this.fVectors.size() == 0 || this.eVectors.size() == 0) {
                return;
            }
            reporter.incrCounter(Sentences.pairsCandidate, this.fVectors.size() * this.eVectors.size());
            int i3 = 0;
            long j = 0;
            for (int i4 = 0; i4 < this.fVectors.size(); i4++) {
                HMapSFW hMapSFW = (HMapSFW) this.fVectors.get(i4);
                int length = ((Text) this.fSentences.get(i4)).getLength();
                for (int i5 = 0; i5 < this.eVectors.size(); i5++) {
                    HMapSFW hMapSFW2 = (HMapSFW) this.eVectors.get(i5);
                    int length2 = ((Text) this.eSentences.get(i5)).getLength();
                    if (length2 > 2 * length || length > 2 * length2) {
                        reporter.incrCounter(Sentences.pairsFilteredBySentRatio, 1L);
                    } else {
                        reporter.incrCounter(Sentences.pairsProcessed, 1L);
                        i3++;
                        long currentTimeMillis = System.currentTimeMillis();
                        String[] computeFeaturesF1 = CLIRUtils.computeFeaturesF1(hMapSFW2, hMapSFW, length2, length);
                        j += System.currentTimeMillis() - currentTimeMillis;
                        if (computeFeaturesF1 == null) {
                            throw new RuntimeException("SHOULD NOT HAPPEN!");
                        }
                        if (this.helper.getClassifier().eval(computeFeaturesF1, RealValueFileEventStream.parseContexts(computeFeaturesF1))[this.classifierPositiveId] > this.classifierThreshold) {
                            reporter.incrCounter(Sentences.parallel, 1L);
                            outputCollector.collect(new Text(this.fSentences.get(i4) + "<GERMAN2ENGLISH>" + this.eSentences.get(i5)), this.emptyValue);
                        }
                    }
                }
            }
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            reduce((PairOfInts) obj, (Iterator<WikiDocInfo>) it, (OutputCollector<Text, Text>) outputCollector, reporter);
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:ivory/lsh/bitext/FindParallelSentencePairsOld$Sentences.class */
    public enum Sentences {
        pairsE,
        pairsF,
        pairsProcessed,
        pairsCandidate,
        pairsFilteredByVectorSize,
        pairsFilteredBySentRatio,
        parallel
    }

    private static int printUsage() {
        sLogger.info("usage: [cl-pwsim-output-path] [output-path] [e-path] [f-path] [e-dir] [f-dir] [vocab-dir] [e-lang] [f-lang] [classifier] [threshold] [classifier parallel-label id]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    public int run(String[] strArr) throws Exception {
        if (strArr.length != 12) {
            printUsage();
            return -1;
        }
        JobConf jobConf = new JobConf(getConf(), FindParallelSentencePairsOld.class);
        String str = strArr[0];
        String str2 = strArr[1];
        String str3 = strArr[2];
        String str4 = strArr[3];
        String str5 = strArr[4];
        String str6 = strArr[5];
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(str5, FileSystem.get(jobConf));
        String str7 = strArr[6];
        String str8 = strArr[7];
        String str9 = strArr[8];
        String str10 = strArr[9];
        float parseFloat = Float.parseFloat(strArr[10]);
        int parseInt = Integer.parseInt(strArr[11]);
        jobConf.setJobName("FindParallelSentences_" + str9 + "-" + str8 + "_F1=" + parseFloat + "[" + parseInt + "]");
        String str11 = str7 + "/" + str8 + "-sent.bin";
        String str12 = str7 + "/" + str8 + "-token.bin";
        String str13 = str7 + "/vocab." + str8 + "-" + str9 + "." + str8;
        String str14 = str7 + "/vocab." + str9 + "-" + str8 + "." + str8;
        String str15 = str7 + "/" + str9 + "-sent.bin";
        String str16 = str7 + "/" + str9 + "-token.bin";
        String str17 = str7 + "/vocab." + str9 + "-" + str8 + "." + str9;
        String str18 = str7 + "/vocab." + str8 + "-" + str9 + "." + str9;
        String str19 = str7 + "/ttable." + str9 + "-" + str8;
        jobConf.set("eDir", str5);
        jobConf.set("fDir", str6);
        jobConf.set("eLang", str8);
        jobConf.set("fLang", str9);
        jobConf.setInt("NumReducers", 50);
        jobConf.setFloat("ClassifierThreshold", parseFloat);
        jobConf.setInt("ClassifierId", parseInt);
        sLogger.info("caching files...");
        sLogger.info("caching files...0,1,2,3,4");
        DistributedCache.addCacheFile(new URI(retrievalEnvironment.getDfByTermData()), jobConf);
        DistributedCache.addCacheFile(new URI(str11), jobConf);
        DistributedCache.addCacheFile(new URI(str12), jobConf);
        DistributedCache.addCacheFile(new URI(str13), jobConf);
        DistributedCache.addCacheFile(new URI(str14), jobConf);
        sLogger.info("caching files...5,6,7,8,9");
        DistributedCache.addCacheFile(new URI(str6 + "/transDf.dat"), jobConf);
        DistributedCache.addCacheFile(new URI(str15), jobConf);
        DistributedCache.addCacheFile(new URI(str16), jobConf);
        DistributedCache.addCacheFile(new URI(str17), jobConf);
        DistributedCache.addCacheFile(new URI(str18), jobConf);
        sLogger.info("caching files...10,11,12,13,14");
        DistributedCache.addCacheFile(new URI(str19), jobConf);
        DistributedCache.addCacheFile(new URI(str7 + "/ttable." + str8 + "-" + str9), jobConf);
        DistributedCache.addCacheFile(new URI(retrievalEnvironment.getIndexTermsData()), jobConf);
        DistributedCache.addCacheFile(new URI(str10), jobConf);
        DistributedCache.addCacheFile(new URI(str), jobConf);
        FileInputFormat.addInputPaths(jobConf, str3);
        FileInputFormat.addInputPaths(jobConf, str4);
        FileOutputFormat.setOutputPath(jobConf, new Path(str2));
        jobConf.setInt("mapred.task.timeout", 60000000);
        jobConf.set("mapred.child.java.opts", "-Xmx2000m");
        jobConf.setBoolean("mapred.map.tasks.speculative.execution", false);
        jobConf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
        jobConf.setNumMapTasks(100);
        jobConf.setNumReduceTasks(50);
        jobConf.setInt("mapred.min.split.size", 2000000000);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setOutputFormat(TextOutputFormat.class);
        jobConf.setMapOutputKeyClass(PairOfInts.class);
        jobConf.setMapOutputValueClass(WikiDocInfo.class);
        jobConf.setMapperClass(MyMapper.class);
        jobConf.setReducerClass(MyReducer.class);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(Text.class);
        JobClient.runJob(jobConf);
        return 0;
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(new FindParallelSentencePairsOld(), strArr));
    }
}
