package edu.umd.hooka;

import edu.umd.hooka.alignment.aer.ReferenceAlignment;
import edu.umd.hooka.corpora.Chunk;
import edu.umd.hooka.corpora.Language;
import edu.umd.hooka.corpora.LanguagePair;
import edu.umd.hooka.corpora.ParallelChunk;
import edu.umd.hooka.corpora.ParallelCorpusReader;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.streaming.StreamXmlRecordReader;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:edu/umd/hooka/CorpusVocabNormalizerAndNumberizer.class */
public class CorpusVocabNormalizerAndNumberizer {
    private static final Logger sLogger = Logger.getLogger(CorpusVocabNormalizerAndNumberizer.class);
    static final String SRC_LANG = "ha.sourcelang";
    static final String TGT_LANG = "ha.targetlang";

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:edu/umd/hooka/CorpusVocabNormalizerAndNumberizer$BitextCompilerCounters.class */
    public enum BitextCompilerCounters {
        EN_WORDS,
        FR_WORDS,
        CHUNKS,
        WRONG_LANGUAGE,
        SRC_TOO_LONG,
        TGT_TOO_LONG;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static BitextCompilerCounters[] valuesCustom() {
            BitextCompilerCounters[] valuesCustom = values();
            int length = valuesCustom.length;
            BitextCompilerCounters[] bitextCompilerCountersArr = new BitextCompilerCounters[length];
            System.arraycopy(valuesCustom, 0, bitextCompilerCountersArr, 0, length);
            return bitextCompilerCountersArr;
        }
    }

    /* loaded from: input_file:edu/umd/hooka/CorpusVocabNormalizerAndNumberizer$BitextCompilerMapper.class */
    public static class BitextCompilerMapper extends MapReduceBase implements Mapper<Text, Text, Text, PhrasePair> {
        static Vocab vocE = null;
        static Vocab vocF = null;
        String outputBase = null;
        Path pf = null;
        Path pe = null;
        Path pa = null;
        ParallelCorpusReader pcr = new ParallelCorpusReader();
        Language src = null;
        Language tgt = null;
        AlignmentWordPreprocessor sawp = null;
        AlignmentWordPreprocessor tawp = null;
        LanguagePair lp = null;
        JobConf job_ = null;
        Text ok = new Text("");

        public void configure(JobConf jobConf) {
            CorpusVocabNormalizerAndNumberizer.sLogger.setLevel(Level.OFF);
            this.src = Language.languageForISO639_1(jobConf.get(CorpusVocabNormalizerAndNumberizer.SRC_LANG));
            this.tgt = Language.languageForISO639_1(jobConf.get(CorpusVocabNormalizerAndNumberizer.TGT_LANG));
            CorpusVocabNormalizerAndNumberizer.sLogger.debug("Source language: " + this.src.code());
            CorpusVocabNormalizerAndNumberizer.sLogger.debug("Target language: " + this.tgt.code());
            if (0 == 0) {
                if (vocE == null) {
                    vocE = new VocabularyWritable();
                }
                if (vocF == null) {
                    vocF = new VocabularyWritable();
                }
            } else {
                try {
                    vocE = new VocabServerClient(jobConf.get("ha.vocabserver.host"), Integer.parseInt(jobConf.get("ha.vocabserver.port1")));
                    vocF = new VocabServerClient(jobConf.get("ha.vocabserver.host"), Integer.parseInt(jobConf.get("ha.vocabserver.port2")));
                } catch (IOException e) {
                    e.printStackTrace();
                    throw new RuntimeException(e);
                }
            }
            this.lp = LanguagePair.languageForISO639_1Pair(String.valueOf(this.src.code()) + "-" + this.tgt.code());
            if (jobConf.getBoolean("ha.trunc.use", true)) {
                this.sawp = AlignmentWordPreprocessor.CreatePreprocessor(this.lp, this.src, jobConf);
                this.tawp = AlignmentWordPreprocessor.CreatePreprocessor(this.lp, this.tgt, jobConf);
            } else {
                this.sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, jobConf);
                this.tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, jobConf);
            }
            this.job_ = jobConf;
        }

        public int[] convertStrings(String[] strArr, Vocab vocab) {
            int[] iArr = new int[strArr.length];
            for (int i = 0; i < strArr.length; i++) {
                iArr[i] = vocab.addOrGet(strArr[i]);
                CorpusVocabNormalizerAndNumberizer.sLogger.info(String.valueOf(strArr[i]) + "-->" + iArr[i]);
            }
            return iArr;
        }

        public void close() {
            System.err.println("Target: " + vocE.size() + " types. Writing to " + this.job_.get("root", (String) null) + "/vocab.E");
            System.err.println("Source: " + vocF.size() + " types .Writing to " + this.job_.get("root", (String) null) + "/vocab.F");
            try {
                FileSystem fileSystem = FileSystem.get(this.job_);
                DataOutputStream dataOutputStream = new DataOutputStream(new BufferedOutputStream(fileSystem.create(new Path(String.valueOf(this.job_.get("root", (String) null)) + "/vocab.E"))));
                ((VocabularyWritable) vocE).write(dataOutputStream);
                dataOutputStream.close();
                DataOutputStream dataOutputStream2 = new DataOutputStream(new BufferedOutputStream(fileSystem.create(new Path(String.valueOf(this.job_.get("root", (String) null)) + "/vocab.F"))));
                ((VocabularyWritable) vocF).write(dataOutputStream2);
                dataOutputStream2.close();
            } catch (IOException e) {
                throw new RuntimeException("Vocab couldn't be written to disk.\n" + e.toString());
            }
        }

        public void map(Text text, Text text2, OutputCollector<Text, PhrasePair> outputCollector, Reporter reporter) throws IOException {
            ParallelChunk parseString = this.pcr.parseString(text.toString());
            this.ok.set(parseString.idString());
            Chunk chunk = parseString.getChunk(this.src);
            Chunk chunk2 = parseString.getChunk(this.tgt);
            if (chunk == null || chunk2 == null) {
                reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1L);
                return;
            }
            if (chunk.getLength() > 200) {
                reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1L);
                return;
            }
            if (chunk2.getLength() > 200) {
                reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1L);
                return;
            }
            CorpusVocabNormalizerAndNumberizer.sLogger.debug("Target sentence:");
            int[] convertStrings = convertStrings(this.tawp.preprocessWordsForAlignment(chunk2.getWords()), vocE);
            CorpusVocabNormalizerAndNumberizer.sLogger.debug("Source sentence:");
            int[] convertStrings2 = convertStrings(this.sawp.preprocessWordsForAlignment(chunk.getWords()), vocF);
            Phrase phrase = new Phrase(convertStrings, 0);
            PhrasePair phrasePair = new PhrasePair(new Phrase(convertStrings2, 1), phrase);
            ReferenceAlignment referenceAlignment = parseString.getReferenceAlignment(this.lp);
            if (referenceAlignment != null) {
                phrasePair.setAlignment(referenceAlignment);
            }
            reporter.incrCounter(BitextCompilerCounters.EN_WORDS, phrase.getWords().length);
            reporter.incrCounter(BitextCompilerCounters.FR_WORDS, r0.getWords().length);
            reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1L);
            outputCollector.collect(this.ok, phrasePair);
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((Text) obj, (Text) obj2, (OutputCollector<Text, PhrasePair>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:edu/umd/hooka/CorpusVocabNormalizerAndNumberizer$XMLInput.class */
    public static class XMLInput extends FileInputFormat<Text, Text> {
        private CompressionCodecFactory compressionCodecs = null;

        public void configure(JobConf jobConf) {
            this.compressionCodecs = new CompressionCodecFactory(jobConf);
        }

        protected boolean isSplitable(FileSystem fileSystem, Path path) {
            return this.compressionCodecs == null || this.compressionCodecs.getCodec(path) == null;
        }

        public RecordReader<Text, Text> getRecordReader(InputSplit inputSplit, JobConf jobConf, Reporter reporter) throws IOException {
            reporter.setStatus(inputSplit.toString());
            FileSplit fileSplit = (FileSplit) inputSplit;
            Path path = fileSplit.getPath();
            FSDataInputStream open = path.getFileSystem(jobConf).open(fileSplit.getPath());
            if (this.compressionCodecs == null || this.compressionCodecs.getCodec(path) == null) {
                return new StreamXmlRecordReader(open, fileSplit, reporter, jobConf, FileSystem.get(jobConf));
            }
            throw new RuntimeException("Not handling compression!");
        }
    }

    public static void preprocessAndNumberizeFiles(Configuration configuration, String str, Path path) throws IOException {
        sLogger.setLevel(Level.INFO);
        JobConf jobConf = new JobConf(configuration);
        jobConf.setJobName("bitext.compile");
        Thread thread = null;
        Thread thread2 = null;
        VocabServer vocabServer = null;
        VocabServer vocabServer2 = null;
        try {
            jobConf.setOutputKeyClass(Text.class);
            jobConf.setOutputValueClass(PhrasePair.class);
            jobConf.setMapperClass(BitextCompilerMapper.class);
            jobConf.setReducerClass(IdentityReducer.class);
            jobConf.setNumMapTasks(1);
            jobConf.setNumReduceTasks(1);
            FileInputFormat.setInputPaths(jobConf, str);
            jobConf.set("stream.recordreader.begin", "<pchunk");
            jobConf.set("stream.recordreader.end", "</pchunk>");
            jobConf.set("stream.recordreader.slowmatch", "false");
            jobConf.set("stream.recordreader.maxrec", "100000");
            jobConf.setInputFormat(XMLInput.class);
            FileOutputFormat.setOutputPath(jobConf, path);
            jobConf.setOutputFormat(SequenceFileOutputFormat.class);
            jobConf.setJarByClass(CorpusVocabNormalizerAndNumberizer.class);
            System.out.println("Running job " + jobConf.getJobName());
            System.out.println("Input: " + str);
            System.out.println("Output: " + path);
            JobClient.runJob(jobConf);
            if (0 != 0) {
                try {
                    vocabServer.stopServer();
                } catch (InterruptedException e) {
                    return;
                }
            }
            if (0 != 0) {
                vocabServer2.stopServer();
            }
            if (0 != 0) {
                thread.join();
            }
            if (0 != 0) {
                thread2.join();
            }
        } catch (Throwable th) {
            if (0 != 0) {
                try {
                    vocabServer.stopServer();
                } catch (InterruptedException e2) {
                    throw th;
                }
            }
            if (0 != 0) {
                vocabServer2.stopServer();
            }
            if (0 != 0) {
                thread.join();
            }
            if (0 != 0) {
                thread2.join();
            }
            throw th;
        }
    }

    public static void main(String[] strArr) {
        Path[] pathArr = {new Path("/Users/redpony/bitexts/man-align/deen.ccb_jhu.xml"), new Path("/tmp/bar.xml")};
        try {
            Configuration configuration = new Configuration();
            configuration.set(SRC_LANG, "de");
            configuration.set(TGT_LANG, "en");
            preprocessAndNumberizeFiles(configuration, "/umd-lin/fture/mt/eu-nc-wmt2008.de-en.xml", new Path("/umd-lin/fture/mt/aligner/comp-bitext"));
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
