package ivory.lsh.eval;

import edu.umd.cloud9.io.map.HMapIFW;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.io.map.HMapSIW;
import edu.umd.cloud9.util.array.ArrayListOfInts;
import edu.umd.cloud9.util.map.MapKF;
import edu.umd.cloud9.util.map.MapKI;
import edu.umd.hooka.Vocab;
import edu.umd.hooka.alignment.HadoopAlign;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;
import ivory.core.tokenize.Tokenizer;
import ivory.core.tokenize.TokenizerFactory;
import ivory.core.util.CLIRUtils;
import ivory.pwsim.score.Bm25;
import ivory.pwsim.score.ScoringModel;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/lsh/eval/BitextClassifierUtils.class */
public class BitextClassifierUtils {
    static HMapSIW numSentencesPerDocE;
    static HMapSIW numSentencesPerDocF;
    static float avgDeDocLeng;
    static float avgEnDocLeng;
    static float avgGDocLeng;
    static Vocab eVocabSrc;
    static Vocab eVocabTrg;
    static Vocab fVocabSrc;
    static Vocab fVocabTrg;
    static TTable_monolithic_IFAs f2e_Probs;
    static TTable_monolithic_IFAs e2f_Probs;
    private static Options options;
    private static final String FLANG_OPTION = "f_lang";
    private static final String ELANG_OPTION = "e_lang";
    private static final String FBITEXT_OPTION = "f_bitext";
    private static final String EBITEXT_OPTION = "e_bitext";
    private static final String FSRC_OPTION = "f_srcvocab";
    private static final String ESRC_OPTION = "e_srcvocab";
    private static final String FTRG_OPTION = "f_trgvocab";
    private static final String ETRG_OPTION = "e_trgvocab";
    private static final String F2E_OPTION = "f2e_ttable";
    private static final String E2F_OPTION = "e2f_ttable";
    private static final String ETOK_OPTION = "e_tokenizer";
    private static final String FTOK_OPTION = "f_tokenizer";
    private static final String ESTOP_OPTION = "e_stopwords";
    private static final String FSTOP_OPTION = "f_stopwords";
    private static final String FEAT_OPTION = "feature";
    private static final String PROB_OPTION = "prob";
    private static final String LIBJARS_OPTION = "libjars";
    static List<HMapSIW> fDocs = new ArrayList();
    static List<HMapSIW> fSentTfs = new ArrayList();
    static List<String> fSents = new ArrayList();
    static List<HMapSIW> eDocs = new ArrayList();
    static List<HMapSIW> eSentTfs = new ArrayList();
    static List<String> eSents = new ArrayList();
    static ArrayListOfInts enSentLengths = new ArrayListOfInts();
    static ArrayListOfInts deSentLengths = new ArrayListOfInts();
    static List<HMapSIW> gDocs = new ArrayList();
    static HMapSIW dfE = new HMapSIW();
    static HMapSIW dfD = new HMapSIW();
    static HMapSIW dfG = new HMapSIW();

    private List<HMapSFW> translateDocVectors(String str, String str2, String str3, List<HMapSIW> list, HMapSIW hMapSIW) {
        Bm25 bm25 = new Bm25();
        bm25.setDocCount(list.size());
        bm25.setAvgDocLength(avgDeDocLeng);
        ArrayList arrayList = new ArrayList();
        Tokenizer createTokenizer = TokenizerFactory.createTokenizer(str, str2, true, str3, String.valueOf(str3) + ".stemmed", null);
        for (HMapSIW hMapSIW2 : list) {
            HMapIFW hMapIFW = new HMapIFW();
            int i = 0;
            try {
                i = CLIRUtils.translateTFs(hMapSIW2, hMapIFW, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, createTokenizer, (Logger) null);
            } catch (IOException e) {
                e.printStackTrace();
            }
            arrayList.add(CLIRUtils.createTermDocVector(i, hMapIFW, eVocabTrg, (ScoringModel) bm25, dfE, true, (Logger) null));
        }
        return arrayList;
    }

    private void readSentences(String str, String str2, String str3, String str4, Vocab vocab, Vocab vocab2, String str5, String str6, String str7, String str8) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
        File file = new File(str);
        File file2 = new File(str2);
        Tokenizer createTokenizer = TokenizerFactory.createTokenizer(str3, str6, true, str8, String.valueOf(str8) + ".stemmed", null);
        Tokenizer createTokenizer2 = TokenizerFactory.createTokenizer(str4, str5, true, str7, String.valueOf(str7) + ".stemmed", null);
        float f = 0.0f;
        float f2 = 0.0f;
        try {
            FileInputStream fileInputStream = new FileInputStream(file);
            FileInputStream fileInputStream2 = new FileInputStream(file2);
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fileInputStream, "UTF-8"));
            BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader(fileInputStream2, "UTF-8"));
            HMapSIW hMapSIW = new HMapSIW();
            HMapSIW hMapSIW2 = new HMapSIW();
            int i = 0;
            int i2 = 0;
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    fileInputStream.close();
                    bufferedReader.close();
                    fileInputStream2.close();
                    bufferedReader2.close();
                    avgDeDocLeng = f / i2;
                    avgEnDocLeng = f2 / i;
                    return;
                }
                String trim = bufferedReader2.readLine().trim();
                String trim2 = readLine.trim();
                String[] split = createTokenizer2 == null ? trim.split(" ") : createTokenizer2.processContent(trim);
                int length = split.length;
                for (String str9 : split) {
                    if (!hMapSIW.containsKey(str9)) {
                        dfD.increment(str9);
                    }
                    hMapSIW.increment(str9);
                }
                String[] processContent = createTokenizer.processContent(trim2);
                int length2 = processContent.length;
                for (String str10 : processContent) {
                    if (!hMapSIW2.containsKey(str10)) {
                        dfE.increment(str10);
                    }
                    hMapSIW2.increment(str10);
                }
                f += length;
                f2 += length2;
                enSentLengths.add(length2);
                deSentLengths.add(length);
                eSentTfs.add(hMapSIW2);
                fSentTfs.add(hMapSIW);
                eSents.add(trim2);
                fSents.add(trim);
                i++;
                i2++;
                hMapSIW = new HMapSIW();
                hMapSIW2 = new HMapSIW();
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e2) {
            e2.printStackTrace();
        }
    }

    private List<HMapSFW> buildDocVectors(List<HMapSIW> list, float f, HMapSIW hMapSIW) {
        Bm25 bm25 = new Bm25();
        bm25.setDocCount(list.size());
        bm25.setAvgDocLength(f);
        ArrayList arrayList = new ArrayList();
        for (HMapSIW hMapSIW2 : list) {
            HMapSFW hMapSFW = new HMapSFW();
            int i = 0;
            Iterator it = hMapSIW2.entrySet().iterator();
            while (it.hasNext()) {
                i += ((MapKI.Entry) it.next()).getValue();
            }
            float f2 = 0.0f;
            for (MapKI.Entry entry : hMapSIW2.entrySet()) {
                String str = (String) entry.getKey();
                int value = entry.getValue();
                bm25.setDF(hMapSIW.get(str));
                float computeDocumentWeight = bm25.computeDocumentWeight(value, i);
                if (computeDocumentWeight > 0.0f) {
                    hMapSFW.put(str, computeDocumentWeight);
                    f2 += computeDocumentWeight * computeDocumentWeight;
                }
            }
            float sqrt = (float) Math.sqrt(f2);
            for (MapKF.Entry entry2 : hMapSFW.entrySet()) {
                hMapSFW.put((String) entry2.getKey(), hMapSFW.get((String) entry2.getKey()) / sqrt);
            }
            arrayList.add(hMapSFW);
        }
        return arrayList;
    }

    private List<String> readAlignments(String str) {
        ArrayList arrayList = new ArrayList();
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                arrayList.add(readLine);
            }
            bufferedReader.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
        } catch (IOException e3) {
            e3.printStackTrace();
        }
        return arrayList;
    }

    private void prepareTrainTestData(List<String> list, List<String> list2, Tokenizer tokenizer, Tokenizer tokenizer2, List<HMapSIW> list3, List<HMapSIW> list4, List<HMapSFW> list5, List<HMapSFW> list6, int i, float f, List<String> list7) {
        NumberFormat numberInstance = NumberFormat.getNumberInstance();
        numberInstance.setGroupingUsed(false);
        numberInstance.setMaximumFractionDigits(2);
        int i2 = 0;
        long currentTimeMillis = System.currentTimeMillis();
        int i3 = 0;
        while (i3 < list5.size()) {
            HMapSFW hMapSFW = list5.get(i3);
            HMapSIW hMapSIW = list3.get(i3);
            String str = list.get(i3);
            int i4 = 0;
            while (i4 < list6.size()) {
                HMapSFW hMapSFW2 = list6.get(i4);
                HMapSIW hMapSIW2 = list4.get(i4);
                String str2 = list2.get(i4);
                String str3 = i3 == i4 ? "parallel" : "non_parallel";
                String[] strArr = null;
                if (i == 1) {
                    strArr = CLIRUtils.computeFeaturesF1(hMapSFW2, hMapSFW, enSentLengths.get(i4), deSentLengths.get(i3));
                } else if (i == 2) {
                    strArr = CLIRUtils.computeFeaturesF2(hMapSIW2, hMapSFW2, hMapSIW, hMapSFW, enSentLengths.get(i4), deSentLengths.get(i3), eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, f);
                } else if (i == 3) {
                    strArr = CLIRUtils.computeFeaturesF3(str, str2, tokenizer, tokenizer2, hMapSIW2, hMapSFW2, hMapSIW, hMapSFW, enSentLengths.get(i4), deSentLengths.get(i3), eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, f);
                }
                if (strArr != null) {
                    System.out.println(String.valueOf(concat(strArr)) + " " + str3);
                }
                i2++;
                i4++;
            }
            i3++;
        }
        System.out.println("Computed " + i2 + " F" + i + " instances in " + (System.currentTimeMillis() - currentTimeMillis));
    }

    public void runPrepareSentenceExtractionData(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9, String str10, String str11, String str12, String str13, String str14, int i, float f, String str15) {
        try {
            LocalFileSystem local = FileSystem.getLocal(new Configuration());
            List<String> list = null;
            if (str15 != null) {
                list = readAlignments(str15);
            }
            eVocabSrc = HadoopAlign.loadVocab(new Path(str9), local);
            eVocabTrg = HadoopAlign.loadVocab(new Path(str8), local);
            fVocabSrc = HadoopAlign.loadVocab(new Path(str7), local);
            fVocabTrg = HadoopAlign.loadVocab(new Path(str10), local);
            f2e_Probs = new TTable_monolithic_IFAs(local, new Path(str11), true);
            e2f_Probs = new TTable_monolithic_IFAs(local, new Path(str12), true);
            Tokenizer createTokenizer = TokenizerFactory.createTokenizer((FileSystem) local, str, str13, false);
            Tokenizer createTokenizer2 = TokenizerFactory.createTokenizer((FileSystem) local, str2, str14, false);
            long currentTimeMillis = System.currentTimeMillis();
            readSentences(str4, str3, str2, str, eVocabTrg, fVocabSrc, str13, str14, str5, str6);
            long currentTimeMillis2 = System.currentTimeMillis();
            System.out.println("Sentences read in " + (currentTimeMillis2 - currentTimeMillis) + " ms. Number of sentences: " + fSentTfs.size() + " = " + eSentTfs.size());
            List<HMapSFW> buildDocVectors = buildDocVectors(eSentTfs, avgEnDocLeng, dfE);
            long currentTimeMillis3 = System.currentTimeMillis();
            System.out.println("E vectors created in " + (currentTimeMillis3 - currentTimeMillis2) + " ms");
            List<HMapSFW> translateDocVectors = translateDocVectors(str2, str14, str6, fSentTfs, dfE);
            long currentTimeMillis4 = System.currentTimeMillis();
            System.out.println("F vectors created in " + (currentTimeMillis4 - currentTimeMillis3) + " ms. Number of vectors: " + translateDocVectors.size() + " = " + buildDocVectors.size());
            prepareTrainTestData(fSents, eSents, createTokenizer, createTokenizer2, fSentTfs, eSentTfs, translateDocVectors, buildDocVectors, i, f, list);
            System.out.println("Features computed in " + (System.currentTimeMillis() - currentTimeMillis4) + " ms");
        } catch (Exception e) {
            System.err.println(str9);
            System.err.println(str8);
            System.err.println(str7);
            System.err.println(str10);
            System.err.println(str11);
            System.err.println(str12);
            throw new RuntimeException(e);
        }
    }

    public static void main(String[] strArr) throws Exception {
        CommandLine parseArgs = parseArgs(strArr);
        if (parseArgs == null) {
            printUsage();
            return;
        }
        BitextClassifierUtils bitextClassifierUtils = new BitextClassifierUtils();
        numSentencesPerDocE = new HMapSIW();
        numSentencesPerDocF = new HMapSIW();
        long currentTimeMillis = System.currentTimeMillis();
        bitextClassifierUtils.runPrepareSentenceExtractionData(parseArgs.getOptionValue(FLANG_OPTION), parseArgs.getOptionValue(ELANG_OPTION), parseArgs.getOptionValue(FBITEXT_OPTION), parseArgs.getOptionValue(EBITEXT_OPTION), parseArgs.getOptionValue(FSTOP_OPTION), parseArgs.getOptionValue(ESTOP_OPTION), parseArgs.getOptionValue(FSRC_OPTION), parseArgs.getOptionValue(ETRG_OPTION), parseArgs.getOptionValue(ESRC_OPTION), parseArgs.getOptionValue(FTRG_OPTION), parseArgs.getOptionValue(F2E_OPTION), parseArgs.getOptionValue(E2F_OPTION), parseArgs.getOptionValue(FTOK_OPTION), parseArgs.getOptionValue(ETOK_OPTION), Integer.parseInt(parseArgs.getOptionValue("feature")), parseArgs.hasOption(PROB_OPTION) ? Float.parseFloat(parseArgs.getOptionValue(PROB_OPTION)) : 0.0f, null);
        System.out.println("Done in " + (System.currentTimeMillis() - currentTimeMillis) + " ms");
    }

    private static void printUsage() {
        new HelpFormatter().printHelp("BitextClassifierUtils", options);
        System.exit(-1);
    }

    public String concat(String[] strArr) {
        String str = "";
        for (String str2 : strArr) {
            str = String.valueOf(str) + str2 + " ";
        }
        return str;
    }

    private static CommandLine parseArgs(String[] strArr) {
        options = new Options();
        Options options2 = options;
        OptionBuilder.withDescription("two-letter code for f-language");
        OptionBuilder.withArgName("en|de|tr|cs|zh|ar|es");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options2.addOption(OptionBuilder.create(FLANG_OPTION));
        Options options3 = options;
        OptionBuilder.withDescription("two-letter code for e-language");
        OptionBuilder.withArgName("en|de|tr|cs|zh|ar|es");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options3.addOption(OptionBuilder.create(ELANG_OPTION));
        Options options4 = options;
        OptionBuilder.withDescription("source-side of training bitext");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options4.addOption(OptionBuilder.create(FBITEXT_OPTION));
        Options options5 = options;
        OptionBuilder.withDescription("target-side of training bitext");
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options5.addOption(OptionBuilder.create(EBITEXT_OPTION));
        Options options6 = options;
        OptionBuilder.withDescription("source vocabulary (f-side) of P(e|f)");
        OptionBuilder.withArgName("path to Vocab object");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options6.addOption(OptionBuilder.create(FSRC_OPTION));
        Options options7 = options;
        OptionBuilder.withDescription("source vocabulary (e-side) of P(f|e)");
        OptionBuilder.withArgName("path to Vocab object");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options7.addOption(OptionBuilder.create(ESRC_OPTION));
        Options options8 = options;
        OptionBuilder.withDescription("target vocabulary (f-side) of P(f|e)");
        OptionBuilder.withArgName("path to Vocab object");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options8.addOption(OptionBuilder.create(FTRG_OPTION));
        Options options9 = options;
        OptionBuilder.withDescription("target vocabulary (e-side) of P(e|f)");
        OptionBuilder.withArgName("path to Vocab object");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options9.addOption(OptionBuilder.create(ETRG_OPTION));
        Options options10 = options;
        OptionBuilder.withDescription("translation table P(e|f)");
        OptionBuilder.withArgName("path to TTable object");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options10.addOption(OptionBuilder.create(F2E_OPTION));
        Options options11 = options;
        OptionBuilder.withDescription("translation table P(f|e)");
        OptionBuilder.withArgName("path to TTable object");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options11.addOption(OptionBuilder.create(E2F_OPTION));
        Options options12 = options;
        OptionBuilder.withDescription("tokenizer model for f-language");
        OptionBuilder.withArgName("path to Tokenizer object");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options12.addOption(OptionBuilder.create(FTOK_OPTION));
        Options options13 = options;
        OptionBuilder.withDescription("tokenizer model for e-language");
        OptionBuilder.withArgName("path to Tokenizer object");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options13.addOption(OptionBuilder.create(ETOK_OPTION));
        Options options14 = options;
        OptionBuilder.withDescription("stopwords for f-language");
        OptionBuilder.withArgName("path to stopword list");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options14.addOption(OptionBuilder.create(FSTOP_OPTION));
        Options options15 = options;
        OptionBuilder.withDescription("stopwords for e-language");
        OptionBuilder.withArgName("path to stopword list");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options15.addOption(OptionBuilder.create(ESTOP_OPTION));
        Options options16 = options;
        OptionBuilder.withDescription("id of feature set to be used");
        OptionBuilder.withArgName("1|2|3");
        OptionBuilder.hasArg();
        OptionBuilder.isRequired();
        options16.addOption(OptionBuilder.create("feature"));
        Options options17 = options;
        OptionBuilder.withDescription("lower threshold for token translation probability");
        OptionBuilder.withArgName("0-1");
        OptionBuilder.hasArg();
        options17.addOption(OptionBuilder.create(PROB_OPTION));
        Options options18 = options;
        OptionBuilder.withDescription("Hadoop option to load external jars");
        OptionBuilder.withArgName("jar packages");
        OptionBuilder.hasArg();
        options18.addOption(OptionBuilder.create(LIBJARS_OPTION));
        try {
            return new GnuParser().parse(options, strArr);
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            return null;
        }
    }
}
