package ivory.lsh.bitext;

import com.google.common.collect.Maps;
import edu.umd.cloud9.io.array.ArrayListOfIntsWritable;
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.map.HMapIFW;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.io.map.HMapSIW;
import edu.umd.cloud9.util.map.MapKI;
import edu.umd.hooka.Vocab;
import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.DefaultFrequencySortedDictionary;
import ivory.core.data.dictionary.FrequencySortedDictionary;
import ivory.core.data.stat.DfTableArray;
import ivory.core.tokenize.Tokenizer;
import ivory.core.tokenize.TokenizerFactory;
import ivory.core.util.CLIRUtils;
import ivory.pwsim.score.Bm25;
import ivory.pwsim.score.ScoringModel;
import ivory.sqe.retrieval.Constants;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import opennlp.model.MaxentModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/lsh/bitext/PreprocessHelper.class */
public class PreprocessHelper {
    private String eLang;
    private String fLang;
    private String eDir;
    private int MinVectorTerms;
    private int MinSentenceLength;
    private SentenceDetectorME fModel;
    private SentenceDetectorME eModel;
    private Tokenizer fTok;
    private Tokenizer eTok;
    private VocabularyWritable eVocabSrc;
    private VocabularyWritable eVocabTrg;
    private VocabularyWritable fVocabTrg;
    private VocabularyWritable fVocabSrc;
    private TTable_monolithic_IFAs f2e_Probs;
    private TTable_monolithic_IFAs e2f_Probs;
    private ScoringModel fScoreFn;
    private ScoringModel eScoreFn;
    private MaxentModel classifier;
    private DfTableArray dfTable;
    private DefaultFrequencySortedDictionary dict;
    private final Logger sLogger = Logger.getLogger(PreprocessHelper.class);
    private static final HMapSIW lang2AvgSentLen = new HMapSIW();

    static {
        lang2AvgSentLen.put(Constants.English, 21);
        lang2AvgSentLen.put(Constants.German, 16);
        lang2AvgSentLen.put(Constants.Chinese, 27);
        lang2AvgSentLen.put(Constants.French, 18);
        lang2AvgSentLen.put("tr", 12);
        lang2AvgSentLen.put(Constants.Arabic, 22);
        lang2AvgSentLen.put("es", 19);
        lang2AvgSentLen.put("cs", 18);
    }

    public PreprocessHelper(int i, int i2, JobConf jobConf) throws Exception {
        this.sLogger.setLevel(Level.INFO);
        this.fLang = jobConf.get("fLang");
        this.eLang = jobConf.get("eLang");
        this.eDir = jobConf.get("eDir");
        this.MinVectorTerms = i;
        this.MinSentenceLength = i2;
        loadModels(jobConf);
    }

    public PreprocessHelper(int i, int i2, Configuration configuration) throws Exception {
        this.fLang = configuration.get("fLang");
        this.eLang = configuration.get("eLang");
        this.eDir = configuration.get("eDir");
        this.MinVectorTerms = i;
        this.MinSentenceLength = i2;
        loadFModels(configuration);
        loadEModels(configuration);
    }

    public void loadModels(JobConf jobConf) throws Exception {
        loadFModels(jobConf);
        loadEModels(jobConf);
    }

    private void loadFModels(JobConf jobConf) throws Exception {
        this.sLogger.info("Loading models for " + this.fLang + " ...");
        FileSystem fileSystem = FileSystem.get(jobConf);
        LocalFileSystem local = FileSystem.getLocal(jobConf);
        Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(jobConf);
        String sentDetectorFile = getSentDetectorFile(this.fLang);
        String srcVocab = getSrcVocab(this.eLang, this.fLang);
        String trgVocab = getTrgVocab(this.fLang, this.eLang);
        String srcVocab2 = getSrcVocab(this.fLang, this.eLang);
        String trgVocab2 = getTrgVocab(this.eLang, this.fLang);
        String tTable = getTTable(this.fLang, this.eLang);
        String tTable2 = getTTable(this.eLang, this.fLang);
        String classifierFile = getClassifierFile();
        HashMap newHashMap = Maps.newHashMap();
        for (Path path : localCacheFiles) {
            this.sLogger.info("In DistributedCache: " + path);
            if (path.toString().contains(sentDetectorFile)) {
                newHashMap.put(sentDetectorFile, path);
                this.sLogger.info("--> sentdetector");
            } else if (path.toString().contains(srcVocab)) {
                newHashMap.put(srcVocab, path);
                this.sLogger.info("--> eVocabSrcFile");
            } else if (path.toString().contains(trgVocab)) {
                newHashMap.put(trgVocab, path);
                this.sLogger.info("--> eVocabTrgFile");
            } else if (path.toString().contains(srcVocab2)) {
                newHashMap.put(srcVocab2, path);
                this.sLogger.info("--> fVocabSrcFile");
            } else if (path.toString().contains(trgVocab2)) {
                newHashMap.put(trgVocab2, path);
                this.sLogger.info("--> fVocabTrgFile");
            } else if (path.toString().contains(tTable)) {
                newHashMap.put(tTable, path);
                this.sLogger.info("--> f2e_ttableFile");
            } else if (path.toString().contains(tTable2)) {
                newHashMap.put(tTable2, path);
                this.sLogger.info("--> e2f_ttableFile");
            } else if (path.toString().contains(classifierFile)) {
                newHashMap.put(classifierFile, path);
                this.sLogger.info("--> classifier model");
            }
        }
        this.fModel = new SentenceDetectorME(new SentenceModel(local.open((Path) newHashMap.get(sentDetectorFile))));
        this.sLogger.info("Sentence model created successfully from " + newHashMap.get(sentDetectorFile));
        this.eVocabSrc = HadoopAlign.loadVocab((Path) newHashMap.get(srcVocab), local);
        this.eVocabTrg = HadoopAlign.loadVocab((Path) newHashMap.get(trgVocab), local);
        this.fVocabSrc = HadoopAlign.loadVocab((Path) newHashMap.get(srcVocab2), local);
        this.fVocabTrg = HadoopAlign.loadVocab((Path) newHashMap.get(trgVocab2), local);
        this.f2e_Probs = new TTable_monolithic_IFAs(local, (Path) newHashMap.get(tTable), true);
        this.e2f_Probs = new TTable_monolithic_IFAs(local, (Path) newHashMap.get(tTable2), true);
        String str = jobConf.get("fTokenizer");
        this.fTok = TokenizerFactory.createTokenizer(fileSystem, this.fLang, str, true, jobConf.get("fStopword"), jobConf.get("fStemmedStopword"), null);
        this.sLogger.info("Tokenizer and vocabs created successfully from " + this.fLang + " " + str + "," + jobConf.get("fStopword") + "," + jobConf.get("fStemmedStopword"));
        this.fScoreFn = new Bm25();
        this.fScoreFn.setAvgDocLength(lang2AvgSentLen.get(this.fLang));
        this.fScoreFn.setDocCount(new RetrievalEnvironment(this.eDir, fileSystem).readCollectionDocumentCount());
        if (newHashMap.containsKey(classifierFile)) {
            this.classifier = new MoreGenericModelReader((Path) newHashMap.get(classifierFile), (FileSystem) local).constructModel();
            this.sLogger.info("Bitext classifier created successfully from " + newHashMap.get(classifierFile));
        }
    }

    private void loadEModels(JobConf jobConf) throws Exception {
        this.sLogger.info("Loading models for " + this.eLang + " ...");
        String sentDetectorFile = getSentDetectorFile(this.eLang);
        Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(jobConf);
        HashMap newHashMap = Maps.newHashMap();
        for (Path path : localCacheFiles) {
            this.sLogger.info("In DistributedCache: " + path);
            if (path.toString().contains(sentDetectorFile)) {
                newHashMap.put(sentDetectorFile, path);
                this.sLogger.info("--> sentdetector");
            }
        }
        this.eModel = new SentenceDetectorME(new SentenceModel(FileSystem.getLocal(jobConf).open((Path) newHashMap.get(sentDetectorFile))));
        this.sLogger.info("Sentence model created successfully.");
        FileSystem fileSystem = FileSystem.get(jobConf);
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(this.eDir, fileSystem);
        this.sLogger.info("Environment created successfully at " + this.eDir);
        String str = jobConf.get("eTokenizer");
        this.eTok = TokenizerFactory.createTokenizer(fileSystem, this.eLang, str, true, jobConf.get("eStopword"), jobConf.get("eStemmedStopword"), null);
        this.sLogger.info("Tokenizer and vocabs created successfully from " + this.eLang + " " + str + "," + jobConf.get("eStopword") + "," + jobConf.get("eStemmedStopword"));
        this.eScoreFn = new Bm25();
        this.eScoreFn.setAvgDocLength(lang2AvgSentLen.get(this.eLang));
        this.eScoreFn.setDocCount(retrievalEnvironment.readCollectionDocumentCount());
        this.dict = new DefaultFrequencySortedDictionary(new Path(retrievalEnvironment.getIndexTermsData()), new Path(retrievalEnvironment.getIndexTermIdsData()), new Path(retrievalEnvironment.getIndexTermIdMappingData()), fileSystem);
        this.dfTable = new DfTableArray(new Path(retrievalEnvironment.getDfByTermData()), fileSystem);
    }

    public HMapSFW createFDocVector(String str) {
        return createFDocVector(str, new HMapSIW());
    }

    public HMapSFW createFDocVector(String str, HMapSIW hMapSIW) {
        String[] processContent = this.fTok.processContent(str);
        for (String str2 : processContent) {
            hMapSIW.increment(str2);
        }
        HMapIFW hMapIFW = new HMapIFW();
        for (MapKI.Entry entry : hMapSIW.entrySet()) {
            hMapIFW = CLIRUtils.updateTFsByTerm((String) entry.getKey(), entry.getValue(), hMapIFW, this.eVocabSrc, this.eVocabTrg, this.fVocabSrc, this.fVocabTrg, this.e2f_Probs, this.f2e_Probs, this.eTok, this.sLogger);
        }
        HMapSFW createTermDocVector = CLIRUtils.createTermDocVector(processContent.length, hMapIFW, this.eVocabTrg, this.fScoreFn, this.dict, this.dfTable, true, this.sLogger);
        int i = 0;
        Iterator it = createTermDocVector.keySet().iterator();
        while (it.hasNext()) {
            if (!((String) it.next()).matches("\\d+")) {
                i++;
            }
        }
        if (i < this.MinVectorTerms) {
            return null;
        }
        return createTermDocVector;
    }

    public HMapSFW createEDocVector(String str) {
        return createEDocVector(str, new HMapSIW());
    }

    public HMapSFW createEDocVector(String str, HMapSIW hMapSIW) {
        new HMapSFW();
        String[] processContent = this.eTok.processContent(str);
        for (String str2 : processContent) {
            hMapSIW.increment(str2);
        }
        HMapSFW createTermDocVector = CLIRUtils.createTermDocVector(processContent.length, hMapSIW, this.eScoreFn, (FrequencySortedDictionary) this.dict, this.dfTable, true, this.sLogger);
        int i = 0;
        Iterator it = createTermDocVector.keySet().iterator();
        while (it.hasNext()) {
            if (!((String) it.next()).matches("\\d+")) {
                i++;
            }
        }
        if (i < this.MinVectorTerms) {
            return null;
        }
        return createTermDocVector;
    }

    public ArrayListWritable<Text> getESentences(String str, ArrayListWritable<HMapSFW> arrayListWritable, ArrayListOfIntsWritable arrayListOfIntsWritable) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException {
        int numberTokens;
        HMapSFW createEDocVector;
        ArrayListWritable<Text> arrayListWritable2 = new ArrayListWritable<>();
        for (String str2 : str.split("\n")) {
            if (!str2.matches("\\s+") && !str2.isEmpty()) {
                for (String str3 : this.eModel.sentDetect(str2)) {
                    if (!str3.contains("date:") && !str3.contains("jpg") && !str3.contains("png") && !str3.contains("gif") && !str3.contains("fontsize:") && !str3.contains("category:") && (numberTokens = this.eTok.getNumberTokens(str3)) >= this.MinSentenceLength && (createEDocVector = createEDocVector(str3.toString())) != null) {
                        arrayListWritable.add(createEDocVector);
                        arrayListWritable2.add(new Text(str3));
                        if (arrayListOfIntsWritable != null) {
                            arrayListOfIntsWritable.add(numberTokens);
                        }
                    }
                }
            }
        }
        return arrayListWritable2;
    }

    public ArrayListWritable<Text> getFSentences(String str, ArrayListWritable<HMapSFW> arrayListWritable, ArrayListOfIntsWritable arrayListOfIntsWritable) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException {
        int numberTokens;
        HMapSFW createFDocVector;
        this.sLogger.debug("text length=" + str.length());
        ArrayListWritable<Text> arrayListWritable2 = new ArrayListWritable<>();
        String[] split = str.split("\n");
        this.sLogger.debug("num lines=" + split.length);
        for (String str2 : split) {
            StringBuffer stringBuffer = new StringBuffer();
            for (int i = 0; i < str2.length(); i++) {
                char charAt = str2.charAt(i);
                if (String.format("%04x", Integer.valueOf(charAt)).equals("3002")) {
                    stringBuffer.append(". ");
                } else {
                    stringBuffer.append(charAt);
                }
            }
            String stringBuffer2 = stringBuffer.toString();
            this.sLogger.debug("line=" + stringBuffer2);
            if (!stringBuffer2.matches("\\s+") && !stringBuffer2.isEmpty()) {
                for (String str3 : this.fModel.sentDetect(stringBuffer2)) {
                    this.sLogger.debug("sent=" + str3);
                    if (!str3.contains("datei:") && !str3.contains("jpg") && !str3.contains("png") && !str3.contains("fontsize:") && !str3.contains("kategorie:") && (numberTokens = this.fTok.getNumberTokens(str3)) >= this.MinSentenceLength && (createFDocVector = createFDocVector(str3)) != null) {
                        arrayListWritable.add(createFDocVector);
                        arrayListWritable2.add(new Text(str3));
                        this.sLogger.debug("added=" + createFDocVector);
                        if (arrayListOfIntsWritable != null) {
                            arrayListOfIntsWritable.add(numberTokens);
                        }
                    }
                }
            }
        }
        this.sLogger.setLevel(Level.INFO);
        return arrayListWritable2;
    }

    private String getSentDetectorFile(String str) {
        return String.valueOf(str) + "-sent.bin";
    }

    private String getClassifierFile() {
        return "classifier-";
    }

    private String getTTable(String str, String str2) {
        return "ttable." + str + "-" + str2;
    }

    private String getTrgVocab(String str, String str2) {
        return "vocab." + str + "-" + str2 + "." + str2;
    }

    private String getSrcVocab(String str, String str2) {
        return "vocab." + str + "-" + str2 + "." + str;
    }

    public MaxentModel getClassifier() {
        return this.classifier;
    }

    public Tokenizer getETokenizer() {
        return this.eTok;
    }

    public Tokenizer getFTokenizer() {
        return this.fTok;
    }

    public TTable_monolithic_IFAs getE2F() {
        return this.e2f_Probs;
    }

    public TTable_monolithic_IFAs getF2E() {
        return this.f2e_Probs;
    }

    public Vocab getFSrc() {
        return this.fVocabSrc;
    }

    public Vocab getETrg() {
        return this.eVocabTrg;
    }

    public Vocab getESrc() {
        return this.eVocabSrc;
    }

    public Vocab getFTrg() {
        return this.fVocabTrg;
    }

    public SentenceDetectorME getFSentenceModel() {
        return this.fModel;
    }

    public SentenceDetectorME getESentenceModel() {
        return this.eModel;
    }

    private void loadFModels(Configuration configuration) throws Exception {
        this.sLogger.info("Loading models for " + this.fLang + " ...");
        LocalFileSystem local = FileSystem.getLocal(configuration);
        this.fModel = new SentenceDetectorME(new SentenceModel(local.open(new Path(configuration.get("eSentDetectorFile")))));
        this.sLogger.info("Sentence model created successfully.");
        this.eVocabSrc = HadoopAlign.loadVocab(new Path(configuration.get("eVocabSrcFile")), local);
        this.eVocabTrg = HadoopAlign.loadVocab(new Path(configuration.get("eVocabTrgFile")), local);
        this.fVocabSrc = HadoopAlign.loadVocab(new Path(configuration.get("fVocabSrcFile")), local);
        this.fVocabTrg = HadoopAlign.loadVocab(new Path(configuration.get("fVocabTrgFile")), local);
        this.f2e_Probs = new TTable_monolithic_IFAs(local, new Path(configuration.get("f2e_ttableFile")), true);
        this.e2f_Probs = new TTable_monolithic_IFAs(local, new Path(configuration.get("e2f_ttableFile")), true);
        this.fTok = TokenizerFactory.createTokenizer(local, this.fLang, configuration.get("fTokenizer"), true, configuration.get("fStopword"), null, null);
        this.sLogger.info("Tokenizer and vocabs created successfully.");
        this.fScoreFn = new Bm25();
        this.fScoreFn.setAvgDocLength(lang2AvgSentLen.get(this.fLang));
        this.fScoreFn.setDocCount(new RetrievalEnvironment(this.eDir, local).readCollectionDocumentCount());
        this.classifier = new MoreGenericModelReader(new Path(configuration.get("modelFileName")), (FileSystem) local).constructModel();
    }

    private void loadEModels(Configuration configuration) throws Exception {
        this.sLogger.info("Loading models for " + this.eLang + " ...");
        LocalFileSystem local = FileSystem.getLocal(configuration);
        this.eModel = new SentenceDetectorME(new SentenceModel(local.open(new Path(configuration.get("fSentDetectorFile")))));
        this.sLogger.info("Sentence model created successfully.");
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(this.eDir, local);
        this.sLogger.info("Environment created successfully.");
        this.eTok = TokenizerFactory.createTokenizer(local, this.eLang, configuration.get("eTokenizer"), true, configuration.get("eStopword"), null, null);
        this.sLogger.info("Tokenizer and vocabs created successfully.");
        this.eScoreFn = new Bm25();
        this.eScoreFn.setAvgDocLength(lang2AvgSentLen.get(this.eLang));
        this.eScoreFn.setDocCount(retrievalEnvironment.readCollectionDocumentCount());
        this.dict = new DefaultFrequencySortedDictionary(new Path(retrievalEnvironment.getIndexTermsData()), new Path(retrievalEnvironment.getIndexTermIdsData()), new Path(retrievalEnvironment.getIndexTermIdMappingData()), local);
        this.dfTable = new DfTableArray(new Path(retrievalEnvironment.getDfByTermData()), local);
    }

    public float getFOOVRate(String str) {
        return this.fTok.getOOVRate(str, this.fVocabSrc);
    }

    public float getEOOVRate(String str) {
        return this.eTok.getOOVRate(str, this.eVocabSrc);
    }
}
