package ivory.core.preprocess;

import edu.umd.cloud9.io.map.HMapIFW;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.mapred.NullInputFormat;
import edu.umd.cloud9.mapred.NullMapper;
import edu.umd.cloud9.mapred.NullOutputFormat;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.map.MapIF;
import edu.umd.hooka.Vocab;
import edu.umd.hooka.alignment.HadoopAlign;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.TermDocVector;
import ivory.core.data.stat.DocLengthTable4B;
import ivory.core.data.stat.PrefixEncodedGlobalStats;
import ivory.core.util.CLIRUtils;
import ivory.pwsim.score.ScoringModel;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/core/preprocess/BuildTranslatedTermDocVectors.class */
public class BuildTranslatedTermDocVectors extends PowerTool {
    private static final Logger LOG = Logger.getLogger(BuildTranslatedTermDocVectors.class);
    private static int SAMPLING = 1;
    public static final String[] RequiredParameters = {Constants.IndexPath, "Ivory.ScoringModel"};

    /* loaded from: input_file:ivory/core/preprocess/BuildTranslatedTermDocVectors$DF.class */
    protected enum DF {
        TransDf,
        NoDf
    }

    /* loaded from: input_file:ivory/core/preprocess/BuildTranslatedTermDocVectors$DataWriterMapper.class */
    private static class DataWriterMapper extends NullMapper {
        private DataWriterMapper() {
        }

        public void run(JobConf jobConf, Reporter reporter) throws IOException {
            Logger logger = Logger.getLogger(DataWriterMapper.class);
            logger.setLevel(Level.DEBUG);
            String str = jobConf.get(Constants.IndexPath);
            FileSystem fileSystem = FileSystem.get(jobConf);
            RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(str, fileSystem);
            String str2 = jobConf.get("TransDfFile");
            String str3 = jobConf.get("Ivory.E_Vocab_E2F");
            String str4 = jobConf.get("Ivory.F_Vocab_E2F");
            String str5 = jobConf.get("Ivory.TTable_E2F");
            String indexTermsData = retrievalEnvironment.getIndexTermsData();
            String dfByTermData = retrievalEnvironment.getDfByTermData();
            logger.debug(str5 + str3 + indexTermsData);
            if (!fileSystem.exists(new Path(str4)) || !fileSystem.exists(new Path(str3)) || !fileSystem.exists(new Path(str5)) || !fileSystem.exists(new Path(indexTermsData)) || !fileSystem.exists(new Path(dfByTermData))) {
                throw new RuntimeException("Error: Translation files do not exist!");
            }
            Vocab vocab = null;
            Vocab vocab2 = null;
            TTable_monolithic_IFAs tTable_monolithic_IFAs = null;
            try {
                vocab = HadoopAlign.loadVocab(new Path(str3), jobConf);
                vocab2 = HadoopAlign.loadVocab(new Path(str4), jobConf);
                tTable_monolithic_IFAs = new TTable_monolithic_IFAs(fileSystem, new Path(str5), true);
            } catch (IOException e) {
                e.printStackTrace();
            }
            PrefixEncodedGlobalStats prefixEncodedGlobalStats = new PrefixEncodedGlobalStats(new Path(indexTermsData), fileSystem);
            prefixEncodedGlobalStats.loadDFStats(new Path(dfByTermData), fileSystem);
            HMapIFW translateDFTable = CLIRUtils.translateDFTable(vocab, vocab2, tTable_monolithic_IFAs, prefixEncodedGlobalStats);
            SequenceFile.Writer createWriter = SequenceFile.createWriter(fileSystem, jobConf, new Path(str2), IntWritable.class, FloatWritable.class);
            for (MapIF.Entry entry : translateDFTable.entrySet()) {
                reporter.incrCounter(DF.TransDf, 1L);
                createWriter.append(new IntWritable(entry.getKey()), new FloatWritable(entry.getValue()));
            }
            createWriter.close();
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/BuildTranslatedTermDocVectors$Docs.class */
    public enum Docs {
        ZERO,
        SHORT,
        Total
    }

    /* loaded from: input_file:ivory/core/preprocess/BuildTranslatedTermDocVectors$MyMapperTrans.class */
    private static class MyMapperTrans extends MapReduceBase implements Mapper<IntWritable, TermDocVector, IntWritable, HMapSFW> {
        private ScoringModel model;
        private HMapIFW transDfTable;
        static Vocab eVocabSrc;
        static Vocab fVocabSrc;
        static Vocab fVocabTrg;
        static Vocab eVocabTrg;
        static TTable_monolithic_IFAs f2e_Probs;
        static TTable_monolithic_IFAs e2f_Probs;
        static float avgDocLen;
        static int numDocs;
        static boolean isNormalize;
        private String language;
        int MIN_SIZE = 0;

        private MyMapperTrans() {
        }

        public void configure(JobConf jobConf) {
            numDocs = jobConf.getInt(Constants.CollectionDocumentCount, -1);
            avgDocLen = jobConf.getFloat("Ivory.AvgDocLen", -1.0f);
            isNormalize = jobConf.getBoolean("Ivory.Normalize", false);
            this.language = jobConf.get("Ivory.Lang");
            BuildTranslatedTermDocVectors.LOG.debug(numDocs + " " + avgDocLen);
            this.MIN_SIZE = jobConf.getInt("Ivory.MinNumTerms", 0);
            LocalFileSystem localFileSystem = null;
            try {
                localFileSystem = FileSystem.getLocal(jobConf);
            } catch (IOException e) {
            }
            try {
                Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(jobConf);
                try {
                    this.transDfTable = CLIRUtils.readTransDfTable(localCacheFiles[0], localFileSystem);
                    try {
                        eVocabTrg = HadoopAlign.loadVocab(localCacheFiles[1], localFileSystem);
                        fVocabSrc = HadoopAlign.loadVocab(localCacheFiles[2], localFileSystem);
                        f2e_Probs = new TTable_monolithic_IFAs(localFileSystem, localCacheFiles[3], true);
                        eVocabSrc = HadoopAlign.loadVocab(localCacheFiles[4], localFileSystem);
                        fVocabTrg = HadoopAlign.loadVocab(localCacheFiles[5], localFileSystem);
                        e2f_Probs = new TTable_monolithic_IFAs(localFileSystem, localCacheFiles[6], true);
                        try {
                            this.model = (ScoringModel) Class.forName(jobConf.get("Ivory.ScoringModel")).newInstance();
                            this.model.setDocCount(numDocs);
                            this.model.setAvgDocLength(avgDocLen);
                            if (jobConf.get("debug") != null) {
                                BuildTranslatedTermDocVectors.LOG.setLevel(Level.DEBUG);
                            }
                            BuildTranslatedTermDocVectors.LOG.debug(Integer.valueOf(numDocs));
                            BuildTranslatedTermDocVectors.LOG.debug(Float.valueOf(avgDocLen));
                            BuildTranslatedTermDocVectors.LOG.debug("---------");
                        } catch (Exception e2) {
                            throw new RuntimeException("Error initializing Ivory.ScoringModel!");
                        }
                    } catch (IOException e3) {
                        throw new RuntimeException("Error initializing vocabularies/prob table!");
                    }
                } catch (Exception e4) {
                    BuildTranslatedTermDocVectors.LOG.info(e4.getMessage());
                    throw new RuntimeException("Error initializing DfTable!");
                }
            } catch (IOException e5) {
                throw new RuntimeException("Error initializing cache file paths!");
            }
        }

        public void map(IntWritable intWritable, TermDocVector termDocVector, OutputCollector<IntWritable, HMapSFW> outputCollector, Reporter reporter) throws IOException {
            if (intWritable.get() % BuildTranslatedTermDocVectors.SAMPLING != 0) {
                return;
            }
            if (!this.language.equals("english")) {
                intWritable.set(intWritable.get() + 1000000000);
            }
            HMapIFW hMapIFW = new HMapIFW();
            HMapSFW createTermDocVector = CLIRUtils.createTermDocVector(CLIRUtils.translateTFs(termDocVector, hMapIFW, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, BuildTranslatedTermDocVectors.LOG), hMapIFW, eVocabSrc, this.model, this.transDfTable, isNormalize, BuildTranslatedTermDocVectors.LOG);
            if (createTermDocVector.isEmpty()) {
                reporter.incrCounter(Docs.ZERO, 1L);
            } else if (createTermDocVector.size() < this.MIN_SIZE) {
                reporter.incrCounter(Docs.SHORT, 1L);
            } else {
                reporter.incrCounter(Docs.Total, 1L);
                outputCollector.collect(intWritable, createTermDocVector);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (TermDocVector) obj2, (OutputCollector<IntWritable, HMapSFW>) outputCollector, reporter);
        }
    }

    public BuildTranslatedTermDocVectors(Configuration configuration) {
        super(configuration);
    }

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public int runTool() throws Exception {
        String str = getConf().get(Constants.IndexPath);
        String str2 = getConf().get("Ivory.ScoringModel");
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(str, FileSystem.get(getConf()));
        String weightedTermDocVectorsDirectory = retrievalEnvironment.getWeightedTermDocVectorsDirectory();
        String str3 = str + "/transDf.dat";
        String str4 = getConf().get("Ivory.F_Vocab_F2E");
        String str5 = getConf().get("Ivory.E_Vocab_F2E");
        String str6 = getConf().get("Ivory.TTable_F2E");
        String str7 = getConf().get("Ivory.E_Vocab_E2F");
        String str8 = getConf().get("Ivory.F_Vocab_E2F");
        String str9 = getConf().get("Ivory.TTable_E2F");
        createTranslatedDFFile(str3);
        JobConf jobConf = new JobConf(getConf(), BuildTranslatedTermDocVectors.class);
        jobConf.setJobName("BuildTranslatedTermDocVectors");
        FileSystem fileSystem = FileSystem.get(jobConf);
        if (fileSystem.exists(new Path(weightedTermDocVectorsDirectory))) {
            LOG.info(weightedTermDocVectorsDirectory + ": Translated term doc vectors already exist! Nothing to do for this job...");
            return 0;
        }
        String str10 = getConf().get(Constants.CollectionName);
        String termDocVectorsDirectory = retrievalEnvironment.getTermDocVectorsDirectory();
        LOG.info("Preparing to build document vectors using " + str2);
        LOG.info("Document vectors to be stored in " + weightedTermDocVectorsDirectory);
        LOG.info("CollectionName: " + str10);
        LOG.info("Input path: " + termDocVectorsDirectory);
        jobConf.set(Constants.IndexPath, str);
        jobConf.set("Ivory.ScoringModel", str2);
        try {
            DocLengthTable4B docLengthTable4B = new DocLengthTable4B(retrievalEnvironment.getDoclengthsData(), fileSystem);
            LOG.info(docLengthTable4B.getAvgDocLength() + " is average doc len.");
            LOG.info(docLengthTable4B.getDocCount() + " is num docs.");
            jobConf.setFloat("Ivory.AvgDocLen", docLengthTable4B.getAvgDocLength());
            jobConf.setInt(Constants.CollectionDocumentCount, retrievalEnvironment.readCollectionDocumentCount());
            jobConf.setNumMapTasks(300);
            jobConf.setNumReduceTasks(0);
            jobConf.set("mapred.child.java.opts", "-Xmx2048m");
            jobConf.setInt("mapred.map.max.attempts", 10);
            jobConf.setInt("mapred.reduce.max.attempts", 10);
            jobConf.setInt("mapred.task.timeout", 6000000);
            DistributedCache.addCacheFile(new URI(str3), jobConf);
            DistributedCache.addCacheFile(new URI(str5), jobConf);
            DistributedCache.addCacheFile(new URI(str4), jobConf);
            DistributedCache.addCacheFile(new URI(str6), jobConf);
            DistributedCache.addCacheFile(new URI(str7), jobConf);
            DistributedCache.addCacheFile(new URI(str8), jobConf);
            DistributedCache.addCacheFile(new URI(str9), jobConf);
            FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(termDocVectorsDirectory)});
            FileOutputFormat.setOutputPath(jobConf, new Path(weightedTermDocVectorsDirectory));
            jobConf.setInputFormat(SequenceFileInputFormat.class);
            jobConf.setMapOutputKeyClass(IntWritable.class);
            jobConf.setMapOutputValueClass(HMapSFW.class);
            jobConf.setOutputKeyClass(IntWritable.class);
            jobConf.setOutputValueClass(HMapSFW.class);
            jobConf.setOutputFormat(SequenceFileOutputFormat.class);
            jobConf.setMapperClass(MyMapperTrans.class);
            long currentTimeMillis = System.currentTimeMillis();
            JobClient.runJob(jobConf);
            LOG.info("Job finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
            return 0;
        } catch (IOException e) {
            throw new RuntimeException("Error initializing Doclengths file");
        }
    }

    private void createTranslatedDFFile(String str) {
        try {
            JobConf jobConf = new JobConf(getConf(), BuildTranslatedTermDocVectors.class);
            jobConf.setJobName("BuildTranslatedDfTable");
            if (FileSystem.get(jobConf).exists(new Path(str))) {
                LOG.info("Translated Df file already exists! Nothing to do for this job...");
            } else {
                LOG.info("Creating translated Df file ...");
                jobConf.set("mapred.child.java.opts", "-Xmx2048m");
                jobConf.setInt("mapred.map.max.attempts", 10);
                jobConf.setInt("mapred.reduce.max.attempts", 10);
                jobConf.setInt("mapred.task.timeout", 6000000);
                jobConf.set("TransDfFile", str);
                jobConf.setSpeculativeExecution(false);
                jobConf.setNumMapTasks(1);
                jobConf.setNumReduceTasks(0);
                jobConf.setInputFormat(NullInputFormat.class);
                jobConf.setOutputFormat(NullOutputFormat.class);
                jobConf.setMapperClass(DataWriterMapper.class);
                JobClient.runJob(jobConf);
                LOG.info("Done");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
