package ivory.core.preprocess;

import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.Indexable;
import edu.umd.cloud9.mapreduce.NullInputFormat;
import edu.umd.cloud9.mapreduce.NullMapper;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.util.array.ArrayListOfInts;
import edu.umd.cloud9.util.map.HMapII;
import edu.umd.cloud9.util.map.MapII;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.data.document.LazyTermDocVector;
import ivory.core.data.document.TermDocVector;
import ivory.core.tokenize.DocumentProcessingUtils;
import ivory.core.tokenize.Tokenizer;
import ivory.core.util.DelimitedValuesFileReader;
import java.io.IOException;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.LineReader;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/core/preprocess/BuildTermDocVectors.class */
public class BuildTermDocVectors extends PowerTool {
    private static final String InputPath = "Ivory.InputPath";
    private static final String DocLengthDataFile = "Ivory.DocLengthDataFile";
    private static final Logger LOG = Logger.getLogger(BuildTermDocVectors.class);
    public static final String[] RequiredParameters = {Constants.CollectionName, Constants.CollectionPath, Constants.IndexPath, Constants.InputFormat, Constants.Tokenizer, Constants.DocnoMappingClass, Constants.DocnoOffset};

    /* loaded from: input_file:ivory/core/preprocess/BuildTermDocVectors$DocLengthDataWriterMapper.class */
    private static class DocLengthDataWriterMapper extends NullMapper {
        private DocLengthDataWriterMapper() {
        }

        public void runSafely(Mapper<NullWritable, NullWritable, NullWritable, NullWritable>.Context context) {
            try {
                Configuration configuration = context.getConfiguration();
                int i = configuration.getInt(Constants.CollectionDocumentCount, -1);
                String str = configuration.get(BuildTermDocVectors.InputPath);
                String str2 = configuration.get(BuildTermDocVectors.DocLengthDataFile);
                int i2 = configuration.getInt(Constants.DocnoOffset, 0);
                Path path = new Path(str);
                BuildTermDocVectors.LOG.info("InputPath: " + str);
                BuildTermDocVectors.LOG.info("DocLengthDataFile: " + str2);
                BuildTermDocVectors.LOG.info("DocnoOffset: " + i2);
                BuildTermDocVectors.LOG.info("CollectionDocCount: " + i);
                FileSystem fileSystem = FileSystem.get(configuration);
                FileStatus[] listStatus = fileSystem.listStatus(path);
                int[] iArr = new int[i + 1];
                int i3 = 0;
                int i4 = Integer.MAX_VALUE;
                int length = listStatus.length;
                for (int i5 = 0; i5 < length; i5++) {
                    if (!listStatus[i5].getPath().getName().startsWith("_")) {
                        BuildTermDocVectors.LOG.info("processing " + listStatus[i5].getPath());
                        LineReader lineReader = new LineReader(fileSystem.open(listStatus[i5].getPath()));
                        Text text = new Text();
                        while (lineReader.readLine(text) > 0) {
                            String[] split = text.toString().split("\\t+", 2);
                            int parseInt = Integer.parseInt(split[0]);
                            int parseInt2 = Integer.parseInt(split[1]);
                            if (parseInt < i2) {
                                throw new RuntimeException("Error: docno " + parseInt + " < docnoOffset " + i2 + "!");
                            }
                            if (parseInt - i2 >= iArr.length) {
                                throw new RuntimeException("Error: docno - docnoOffset " + (parseInt - i2) + " >= collectionDocCount " + i + "!");
                            }
                            iArr[parseInt - i2] = parseInt2;
                            if (parseInt > i3) {
                                i3 = parseInt;
                            }
                            if (parseInt < i4) {
                                i4 = parseInt;
                            }
                        }
                        lineReader.close();
                        context.getCounter(DocLengths.Files).increment(1L);
                    }
                }
                BuildTermDocVectors.LOG.info("min docno: " + i4);
                BuildTermDocVectors.LOG.info("max docno: " + i3);
                FSDataOutputStream create = fileSystem.create(new Path(str2), true);
                create.writeInt(i2);
                create.writeInt(i3 - i2);
                int i6 = 0;
                for (int i7 = 1; i7 <= i3 - i2; i7++) {
                    create.writeInt(iArr[i7]);
                    i6++;
                    context.getCounter(DocLengths.Count).increment(1L);
                    context.getCounter(DocLengths.SumOfDocLengths).increment(iArr[i7]);
                }
                BuildTermDocVectors.LOG.info(i6 + " doc lengths written");
                create.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

    /* loaded from: input_file:ivory/core/preprocess/BuildTermDocVectors$DocLengths.class */
    protected enum DocLengths {
        Count,
        SumOfDocLengths,
        Files
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/BuildTermDocVectors$Docs.class */
    public enum Docs {
        Skipped,
        Total,
        Empty
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/core/preprocess/BuildTermDocVectors$MapTime.class */
    public enum MapTime {
        Spilling,
        Parsing
    }

    /* loaded from: input_file:ivory/core/preprocess/BuildTermDocVectors$MyMapper.class */
    protected static class MyMapper extends Mapper<Writable, Indexable, IntWritable, TermDocVector> {
        private static final IntWritable key = new IntWritable();
        private static final LazyTermDocVector docVector = new LazyTermDocVector();
        private static final HMapII doclengths = new HMapII();
        private Tokenizer tokenizer;
        private DocnoMapping docMapping;
        private int docno;

        protected MyMapper() {
        }

        public void setup(Mapper<Writable, Indexable, IntWritable, TermDocVector>.Context context) throws IOException {
            Configuration configuration = context.getConfiguration();
            try {
                LocalFileSystem local = FileSystem.getLocal(configuration);
                this.docMapping = (DocnoMapping) Class.forName(configuration.get(Constants.DocnoMappingClass)).newInstance();
                if (configuration.get("mapred.job.tracker").equals("local")) {
                    this.docMapping.loadMapping(new RetrievalEnvironment(context.getConfiguration().get(Constants.IndexPath), local).getDocnoMappingData(), local);
                } else {
                    this.docMapping.loadMapping(DistributedCache.getLocalCacheFiles(configuration)[0], local);
                }
                try {
                    this.tokenizer = (Tokenizer) Class.forName(configuration.get(Constants.Tokenizer)).newInstance();
                    this.tokenizer.configure(configuration, FileSystem.get(configuration));
                } catch (Exception e) {
                    e.printStackTrace();
                    throw new RuntimeException("Error initializing tokenizer: " + e.getMessage());
                }
            } catch (Exception e2) {
                throw new RuntimeException("Error initializing docno mapping!", e2);
            }
        }

        public void map(Writable writable, Indexable indexable, Mapper<Writable, Indexable, IntWritable, TermDocVector>.Context context) throws IOException, InterruptedException {
            int i;
            this.docno = this.docMapping.getDocno(indexable.getDocid());
            if (this.docno <= 0) {
                context.getCounter(Docs.Skipped).increment(1L);
                return;
            }
            long currentTimeMillis = System.currentTimeMillis();
            Map<String, ArrayListOfInts> parseDocument = DocumentProcessingUtils.parseDocument(indexable, this.tokenizer);
            context.getCounter(MapTime.Parsing).increment(System.currentTimeMillis() - currentTimeMillis);
            if (parseDocument.size() == 0) {
                context.getCounter(Docs.Empty).increment(1L);
                i = 0;
            } else {
                i = parseDocument.get("").get(0);
                parseDocument.remove("");
            }
            long currentTimeMillis2 = System.currentTimeMillis();
            key.set(this.docno);
            docVector.setTermPositionsMap(parseDocument);
            context.write(key, docVector);
            context.getCounter(MapTime.Spilling).increment(System.currentTimeMillis() - currentTimeMillis2);
            context.getCounter(Docs.Total).increment(1L);
            doclengths.put(this.docno, i);
        }

        public void cleanup(Mapper<Writable, Indexable, IntWritable, TermDocVector>.Context context) throws IOException, InterruptedException {
            if (doclengths.size() == 0) {
                throw new RuntimeException("Error: Doclength table empty!");
            }
            long j = 0;
            Configuration configuration = context.getConfiguration();
            String str = configuration.get("mapred.task.id");
            String str2 = configuration.get(Constants.IndexPath);
            FileSystem fileSystem = FileSystem.get(configuration);
            Path path = new Path(str2 + "/doclengths/" + this.docno + "." + str);
            FSDataOutputStream create = fileSystem.create(path, false);
            long j2 = 0;
            int i = 0;
            for (MapII.Entry entry : doclengths.entrySet()) {
                create.write((entry.getKey() + DelimitedValuesFileReader.DEFAULT_DELIMITER + entry.getValue() + "\n").getBytes());
                j += r0.getBytes().length;
                i++;
                j2 += entry.getValue();
            }
            create.close();
            BuildTermDocVectors.LOG.info("Expected length of doclengths file: " + j);
            long len = fileSystem.listStatus(path)[0].getLen();
            BuildTermDocVectors.LOG.info("Actual length of doclengths file: " + len);
            if (j == 0) {
                throw new RuntimeException("Error: zero bytesCnt at " + path);
            }
            if (len == 0) {
                throw new RuntimeException("Error: zero bytesActual at " + path);
            }
            if (j != len) {
                throw new RuntimeException(String.format("Error writing Doclengths file: %d %d %s", Long.valueOf(j), Long.valueOf(len), path.toString()));
            }
            context.getCounter(DocLengths.Count).increment(i);
            context.getCounter(DocLengths.SumOfDocLengths).increment(j2);
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
            map((Writable) obj, (Indexable) obj2, (Mapper<Writable, Indexable, IntWritable, TermDocVector>.Context) context);
        }
    }

    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public BuildTermDocVectors(Configuration configuration) {
        super(configuration);
    }

    public int runTool() throws Exception {
        Configuration conf = getConf();
        FileSystem fileSystem = FileSystem.get(conf);
        String str = conf.get(Constants.IndexPath);
        String str2 = conf.get(Constants.CollectionName);
        String str3 = conf.get(Constants.CollectionPath);
        String str4 = conf.get(Constants.InputFormat);
        String str5 = conf.get(Constants.Tokenizer);
        String str6 = conf.get(Constants.DocnoMappingClass);
        int i = conf.getInt(Constants.DocnoOffset, 0);
        int i2 = conf.getInt(Constants.TermDocVectorSegments, 0);
        LOG.info("PowerTool: " + BuildTermDocVectors.class.getSimpleName());
        LOG.info(String.format(" - %s: %s", Constants.IndexPath, str));
        LOG.info(String.format(" - %s: %s", Constants.CollectionName, str2));
        LOG.info(String.format(" - %s: %s", Constants.CollectionPath, str3));
        LOG.info(String.format(" - %s: %s", Constants.InputFormat, str4));
        LOG.info(String.format(" - %s: %s", Constants.Tokenizer, str5));
        LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, str6));
        LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, Integer.valueOf(i)));
        LOG.info(String.format(" - %s: %s", Constants.TermDocVectorSegments, Integer.valueOf(i2)));
        LOG.info(String.format(" - %s: %s", Constants.CollectionVocab, conf.get(Constants.CollectionVocab)));
        LOG.info(String.format(" - %s: %s", Constants.StopwordList, conf.get(Constants.StopwordList)));
        RetrievalEnvironment retrievalEnvironment = new RetrievalEnvironment(str, fileSystem);
        Path docnoMappingData = retrievalEnvironment.getDocnoMappingData();
        if (!fileSystem.exists(docnoMappingData)) {
            throw new RuntimeException("Error, docno mapping data file " + docnoMappingData + " doesn't exist!");
        }
        DistributedCache.addCacheFile(docnoMappingData.toUri(), conf);
        Path path = new Path(retrievalEnvironment.getTermDocVectorsDirectory());
        if (fileSystem.exists(path)) {
            LOG.info("TermDocVectors already exist: Skipping!");
            return 0;
        }
        retrievalEnvironment.writeCollectionName(str2);
        retrievalEnvironment.writeCollectionPath(str3);
        retrievalEnvironment.writeInputFormat(str4);
        retrievalEnvironment.writeDocnoMappingClass(str6);
        retrievalEnvironment.writeTokenizerClass(str5);
        retrievalEnvironment.writeDocnoOffset(i);
        conf.set("mapreduce.task.timeout", "6000000");
        conf.set("mapreduce.map.memory.mb", "2048");
        conf.set("mapreduce.map.java.opts", "-Xmx2048m");
        Job job = Job.getInstance(conf, BuildTermDocVectors.class.getSimpleName() + ":" + str2);
        job.setJarByClass(BuildTermDocVectors.class);
        job.setNumReduceTasks(i2);
        FileInputFormat.addInputPaths(job, str3);
        FileOutputFormat.setOutputPath(job, path);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        job.setInputFormatClass(Class.forName(str4));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(LazyTermDocVector.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(LazyTermDocVector.class);
        job.setMapperClass(MyMapper.class);
        long currentTimeMillis = System.currentTimeMillis();
        job.waitForCompletion(true);
        LOG.info("Job Finished in " + ((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds");
        int value = (int) job.getCounters().findCounter(Docs.Total).getValue();
        retrievalEnvironment.writeCollectionDocumentCount(value);
        Path doclengthsData = retrievalEnvironment.getDoclengthsData();
        if (fileSystem.exists(doclengthsData)) {
            LOG.info("DocLength data exists: Skipping!");
            return 0;
        }
        conf.setInt(Constants.CollectionDocumentCount, value);
        conf.set(InputPath, retrievalEnvironment.getDoclengthsDirectory().toString());
        conf.set(DocLengthDataFile, doclengthsData.toString());
        conf.set("mapreduce.map.memory.mb", "2048");
        conf.set("mapreduce.map.java.opts", "-Xmx2048m");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
        conf.setBoolean("mapred.map.tasks.speculative.execution", false);
        conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
        LOG.info("Writing doc length data to " + doclengthsData + "...");
        Job job2 = Job.getInstance(conf, "DocLengthTable:" + str2);
        job2.setJarByClass(BuildTermDocVectors.class);
        job2.setNumReduceTasks(0);
        job2.setInputFormatClass(NullInputFormat.class);
        job2.setOutputFormatClass(NullOutputFormat.class);
        job2.setMapperClass(DocLengthDataWriterMapper.class);
        long currentTimeMillis2 = System.currentTimeMillis();
        job2.waitForCompletion(true);
        LOG.info("Job finished in " + ((System.currentTimeMillis() - currentTimeMillis2) / 1000.0d) + " seconds");
        retrievalEnvironment.writeCollectionAverageDocumentLength(((float) job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue()) / value);
        return 0;
    }
}
