package ivory.lsh.eval;

import edu.umd.cloud9.collection.DocnoMapping;
import edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMapping;
import edu.umd.cloud9.collection.wikipedia.WikipediaPage;
import edu.umd.cloud9.io.SequenceFileUtils;
import edu.umd.cloud9.io.map.HMapIIW;
import ivory.core.Constants;
import ivory.core.RetrievalEnvironment;
import ivory.core.util.DelimitedValuesFileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.SortedMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.LineReader;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/lsh/eval/ExtractWikipedia.class */
public class ExtractWikipedia extends Configured implements Tool {
    public static final String[] RequiredParameters = new String[0];
    private static final Logger sLogger = Logger.getLogger(ExtractWikipedia.class);

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:ivory/lsh/eval/ExtractWikipedia$Count.class */
    public enum Count {
        INTER,
        DOCS,
        SKIPPED;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Count[] valuesCustom() {
            Count[] valuesCustom = values();
            int length = valuesCustom.length;
            Count[] countArr = new Count[length];
            System.arraycopy(valuesCustom, 0, countArr, 0, length);
            return countArr;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:ivory/lsh/eval/ExtractWikipedia$Maps.class */
    public enum Maps {
        FOUND,
        NOTFOUND;

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static Maps[] valuesCustom() {
            Maps[] valuesCustom = values();
            int length = valuesCustom.length;
            Maps[] mapsArr = new Maps[length];
            System.arraycopy(valuesCustom, 0, mapsArr, 0, length);
            return mapsArr;
        }
    }

    /* loaded from: input_file:ivory/lsh/eval/ExtractWikipedia$MyMapperEn2DeDocno.class */
    static class MyMapperEn2DeDocno extends MapReduceBase implements Mapper<IntWritable, WikipediaPage, IntWritable, IntWritable> {
        SortedMap<WritableComparable, Writable> title2Docno;
        private DocnoMapping mDocMapping;
        String sampleDocnosFile;
        HMapIIW samplesMap = null;

        MyMapperEn2DeDocno() {
        }

        public void configure(JobConf jobConf) {
            ExtractWikipedia.sLogger.setLevel(Level.DEBUG);
            try {
                this.title2Docno = SequenceFileUtils.readFileIntoMap(new Path(jobConf.get("TitleDocnoFile")));
            } catch (IOException e) {
                e.printStackTrace();
            }
            this.sampleDocnosFile = jobConf.get("SampleDocnosFile");
            if (this.sampleDocnosFile != null) {
                this.samplesMap = new HMapIIW();
                try {
                    LineReader lineReader = new LineReader(FileSystem.get(jobConf).open(new Path(this.sampleDocnosFile)));
                    Text text = new Text();
                    while (lineReader.readLine(text) != 0) {
                        this.samplesMap.put(Integer.parseInt(text.toString()), 1);
                    }
                    lineReader.close();
                } catch (IOException e2) {
                }
                ExtractWikipedia.sLogger.info("Loaded " + this.samplesMap.size() + " samples");
            } else {
                ExtractWikipedia.sLogger.info("No sample file read.");
            }
            this.mDocMapping = new WikipediaDocnoMapping();
            try {
                FileSystem fileSystem = FileSystem.get(jobConf);
                this.mDocMapping.loadMapping(new RetrievalEnvironment(jobConf.get(Constants.IndexPath), fileSystem).getDocnoMappingData(), fileSystem);
            } catch (IOException e3) {
                e3.printStackTrace();
            }
        }

        public void map(IntWritable intWritable, WikipediaPage wikipediaPage, OutputCollector<IntWritable, IntWritable> outputCollector, Reporter reporter) throws IOException {
            int docno = this.mDocMapping.getDocno(wikipediaPage.getDocid());
            if (docno < 0) {
                reporter.incrCounter(Count.SKIPPED, 1L);
                return;
            }
            int i = docno + 1000000000;
            if (this.samplesMap != null && !this.samplesMap.containsKey(i)) {
                ExtractWikipedia.sLogger.info(String.valueOf(i) + " not found!");
                ExtractWikipedia.sLogger.info(this.samplesMap);
                return;
            }
            Text text = new Text(wikipediaPage.getTitle());
            IntWritable intWritable2 = this.title2Docno.get(text);
            if (intWritable2 == null) {
                reporter.incrCounter(Maps.NOTFOUND, 1L);
                ExtractWikipedia.sLogger.debug(text + " does not have language link");
            } else {
                reporter.incrCounter(Maps.FOUND, 1L);
                ExtractWikipedia.sLogger.debug("English docno: " + intWritable2.get());
                ExtractWikipedia.sLogger.debug("German title: " + text);
                outputCollector.collect(intWritable2, new IntWritable(i));
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (WikipediaPage) obj2, (OutputCollector<IntWritable, IntWritable>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:ivory/lsh/eval/ExtractWikipedia$MyMapperGetDocs.class */
    static class MyMapperGetDocs extends MapReduceBase implements Mapper<LongWritable, WikipediaPage, Text, Text> {
        Text valText;
        Text keyText;
        private DocnoMapping mDocMapping;
        String sampleDocnosFile;
        HMapIIW samplesMap = null;
        HashSet<Integer> germandocnos = new HashSet<>();
        HashSet<Integer> englishdocnos = new HashSet<>();

        MyMapperGetDocs() {
        }

        public void configure(JobConf jobConf) {
            ExtractWikipedia.sLogger.setLevel(Level.DEBUG);
            this.keyText = new Text();
            this.valText = new Text();
            this.sampleDocnosFile = jobConf.get("SampleDocnosFile");
            if (this.sampleDocnosFile != null) {
                this.samplesMap = new HMapIIW();
                try {
                    LineReader lineReader = new LineReader(FileSystem.get(jobConf).open(new Path(this.sampleDocnosFile)));
                    Text text = new Text();
                    while (lineReader.readLine(text) != 0) {
                        String[] split = text.toString().split(DelimitedValuesFileReader.DEFAULT_DELIMITER);
                        this.germandocnos.add(Integer.valueOf(Integer.parseInt(split[0])));
                        this.englishdocnos.add(Integer.valueOf(Integer.parseInt(split[1])));
                    }
                    lineReader.close();
                } catch (IOException e) {
                }
            }
            this.mDocMapping = new WikipediaDocnoMapping();
            try {
                FileSystem fileSystem = FileSystem.get(jobConf);
                this.mDocMapping.loadMapping(new RetrievalEnvironment(jobConf.get(Constants.IndexPath), fileSystem).getDocnoMappingData(), fileSystem);
            } catch (IOException e2) {
                e2.printStackTrace();
            }
        }

        public void map(LongWritable longWritable, WikipediaPage wikipediaPage, OutputCollector<Text, Text> outputCollector, Reporter reporter) throws IOException {
            int docno = this.mDocMapping.getDocno(wikipediaPage.getDocid());
            if (docno < 0) {
                reporter.incrCounter(Count.SKIPPED, 1L);
                return;
            }
            int i = docno + 1000000000;
            if (this.germandocnos.contains(new Integer(i))) {
                reporter.incrCounter(Count.DOCS, 1L);
                this.valText.set("\n" + wikipediaPage.getContent());
                this.keyText.set("DOCNO " + i);
                outputCollector.collect(this.keyText, this.valText);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((LongWritable) obj, (WikipediaPage) obj2, (OutputCollector<Text, Text>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:ivory/lsh/eval/ExtractWikipedia$MyMapperTitle2Docno.class */
    static class MyMapperTitle2Docno extends MapReduceBase implements Mapper<IntWritable, WikipediaPage, Text, IntWritable> {
        Text valText;
        IntWritable keyInt;
        private DocnoMapping mDocMapping;
        String sampleDocnosFile;
        HMapIIW samplesMap = null;

        MyMapperTitle2Docno() {
        }

        public void configure(JobConf jobConf) {
            this.keyInt = new IntWritable();
            this.valText = new Text();
            this.sampleDocnosFile = jobConf.get("SampleDocnosFile");
            if (this.sampleDocnosFile != null) {
                this.samplesMap = new HMapIIW();
                try {
                    LineReader lineReader = new LineReader(FileSystem.get(jobConf).open(new Path(this.sampleDocnosFile)));
                    Text text = new Text();
                    while (lineReader.readLine(text) != 0) {
                        this.samplesMap.put(Integer.parseInt(text.toString()), 1);
                    }
                    lineReader.close();
                } catch (IOException e) {
                }
            }
            this.mDocMapping = new WikipediaDocnoMapping();
            try {
                FileSystem fileSystem = FileSystem.get(jobConf);
                this.mDocMapping.loadMapping(new RetrievalEnvironment(jobConf.get(Constants.IndexPath), fileSystem).getDocnoMappingData(), fileSystem);
            } catch (IOException e2) {
                e2.printStackTrace();
            }
        }

        public void map(IntWritable intWritable, WikipediaPage wikipediaPage, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {
            int docno = this.mDocMapping.getDocno(wikipediaPage.getDocid());
            if (docno < 0) {
                reporter.incrCounter(Count.SKIPPED, 1L);
                return;
            }
            if (this.samplesMap == null || this.samplesMap.containsKey(docno)) {
                reporter.incrCounter(Count.DOCS, 1L);
                String rawXML = wikipediaPage.getRawXML();
                Matcher matcher = Pattern.compile("\\[\\[de:(.+)\\]\\]").matcher(rawXML);
                if (matcher.find()) {
                    ExtractWikipedia.sLogger.debug(wikipediaPage.getTitle());
                    ExtractWikipedia.sLogger.debug(matcher.group(1));
                    this.keyInt.set(docno);
                    this.valText.set(matcher.group(1).split("#")[0]);
                    outputCollector.collect(this.valText, this.keyInt);
                    reporter.incrCounter(Count.INTER, 1L);
                    ExtractWikipedia.sLogger.debug(rawXML);
                    ExtractWikipedia.sLogger.debug("---------");
                }
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (WikipediaPage) obj2, (OutputCollector<Text, IntWritable>) outputCollector, reporter);
        }
    }

    private static int printUsage() {
        System.out.println("usage: [en-collection-path] [en-index-path] [intermediate-output-path] [de-collection-path] [de-index-path] [output-path] [sample-docnos-path]");
        return -1;
    }

    public int run(String[] strArr) throws Exception {
        if (strArr.length != 7) {
            return printUsage();
        }
        String str = strArr[0];
        String str2 = strArr[1];
        String str3 = strArr[2];
        JobConf jobConf = new JobConf(getConf(), ExtractWikipedia.class);
        jobConf.set(Constants.IndexPath, str2);
        jobConf.setJobName("ExtractWikipedia-Step1");
        sLogger.info("Extracting information from En-Wikipedia...");
        sLogger.info("InputPath: " + str);
        sLogger.info("Output Path: " + str3);
        sLogger.info("Index Path: " + str2);
        FileSystem.get(jobConf).delete(new Path(str3), true);
        FileInputFormat.setInputPaths(jobConf, new Path[]{new Path(str)});
        FileOutputFormat.setOutputPath(jobConf, new Path(str3));
        FileOutputFormat.setCompressOutput(jobConf, false);
        jobConf.set("mapred.child.java.opts", "-Xmx2048m");
        jobConf.setInt("mapred.map.max.attempts", 10);
        jobConf.setInt("mapred.reduce.max.attempts", 10);
        jobConf.setInt("mapred.task.timeout", 6000000);
        jobConf.setNumMapTasks(100);
        jobConf.setNumReduceTasks(1);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setMapOutputKeyClass(Text.class);
        jobConf.setMapOutputValueClass(IntWritable.class);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setOutputValueClass(IntWritable.class);
        jobConf.setMapperClass(MyMapperTitle2Docno.class);
        jobConf.setReducerClass(IdentityReducer.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        JobClient.runJob(jobConf);
        String str4 = strArr[3];
        String str5 = strArr[4];
        String str6 = strArr[5];
        JobConf jobConf2 = new JobConf(getConf(), ExtractWikipedia.class);
        jobConf2.set(Constants.IndexPath, str5);
        jobConf2.set("TitleDocnoFile", String.valueOf(str3) + "/part-00000");
        jobConf2.set("SampleDocnosFile", strArr[6]);
        jobConf2.setJobName("ExtractWikipedia-Step2");
        sLogger.info("Extracting information from De-Wikipedia...");
        sLogger.info("InputPath: " + str4);
        sLogger.info("Output Path: " + str6);
        sLogger.info("Index Path: " + str5);
        sLogger.info("Sample file: " + strArr[6]);
        FileInputFormat.setInputPaths(jobConf2, new Path[]{new Path(str4)});
        FileOutputFormat.setOutputPath(jobConf2, new Path(str6));
        FileOutputFormat.setCompressOutput(jobConf2, false);
        jobConf2.set("mapred.child.java.opts", "-Xmx2048m");
        jobConf2.setInt("mapred.map.max.attempts", 10);
        jobConf2.setInt("mapred.reduce.max.attempts", 10);
        jobConf2.setInt("mapred.task.timeout", 6000000);
        jobConf2.setNumMapTasks(100);
        jobConf2.setNumReduceTasks(1);
        jobConf2.setInputFormat(SequenceFileInputFormat.class);
        jobConf2.setMapOutputKeyClass(IntWritable.class);
        jobConf2.setMapOutputValueClass(IntWritable.class);
        jobConf2.setOutputKeyClass(IntWritable.class);
        jobConf2.setOutputValueClass(IntWritable.class);
        jobConf2.setMapperClass(MyMapperEn2DeDocno.class);
        jobConf2.setReducerClass(IdentityReducer.class);
        JobClient.runJob(jobConf2);
        return 0;
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(new Configuration(), new ExtractWikipedia(), strArr));
    }
}
