package edu.umd.cloud9.webgraph;

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfIntString;
import edu.umd.cloud9.util.PowerTool;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
import java.io.IOException;
import java.net.URI;
import java.util.Collections;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.log4j.Logger;

/* loaded from: input_file:edu/umd/cloud9/webgraph/CollectHostnames.class */
public class CollectHostnames extends PowerTool {
    private static final Logger sLogger = Logger.getLogger(CollectHostnames.class);
    public static final String[] RequiredParameters = {"Cloud9.InputPath", "Cloud9.OutputPath", "Cloud9.Mappers", "Cloud9.Reducers"};

    /* loaded from: input_file:edu/umd/cloud9/webgraph/CollectHostnames$Map.class */
    public static class Map extends MapReduceBase implements Mapper<IntWritable, ArrayListWritable<AnchorText>, PairOfIntString, IntWritable> {
        private static final PairOfIntString keyWord = new PairOfIntString();
        private static final IntWritable valueWord = new IntWritable();
        private static String host;

        public void map(IntWritable intWritable, ArrayListWritable<AnchorText> arrayListWritable, OutputCollector<PairOfIntString, IntWritable> outputCollector, Reporter reporter) throws IOException {
            Iterator<E> it = arrayListWritable.iterator();
            while (it.hasNext()) {
                AnchorText anchorText = (AnchorText) it.next();
                if (anchorText.isURL()) {
                    try {
                        host = new URI(anchorText.getText()).getHost();
                    } catch (Exception e) {
                        return;
                    }
                }
            }
            Iterator<E> it2 = arrayListWritable.iterator();
            while (it2.hasNext()) {
                AnchorText anchorText2 = (AnchorText) it2.next();
                if (anchorText2.isExternalOutLink()) {
                    valueWord.set(intWritable.get());
                    Iterator<Integer> it3 = anchorText2.iterator();
                    while (it3.hasNext()) {
                        keyWord.set(it3.next().intValue(), host);
                        outputCollector.collect(keyWord, valueWord);
                    }
                }
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, OutputCollector outputCollector, Reporter reporter) throws IOException {
            map((IntWritable) obj, (ArrayListWritable<AnchorText>) obj2, (OutputCollector<PairOfIntString, IntWritable>) outputCollector, reporter);
        }
    }

    /* loaded from: input_file:edu/umd/cloud9/webgraph/CollectHostnames$Partition.class */
    protected static class Partition implements Partitioner<PairOfIntString, IntWritable> {
        protected Partition() {
        }

        public void configure(JobConf jobConf) {
        }

        public int getPartition(PairOfIntString pairOfIntString, IntWritable intWritable, int i) {
            return Math.abs(pairOfIntString.getLeftElement() % i);
        }
    }

    /* loaded from: input_file:edu/umd/cloud9/webgraph/CollectHostnames$Reduce.class */
    public static class Reduce extends MapReduceBase implements Reducer<PairOfIntString, IntWritable, IntWritable, ArrayListWritable<AnchorText>> {
        private static OutputCollector<IntWritable, ArrayListWritable<AnchorText>> outputCollector;
        private static final ArrayListWritable<AnchorText> arrayList = new ArrayListWritable<>();
        private static final IntWritable keyWord = new IntWritable();
        private static boolean firstTime = true;
        private static int currentDocument;
        private static int packet;

        /* JADX WARN: Multi-variable type inference failed */
        public void reduce(PairOfIntString pairOfIntString, Iterator<IntWritable> it, OutputCollector<IntWritable, ArrayListWritable<AnchorText>> outputCollector2, Reporter reporter) throws IOException {
            if (firstTime) {
                outputCollector = outputCollector2;
                firstTime = false;
                arrayList.clear();
                currentDocument = pairOfIntString.getLeftElement();
            } else if (currentDocument != pairOfIntString.getLeftElement()) {
                Collections.sort(arrayList);
                keyWord.set(currentDocument);
                outputCollector2.collect(keyWord, arrayList);
                currentDocument = pairOfIntString.getLeftElement();
                arrayList.clear();
            }
            arrayList.add(new AnchorText(AnchorTextConstants.Type.OTHER_TYPES.val, pairOfIntString.getRightElement()));
            int size = arrayList.size() - 1;
            while (it.hasNext()) {
                packet = it.next().get();
                if (((AnchorText) arrayList.get(size)).getSize() < 1048576) {
                    ((AnchorText) arrayList.get(size)).addDocument(packet);
                } else {
                    arrayList.add(new AnchorText(AnchorTextConstants.Type.OTHER_TYPES.val, pairOfIntString.getRightElement(), packet));
                    size = arrayList.size() - 1;
                }
            }
        }

        public void close() throws IOException {
            Collections.sort(arrayList);
            keyWord.set(currentDocument);
            outputCollector.collect(keyWord, arrayList);
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterator it, OutputCollector outputCollector2, Reporter reporter) throws IOException {
            reduce((PairOfIntString) obj, (Iterator<IntWritable>) it, (OutputCollector<IntWritable, ArrayListWritable<AnchorText>>) outputCollector2, reporter);
        }
    }

    @Override // edu.umd.cloud9.util.PowerTool
    public String[] getRequiredParameters() {
        return RequiredParameters;
    }

    public CollectHostnames(Configuration configuration) {
        super(configuration);
    }

    @Override // edu.umd.cloud9.util.PowerTool
    public int runTool() throws Exception {
        JobConf jobConf = new JobConf(getConf(), CollectHostnames.class);
        FileSystem fileSystem = FileSystem.get(jobConf);
        int i = jobConf.getInt("Cloud9.Mappers", 1);
        int i2 = jobConf.getInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS);
        String str = jobConf.get("Cloud9.InputPath");
        String str2 = jobConf.get("Cloud9.OutputPath");
        jobConf.setJobName("CollectHostnames");
        jobConf.set("mapred.child.java.opts", "-Xmx4096m");
        jobConf.setInt("mapred.task.timeout", 60000000);
        jobConf.setNumMapTasks(i);
        jobConf.setNumReduceTasks(i2);
        jobConf.setMapperClass(Map.class);
        jobConf.setPartitionerClass(Partition.class);
        jobConf.setReducerClass(Reduce.class);
        jobConf.setOutputKeyClass(IntWritable.class);
        jobConf.setOutputValueClass(ArrayListWritable.class);
        jobConf.setMapOutputKeyClass(PairOfIntString.class);
        jobConf.setMapOutputValueClass(IntWritable.class);
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(jobConf, true);
        SequenceFileOutputFormat.setOutputCompressionType(jobConf, SequenceFile.CompressionType.BLOCK);
        SequenceFileInputFormat.setInputPaths(jobConf, str);
        FileOutputFormat.setOutputPath(jobConf, new Path(str2));
        sLogger.info("PropagateHostname");
        sLogger.info(" - input path: " + str);
        sLogger.info(" - output path: " + str2);
        if (fileSystem.exists(new Path(str2))) {
            sLogger.info(String.valueOf(str2) + " already exists! Skipping this step...");
            return 0;
        }
        JobClient.runJob(jobConf);
        return 0;
    }
}
