package com.digitalpebble.stormcrawler.indexing;

import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.digitalpebble.stormcrawler.util.RobotsTags;
import com.digitalpebble.stormcrawler.util.URLUtil;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/digitalpebble/stormcrawler/indexing/AbstractIndexerBolt.class */
public abstract class AbstractIndexerBolt extends BaseRichBolt {
    public static final String metadata2fieldParamName = "indexer.md.mapping";
    public static final String metadataFilterParamName = "indexer.md.filter";
    public static final String textFieldParamName = "indexer.text.fieldname";
    public static final String urlFieldParamName = "indexer.url.fieldname";
    public static final String canonicalMetadataParamName = "indexer.canonical.name";
    private final Logger LOG = LoggerFactory.getLogger(getClass());
    private String[] filterKeyValue = null;
    private Map<String, String> metadata2field = new HashMap();
    private String fieldNameForText = null;
    private String fieldNameForURL = null;
    private String canonicalMetadataName = null;

    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        String string = ConfUtils.getString(map, metadataFilterParamName);
        if (StringUtils.isNotBlank(string)) {
            int indexOf = string.indexOf(61);
            if (indexOf != -1) {
                this.filterKeyValue = new String[]{string.substring(0, indexOf).trim(), string.substring(indexOf + 1).trim()};
            } else {
                this.LOG.error("Can't split into key value : {}", string);
            }
        }
        this.fieldNameForText = ConfUtils.getString(map, textFieldParamName);
        this.fieldNameForURL = ConfUtils.getString(map, urlFieldParamName);
        this.canonicalMetadataName = ConfUtils.getString(map, canonicalMetadataParamName);
        for (String str : ConfUtils.loadListFromConf(metadata2fieldParamName, map)) {
            int indexOf2 = str.indexOf(61);
            if (indexOf2 != -1) {
                this.metadata2field.put(str.substring(0, indexOf2).trim(), str.substring(indexOf2 + 1).trim());
            } else {
                this.LOG.error("Can't split into key value : {}", str);
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean filterDocument(Metadata metadata) {
        if ("true".equalsIgnoreCase(metadata.getFirstValue(RobotsTags.ROBOTS_NO_INDEX))) {
            return false;
        }
        if (this.filterKeyValue == null) {
            return true;
        }
        String[] values = metadata.getValues(this.filterKeyValue[0]);
        if (values == null) {
            return false;
        }
        return ArrayUtils.contains(values, this.filterKeyValue[1]);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Map<String, String[]> filterMetadata(Metadata metadata) {
        Pattern compile = Pattern.compile("\\[(\\d+)\\]");
        HashMap hashMap = new HashMap();
        for (Map.Entry<String, String> entry : this.metadata2field.entrySet()) {
            int i = -1;
            String key = entry.getKey();
            Matcher matcher = compile.matcher(key);
            if (matcher.find()) {
                i = Integer.parseInt(matcher.group(1));
                key = key.substring(0, matcher.start());
            }
            String[] values = metadata.getValues(key);
            if (values != null && values.length != 0 && i < values.length) {
                if (i == -1) {
                    hashMap.put(entry.getValue(), values);
                } else {
                    hashMap.put(entry.getValue(), new String[]{values[i]});
                }
            }
        }
        return hashMap;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String valueForURL(Tuple tuple) {
        URL url;
        URL resolveURL;
        String stringByField = tuple.getStringByField("url");
        Metadata metadata = (Metadata) tuple.getValueByField("metadata");
        if (StringUtils.isBlank(canonicalMetadataParamName)) {
            return stringByField;
        }
        String firstValue = metadata.getFirstValue(this.canonicalMetadataName);
        if (StringUtils.isBlank(firstValue)) {
            return stringByField;
        }
        try {
            url = new URL(stringByField);
            resolveURL = URLUtil.resolveURL(url, firstValue);
        } catch (MalformedURLException e) {
            this.LOG.error("Malformed canonical URL {} was found in {} ", firstValue, stringByField);
        }
        if (url.getHost().equals(resolveURL.getHost())) {
            return resolveURL.toExternalForm();
        }
        this.LOG.info("Canonical URL references a different host, ignoring in {} ", stringByField);
        return stringByField;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String fieldNameForText() {
        return this.fieldNameForText;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String fieldNameForURL() {
        return this.fieldNameForURL;
    }

    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        outputFieldsDeclarer.declareStream(Constants.StatusStreamName, new Fields(new String[]{"url", "metadata", Constants.StatusStreamName}));
    }
}
