package com.digitalpebble.stormcrawler.protocol;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.google.common.cache.Cache;
import com.google.common.primitives.Ints;
import crawlercommons.robots.BaseRobotRules;
import java.net.URL;
import java.util.LinkedList;
import java.util.Locale;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.Config;

/* loaded from: input_file:com/digitalpebble/stormcrawler/protocol/HttpRobotRulesParser.class */
public class HttpRobotRulesParser extends RobotRulesParser {
    protected boolean allowForbidden = false;

    HttpRobotRulesParser() {
    }

    public HttpRobotRulesParser(Config config) {
        setConf(config);
    }

    @Override // com.digitalpebble.stormcrawler.protocol.RobotRulesParser
    public void setConf(Config config) {
        super.setConf(config);
        this.allowForbidden = ConfUtils.getBoolean(config, "http.robots.403.allow", true);
    }

    protected static String getCacheKey(URL url) {
        String lowerCase = url.getProtocol().toLowerCase(Locale.ROOT);
        String lowerCase2 = url.getHost().toLowerCase(Locale.ROOT);
        int port = url.getPort();
        if (port == -1) {
            port = url.getDefaultPort();
        }
        return lowerCase + ":" + lowerCase2 + ":" + port;
    }

    @Override // com.digitalpebble.stormcrawler.protocol.RobotRulesParser
    public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url) {
        BaseRobotRules baseRobotRules;
        String cacheKey = getCacheKey(url);
        BaseRobotRules baseRobotRules2 = (BaseRobotRules) ERRORCACHE.getIfPresent(cacheKey);
        if (baseRobotRules2 != null) {
            return baseRobotRules2;
        }
        BaseRobotRules baseRobotRules3 = (BaseRobotRules) CACHE.getIfPresent(cacheKey);
        if (baseRobotRules3 != null) {
            return baseRobotRules3;
        }
        boolean z = true;
        URL url2 = null;
        LOG.debug("Cache miss {} for {}", cacheKey, url);
        LinkedList linkedList = new LinkedList();
        try {
            ProtocolResponse protocolOutput = protocol.getProtocolOutput(new URL(url, "/robots.txt").toString(), Metadata.empty);
            int statusCode = protocolOutput.getStatusCode();
            linkedList.add(Integer.valueOf(protocolOutput.getContent() != null ? protocolOutput.getContent().length : 0));
            if (statusCode == 301 || statusCode == 302 || statusCode == 307 || statusCode == 308) {
                String firstValue = protocolOutput.getMetadata().getFirstValue(HttpHeaders.LOCATION);
                if (StringUtils.isNotBlank(firstValue)) {
                    url2 = !firstValue.startsWith("http") ? new URL(url, firstValue) : new URL(firstValue);
                    protocolOutput = protocol.getProtocolOutput(url2.toString(), Metadata.empty);
                    statusCode = protocolOutput.getStatusCode();
                    linkedList.add(Integer.valueOf(protocolOutput.getContent() != null ? protocolOutput.getContent().length : 0));
                }
            }
            if (statusCode == 200) {
                baseRobotRules = parseRules(url.toString(), protocolOutput.getContent(), protocolOutput.getMetadata().getFirstValue(HttpHeaders.CONTENT_TYPE), this.agentNames);
            } else if (statusCode == 403 && !this.allowForbidden) {
                baseRobotRules = FORBID_ALL_RULES;
            } else if (statusCode >= 500) {
                z = false;
                baseRobotRules = EMPTY_RULES;
            } else {
                baseRobotRules = EMPTY_RULES;
            }
        } catch (Throwable th) {
            LOG.info("Couldn't get robots.txt for {} : {}", url, th.toString());
            z = false;
            baseRobotRules = EMPTY_RULES;
        }
        RobotRules robotRules = new RobotRules(baseRobotRules);
        Cache<String, BaseRobotRules> cache = CACHE;
        Object obj = "success";
        if (!z) {
            cache = ERRORCACHE;
            obj = "error";
        }
        LOG.debug("Caching robots for {} under key {} in cache {}", new Object[]{url, cacheKey, obj});
        cache.put(cacheKey, robotRules);
        if (url2 != null && !url2.getHost().equalsIgnoreCase(url.getHost())) {
            String cacheKey2 = getCacheKey(url2);
            LOG.debug("Caching robots for {} under key {} in cache {}", new Object[]{url2, cacheKey2, obj});
            cache.put(cacheKey2, robotRules);
        }
        RobotRules robotRules2 = new RobotRules(baseRobotRules);
        robotRules2.setContentLengthFetched(Ints.toArray(linkedList));
        return robotRules2;
    }
}
