package edu.uci.ics.crawler4j.robotstxt;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.url.WebURLFactory;
import edu.uci.ics.crawler4j.util.Util;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.NoHttpResponseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.class */
public class RobotstxtServer {
    private static final Logger logger = LoggerFactory.getLogger(RobotstxtServer.class);
    protected WebURLFactory factory;
    protected RobotstxtConfig config;
    protected CrawlConfig crawlConfig;
    protected final Map<String, HostDirectives> host2directivesCache;
    protected PageFetcher pageFetcher;
    private final int maxBytes;

    public RobotstxtServer(RobotstxtConfig robotstxtConfig, PageFetcher pageFetcher, WebURLFactory webURLFactory) {
        this(robotstxtConfig, pageFetcher, 16384, webURLFactory);
    }

    public RobotstxtServer(RobotstxtConfig robotstxtConfig, PageFetcher pageFetcher, int i, WebURLFactory webURLFactory) {
        this.host2directivesCache = new HashMap();
        this.config = robotstxtConfig;
        this.pageFetcher = pageFetcher;
        this.maxBytes = i;
        this.factory = webURLFactory;
    }

    private static String getHost(URL url) {
        return url.getHost().toLowerCase();
    }

    public boolean allows(WebURL webURL) throws IOException, InterruptedException {
        if (!this.config.isEnabled()) {
            return true;
        }
        try {
            URL url = new URL(webURL.getURL());
            String host = getHost(url);
            String path = url.getPath();
            HostDirectives hostDirectives = this.host2directivesCache.get(host);
            if (hostDirectives != null && hostDirectives.needsRefetch()) {
                synchronized (this.host2directivesCache) {
                    this.host2directivesCache.remove(host);
                    hostDirectives = null;
                }
            }
            if (hostDirectives == null) {
                hostDirectives = fetchDirectives(url);
            }
            return hostDirectives.allows(path);
        } catch (MalformedURLException e) {
            logger.error("Bad URL in Robots.txt: " + webURL.getURL(), e);
            logger.warn("RobotstxtServer: default: allow", webURL.getURL());
            return true;
        }
    }

    private HostDirectives fetchDirectives(URL url) throws IOException, InterruptedException {
        WebURL newWebUrl = this.factory.newWebUrl();
        String host = getHost(url);
        newWebUrl.setURL(url.getProtocol() + "://" + host + ((url.getPort() == url.getDefaultPort() || url.getPort() == -1) ? "" : ":" + url.getPort()) + "/robots.txt");
        HostDirectives hostDirectives = null;
        PageFetchResult pageFetchResult = null;
        try {
            for (int i = 0; i < 3; i++) {
                try {
                    try {
                        try {
                            pageFetchResult = this.pageFetcher.fetchPage(newWebUrl);
                            int statusCode = pageFetchResult.getStatusCode();
                            if ((statusCode != 300 && statusCode != 301 && statusCode != 302 && statusCode != 303 && statusCode != 307 && statusCode != 308) || pageFetchResult.getMovedToUrl() == null) {
                                break;
                            }
                            newWebUrl.setURL(pageFetchResult.getMovedToUrl());
                            pageFetchResult.discardContentIfNotConsumed();
                        } catch (IOException e) {
                            logger.error("Error occurred while fetching (robots) url: " + newWebUrl.getURL(), e);
                            if (pageFetchResult != null) {
                                pageFetchResult.discardContentIfNotConsumed();
                            }
                        }
                    } catch (PageBiggerThanMaxSizeException e2) {
                        logger.error("Error occurred while fetching (robots) url: {}, {}", newWebUrl.getURL(), e2.getMessage());
                        if (pageFetchResult != null) {
                            pageFetchResult.discardContentIfNotConsumed();
                        }
                    }
                } catch (InterruptedException | RuntimeException e3) {
                    if (this.crawlConfig.isHaltOnError()) {
                        throw e3;
                    }
                    logger.error("Error occurred while fetching (robots) url: " + newWebUrl.getURL(), e3);
                    if (pageFetchResult != null) {
                        pageFetchResult.discardContentIfNotConsumed();
                    }
                } catch (SocketException | SocketTimeoutException | UnknownHostException | NoHttpResponseException e4) {
                    logger.trace("robots.txt probably does not exist.", e4);
                    if (pageFetchResult != null) {
                        pageFetchResult.discardContentIfNotConsumed();
                    }
                }
            }
            if (pageFetchResult.getStatusCode() == 200) {
                Page page = new Page(newWebUrl);
                pageFetchResult.fetchContent(page, 512000);
                if (Util.hasPlainTextContent(page.getContentType())) {
                    hostDirectives = RobotstxtParser.parse(page.getContentCharset() == null ? new String(page.getContentData()) : new String(page.getContentData(), page.getContentCharset()), this.config);
                } else if (page.getContentType().contains("html")) {
                    hostDirectives = RobotstxtParser.parse(new String(page.getContentData()), this.config);
                } else {
                    logger.warn("Can't read this robots.txt: {}  as it is not written in plain text, contentType: {}", newWebUrl.getURL(), page.getContentType());
                }
            } else {
                logger.debug("Can't read this robots.txt: {}  as it's status code is {}", newWebUrl.getURL(), Integer.valueOf(pageFetchResult.getStatusCode()));
            }
            if (pageFetchResult != null) {
                pageFetchResult.discardContentIfNotConsumed();
            }
            if (hostDirectives == null) {
                hostDirectives = new HostDirectives(this.config);
            }
            synchronized (this.host2directivesCache) {
                if (this.host2directivesCache.size() == this.config.getCacheSize()) {
                    String str = null;
                    long j = Long.MAX_VALUE;
                    for (Map.Entry<String, HostDirectives> entry : this.host2directivesCache.entrySet()) {
                        long lastAccessTime = entry.getValue().getLastAccessTime();
                        if (lastAccessTime < j) {
                            j = lastAccessTime;
                            str = entry.getKey();
                        }
                    }
                    this.host2directivesCache.remove(str);
                }
                this.host2directivesCache.put(host, hostDirectives);
            }
            return hostDirectives;
        } catch (Throwable th) {
            if (pageFetchResult != null) {
                pageFetchResult.discardContentIfNotConsumed();
            }
            throw th;
        }
    }

    public void setCrawlConfig(CrawlConfig crawlConfig) {
        this.crawlConfig = crawlConfig;
    }
}
