package edu.uci.ics.crawler4j.parser;

import crawlercommons.filters.basic.BasicURLNormalizer;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.url.TLDList;
import edu.uci.ics.crawler4j.url.WebURLFactory;
import edu.uci.ics.crawler4j.util.Net;
import edu.uci.ics.crawler4j.util.Util;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import org.apache.commons.lang3.Validate;

/* loaded from: input_file:edu/uci/ics/crawler4j/parser/Parser.class */
public class Parser {
    private final CrawlConfig config;
    private final HtmlParser htmlContentParser;
    private TikaLanguageDetector languageDetector;
    private final Net net;
    private final WebURLFactory factory;
    private final BasicURLNormalizer normalizer;

    public Parser(CrawlConfig crawlConfig, BasicURLNormalizer basicURLNormalizer, TLDList tLDList, WebURLFactory webURLFactory) throws IOException {
        this(crawlConfig, basicURLNormalizer, new TikaHtmlParser(crawlConfig, basicURLNormalizer, tLDList, webURLFactory), tLDList, webURLFactory);
    }

    public Parser(CrawlConfig crawlConfig, BasicURLNormalizer basicURLNormalizer, HtmlParser htmlParser, TLDList tLDList, WebURLFactory webURLFactory) throws IOException {
        this.config = crawlConfig;
        this.htmlContentParser = htmlParser;
        this.net = new Net(crawlConfig, tLDList, webURLFactory);
        this.factory = webURLFactory;
        this.normalizer = basicURLNormalizer;
        if (crawlConfig.isLanguageDetection()) {
            this.languageDetector = new TikaLanguageDetector();
        }
    }

    public void parse(Page page) throws Exception {
        if (Util.hasBinaryContent(page.getContentType())) {
            if (!this.config.isIncludeBinaryContentInCrawling()) {
                throw new NotAllowedContentException();
            }
            BinaryParseData createBinaryParseData = createBinaryParseData();
            if (this.config.isProcessBinaryContentInCrawling()) {
                createBinaryParseData.parseBinaryContentAndSetHtml(page);
            } else {
                createBinaryParseData.setHtml("<html></html>");
            }
            String html = createBinaryParseData.getHtml();
            Validate.validState(html != null, "BinaryParseData.parseBinaryContentAndSetHtml(...) should initialize the html value", new Object[0]);
            createBinaryParseData.setOutgoingUrls(this.net.extractUrls(html));
            page.setParseData(createBinaryParseData);
            return;
        }
        if (Util.hasCssTextContent(page.getContentType())) {
            CssParseData createCssParseData = createCssParseData();
            setTextContent(createCssParseData, page);
            createCssParseData.parseAndSetOutgoingUrls(page);
            page.setParseData(createCssParseData);
            return;
        }
        if (Util.hasPlainTextContent(page.getContentType())) {
            TextParseData createTextParseData = createTextParseData();
            setTextContent(createTextParseData, page);
            createTextParseData.parseAndSetOutgoingUrls(page);
            createTextParseData.setOutgoingUrls(this.net.extractUrls(createTextParseData.getTextContent()));
            page.setParseData(createTextParseData);
            return;
        }
        HtmlParseData createHtmlParseData = createHtmlParseData(page);
        if (page.getContentCharset() == null) {
            page.setContentCharset(createHtmlParseData.getContentCharset());
        }
        if (this.config.isLanguageDetection()) {
            page.setLanguage(this.languageDetector.detect(createHtmlParseData.getText()));
        } else {
            page.setLanguage("");
        }
        page.setParseData(createHtmlParseData);
    }

    private void setTextContent(TextParseData textParseData, Page page) throws UnsupportedEncodingException {
        if (page.getContentCharset() == null) {
            textParseData.setTextContent(new String(page.getContentData(), StandardCharsets.UTF_8));
        } else {
            textParseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
        }
    }

    protected BinaryParseData createBinaryParseData() {
        return new BinaryParseData();
    }

    protected CssParseData createCssParseData() {
        return new CssParseData(getFactory(), getNormalizer(), getConfig().isHaltOnError());
    }

    protected TextParseData createTextParseData() {
        return new TextParseData();
    }

    protected HtmlParseData createHtmlParseData(Page page) throws Exception {
        return getHtmlContentParser().parse(page);
    }

    protected CrawlConfig getConfig() {
        return this.config;
    }

    protected WebURLFactory getFactory() {
        return this.factory;
    }

    protected BasicURLNormalizer getNormalizer() {
        return this.normalizer;
    }

    protected HtmlParser getHtmlContentParser() {
        return this.htmlContentParser;
    }
}
