package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.exceptions.ParseException;
import edu.uci.ics.crawler4j.url.TLDList;
import edu.uci.ics.crawler4j.url.WebURLFactory;
import edu.uci.ics.crawler4j.util.Net;
import edu.uci.ics.crawler4j.util.Util;
import java.io.IOException;
import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageDetector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:edu/uci/ics/crawler4j/parser/Parser.class */
public class Parser {
    private static final Logger logger = LoggerFactory.getLogger(Parser.class);
    private final CrawlConfig config;
    private final HtmlParser htmlContentParser;
    private final LanguageDetector languageDetector;
    private final Net net;
    private final WebURLFactory factory;

    public Parser(CrawlConfig crawlConfig, TLDList tLDList, WebURLFactory webURLFactory) throws IllegalAccessException, InstantiationException, IOException {
        this(crawlConfig, new TikaHtmlParser(crawlConfig, tLDList, webURLFactory), tLDList, webURLFactory);
    }

    public Parser(CrawlConfig crawlConfig, HtmlParser htmlParser, TLDList tLDList, WebURLFactory webURLFactory) throws IOException {
        this.config = crawlConfig;
        this.htmlContentParser = htmlParser;
        this.net = new Net(crawlConfig, tLDList, webURLFactory);
        this.factory = webURLFactory;
        this.languageDetector = new OptimaizeLangDetector();
        this.languageDetector.loadModels();
    }

    public void parse(Page page, String str) throws NotAllowedContentException, ParseException {
        if (Util.hasBinaryContent(page.getContentType())) {
            BinaryParseData binaryParseData = new BinaryParseData();
            if (!this.config.isIncludeBinaryContentInCrawling()) {
                throw new NotAllowedContentException();
            }
            if (this.config.isProcessBinaryContentInCrawling()) {
                try {
                    binaryParseData.setBinaryContent(page.getContentData());
                } catch (Exception e) {
                    if (this.config.isHaltOnError()) {
                        throw new ParseException(e);
                    }
                    logger.error("Error parsing file", e);
                }
            } else {
                binaryParseData.setHtml("<html></html>");
            }
            page.setParseData(binaryParseData);
            if (binaryParseData.getHtml() == null) {
                throw new ParseException();
            }
            binaryParseData.setOutgoingUrls(this.net.extractUrls(binaryParseData.getHtml()));
            return;
        }
        if (Util.hasCssTextContent(page.getContentType())) {
            try {
                CssParseData cssParseData = new CssParseData(this.factory);
                if (page.getContentCharset() == null) {
                    cssParseData.setTextContent(new String(page.getContentData()));
                } else {
                    cssParseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
                }
                cssParseData.setOutgoingUrls(page.getWebURL());
                page.setParseData(cssParseData);
                return;
            } catch (Exception e2) {
                logger.error("{}, while parsing css: {}", e2.getMessage(), page.getWebURL().getURL());
                throw new ParseException();
            }
        }
        if (!Util.hasPlainTextContent(page.getContentType())) {
            HtmlParseData parse = this.htmlContentParser.parse(page, str);
            if (page.getContentCharset() == null) {
                page.setContentCharset(parse.getContentCharset());
            }
            page.setLanguage(this.languageDetector.detect(parse.getText()).getLanguage());
            page.setParseData(parse);
            return;
        }
        try {
            TextParseData textParseData = new TextParseData();
            if (page.getContentCharset() == null) {
                textParseData.setTextContent(new String(page.getContentData()));
            } else {
                textParseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
            }
            textParseData.setOutgoingUrls(this.net.extractUrls(textParseData.getTextContent()));
            page.setParseData(textParseData);
        } catch (Exception e3) {
            logger.error("{}, while parsing: {}", e3.getMessage(), page.getWebURL().getURL());
            throw new ParseException(e3);
        }
    }
}
