package com.digitalpebble.stormcrawler.bolt;

import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.parse.DocumentFragmentBuilder;
import com.digitalpebble.stormcrawler.parse.JSoupFilter;
import com.digitalpebble.stormcrawler.parse.JSoupFilters;
import com.digitalpebble.stormcrawler.parse.Outlink;
import com.digitalpebble.stormcrawler.parse.ParseData;
import com.digitalpebble.stormcrawler.parse.ParseFilter;
import com.digitalpebble.stormcrawler.parse.ParseFilters;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.digitalpebble.stormcrawler.parse.TextExtractor;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
import com.digitalpebble.stormcrawler.protocol.ProtocolResponse;
import com.digitalpebble.stormcrawler.util.CharsetIdentification;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.digitalpebble.stormcrawler.util.RefreshTag;
import com.digitalpebble.stormcrawler.util.RobotsTags;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/digitalpebble/stormcrawler/bolt/JSoupParserBolt.class */
public class JSoupParserBolt extends StatusEmitterBolt {
    public static final String ANCHORS_KEY_NAME = "anchors";
    private static final Logger LOG = LoggerFactory.getLogger(JSoupParserBolt.class);
    private MultiCountMetric eventCounter;
    private ParseFilter parseFilters = null;
    private JSoupFilter jsoupFilters = null;
    private final Detector detector = TikaConfig.getDefaultConfig().getDetector();
    private boolean detectMimeType = true;
    private boolean trackAnchors = true;
    private boolean emitOutlinks = true;
    private int maxOutlinksPerPage = -1;
    private boolean robots_noFollow_strict = true;
    private boolean treat_non_html_as_error = true;
    private int maxLengthCharsetDetection = -1;
    private TextExtractor textExtractor;
    private String protocolMDprefix;
    private boolean robotsHeaderSkip;
    private boolean robotsMetaSkip;
    private boolean fastCharsetDetection;
    private boolean ignoreMetaRedirections;

    @Override // com.digitalpebble.stormcrawler.bolt.StatusEmitterBolt
    public void prepare(Map<String, Object> map, TopologyContext topologyContext, OutputCollector outputCollector) {
        super.prepare(map, topologyContext, outputCollector);
        this.eventCounter = topologyContext.registerMetric(getClass().getSimpleName(), new MultiCountMetric(), 10);
        this.parseFilters = ParseFilters.fromConf(map);
        this.jsoupFilters = JSoupFilters.fromConf(map);
        this.emitOutlinks = ConfUtils.getBoolean(map, "parser.emitOutlinks", true);
        this.trackAnchors = ConfUtils.getBoolean(map, "track.anchors", true);
        this.robots_noFollow_strict = ConfUtils.getBoolean(map, RobotsTags.ROBOTS_NO_FOLLOW_STRICT, true);
        this.treat_non_html_as_error = ConfUtils.getBoolean(map, "jsoup.treat.non.html.as.error", true);
        this.detectMimeType = ConfUtils.getBoolean(map, "detect.mimetype", true);
        this.maxLengthCharsetDetection = ConfUtils.getInt(map, "detect.charset.maxlength", -1);
        this.fastCharsetDetection = ConfUtils.getBoolean(map, "detect.charset.fast", false);
        this.maxOutlinksPerPage = ConfUtils.getInt(map, "parser.emitOutlinks.max.per.page", -1);
        this.protocolMDprefix = ConfUtils.getString(map, ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, "");
        this.robotsHeaderSkip = ConfUtils.getBoolean(map, "http.robots.headers.skip", false);
        this.robotsMetaSkip = ConfUtils.getBoolean(map, "http.robots.meta.skip", false);
        this.ignoreMetaRedirections = ConfUtils.getBoolean(map, "jsoup.ignore.meta.redirections", false);
        this.textExtractor = new TextExtractor(map);
    }

    public void execute(Tuple tuple) {
        HashMap hashMap;
        Element selectFirst;
        byte[] binaryByField = tuple.getBinaryByField("content");
        String stringByField = tuple.getStringByField("url");
        Metadata metadata = (Metadata) tuple.getValueByField("metadata");
        LOG.info("Parsing : starting {}", stringByField);
        boolean z = false;
        String firstValue = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE, this.protocolMDprefix);
        if (this.detectMimeType) {
            try {
                firstValue = guessMimeType(stringByField, firstValue, binaryByField);
                metadata.setValue("parse.Content-Type", firstValue);
            } catch (Exception e) {
                handleException(stringByField, e, metadata, tuple, "mimetype guessing", "Exception while guessing mimetype on " + stringByField + ": " + e);
                return;
            }
        }
        if (!StringUtils.isNotBlank(firstValue)) {
            z = true;
        } else if (firstValue.toLowerCase().contains("html")) {
            z = true;
        }
        if (!z) {
            if (this.treat_non_html_as_error) {
                String str = "Exception content-type " + firstValue + " for " + stringByField;
                handleException(stringByField, new RuntimeException(str), metadata, tuple, "content-type checking", str);
                return;
            } else {
                LOG.info("Unsupported mimetype {} - passing on : {}", firstValue, stringByField);
                this.collector.emit(tuple, new Values(new Object[]{stringByField, binaryByField, metadata, ""}));
                this.collector.ack(tuple);
                return;
            }
        }
        long currentTimeMillis = System.currentTimeMillis();
        String charsetFast = this.fastCharsetDetection ? CharsetIdentification.getCharsetFast(metadata, binaryByField, this.maxLengthCharsetDetection) : CharsetIdentification.getCharset(metadata, binaryByField, this.maxLengthCharsetDetection);
        LOG.debug("Charset identified as {} in {} msec", charsetFast, Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
        RobotsTags robotsTags = new RobotsTags();
        if (!this.robotsHeaderSkip) {
            robotsTags = new RobotsTags(metadata, this.protocolMDprefix);
        }
        try {
            Document parseInput = Parser.htmlParser().parseInput(Charset.forName(charsetFast).decode(ByteBuffer.wrap(binaryByField)).toString(), stringByField);
            if (!this.robotsMetaSkip && (selectFirst = parseInput.selectFirst("meta[name~=(?i)robots][content]")) != null) {
                robotsTags.extractMetaTags(selectFirst.attr("content"));
            }
            robotsTags.normaliseToMetadata(metadata);
            if (robotsTags.isNoFollow() && this.robots_noFollow_strict) {
                hashMap = new HashMap(0);
            } else {
                Elements select = parseInput.select("a[href]");
                hashMap = new HashMap(select.size());
                URL url = new URL(stringByField);
                Iterator it = select.iterator();
                while (it.hasNext()) {
                    Element element = (Element) it.next();
                    boolean equalsIgnoreCase = "nofollow".equalsIgnoreCase(element.attr("rel"));
                    if (!equalsIgnoreCase || !this.robots_noFollow_strict) {
                        if (!equalsIgnoreCase && robotsTags.isNoFollow()) {
                            equalsIgnoreCase = true;
                        }
                        String str2 = null;
                        try {
                            str2 = StringUtil.resolve(url, element.attr("href")).toExternalForm();
                        } catch (MalformedURLException e2) {
                            LOG.debug("Cannot resolve URL with baseURL : {} and href : {}", new Object[]{url, element.attr("href"), e2});
                        }
                        if (!StringUtils.isBlank(str2)) {
                            List<String> computeIfAbsent = hashMap.computeIfAbsent(str2, str3 -> {
                                return new LinkedList();
                            });
                            String text = element.text();
                            if (!equalsIgnoreCase && StringUtils.isNotBlank(text)) {
                                computeIfAbsent.add(text);
                            }
                        }
                    }
                }
            }
            String text2 = this.textExtractor.text(parseInput.body());
            metadata.setValue("parse.Content-Encoding", charsetFast);
            metadata.setValue("parsed.by", getClass().getName());
            LOG.info("Parsed {} in {} msec", stringByField, Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
            if (!this.ignoreMetaRedirections) {
                try {
                    Element selectFirst2 = parseInput.selectFirst("meta[http-equiv~=(?i)refresh][content]");
                    String extractRefreshURL = selectFirst2 != null ? RefreshTag.extractRefreshURL(selectFirst2.attr("content")) : null;
                    if (StringUtils.isNotBlank(extractRefreshURL)) {
                        LOG.info("Found redir in {} to {}", stringByField, extractRefreshURL);
                        metadata.setValue("_redirTo", extractRefreshURL);
                        if (allowRedirs() && StringUtils.isNotBlank(extractRefreshURL)) {
                            emitOutlink(tuple, new URL(stringByField), extractRefreshURL, metadata, new String[0]);
                        }
                        this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.REDIRECTION}));
                        this.collector.ack(tuple);
                        this.eventCounter.scope("tuple_success").incr();
                        return;
                    }
                } catch (MalformedURLException e3) {
                    LOG.error("MalformedURLException on {}", stringByField);
                }
            }
            ParseResult parseResult = new ParseResult(toOutlinks(stringByField, metadata, hashMap));
            ParseData parseData = parseResult.get(stringByField);
            parseData.setMetadata(metadata);
            parseData.setText(text2);
            parseData.setContent(binaryByField);
            try {
                this.jsoupFilters.filter(stringByField, binaryByField, parseInput, parseResult);
                try {
                    this.parseFilters.filter(stringByField, binaryByField, this.parseFilters.needsDOM() ? DocumentFragmentBuilder.fromJsoup(parseInput) : null, parseResult);
                    if (this.emitOutlinks) {
                        for (Outlink outlink : this.maxOutlinksPerPage == -1 ? parseResult.getOutlinks() : (List) parseResult.getOutlinks().stream().limit(this.maxOutlinksPerPage).collect(Collectors.toList())) {
                            this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED}));
                        }
                    }
                    Iterator<Map.Entry<String, ParseData>> it2 = parseResult.iterator();
                    while (it2.hasNext()) {
                        Map.Entry<String, ParseData> next = it2.next();
                        ParseData value = next.getValue();
                        this.collector.emit(tuple, new Values(new Object[]{next.getKey(), value.getContent(), value.getMetadata(), value.getText()}));
                    }
                    LOG.info("Total for {} - {} msec", stringByField, Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
                    this.collector.ack(tuple);
                    this.eventCounter.scope("tuple_success").incr();
                } catch (RuntimeException e4) {
                    handleException(stringByField, e4, metadata, tuple, "content filtering", "Exception while running parse filters on " + stringByField + ": " + e4);
                }
            } catch (RuntimeException e5) {
                handleException(stringByField, e5, metadata, tuple, "jsoup filtering", "Exception while running jsoup filters on " + stringByField + ": " + e5);
            }
        } catch (Throwable th) {
            handleException(stringByField, th, metadata, tuple, "content parsing", "Exception while parsing " + stringByField + ": " + th);
        }
    }

    private void handleException(String str, Throwable th, Metadata metadata, Tuple tuple, String str2, String str3) {
        LOG.error(str3);
        metadata.setValue(Constants.STATUS_ERROR_SOURCE, str2);
        metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str3);
        this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{str, metadata, Status.ERROR}));
        this.collector.ack(tuple);
        this.eventCounter.scope(("error_" + str2.replaceAll(" ", "_") + "_") + th.getClass().getSimpleName()).incrBy(1L);
        this.eventCounter.scope("parse exception").incrBy(1L);
    }

    @Override // com.digitalpebble.stormcrawler.bolt.StatusEmitterBolt
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        super.declareOutputFields(outputFieldsDeclarer);
        outputFieldsDeclarer.declare(new Fields(new String[]{"url", "content", "metadata", "text"}));
    }

    public String guessMimeType(String str, String str2, byte[] bArr) {
        org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata();
        if (StringUtils.isNotBlank(str2)) {
            metadata.set("Content-Type", str2);
        }
        metadata.set("resourceName", str);
        metadata.set("Content-Length", Integer.toString(bArr.length));
        try {
            ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bArr);
            try {
                String mediaType = this.detector.detect(byteArrayInputStream, metadata).toString();
                byteArrayInputStream.close();
                return mediaType;
            } finally {
            }
        } catch (IOException e) {
            throw new IllegalStateException("Unexpected IOException", e);
        }
    }

    protected List<Outlink> toOutlinks(String str, Metadata metadata, Map<String, List<String>> map) {
        HashMap hashMap = new HashMap();
        try {
            URL url = new URL(str);
            for (Map.Entry<String, List<String>> entry : map.entrySet()) {
                Outlink filterOutlink = filterOutlink(url, entry.getKey(), metadata, new String[0]);
                if (filterOutlink == null) {
                    this.eventCounter.scope("outlink_filtered").incr();
                } else {
                    Outlink outlink = (Outlink) hashMap.get(filterOutlink.getTargetURL());
                    if (outlink != null) {
                        filterOutlink = outlink;
                    }
                    List<String> value = entry.getValue();
                    if (this.trackAnchors && value.size() > 0) {
                        filterOutlink.getMetadata().addValues(ANCHORS_KEY_NAME, value);
                        filterOutlink.setAnchor(value.get(0));
                    }
                    if (outlink == null) {
                        hashMap.put(filterOutlink.getTargetURL(), filterOutlink);
                        this.eventCounter.scope("outlink_kept").incr();
                    }
                }
            }
            return new LinkedList(hashMap.values());
        } catch (MalformedURLException e) {
            LOG.error("MalformedURLException on {}", str);
            this.eventCounter.scope("error_invalid_source_url").incrBy(1L);
            return new LinkedList();
        }
    }
}
