package com.digitalpebble.stormcrawler.bolt;

import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.parse.Outlink;
import com.digitalpebble.stormcrawler.parse.ParseFilter;
import com.digitalpebble.stormcrawler.parse.ParseFilters;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
import com.digitalpebble.stormcrawler.protocol.ProtocolResponse;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.google.common.primitives.Bytes;
import com.rometools.rome.feed.synd.SyndContent;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.SyndFeedInput;
import java.io.ByteArrayInputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;

/* loaded from: input_file:com/digitalpebble/stormcrawler/bolt/FeedParserBolt.class */
public class FeedParserBolt extends StatusEmitterBolt {
    public static final String isFeedKey = "isFeed";
    private static final Logger LOG = LoggerFactory.getLogger(FeedParserBolt.class);
    private ParseFilter parseFilters;
    private String protocolMDprefix;
    private boolean sniffWhenNoMDKey = false;
    private int filterHoursSincePub = -1;

    public void execute(Tuple tuple) {
        Metadata metadata = (Metadata) tuple.getValueByField("metadata");
        byte[] binaryByField = tuple.getBinaryByField("content");
        String stringByField = tuple.getStringByField("url");
        LOG.debug("Processing {}", stringByField);
        boolean parseBoolean = Boolean.parseBoolean(metadata.getFirstValue(isFeedKey));
        if (!parseBoolean && this.sniffWhenNoMDKey) {
            String firstValue = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE, this.protocolMDprefix);
            if (firstValue == null || !firstValue.contains("rss+xml")) {
                byte[] bytes = "<rss ".getBytes();
                byte[] bArr = binaryByField;
                if (binaryByField.length > 100) {
                    bArr = Arrays.copyOfRange(binaryByField, 0, 100);
                }
                if (Bytes.indexOf(bArr, bytes) != -1) {
                    LOG.info("{} detected as rss feed based on content", stringByField);
                    parseBoolean = true;
                }
            } else {
                parseBoolean = true;
            }
        }
        if (!parseBoolean) {
            LOG.debug("Not a feed {}", stringByField);
            this.collector.emit(tuple, tuple.getValues());
            this.collector.ack(tuple);
            return;
        }
        metadata.setValue(isFeedKey, "true");
        try {
            ParseResult parseResult = new ParseResult(parseFeed(stringByField, binaryByField, metadata));
            parseResult.set(stringByField, metadata);
            try {
                this.parseFilters.filter(stringByField, binaryByField, null, parseResult);
                for (Outlink outlink : parseResult.getOutlinks()) {
                    this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED}));
                }
                LOG.info("Feed parser done {}", stringByField);
                this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.FETCHED}));
                this.collector.ack(tuple);
            } catch (RuntimeException e) {
                String str = "Exception while running parse filters on " + stringByField + ": " + e;
                LOG.error(str, e);
                metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
                metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str);
                this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
                this.collector.ack(tuple);
            }
        } catch (Exception e2) {
            String str2 = "Exception while parsing " + stringByField + ": " + e2;
            LOG.error(str2, e2);
            metadata.setValue(Constants.STATUS_ERROR_SOURCE, "feed parsing");
            metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str2);
            this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
            this.collector.ack(tuple);
        }
    }

    private List<Outlink> parseFeed(String str, byte[] bArr, Metadata metadata) throws Exception {
        ArrayList arrayList = new ArrayList();
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bArr);
        try {
            SyndFeed build = new SyndFeedInput().build(new InputSource(byteArrayInputStream));
            byteArrayInputStream.close();
            URL url = new URL(str);
            for (SyndEntry syndEntry : build.getEntries()) {
                String link = syndEntry.getLink();
                if (StringUtils.isBlank(link)) {
                    link = syndEntry.getUri();
                    if (StringUtils.isBlank(link)) {
                    }
                }
                Outlink filterOutlink = filterOutlink(url, link, metadata, new String[0]);
                if (filterOutlink != null) {
                    String title = syndEntry.getTitle();
                    if (StringUtils.isNotBlank(title)) {
                        filterOutlink.getMetadata().setValue("feed.title", title.trim());
                    }
                    Date publishedDate = syndEntry.getPublishedDate();
                    if (publishedDate != null) {
                        if (this.filterHoursSincePub != -1) {
                            Calendar calendar = Calendar.getInstance();
                            calendar.add(10, -this.filterHoursSincePub);
                            if (publishedDate.before(calendar.getTime())) {
                                LOG.info("{} has a published date {} which is more than {} hours old", new Object[]{link, publishedDate, Integer.valueOf(this.filterHoursSincePub)});
                            }
                        }
                        filterOutlink.getMetadata().setValue("feed.publishedDate", publishedDate.toString());
                    }
                    SyndContent description = syndEntry.getDescription();
                    if (description != null && StringUtils.isNotBlank(description.getValue())) {
                        filterOutlink.getMetadata().setValue("feed.description", description.getValue());
                    }
                    arrayList.add(filterOutlink);
                }
            }
            return arrayList;
        } catch (Throwable th) {
            try {
                byteArrayInputStream.close();
            } catch (Throwable th2) {
                th.addSuppressed(th2);
            }
            throw th;
        }
    }

    @Override // com.digitalpebble.stormcrawler.bolt.StatusEmitterBolt
    public void prepare(Map<String, Object> map, TopologyContext topologyContext, OutputCollector outputCollector) {
        super.prepare(map, topologyContext, outputCollector);
        this.sniffWhenNoMDKey = ConfUtils.getBoolean(map, "feed.sniffContent", false);
        this.filterHoursSincePub = ConfUtils.getInt(map, "feed.filter.hours.since.published", -1);
        this.parseFilters = ParseFilters.fromConf(map);
        this.protocolMDprefix = ConfUtils.getString(map, ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, "");
    }

    @Override // com.digitalpebble.stormcrawler.bolt.StatusEmitterBolt
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        super.declareOutputFields(outputFieldsDeclarer);
        outputFieldsDeclarer.declare(new Fields(new String[]{"url", "content", "metadata"}));
    }
}
