package com.digitalpebble.stormcrawler.bolt;

import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.parse.Outlink;
import com.digitalpebble.stormcrawler.parse.ParseFilter;
import com.digitalpebble.stormcrawler.parse.ParseFilters;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.google.common.primitives.Bytes;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/digitalpebble/stormcrawler/bolt/SiteMapParserBolt.class */
public class SiteMapParserBolt extends StatusEmitterBolt {
    public static final String isSitemapKey = "isSitemap";
    private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserBolt.class);
    private ParseFilter parseFilters;
    private boolean strictMode = false;
    private boolean sniffWhenNoSMKey = false;
    private int filterHoursSinceModified = -1;
    private int maxOffsetGuess = 300;

    public void execute(Tuple tuple) {
        Metadata metadata = (Metadata) tuple.getValueByField("metadata");
        byte[] binaryByField = tuple.getBinaryByField("content");
        String stringByField = tuple.getStringByField("url");
        if (!Boolean.valueOf(metadata.getFirstValue(isSitemapKey)).booleanValue()) {
            int i = -1;
            if (this.sniffWhenNoSMKey) {
                byte[] bytes = "http://www.sitemaps.org/schemas/sitemap/0.9".getBytes();
                byte[] bArr = binaryByField;
                if (binaryByField.length > this.maxOffsetGuess && this.maxOffsetGuess > 0) {
                    bArr = Arrays.copyOfRange(binaryByField, 0, this.maxOffsetGuess);
                }
                i = Bytes.indexOf(bArr, bytes);
                if (i != -1) {
                    LOG.info("{} detected as sitemap based on content", stringByField);
                }
            }
            if (i == -1) {
                this.collector.emit(tuple, tuple.getValues());
                this.collector.ack(tuple);
                return;
            }
        }
        try {
            List<Outlink> parseSiteMap = parseSiteMap(stringByField, binaryByField, metadata.getFirstValue(HttpHeaders.CONTENT_TYPE), metadata);
            try {
                ParseResult parseResult = new ParseResult();
                parseResult.setOutlinks(parseSiteMap);
                parseResult.get(stringByField).setMetadata(metadata);
                this.parseFilters.filter(stringByField, binaryByField, null, parseResult);
                for (Outlink outlink : parseSiteMap) {
                    this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED}));
                }
                this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.FETCHED}));
                this.collector.ack(tuple);
            } catch (RuntimeException e) {
                String str = "Exception while running parse filters on " + stringByField + ": " + e;
                LOG.error(str);
                metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
                metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str);
                this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
                this.collector.ack(tuple);
            }
        } catch (Exception e2) {
            String str2 = "Exception while parsing " + stringByField + ": " + e2;
            LOG.error(str2);
            metadata.setValue(Constants.STATUS_ERROR_SOURCE, "sitemap parsing");
            metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str2);
            this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
            this.collector.ack(tuple);
        }
    }

    private List<Outlink> parseSiteMap(String str, byte[] bArr, String str2, Metadata metadata) throws UnknownFormatException, IOException {
        SiteMapParser siteMapParser = new SiteMapParser(this.strictMode);
        URL url = new URL(str);
        AbstractSiteMap parseSiteMap = (StringUtils.isBlank(str2) || str2.contains("octet-stream")) ? siteMapParser.parseSiteMap(bArr, url) : siteMapParser.parseSiteMap(str2, bArr, url);
        ArrayList arrayList = new ArrayList();
        if (parseSiteMap.isIndex()) {
            for (AbstractSiteMap abstractSiteMap : ((SiteMapIndex) parseSiteMap).getSitemaps()) {
                String externalForm = abstractSiteMap.getUrl().toExternalForm();
                Date lastModified = abstractSiteMap.getLastModified();
                String str3 = "";
                if (lastModified != null) {
                    if (this.filterHoursSinceModified != -1) {
                        Calendar calendar = Calendar.getInstance();
                        calendar.add(10, -this.filterHoursSinceModified);
                        if (lastModified.before(calendar.getTime())) {
                            LOG.info("{} has a modified date {} which is more than {} hours old", new Object[]{externalForm, lastModified.toString(), Integer.valueOf(this.filterHoursSinceModified)});
                        }
                    }
                    str3 = lastModified.toString();
                }
                Outlink filterOutlink = filterOutlink(url, externalForm, metadata, isSitemapKey, "true", "sitemap.lastModified", str3);
                if (filterOutlink != null) {
                    arrayList.add(filterOutlink);
                    LOG.debug("{} : [sitemap] {}", str, externalForm);
                }
            }
        } else {
            for (SiteMapURL siteMapURL : ((SiteMap) parseSiteMap).getSiteMapUrls()) {
                siteMapURL.getPriority();
                siteMapURL.getChangeFrequency();
                String externalForm2 = siteMapURL.getUrl().toExternalForm();
                String str4 = "";
                Date lastModified2 = siteMapURL.getLastModified();
                if (lastModified2 != null) {
                    if (this.filterHoursSinceModified != -1) {
                        Calendar calendar2 = Calendar.getInstance();
                        calendar2.add(10, -this.filterHoursSinceModified);
                        if (lastModified2.before(calendar2.getTime())) {
                            LOG.info("{} has a modified date {} which is more than {} hours old", new Object[]{externalForm2, lastModified2.toString(), Integer.valueOf(this.filterHoursSinceModified)});
                        }
                    }
                    str4 = lastModified2.toString();
                }
                Outlink filterOutlink2 = filterOutlink(url, externalForm2, metadata, isSitemapKey, "false", "sitemap.lastModified", str4);
                if (filterOutlink2 != null) {
                    arrayList.add(filterOutlink2);
                    LOG.debug("{} : [sitemap] {}", str, externalForm2);
                }
            }
        }
        return arrayList;
    }

    @Override // com.digitalpebble.stormcrawler.bolt.StatusEmitterBolt
    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        super.prepare(map, topologyContext, outputCollector);
        this.sniffWhenNoSMKey = ConfUtils.getBoolean(map, "sitemap.sniffContent", false);
        this.filterHoursSinceModified = ConfUtils.getInt(map, "sitemap.filter.hours.since.modified", -1);
        this.parseFilters = ParseFilters.fromConf(map);
        this.maxOffsetGuess = ConfUtils.getInt(map, "sitemap.offset.guess", 300);
    }

    @Override // com.digitalpebble.stormcrawler.bolt.StatusEmitterBolt
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        super.declareOutputFields(outputFieldsDeclarer);
        outputFieldsDeclarer.declare(new Fields(new String[]{"url", "content", "metadata"}));
    }
}
