package com.digitalpebble.stormcrawler.bolt;

import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.parse.Outlink;
import com.digitalpebble.stormcrawler.parse.ParseFilter;
import com.digitalpebble.stormcrawler.parse.ParseFilters;
import com.digitalpebble.stormcrawler.parse.ParseResult;
import com.digitalpebble.stormcrawler.persistence.DefaultScheduler;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import com.google.common.primitives.Bytes;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.metric.api.MeanReducer;
import org.apache.storm.metric.api.ReducedMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/digitalpebble/stormcrawler/bolt/SiteMapParserBolt.class */
public class SiteMapParserBolt extends StatusEmitterBolt {
    public static final String isSitemapKey = "isSitemap";
    public static final String foundSitemapKey = "foundSitemap";
    private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserBolt.class);
    private static final byte[] clue = "http://www.sitemaps.org/schemas/sitemap/0.9".getBytes();
    private SiteMapParser parser;
    private ParseFilter parseFilters;
    private ReducedMetric averagedMetrics;
    private List<Extension> extensionsToParse;
    private int filterHoursSinceModified = -1;
    private int maxOffsetGuess = 300;
    private int scheduleSitemapsWithDelay = -1;

    public void execute(Tuple tuple) {
        Metadata metadata = (Metadata) tuple.getValueByField("metadata");
        byte[] binaryByField = tuple.getBinaryByField("content");
        String stringByField = tuple.getStringByField("url");
        String firstValue = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
        LOG.debug("Processing {}", stringByField);
        boolean sniff = sniff(binaryByField);
        if (sniff) {
            firstValue = "application/xml";
        }
        String firstValue2 = metadata.getFirstValue(isSitemapKey);
        boolean parseBoolean = Boolean.parseBoolean(firstValue2);
        if (firstValue2 == null && sniff) {
            LOG.info("{} detected as sitemap based on content", stringByField);
            parseBoolean = true;
        }
        if (!parseBoolean) {
            LOG.debug("Not a sitemap {}", stringByField);
            metadata.setValue(isSitemapKey, "false");
            this.collector.emit(tuple, tuple.getValues());
            this.collector.ack(tuple);
            return;
        }
        try {
            List<Outlink> parseSiteMap = parseSiteMap(stringByField, binaryByField, firstValue, metadata);
            metadata.setValue(isSitemapKey, "true");
            ParseResult parseResult = new ParseResult(parseSiteMap);
            parseResult.set(stringByField, metadata);
            try {
                this.parseFilters.filter(stringByField, binaryByField, null, parseResult);
                for (Outlink outlink : parseResult.getOutlinks()) {
                    this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED}));
                }
                this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.FETCHED}));
                this.collector.ack(tuple);
            } catch (RuntimeException e) {
                String str = "Exception while running parse filters on " + stringByField + ": " + e;
                LOG.error(str);
                metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
                metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str);
                this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
                this.collector.ack(tuple);
            }
        } catch (Exception e2) {
            String str2 = "Exception while parsing " + stringByField + ": " + e2;
            LOG.error(str2);
            metadata.setValue(Constants.STATUS_ERROR_SOURCE, "sitemap parsing");
            metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str2);
            this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
            this.collector.ack(tuple);
        }
    }

    private List<Outlink> parseSiteMap(String str, byte[] bArr, String str2, Metadata metadata) throws UnknownFormatException, IOException {
        URL url = new URL(str);
        long currentTimeMillis = System.currentTimeMillis();
        AbstractSiteMap parseSiteMap = (StringUtils.isBlank(str2) || str2.contains("octet-stream")) ? this.parser.parseSiteMap(bArr, url) : this.parser.parseSiteMap(str2, bArr, url);
        this.averagedMetrics.update(Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
        ArrayList arrayList = new ArrayList();
        if (parseSiteMap.isIndex()) {
            Collection<AbstractSiteMap> sitemaps = ((SiteMapIndex) parseSiteMap).getSitemaps();
            Calendar calendar = Calendar.getInstance();
            calendar.add(10, -this.filterHoursSinceModified);
            int i = 0;
            for (AbstractSiteMap abstractSiteMap : sitemaps) {
                String externalForm = abstractSiteMap.getUrl().toExternalForm();
                Date lastModified = abstractSiteMap.getLastModified();
                String str3 = "";
                if (lastModified != null) {
                    if (this.filterHoursSinceModified == -1 || !lastModified.before(calendar.getTime())) {
                        str3 = lastModified.toString();
                    } else {
                        LOG.info("{} has a modified date {} which is more than {} hours old", new Object[]{externalForm, lastModified, Integer.valueOf(this.filterHoursSinceModified)});
                    }
                }
                Outlink filterOutlink = filterOutlink(url, externalForm, metadata, isSitemapKey, "true", "sitemap.lastModified", str3);
                if (filterOutlink != null) {
                    if (this.scheduleSitemapsWithDelay > 0) {
                        if (i > 0) {
                            filterOutlink.getMetadata().setValue(DefaultScheduler.DELAY_METADATA, Integer.toString(i));
                        }
                        i += this.scheduleSitemapsWithDelay;
                    }
                    arrayList.add(filterOutlink);
                    LOG.debug("{} : [sitemap] {}", str, externalForm);
                }
            }
        } else {
            for (SiteMapURL siteMapURL : ((SiteMap) parseSiteMap).getSiteMapUrls()) {
                siteMapURL.getPriority();
                siteMapURL.getChangeFrequency();
                String externalForm2 = siteMapURL.getUrl().toExternalForm();
                String str4 = "";
                Date lastModified2 = siteMapURL.getLastModified();
                if (lastModified2 != null) {
                    if (this.filterHoursSinceModified != -1) {
                        Calendar calendar2 = Calendar.getInstance();
                        calendar2.add(10, -this.filterHoursSinceModified);
                        if (lastModified2.before(calendar2.getTime())) {
                            LOG.info("{} has a modified date {} which is more than {} hours old", new Object[]{externalForm2, lastModified2.toString(), Integer.valueOf(this.filterHoursSinceModified)});
                        }
                    }
                    str4 = lastModified2.toString();
                }
                Outlink filterOutlink2 = filterOutlink(url, externalForm2, metadata, isSitemapKey, "false", "sitemap.lastModified", str4);
                if (filterOutlink2 != null) {
                    parseExtensionAttributes(siteMapURL, filterOutlink2.getMetadata());
                    arrayList.add(filterOutlink2);
                    LOG.debug("{} : [sitemap] {}", str, externalForm2);
                }
            }
        }
        return arrayList;
    }

    public void parseExtensionAttributes(SiteMapURL siteMapURL, Metadata metadata) {
        for (Extension extension : this.extensionsToParse) {
            ExtensionMetadata[] attributesForExtension = siteMapURL.getAttributesForExtension(extension);
            if (attributesForExtension != null) {
                for (ExtensionMetadata extensionMetadata : attributesForExtension) {
                    for (Map.Entry entry : extensionMetadata.asMap().entrySet()) {
                        if (entry.getValue() != null) {
                            metadata.addValues(extension.name() + "." + ((String) entry.getKey()), Arrays.asList((String[]) entry.getValue()));
                        }
                    }
                }
            }
        }
    }

    @Override // com.digitalpebble.stormcrawler.bolt.StatusEmitterBolt
    public void prepare(Map<String, Object> map, TopologyContext topologyContext, OutputCollector outputCollector) {
        super.prepare(map, topologyContext, outputCollector);
        this.parser = new SiteMapParser(false);
        this.filterHoursSinceModified = ConfUtils.getInt(map, "sitemap.filter.hours.since.modified", -1);
        this.parseFilters = ParseFilters.fromConf(map);
        this.maxOffsetGuess = ConfUtils.getInt(map, "sitemap.offset.guess", 300);
        this.averagedMetrics = topologyContext.registerMetric("sitemap_average_processing_time", new ReducedMetric(new MeanReducer()), 30);
        this.scheduleSitemapsWithDelay = ConfUtils.getInt(map, "sitemap.schedule.delay", this.scheduleSitemapsWithDelay);
        List<String> loadListFromConf = ConfUtils.loadListFromConf("sitemap.extensions", map);
        this.extensionsToParse = new ArrayList(loadListFromConf.size());
        Iterator<String> it = loadListFromConf.iterator();
        while (it.hasNext()) {
            Extension valueOf = Extension.valueOf(it.next());
            this.parser.enableExtension(valueOf);
            this.extensionsToParse.add(valueOf);
        }
    }

    @Override // com.digitalpebble.stormcrawler.bolt.StatusEmitterBolt
    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        super.declareOutputFields(outputFieldsDeclarer);
        outputFieldsDeclarer.declare(new Fields(new String[]{"url", "content", "metadata"}));
    }

    private boolean sniff(byte[] bArr) {
        byte[] bArr2 = bArr;
        if (bArr.length > this.maxOffsetGuess && this.maxOffsetGuess > 0) {
            bArr2 = Arrays.copyOfRange(bArr, 0, this.maxOffsetGuess);
        }
        return Bytes.indexOf(bArr2, clue) != -1;
    }
}
