package com.digitalpebble.storm.crawler.bolt;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import com.digitalpebble.storm.crawler.Constants;
import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.filtering.URLFilters;
import com.digitalpebble.storm.crawler.parse.Outlink;
import com.digitalpebble.storm.crawler.parse.ParseFilter;
import com.digitalpebble.storm.crawler.parse.ParseFilters;
import com.digitalpebble.storm.crawler.parse.ParseResult;
import com.digitalpebble.storm.crawler.persistence.Status;
import com.digitalpebble.storm.crawler.protocol.HttpHeaders;
import com.digitalpebble.storm.crawler.util.ConfUtils;
import com.digitalpebble.storm.crawler.util.MetadataTransfer;
import com.digitalpebble.storm.crawler.util.URLUtil;
import com.google.common.primitives.Bytes;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/digitalpebble/storm/crawler/bolt/SiteMapParserBolt.class */
public class SiteMapParserBolt extends BaseRichBolt {
    public static final String isSitemapKey = "isSitemap";
    private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserBolt.class);
    private OutputCollector collector;
    private boolean strictMode = false;
    private boolean sniffWhenNoSMKey = false;
    private MetadataTransfer metadataTransfer;
    private URLFilters urlFilters;
    private ParseFilter parseFilters;

    public void execute(Tuple tuple) {
        Metadata metadata = (Metadata) tuple.getValueByField("metadata");
        byte[] binaryByField = tuple.getBinaryByField("content");
        String stringByField = tuple.getStringByField("url");
        if (!Boolean.valueOf(metadata.getFirstValue(isSitemapKey)).booleanValue()) {
            int i = -1;
            if (this.sniffWhenNoSMKey) {
                byte[] bytes = "http://www.sitemaps.org/schemas/sitemap/0.9".getBytes();
                byte[] bArr = binaryByField;
                if (binaryByField.length > 200) {
                    bArr = Arrays.copyOfRange(binaryByField, 0, 200);
                }
                i = Bytes.indexOf(bArr, bytes);
                if (i != -1) {
                    LOG.info("{} detected as sitemap based on content", stringByField);
                }
            }
            if (i == -1) {
                this.collector.emit(tuple, tuple.getValues());
                this.collector.ack(tuple);
                return;
            }
        }
        String firstValue = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
        Collections.emptyList();
        try {
            List<Outlink> parseSiteMap = parseSiteMap(stringByField, binaryByField, firstValue, metadata);
            try {
                ParseResult parseResult = new ParseResult();
                parseResult.setOutlinks(parseSiteMap);
                parseResult.get(stringByField).setMetadata(metadata);
                this.parseFilters.filter(stringByField, binaryByField, null, parseResult);
                for (Outlink outlink : parseSiteMap) {
                    this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED}));
                }
                this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.FETCHED}));
                this.collector.ack(tuple);
            } catch (RuntimeException e) {
                String str = "Exception while running parse filters on " + stringByField + ": " + e;
                LOG.error(str);
                metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering");
                metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str);
                this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
                this.collector.ack(tuple);
            }
        } catch (Exception e2) {
            String str2 = "Exception while parsing " + stringByField + ": " + e2;
            LOG.error(str2);
            metadata.setValue(Constants.STATUS_ERROR_SOURCE, "sitemap parsing");
            metadata.setValue(Constants.STATUS_ERROR_MESSAGE, str2);
            this.collector.emit(Constants.StatusStreamName, tuple, new Values(new Object[]{stringByField, metadata, Status.ERROR}));
            this.collector.ack(tuple);
        }
    }

    private List<Outlink> parseSiteMap(String str, byte[] bArr, String str2, Metadata metadata) throws UnknownFormatException, IOException {
        SiteMapParser siteMapParser = new SiteMapParser(this.strictMode);
        URL url = new URL(str);
        AbstractSiteMap parseSiteMap = StringUtils.isBlank(str2) ? siteMapParser.parseSiteMap(bArr, url) : siteMapParser.parseSiteMap(str2, bArr, url);
        ArrayList arrayList = new ArrayList();
        if (parseSiteMap.isIndex()) {
            Iterator it = ((SiteMapIndex) parseSiteMap).getSitemaps().iterator();
            while (it.hasNext()) {
                String externalForm = ((AbstractSiteMap) it.next()).getUrl().toExternalForm();
                try {
                    externalForm = URLUtil.resolveURL(url, externalForm).toExternalForm();
                    if (this.urlFilters != null) {
                        externalForm = this.urlFilters.filter(url, metadata, externalForm);
                    }
                    if (!StringUtils.isBlank(externalForm)) {
                        Metadata metaForOutlink = this.metadataTransfer.getMetaForOutlink(externalForm, str, metadata);
                        metaForOutlink.setValue(isSitemapKey, "true");
                        Outlink outlink = new Outlink(externalForm);
                        outlink.setMetadata(metaForOutlink);
                        arrayList.add(outlink);
                        LOG.debug("{} : [sitemap] {}", str, externalForm);
                    }
                } catch (MalformedURLException e) {
                    LOG.debug("MalformedURLException on {}", externalForm);
                }
            }
        } else {
            for (SiteMapURL siteMapURL : ((SiteMap) parseSiteMap).getSiteMapUrls()) {
                siteMapURL.getPriority();
                siteMapURL.getChangeFrequency();
                String externalForm2 = siteMapURL.getUrl().toExternalForm();
                try {
                    externalForm2 = URLUtil.resolveURL(url, externalForm2).toExternalForm();
                    if (this.urlFilters != null) {
                        externalForm2 = this.urlFilters.filter(url, metadata, externalForm2);
                    }
                    if (!StringUtils.isBlank(externalForm2)) {
                        Metadata metaForOutlink2 = this.metadataTransfer.getMetaForOutlink(externalForm2, str, metadata);
                        metaForOutlink2.setValue(isSitemapKey, "false");
                        Outlink outlink2 = new Outlink(externalForm2);
                        outlink2.setMetadata(metaForOutlink2);
                        arrayList.add(outlink2);
                        LOG.debug("{} : [sitemap] {}", str, externalForm2);
                    }
                } catch (MalformedURLException e2) {
                    LOG.debug("MalformedURLException on {}", externalForm2);
                }
            }
        }
        return arrayList;
    }

    public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
        this.collector = outputCollector;
        this.metadataTransfer = MetadataTransfer.getInstance(map);
        this.urlFilters = URLFilters.emptyURLFilters;
        this.sniffWhenNoSMKey = ConfUtils.getBoolean(map, "sitemap.sniffContent", false);
        String string = ConfUtils.getString(map, "urlfilters.config.file", "urlfilters.json");
        if (string != null) {
            try {
                this.urlFilters = new URLFilters(map, string);
            } catch (IOException e) {
                LOG.error("Exception caught while loading the URLFilters");
                throw new RuntimeException("Exception caught while loading the URLFilters", e);
            }
        }
        String string2 = ConfUtils.getString(map, "parsefilters.config.file", "parsefilters.json");
        this.parseFilters = ParseFilters.emptyParseFilter;
        if (string2 != null) {
            try {
                this.parseFilters = new ParseFilters(map, string2);
            } catch (IOException e2) {
                LOG.error("Exception caught while loading the ParseFilters");
                throw new RuntimeException("Exception caught while loading the ParseFilters", e2);
            }
        }
    }

    public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
        outputFieldsDeclarer.declare(new Fields(new String[]{"url", "content", "metadata"}));
        outputFieldsDeclarer.declareStream(Constants.StatusStreamName, new Fields(new String[]{"url", "metadata", Constants.StatusStreamName}));
    }
}
