package com.digitalpebble.storm.crawler.parse.filter;

import com.digitalpebble.storm.crawler.parse.ParseData;
import com.digitalpebble.storm.crawler.parse.ParseFilter;
import com.digitalpebble.storm.crawler.parse.ParseResult;
import com.fasterxml.jackson.databind.JsonNode;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.NodeList;

/* loaded from: input_file:com/digitalpebble/storm/crawler/parse/filter/ContentFilter.class */
public class ContentFilter extends ParseFilter {
    private static final Logger LOG = LoggerFactory.getLogger(ContentFilter.class);
    private XPath xpath = XPathFactory.newInstance().newXPath();
    private List<XPathExpression> expressions;

    @Override // com.digitalpebble.storm.crawler.parse.ParseFilter
    public void filter(String str, byte[] bArr, DocumentFragment documentFragment, ParseResult parseResult) {
        ParseData parseData = parseResult.get(str);
        Iterator<XPathExpression> it = this.expressions.iterator();
        while (it.hasNext()) {
            try {
                NodeList nodeList = (NodeList) it.next().evaluate(documentFragment, XPathConstants.NODESET);
                if (nodeList.getLength() != 0) {
                    StringBuilder sb = new StringBuilder();
                    for (int i = 0; i < nodeList.getLength(); i++) {
                        sb.append(nodeList.item(i).getTextContent()).append("\n");
                    }
                    if (!StringUtils.isBlank(sb.toString())) {
                        LOG.debug("Restricted text for doc {}. Text size was {} and is now {}", new Object[]{str, Integer.valueOf(parseData.getText().length()), Integer.valueOf(sb.length())});
                        parseData.setText(sb.toString());
                        return;
                    }
                    LOG.debug("Found match for doc {} but empty text extracted - skipping", str);
                }
            } catch (XPathExpressionException e) {
                LOG.error("Caught XPath expression", e);
            }
        }
    }

    @Override // com.digitalpebble.storm.crawler.parse.ParseFilter
    public void configure(Map map, JsonNode jsonNode) {
        this.expressions = new ArrayList();
        Iterator fields = jsonNode.fields();
        while (fields.hasNext()) {
            Map.Entry entry = (Map.Entry) fields.next();
            String asText = ((JsonNode) entry.getValue()).asText();
            try {
                this.expressions.add(this.xpath.compile(asText));
            } catch (XPathExpressionException e) {
                throw new RuntimeException("Can't compile expression : " + asText, e);
            }
        }
    }

    @Override // com.digitalpebble.storm.crawler.parse.ParseFilter
    public boolean needsDOM() {
        return true;
    }
}
