package com.digitalpebble.storm.crawler.filtering.regex;

import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.filtering.URLFilter;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;

/* loaded from: input_file:com/digitalpebble/storm/crawler/filtering/regex/RegexURLNormalizer.class */
public class RegexURLNormalizer implements URLFilter {
    private List<Rule> rules;
    private static final Logger LOG = LoggerFactory.getLogger(RegexURLNormalizer.class);
    private static final List<Rule> EMPTY_RULES = Collections.emptyList();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/digitalpebble/storm/crawler/filtering/regex/RegexURLNormalizer$Rule.class */
    public static class Rule {
        public Pattern pattern;
        public String substitution;

        private Rule() {
        }
    }

    @Override // com.digitalpebble.storm.crawler.filtering.URLFilter
    public void configure(Map map, JsonNode jsonNode) {
        JsonNode jsonNode2 = jsonNode.get("urlNormalizers");
        if (jsonNode2 != null && jsonNode2.isArray()) {
            this.rules = readRules((ArrayNode) jsonNode2);
        } else {
            JsonNode jsonNode3 = jsonNode.get("regexNormalizerFile");
            this.rules = readRules(jsonNode3 != null ? jsonNode3.textValue() : "default-regex-normalizers.xml");
        }
    }

    @Override // com.digitalpebble.storm.crawler.filtering.URLFilter
    public String filter(URL url, Metadata metadata, String str) {
        for (Rule rule : this.rules) {
            str = rule.pattern.matcher(str).replaceAll(rule.substitution);
        }
        if (str.equals("")) {
            str = null;
        }
        return str;
    }

    /* JADX WARN: Multi-variable type inference failed */
    private List<Rule> readRules(ArrayNode arrayNode) {
        Rule createRule;
        List arrayList = new ArrayList();
        Iterator it = arrayNode.iterator();
        while (it.hasNext()) {
            JsonNode jsonNode = (JsonNode) it.next();
            if (jsonNode == null || jsonNode.isNull()) {
                LOG.warn("bad config: 'regex' element is null");
            } else {
                JsonNode jsonNode2 = jsonNode.get("pattern");
                JsonNode jsonNode3 = jsonNode.get("substitution");
                String asText = jsonNode3 != null ? jsonNode3.asText() : "";
                if (jsonNode2 != null && StringUtils.isNotBlank(jsonNode2.asText()) && (createRule = createRule(jsonNode2.asText(), asText)) != null) {
                    arrayList.add(createRule);
                }
            }
        }
        if (arrayList.size() == 0) {
            arrayList = EMPTY_RULES;
        }
        return arrayList;
    }

    private List<Rule> readRules(String str) {
        try {
            return readConfiguration(new InputStreamReader(getClass().getClassLoader().getResourceAsStream(str), StandardCharsets.UTF_8));
        } catch (Exception e) {
            LOG.error("Error loading rules from file: {}", e);
            return EMPTY_RULES;
        }
    }

    private List<Rule> readConfiguration(Reader reader) {
        ArrayList arrayList = new ArrayList();
        try {
            Element documentElement = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new InputSource(reader)).getDocumentElement();
            if (!"regex-normalize".equals(documentElement.getTagName()) && LOG.isErrorEnabled()) {
                LOG.error("bad conf file: top-level element not <regex-normalize>");
            }
            NodeList childNodes = documentElement.getChildNodes();
            for (int i = 0; i < childNodes.getLength(); i++) {
                Node item = childNodes.item(i);
                if (item instanceof Element) {
                    Element element = (Element) item;
                    if (!"regex".equals(element.getTagName()) && LOG.isWarnEnabled()) {
                        LOG.warn("bad conf file: element not <regex>");
                    }
                    NodeList childNodes2 = element.getChildNodes();
                    String str = null;
                    String str2 = null;
                    for (int i2 = 0; i2 < childNodes2.getLength(); i2++) {
                        Node item2 = childNodes2.item(i2);
                        if (item2 instanceof Element) {
                            Element element2 = (Element) item2;
                            if ("pattern".equals(element2.getTagName()) && element2.hasChildNodes()) {
                                str = ((Text) element2.getFirstChild()).getData();
                            }
                            if ("substitution".equals(element2.getTagName()) && element2.hasChildNodes()) {
                                str2 = ((Text) element2.getFirstChild()).getData();
                            }
                            if (!element2.hasChildNodes()) {
                                str2 = "";
                            }
                        }
                    }
                    if (str != null && str2 != null) {
                        arrayList.add(createRule(str, str2));
                    }
                }
            }
            return arrayList.size() == 0 ? EMPTY_RULES : arrayList;
        } catch (Exception e) {
            LOG.error("error parsing conf file", e);
            return EMPTY_RULES;
        }
    }

    private Rule createRule(String str, String str2) {
        Rule rule = new Rule();
        try {
            rule.pattern = Pattern.compile(str);
            rule.substitution = str2;
            return rule;
        } catch (PatternSyntaxException e) {
            LOG.error("skipped rule: {} -> {} : invalid regular expression pattern" + str, str2, e);
            return null;
        }
    }
}
