package com.digitalpebble.storm.crawler.filtering.basic;

import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.filtering.URLFilter;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.lang.StringUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/digitalpebble/storm/crawler/filtering/basic/BasicURLNormalizer.class */
public class BasicURLNormalizer implements URLFilter {
    private static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class);
    boolean removeAnchorPart = true;
    boolean unmangleQueryString = true;
    final Set<String> queryElementsToRemove = new TreeSet();
    Comparator<NameValuePair> comp = new Comparator<NameValuePair>() { // from class: com.digitalpebble.storm.crawler.filtering.basic.BasicURLNormalizer.1
        @Override // java.util.Comparator
        public int compare(NameValuePair nameValuePair, NameValuePair nameValuePair2) {
            return nameValuePair.getName().compareTo(nameValuePair2.getName());
        }
    };

    @Override // com.digitalpebble.storm.crawler.filtering.URLFilter
    public String filter(URL url, Metadata metadata, String str) {
        if (this.removeAnchorPart) {
            try {
                String ref = new URL(str).getRef();
                if (ref != null) {
                    str = str.replace("#" + ref, "");
                }
            } catch (MalformedURLException e) {
                return null;
            }
        }
        if (this.unmangleQueryString) {
            str = unmangleQueryString(str);
        }
        if (!this.queryElementsToRemove.isEmpty()) {
            str = filterQueryElements(str);
        }
        return str;
    }

    @Override // com.digitalpebble.storm.crawler.filtering.URLFilter
    public void configure(Map map, JsonNode jsonNode) {
        JsonNode jsonNode2 = jsonNode.get("removeAnchorPart");
        if (jsonNode2 != null) {
            this.removeAnchorPart = jsonNode2.booleanValue();
        }
        JsonNode jsonNode3 = jsonNode.get("unmangleQueryString");
        if (jsonNode3 != null) {
            this.unmangleQueryString = jsonNode3.booleanValue();
        }
        ArrayNode arrayNode = jsonNode.get("queryElementsToRemove");
        if (arrayNode != null) {
            if (!arrayNode.isArray()) {
                LOG.warn("Failed to configure queryElementsToRemove.  Not an array: {}", arrayNode.toString());
                return;
            }
            Iterator it = arrayNode.iterator();
            while (it.hasNext()) {
                this.queryElementsToRemove.add(((JsonNode) it.next()).asText());
            }
        }
    }

    private String filterQueryElements(String str) {
        try {
            URL url = new URL(str);
            if (StringUtils.isEmpty(url.getQuery())) {
                return str;
            }
            ArrayList arrayList = new ArrayList();
            URLEncodedUtils.parse(arrayList, new Scanner(url.getQuery()), "UTF-8");
            Iterator it = arrayList.iterator();
            while (it.hasNext()) {
                if (this.queryElementsToRemove.contains(((NameValuePair) it.next()).getName())) {
                    it.remove();
                }
            }
            StringBuilder sb = new StringBuilder();
            if (url.getPath() != null) {
                sb.append(url.getPath());
            }
            if (!arrayList.isEmpty()) {
                Collections.sort(arrayList, this.comp);
                sb.append('?').append(URLEncodedUtils.format(arrayList, StandardCharsets.UTF_8));
            }
            if (url.getRef() != null) {
                sb.append('#').append(url.getRef());
            }
            return new URL(url.getProtocol(), url.getHost(), url.getPort(), sb.toString()).toString();
        } catch (MalformedURLException e) {
            LOG.warn("Invalid urlToFilter {}. {}", str, e);
            return null;
        }
    }

    private String unmangleQueryString(String str) {
        return (str.indexOf(38) <= 0 || str.indexOf(63) != -1) ? str : str.replaceFirst("&", "?");
    }
}
