package com.digitalpebble.stormcrawler.filtering.basic;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.filtering.URLFilter;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.utils.URLEncodedUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/digitalpebble/stormcrawler/filtering/basic/BasicURLNormalizer.class */
public class BasicURLNormalizer implements URLFilter {
    private static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class);
    private static final Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})");
    private static final Pattern illegalEscapePattern = Pattern.compile("%u([0-9A-Fa-f]{4})");
    private static final Charset utf8 = Charset.forName("UTF-8");
    private static final boolean[] unescapedCharacters = new boolean[128];
    private static final Pattern thirtytwobithash = Pattern.compile("[a-fA-F\\d]{32}");
    boolean removeAnchorPart = true;
    boolean unmangleQueryString = true;
    boolean checkValidURI = true;
    boolean removeHashes = false;
    final Set<String> queryElementsToRemove = new TreeSet();
    Comparator<NameValuePair> comp = new Comparator<NameValuePair>() { // from class: com.digitalpebble.stormcrawler.filtering.basic.BasicURLNormalizer.1
        @Override // java.util.Comparator
        public int compare(NameValuePair nameValuePair, NameValuePair nameValuePair2) {
            return nameValuePair.getName().compareTo(nameValuePair2.getName());
        }
    };

    @Override // com.digitalpebble.stormcrawler.filtering.URLFilter
    public String filter(URL url, Metadata metadata, String str) {
        String trim = str.trim();
        if (this.removeAnchorPart) {
            try {
                String ref = new URL(trim).getRef();
                if (ref != null) {
                    trim = trim.replace("#" + ref, "");
                }
            } catch (MalformedURLException e) {
                return null;
            }
        }
        if (this.unmangleQueryString) {
            trim = unmangleQueryString(trim);
        }
        if (!this.queryElementsToRemove.isEmpty() || this.removeHashes) {
            trim = processQueryElements(trim);
        }
        try {
            URL url2 = new URL(trim);
            String file = url2.getFile();
            String protocol = url2.getProtocol();
            String host = url2.getHost();
            boolean z = false;
            if (!trim.startsWith(protocol)) {
                z = true;
            }
            if (host != null) {
                String lowerCase = host.toLowerCase(Locale.ROOT);
                if (!host.equals(lowerCase)) {
                    host = lowerCase;
                    z = true;
                }
            }
            int port = url2.getPort();
            String escapePath = escapePath(unescapePath(file));
            if (!file.equals(escapePath)) {
                z = true;
            }
            if (z) {
                trim = new URL(protocol, host, port, escapePath).toString();
            }
            if (this.checkValidURI) {
                try {
                    trim = URI.create(trim).normalize().toString();
                } catch (IllegalArgumentException e2) {
                    LOG.info("Invalid URI {} from {} ", trim, trim);
                    return null;
                }
            }
            return trim;
        } catch (MalformedURLException e3) {
            return null;
        }
    }

    @Override // com.digitalpebble.stormcrawler.filtering.URLFilter
    public void configure(Map map, JsonNode jsonNode) {
        JsonNode jsonNode2 = jsonNode.get("removeAnchorPart");
        if (jsonNode2 != null) {
            this.removeAnchorPart = jsonNode2.booleanValue();
        }
        JsonNode jsonNode3 = jsonNode.get("unmangleQueryString");
        if (jsonNode3 != null) {
            this.unmangleQueryString = jsonNode3.booleanValue();
        }
        ArrayNode arrayNode = jsonNode.get("queryElementsToRemove");
        if (arrayNode != null) {
            if (arrayNode.isArray()) {
                Iterator it = arrayNode.iterator();
                while (it.hasNext()) {
                    this.queryElementsToRemove.add(((JsonNode) it.next()).asText());
                }
            } else {
                LOG.warn("Failed to configure queryElementsToRemove.  Not an array: {}", arrayNode.toString());
            }
        }
        JsonNode jsonNode4 = jsonNode.get("checkValidURI");
        if (jsonNode4 != null) {
            this.checkValidURI = jsonNode4.booleanValue();
        }
        JsonNode jsonNode5 = jsonNode.get("removeHashes");
        if (jsonNode5 != null) {
            this.removeHashes = jsonNode5.booleanValue();
        }
    }

    private String processQueryElements(String str) {
        String[] split;
        String str2;
        int indexOf;
        try {
            URL url = new URL(str);
            String query = url.getQuery();
            String path = url.getPath();
            if (path.contains(";") && (indexOf = (str2 = (split = path.split("/"))[split.length - 1]).indexOf(";")) != -1) {
                split[split.length - 1] = str2.substring(0, indexOf);
                String replaceAll = str2.substring(indexOf + 1).replaceAll(";", "&");
                query = query == null ? replaceAll : query + "&" + replaceAll;
                StringBuilder sb = new StringBuilder();
                for (String str3 : split) {
                    if (StringUtils.isNotBlank(str3)) {
                        sb.append("/").append(str3);
                    }
                }
                path = sb.toString();
            }
            if (StringUtils.isEmpty(query)) {
                return str;
            }
            List parse = URLEncodedUtils.parse(query, StandardCharsets.UTF_8);
            Iterator it = parse.iterator();
            while (it.hasNext()) {
                NameValuePair nameValuePair = (NameValuePair) it.next();
                if (this.queryElementsToRemove.contains(nameValuePair.getName())) {
                    it.remove();
                } else if (this.removeHashes && nameValuePair.getValue() != null && thirtytwobithash.matcher(nameValuePair.getValue()).matches()) {
                    it.remove();
                }
            }
            StringBuilder sb2 = new StringBuilder();
            if (StringUtils.isNotBlank(path)) {
                sb2.append(path);
            }
            if (!parse.isEmpty()) {
                Collections.sort(parse, this.comp);
                sb2.append('?').append(URLEncodedUtils.format(parse, StandardCharsets.UTF_8));
            }
            if (url.getRef() != null) {
                sb2.append('#').append(url.getRef());
            }
            return new URL(url.getProtocol(), url.getHost(), url.getPort(), sb2.toString()).toString();
        } catch (MalformedURLException e) {
            LOG.warn("Invalid urlToFilter {}. {}", str, e);
            return null;
        }
    }

    private String unmangleQueryString(String str) {
        return (str.indexOf(38) <= 0 || str.indexOf(63) != -1) ? str : str.replaceFirst("&", "?");
    }

    private String unescapePath(String str) {
        int i;
        Matcher matcher = illegalEscapePattern.matcher(str);
        StringBuilder sb = null;
        int i2 = 0;
        while (true) {
            i = i2;
            if (!matcher.find()) {
                break;
            }
            if (sb == null) {
                sb = new StringBuilder();
            }
            sb.append(str.substring(i, matcher.start()));
            sb.append((char) Integer.valueOf(matcher.group(1), 16).intValue());
            i2 = matcher.end();
        }
        if (sb != null) {
            sb.append(str.substring(i));
            str = sb.toString();
            i = 0;
        }
        Matcher matcher2 = unescapeRulePattern.matcher(str);
        if (!matcher2.find()) {
            return str;
        }
        StringBuilder sb2 = new StringBuilder();
        do {
            sb2.append(str.substring(i, matcher2.start()));
            int intValue = Integer.valueOf(matcher2.group(1), 16).intValue();
            if (intValue >= 128 || !unescapedCharacters[intValue]) {
                sb2.append(matcher2.group().toUpperCase(Locale.ROOT));
            } else {
                sb2.append((char) intValue);
            }
            i = matcher2.end();
        } while (matcher2.find());
        sb2.append(str.substring(i));
        return sb2.toString();
    }

    private String escapePath(String str) {
        StringBuilder sb = new StringBuilder(str.length());
        for (byte b : str.getBytes(utf8)) {
            if (b < 33 || b == 91 || b == 92 || b == 93 || b == 124) {
                sb.append('%');
                String upperCase = Integer.toHexString(b & 255).toUpperCase(Locale.ROOT);
                if (upperCase.length() % 2 != 0) {
                    sb.append('0');
                    sb.append(upperCase);
                } else {
                    sb.append(upperCase);
                }
            } else {
                sb.append((char) b);
            }
        }
        return sb.toString();
    }

    static {
        for (int i = 0; i < 128; i++) {
            if ((65 > i || i > 90) && ((97 > i || i > 122) && !((48 <= i && i <= 57) || i == 45 || i == 46 || i == 95 || i == 126))) {
                unescapedCharacters[i] = false;
            } else {
                unescapedCharacters[i] = true;
            }
        }
    }
}
