package com.digitalpebble.stormcrawler.util;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.protocol.HttpHeaders;
import com.ibm.icu.text.CharsetDetector;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.BOMInputStream;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;

/* loaded from: input_file:com/digitalpebble/stormcrawler/util/CharsetIdentification.class */
public class CharsetIdentification {
    public static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;
    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");

    public static String getCharsetFast(Metadata metadata, byte[] bArr, int i) {
        String charsetFromBOM = getCharsetFromBOM(bArr);
        if (charsetFromBOM != null) {
            return charsetFromBOM;
        }
        String charsetFromHTTP = getCharsetFromHTTP(metadata);
        if (charsetFromHTTP != null) {
            return charsetFromHTTP;
        }
        String charsetFromMeta = getCharsetFromMeta(bArr, i);
        if (charsetFromMeta != null) {
            return charsetFromMeta;
        }
        String charsetFromText = getCharsetFromText(bArr, null, i);
        return charsetFromText != null ? charsetFromText : DEFAULT_CHARSET.name();
    }

    public static String getCharset(Metadata metadata, byte[] bArr, int i) {
        String charsetFromBOM = getCharsetFromBOM(bArr);
        if (charsetFromBOM != null) {
            return charsetFromBOM;
        }
        String charsetFromHTTP = getCharsetFromHTTP(metadata);
        String charsetFromMeta = getCharsetFromMeta(bArr, i);
        if (charsetFromHTTP != null && charsetFromMeta != null && charsetFromHTTP.equalsIgnoreCase(charsetFromMeta)) {
            return charsetFromHTTP;
        }
        String str = null;
        if (charsetFromHTTP != null && charsetFromMeta == null) {
            str = charsetFromHTTP;
        } else if (charsetFromHTTP == null && charsetFromMeta != null) {
            str = charsetFromMeta;
        }
        String charsetFromText = getCharsetFromText(bArr, str, i);
        return charsetFromText != null ? charsetFromText : DEFAULT_CHARSET.name();
    }

    private static String getCharsetFromHTTP(Metadata metadata) {
        return getCharsetFromContentType(metadata.getFirstValue(HttpHeaders.CONTENT_TYPE));
    }

    private static String getCharsetFromBOM(byte[] bArr) {
        try {
            BOMInputStream bOMInputStream = new BOMInputStream(new ByteArrayInputStream(bArr));
            try {
                ByteOrderMark bom = bOMInputStream.getBOM();
                if (bom == null) {
                    bOMInputStream.close();
                    return null;
                }
                String charsetName = bom.getCharsetName();
                bOMInputStream.close();
                return charsetName;
            } finally {
            }
        } catch (IOException e) {
            return null;
        }
    }

    private static String getCharsetFromText(byte[] bArr, String str, int i) {
        String str2;
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.enableInputFilter(true);
        if (str != null) {
            charsetDetector.setDeclaredEncoding(str);
        }
        byte[] bArr2 = bArr;
        if (i != -1 && bArr.length > i) {
            bArr2 = Arrays.copyOfRange(bArr, 0, i);
        }
        charsetDetector.setText(bArr2);
        try {
            str2 = validateCharset(charsetDetector.detect().getName());
        } catch (Exception e) {
            str2 = null;
        }
        return str2;
    }

    private static String getCharsetFromMeta(byte[] bArr, int i) {
        int length = bArr.length;
        if (i > 0 && i < length) {
            length = i;
        }
        String str = new String(bArr, 0, length, DEFAULT_CHARSET);
        int indexOf = str.indexOf("<meta charset=\"");
        if (indexOf != -1) {
            int indexOf2 = str.indexOf(34, indexOf + 15);
            if (indexOf2 == -1 && i + 10 < bArr.length) {
                return getCharsetFromMeta(bArr, i + 10);
            }
            if (indexOf2 == -1) {
                return null;
            }
            return validateCharset(str.substring(indexOf + 15, indexOf2));
        }
        String str2 = null;
        try {
            Iterator it = Parser.htmlParser().parseInput(str, "dummy").select("meta[http-equiv=content-type], meta[charset]").iterator();
            while (it.hasNext()) {
                Element element = (Element) it.next();
                if (element.hasAttr("http-equiv")) {
                    str2 = getCharsetFromContentType(element.attr("content"));
                }
                if (str2 == null && element.hasAttr("charset")) {
                    str2 = element.attr("charset");
                }
                if (str2 != null) {
                    return str2;
                }
            }
        } catch (Exception e) {
            str2 = null;
        }
        return str2;
    }

    private static String getCharsetFromContentType(String str) {
        if (str == null) {
            return null;
        }
        Matcher matcher = charsetPattern.matcher(str);
        if (matcher.find()) {
            return validateCharset(matcher.group(1).trim().replace("charset=", ""));
        }
        return null;
    }

    private static String validateCharset(String str) {
        if (str == null || str.length() == 0) {
            return null;
        }
        String replaceAll = str.trim().replaceAll("[\"']", "");
        try {
            if (Charset.isSupported(replaceAll)) {
                return replaceAll;
            }
            String upperCase = replaceAll.toUpperCase(Locale.ENGLISH);
            if (Charset.isSupported(upperCase)) {
                return upperCase;
            }
            return null;
        } catch (IllegalCharsetNameException e) {
            return null;
        }
    }
}
