package rita;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:rita/Tokenizer.class */
public class Tokenizer {
    private static final String DELIM = "___";
    private static final Pattern UNDERSCORE = Pattern.compile("([a-zA-Z]|[\\\\,\\\\.])_([a-zA-Z])");
    private static final Pattern SPLITTER = Pattern.compile("(\\S.+?[.!?][\"””]?)(?=\\s+|$)");
    private static final Pattern LBRACKS = Pattern.compile("^[\\[\\(\\{⟨]+$");
    private static final Pattern RBRACKS = Pattern.compile("^[\\)\\]\\}⟩]+$");
    private static final Pattern NB_PUNCT = Pattern.compile("^[,\\.;:\\?!)\"\"“”’‘`'%…℃\\^\\*°/⁄\\-@]+$");
    private static final Pattern NA_PUNCT = Pattern.compile("^[\\^\\*\\$/⁄#\\-@°]+$");
    private static final Pattern QUOTES = Pattern.compile("^[(\"\"“”’‘`''«»‘’]+$");
    private static final Pattern SQUOTES = Pattern.compile("^[’‘`']+$");
    private static final Pattern APOS = Pattern.compile("^[’'’]+$");
    private static final Pattern WWW = Pattern.compile("^(www[0-9]?|WWW[0-9]?)$");
    private static final Pattern DOMIN = Pattern.compile("^(com|org|edu|net|xyz|gov|int|eu|hk|tw|cn|de|ch|fr)$");
    private static final Pattern ALPHA_RE = Pattern.compile("^[A-Za-z]+$");
    private static final Pattern[] TOKPAT1 = {Pattern.compile("([Ee])[.]([Gg])[.]"), Pattern.compile("([Ii])[.]([Ee])[.]"), Pattern.compile("([Aa])[\\.]([Mm])[\\.]"), Pattern.compile("([Pp])[\\.]([Mm])[\\.]"), Pattern.compile("(Cap)[\\.]"), Pattern.compile("([Cc])[\\.]"), Pattern.compile("([Ee][Tt])[\\s]([Aa][Ll])[\\.]"), Pattern.compile("(ect|ECT)[\\.]"), Pattern.compile("([Pp])[\\.]([Ss])[\\.]"), Pattern.compile("([Pp])[\\.]([Ss])"), Pattern.compile("([Pp])([Hh])[\\.]([Dd])"), Pattern.compile("([Rr])[\\.]([Ii])[\\.]([Pp])"), Pattern.compile("([Vv])([Ss]?)[\\.]"), Pattern.compile("([Mm])([Rr]|[Ss]|[Xx])[\\.]"), Pattern.compile("([Dd])([Rr])[\\.]"), Pattern.compile("([Pp])([Ff])[\\.]"), Pattern.compile("([Ii])([Nn])([Dd]|[Cc])[\\.]"), Pattern.compile("([Cc])([Oo])[\\.][\\,][\\s]([Ll])([Tt])([Dd])[\\.]"), Pattern.compile("([Cc])([Oo])[\\.][\\s]([Ll])([Tt])([Dd])[\\.]"), Pattern.compile("([Cc])([Oo])[\\.][\\,]([Ll])([Tt])([Dd])[\\.]"), Pattern.compile("([Cc])([Oo])([Rr]?)([Pp]?)[\\.]"), Pattern.compile("([Ll])([Tt])([Dd])[\\.]"), Pattern.compile("(Prof|PROF|prof)[\\.]"), Pattern.compile("\\.{3}"), Pattern.compile("([\\?\\!\\\"\\u201C\\\\.,;:@#$%&])"), Pattern.compile("\\s+"), Pattern.compile(",([^0-9])"), Pattern.compile("([^.])([.])([\\])}>\\\"'’]*)\\s*$"), Pattern.compile("([\\[\\](){}<>⟨⟩])"), Pattern.compile("--"), Pattern.compile("$"), Pattern.compile("^"), Pattern.compile("([^'])' | '"), Pattern.compile(" ‘"), Pattern.compile("'([SMD]) ")};
    private static final Pattern[] TOKPAT2 = {Pattern.compile("([Cc])an['’]t"), Pattern.compile("([Dd])idn['’]t"), Pattern.compile("([CcWw])ouldn['’]t"), Pattern.compile("([Ss])houldn['’]t"), Pattern.compile("([Ii])t['’]s"), Pattern.compile("n['’]t "), Pattern.compile("['’]ve "), Pattern.compile("['’]re ")};
    private static final Pattern[] TOKPAT3 = {Pattern.compile(" ([A-Z]) \\."), Pattern.compile("\\s+"), Pattern.compile("^\\s+"), Pattern.compile("\\^"), Pattern.compile("°"), Pattern.compile("…"), Pattern.compile("([\\w])([’'])\\s"), Pattern.compile("_elipsisDDD_"), Pattern.compile("_([Ee])([Gg])_"), Pattern.compile("_([Ii])([Ee])_"), Pattern.compile("_([Aa])([Mm])_"), Pattern.compile("_([Pp])([Mm])_"), Pattern.compile("_(Cap)_"), Pattern.compile("_([Cc])_"), Pattern.compile("_([Ee][Tt])zzz([Aa][Ll])_"), Pattern.compile("_(ect|ECT)_"), Pattern.compile("_([Pp])([Ss])dot_"), Pattern.compile("_([Pp])([Ss])_"), Pattern.compile("_([Pp])([Hh])([Dd])_"), Pattern.compile("_([Rr])([Ii])([Pp])_"), Pattern.compile("_([Vv])([Ss]?)_"), Pattern.compile("_([Mm])([Rr]|[Ss]|[Xx])_"), Pattern.compile("_([Dd])([Rr])_"), Pattern.compile("_([Pp])([Ff])_"), Pattern.compile("_([Ii])([Nn])([Dd]|[Cc])_"), Pattern.compile("_([Cc])([Oo])dcs([Ll])([Tt])([Dd])_"), Pattern.compile("_([Cc])([Oo])ds([Ll])([Tt])([Dd])_"), Pattern.compile("_([Cc])([Oo])dc([Ll])([Tt])([Dd])_"), Pattern.compile("_([Cc])([Oo])([Rr]?)([Pp]?)_"), Pattern.compile("_([Ll])([Tt])([Dd])_"), Pattern.compile("_(Prof|PROF|prof)_")};
    private static final String[] TOKREP1 = {"_$1$2_", "_$1$2_", "_$1$2_", "_$1$2_", "_Cap_", "_$1_", "_$1zzz$2_", "_$1_", "_$1$2dot_", "_$1$2_", "_$1$2$3_", "_$1$2$3_", "_$1$2_", "_$1$2_", "_$1$2_", "_$1$2_", "_$1$2$3_", "_$1$2dcs$3$4$5_", "_$1$2ds$3$4$5_", "_$1$2dc$3$4$5_", "_$1$2$3$4_", "_$1$2$3_", "_$1_", "_elipsisDDD_", " $1 ", " ", " , $1", "$1 $2$3 ", " $1 ", " -- ", " ", " ", "$1 ' ", " ‘ ", " '$1 "};
    private static final String[] TOKREP2 = {"$1an not", "$1id not", "$1ould not", "$1hould not", " $1t is", " not ", " have ", " are "};
    private static final String[] TOKREP3 = {" $1. ", " ", "", " ^ ", " ° ", " … ", "$1 $2 ", " ... ", "$1.$2.", "$1.$2.", "$1.$2.", "$1.$2.", "Cap.", "$1.", "$1_$2.", "$1.", "$1.$2.", "$1.$2", "$1$2.$3", "$1.$2.$3", "$1$2.", "$1$2.", "$1$2.", "$1$2.", "$1$2$3.", "$1$2.,_$3$4$5.", "$1$2._$3$4$5.", "$1$2.,$3$4$5.", "$1$2$3$4.", "$1$2$3.", "$1."};
    private static final Pattern LINEBREAKS = Pattern.compile("(\r?\n)+");
    private static final Pattern HTML_TAGS_RE = Pattern.compile("(<\\/?[a-z][a-z0-9='\"#;:&\\s\\-\\+\\/\\.\\?]*\\/?>|<!DOCTYPE[^>]*>|<!--[^>-]*-->)", 2);
    private static final Pattern[] UNTOKENIZE_HTMLTAG_RE = {Pattern.compile("^ *<[a-z][a-z0-9='\"#;:&\\s\\-\\+\\/\\.\\?]*\\/> *$>", 2), Pattern.compile("^ *<([a-z][a-z0-9='\"#;:&\\s\\-\\+\\/\\.\\?]*[a-z0-9='\"#;:&\\s\\-\\+\\.\\?]|[a-z])> *$", 2), Pattern.compile("^ *<\\/[a-z][a-z0-9='\"#;:&\\s\\-\\+\\/\\.\\?]*> *$", 2), Pattern.compile("^ *<!DOCTYPE[^>]*> *$", 2), Pattern.compile("^ *<!--[^->]*--> *$", 2)};
    private static final Pattern LT_RE = Pattern.compile("^ *< *$");
    private static final Pattern GT_RE = Pattern.compile("^ *> *$");
    private static final Pattern TAGSTART_RE = Pattern.compile("^ *[!\\-\\/] *$");
    private static final Pattern TAGEND_RE = Pattern.compile("^ *[\\-\\/] *$");

    public static String[] tokenize(String str) {
        return tokenize(str, null);
    }

    public static String untokenize(String[] strArr) {
        return untokenize(strArr, " ");
    }

    public static String[] tokens(String str) {
        return tokens(str, null);
    }

    public static String[] tokens(String str, String str2) {
        String[] strArr = tokenize(str, str2);
        HashSet hashSet = new HashSet();
        for (int i = 0; i < strArr.length; i++) {
            if (RE.test(ALPHA_RE, strArr[i])) {
                hashSet.add(strArr[i].toLowerCase());
            }
        }
        ArrayList arrayList = new ArrayList(hashSet);
        Collections.sort(arrayList);
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public static String[] sentences(String str, Pattern pattern) {
        if (str == null || str.length() == 0) {
            return new String[]{str};
        }
        String replaceAll = LINEBREAKS.matcher(str).replaceAll(" ");
        if (pattern == null) {
            pattern = SPLITTER;
        }
        Matcher matcher = pattern.matcher(escapeAbbrevs(replaceAll));
        ArrayList arrayList = new ArrayList();
        while (matcher.find()) {
            arrayList.add(matcher.group());
        }
        String[] strArr = (String[]) arrayList.toArray(new String[0]);
        return strArr.length == 0 ? new String[]{str} : unescapeAbbrevs(strArr);
    }

    public static String untokenize(String[] strArr, String str) {
        String[] preProcessTags = preProcessTags(strArr);
        boolean z = false;
        boolean z2 = false;
        boolean z3 = false;
        boolean z4 = false;
        if (preProcessTags == null || preProcessTags.length == 0) {
            return "";
        }
        if (preProcessTags.length > 0) {
            z = QUOTES.matcher(preProcessTags[0]).matches();
        }
        String str2 = preProcessTags[0];
        int i = 1;
        while (i < preProcessTags.length) {
            if (preProcessTags[i] != null) {
                boolean equals = preProcessTags[i].equals(",");
                boolean z5 = NB_PUNCT.matcher(preProcessTags[i]).matches() || UNTOKENIZE_HTMLTAG_RE[2].matcher(preProcessTags[i]).matches();
                boolean matches = QUOTES.matcher(preProcessTags[i]).matches();
                boolean matches2 = LBRACKS.matcher(preProcessTags[i]).matches();
                boolean matches3 = RBRACKS.matcher(preProcessTags[i]).matches();
                boolean matches4 = DOMIN.matcher(preProcessTags[i]).matches();
                boolean equals2 = preProcessTags[i - 1].equals(",");
                boolean matches5 = NB_PUNCT.matcher(preProcessTags[i - 1]).matches();
                boolean z6 = NA_PUNCT.matcher(preProcessTags[i - 1]).matches() || UNTOKENIZE_HTMLTAG_RE[1].matcher(preProcessTags[i - 1]).matches();
                boolean matches6 = QUOTES.matcher(preProcessTags[i - 1]).matches();
                boolean matches7 = LBRACKS.matcher(preProcessTags[i - 1]).matches();
                boolean matches8 = RBRACKS.matcher(preProcessTags[i - 1]).matches();
                boolean z7 = (preProcessTags[i - 1].charAt(preProcessTags[i - 1].length() - 1) != 's' || preProcessTags[i - 1].equals("is") || preProcessTags[i - 1].equals("Is") || preProcessTags[i - 1].equals("IS")) ? false : true;
                boolean matches9 = WWW.matcher(preProcessTags[i - 1]).matches();
                boolean z8 = i == preProcessTags.length - 1 ? false : preProcessTags[i + 1].equals("s") || preProcessTags[i + 1].equals("S");
                boolean z9 = i == preProcessTags.length - 1;
                if ((preProcessTags[i - 1].equals(".") && matches4) || z4) {
                    z4 = false;
                    str2 = str2 + preProcessTags[i];
                } else {
                    if (preProcessTags[i].equals(".") && matches9) {
                        z4 = true;
                    } else if (matches2) {
                        str2 = str2 + str;
                    } else if (matches8) {
                        if (!z5 && !matches2) {
                            str2 = str2 + str;
                        }
                    } else if (matches) {
                        if (z) {
                            z2 = true;
                            z = false;
                        } else if ((!APOS.matcher(preProcessTags[i]).matches() || !z7) && (!APOS.matcher(preProcessTags[i]).matches() || !z8)) {
                            z = true;
                            z2 = false;
                            str2 = str2 + str;
                        }
                    } else if (z2 && !z5) {
                        str2 = str2 + str;
                        z2 = false;
                    } else if (matches6 && equals) {
                        z3 = true;
                    } else if (z3 && equals2) {
                        str2 = str2 + str;
                        z3 = false;
                    } else if ((!z5 && !matches6 && !z6 && !matches7 && !matches3) || (!z9 && z5 && matches5 && !z6 && !matches6 && !matches7 && !matches3)) {
                        str2 = str2 + str;
                    }
                    str2 = str2 + preProcessTags[i];
                    if (z5 && !matches5 && !z && SQUOTES.matcher(preProcessTags[i]).matches() && z7) {
                        str2 = str2 + str;
                    }
                }
            }
            i++;
        }
        return str2.trim();
    }

    public static String[] tokenize(String str, String str2) {
        if (str == null) {
            return new String[0];
        }
        if (str.length() == 0) {
            return new String[]{""};
        }
        if (str2 != null) {
            return str.split(str2);
        }
        String trim = str.trim();
        ArrayList arrayList = (ArrayList) pushTags(trim).get(0);
        String str3 = (String) pushTags(trim).get(1);
        for (int i = 0; i < TOKPAT1.length; i++) {
            str3 = TOKPAT1[i].matcher(str3).replaceAll(TOKREP1[i]);
        }
        if (RiTa.SPLIT_CONTRACTIONS) {
            for (int i2 = 0; i2 < TOKPAT2.length; i2++) {
                str3 = TOKPAT2[i2].matcher(str3).replaceAll(TOKREP2[i2]);
            }
        }
        for (int i3 = 0; i3 < TOKPAT3.length; i3++) {
            str3 = TOKPAT3[i3].matcher(str3).replaceAll(TOKREP3[i3]);
        }
        return (String[]) popTags(str3.trim().split("\\s+"), arrayList).toArray(new String[0]);
    }

    private static List<Object> pushTags(String str) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        Matcher matcher = HTML_TAGS_RE.matcher(str);
        while (matcher.find()) {
            arrayList.add(matcher.group());
            str = str.replace((CharSequence) arrayList.get(i), " _HTMLTAG" + i + "_ ");
            i++;
        }
        return Arrays.asList(arrayList, str);
    }

    private static ArrayList<String> popTags(String[] strArr, ArrayList<String> arrayList) {
        ArrayList<String> arrayList2 = new ArrayList<>();
        for (String str : strArr) {
            if (str.contains("_HTMLTAG")) {
                arrayList2.add(arrayList.get(0));
                arrayList.remove(0);
            } else if (str.contains("_")) {
                arrayList2.add(UNDERSCORE.matcher(str).replaceAll("$1 $2"));
            } else {
                arrayList2.add(str);
            }
        }
        return arrayList2;
    }

    private static String[] preProcessTags(String[] strArr) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        while (i < strArr.length) {
            String str = strArr[i];
            if (LT_RE.matcher(str).matches()) {
                ArrayList arrayList2 = new ArrayList();
                arrayList2.add(strArr[i]);
                int i2 = i + 1;
                while (i2 < strArr.length) {
                    arrayList2.add(strArr[i2]);
                    if (LT_RE.matcher(strArr[i2]).matches() || GT_RE.matcher(strArr[i2]).matches()) {
                        break;
                    }
                    i2++;
                }
                if (LT_RE.matcher((CharSequence) arrayList2.get(arrayList2.size() - 1)).matches()) {
                    arrayList2.remove(arrayList2.size() - 1);
                    arrayList.addAll(arrayList2);
                    i = i2;
                } else if (!GT_RE.matcher((CharSequence) arrayList2.get(arrayList2.size() - 1)).matches()) {
                    arrayList.addAll(arrayList2);
                    i = i2 + 1;
                } else if (HTML_TAGS_RE.matcher(String.join("", arrayList2)).matches()) {
                    arrayList.add(tagSubarrayToString((String[]) arrayList2.toArray(new String[0])));
                    i = i2 + 1;
                } else {
                    arrayList.addAll(arrayList2);
                    i = i2 + 1;
                }
            } else {
                arrayList.add(str);
                i++;
            }
        }
        return (String[]) arrayList.toArray(new String[0]);
    }

    private static String tagSubarrayToString(String[] strArr) {
        String str = "" + strArr[0].trim();
        String str2 = strArr[strArr.length - 1].trim() + "";
        int i = 1;
        while (i < strArr.length - 1 && TAGSTART_RE.matcher(strArr[i]).matches()) {
            str = str + strArr[i].trim();
            i++;
        }
        int i2 = i;
        int length = strArr.length - 2;
        while (length > i2 && TAGEND_RE.matcher(strArr[length]).matches()) {
            str2 = strArr[length].trim() + str2;
            length--;
        }
        return str + untokenize((String[]) Arrays.copyOfRange(strArr, i2, length + 1)) + str2;
    }

    private static String[] unescapeAbbrevs(String[] strArr) {
        for (int i = 0; i < strArr.length; i++) {
            strArr[i] = strArr[i].replaceAll(DELIM, ".");
        }
        return strArr;
    }

    private static String escapeAbbrevs(String str) {
        for (String str2 : RiTa.ABRV) {
            int indexOf = str.indexOf(str2);
            while (indexOf > -1) {
                str = str.replace(str2, str2.replace(".", DELIM));
                indexOf = str.indexOf(str2);
            }
        }
        return str;
    }

    static {
        if (TOKPAT1.length != TOKREP1.length || TOKPAT2.length != TOKREP2.length || TOKPAT3.length != TOKREP3.length) {
            throw new RiTaException("Invalid Tokenizer");
        }
    }
}
