package org.snu.ids.kkma.ma;

import java.lang.Character;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import org.snu.ids.kkma.util.Util;

/* loaded from: input_file:org/snu/ids/kkma/ma/Tokenizer.class */
public class Tokenizer {
    public static final TokenPattern[] PREDEFINED_TOKEN_PATTERN = {new TokenPattern("[hH][tT][tT][pP]([sS]*)://[a-zA-Z0-9/_.?=%&\\-]+", CharSetType.COMBINED), new TokenPattern("[a-zA-Z0-9]+[-][a-zA-Z0-9]+", CharSetType.COMBINED), new TokenPattern("(ㅋ|ㅠ|ㅜ|ㅎ){2,}", CharSetType.EMOTICON), new TokenPattern("(히|흐|크|키|케|캬){2,}", CharSetType.EMOTICON), new TokenPattern("(\\^){3,}", CharSetType.EMOTICON), new TokenPattern("[-]?[0-9]+([,][0-9]{3})*([.][0-9]+)?", CharSetType.NUMBER), new TokenPattern("[(][\\^]([.]|_|[-]|o|0|O|3|~|[ ])?[\\^][']?[)]", CharSetType.EMOTICON), new TokenPattern("[d][\\^]([.]|_|[-]|o|0|O|3|~|[ ])?[\\^][b]", CharSetType.EMOTICON), new TokenPattern("[\\^]([.]|_|[-]|o|0|O|3|~|[ ])?[\\^]([;]+|['\"avVㅗ])?", CharSetType.EMOTICON), new TokenPattern("[(];_;[)]", CharSetType.EMOTICON), new TokenPattern("[(]T[_.~oO\\^]?T[)]", CharSetType.EMOTICON), new TokenPattern("ㅜ[_.]?ㅜ", CharSetType.EMOTICON), new TokenPattern("ㅡ[_.]?ㅜ", CharSetType.EMOTICON), new TokenPattern("ㅜ[_.]?ㅡ", CharSetType.EMOTICON), new TokenPattern("ㅠ[_.]?ㅠ", CharSetType.EMOTICON), new TokenPattern("ㅡ[_.]?ㅠ", CharSetType.EMOTICON), new TokenPattern("ㅠ[_.]?ㅡ", CharSetType.EMOTICON), new TokenPattern("ㅠ[_.]?ㅜ", CharSetType.EMOTICON), new TokenPattern("ㅜ[_.]?ㅠ", CharSetType.EMOTICON), new TokenPattern("[(][-](_|[.])?[-]([;]+|[aㅗ])?[)](zzZ)?", CharSetType.EMOTICON), new TokenPattern("[-](_|[.])?[-]([;]+|[aㅗ]|(zzZ))?", CharSetType.EMOTICON), new TokenPattern("[ㅡ](_|[.])?[ㅡ]([;]+|[aㅗ]|(zzZ))?", CharSetType.EMOTICON), new TokenPattern("[(][>]([.]|_)?[<][)]", CharSetType.EMOTICON), new TokenPattern("[>]([.]|_)?[<]", CharSetType.EMOTICON), new TokenPattern("[(][>]([.]|_)?[>][)]", CharSetType.EMOTICON), new TokenPattern("[>]([.]|_)?[>]", CharSetType.EMOTICON), new TokenPattern("[(][¬]([.]|_)?[¬][)]", CharSetType.EMOTICON), new TokenPattern("[¬]([.]|_)?[¬]", CharSetType.EMOTICON), new TokenPattern("[(]'(_|[.])\\^[)]", CharSetType.EMOTICON), new TokenPattern("'(_|[.])\\^", CharSetType.EMOTICON), new TokenPattern("\\^(_|[.])[~]", CharSetType.EMOTICON), new TokenPattern("[~](_|[.])\\^", CharSetType.EMOTICON), new TokenPattern("[(][.][_][.][)]", CharSetType.EMOTICON), new TokenPattern("[(]['][_]['][)]", CharSetType.EMOTICON), new TokenPattern("[(][,][_][,][)]", CharSetType.EMOTICON), new TokenPattern("[(][X][_][X][)]", CharSetType.EMOTICON), new TokenPattern("[O][_.][o]", CharSetType.EMOTICON), new TokenPattern("[o][_.][O]", CharSetType.EMOTICON), new TokenPattern("m[(]_ _[)]m", CharSetType.EMOTICON)};

    public static List<Token> tokenize(String str) {
        if (!Util.valid(str)) {
            return null;
        }
        ArrayList arrayList = new ArrayList();
        StringBuffer stringBuffer = new StringBuffer(str);
        int length = PREDEFINED_TOKEN_PATTERN.length;
        for (int i = 0; i < length; i++) {
            arrayList.addAll(find(stringBuffer, PREDEFINED_TOKEN_PATTERN[i]));
        }
        int length2 = str.length();
        boolean[] checkFound = checkFound(length2, arrayList);
        char c = 0;
        String str2 = "";
        CharSetType charSetType = CharSetType.ETC;
        CharSetType charSetType2 = CharSetType.ETC;
        int i2 = 0;
        for (int i3 = 0; i3 < length2; i3++) {
            char charAt = stringBuffer.charAt(i3);
            CharSetType charSetType3 = charSetType;
            Character.UnicodeBlock of = Character.UnicodeBlock.of(charAt);
            charSetType = checkFound[i3] ? CharSetType.EMOTICON : (of == Character.UnicodeBlock.HANGUL_SYLLABLES || of == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) ? CharSetType.HANGUL : (of == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || of == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) ? CharSetType.HANMUN : ((charAt < 'A' || charAt > 'Z') && (charAt < 'a' || charAt > 'z')) ? (charAt < '0' || charAt > '9') ? (charAt == ' ' || charAt == '\t' || charAt == '\r' || charAt == '\n') ? CharSetType.SPACE : (of == Character.UnicodeBlock.LETTERLIKE_SYMBOLS || of == Character.UnicodeBlock.CJK_COMPATIBILITY || of == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || of == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || of == Character.UnicodeBlock.BASIC_LATIN) ? CharSetType.SYMBOL : CharSetType.ETC : CharSetType.NUMBER : CharSetType.ENGLISH;
            if (i3 != 0 && (charSetType3 != charSetType || ((charSetType == CharSetType.ETC && (str2.length() <= 0 || str2.charAt(str2.length() - 1) != charAt)) || (charSetType == CharSetType.SYMBOL && c != charAt)))) {
                if (charSetType3 != CharSetType.EMOTICON) {
                    arrayList.add(new Token(str2, charSetType3, i2));
                }
                i2 = i3;
                str2 = "";
            }
            str2 = str2 + charAt;
            c = charAt;
        }
        if (Util.valid(str2)) {
            arrayList.add(new Token(str2, charSetType, i2));
        }
        Collections.sort(arrayList);
        return arrayList;
    }

    private static List<Token> find(StringBuffer stringBuffer, TokenPattern tokenPattern) {
        if (tokenPattern == null) {
            return null;
        }
        ArrayList arrayList = new ArrayList();
        Matcher matcher = tokenPattern.pattern.matcher(stringBuffer);
        while (matcher.find()) {
            arrayList.add(new Token(stringBuffer.substring(matcher.start(), matcher.end()), tokenPattern.charSetType, matcher.start()));
            for (int start = matcher.start(); start < matcher.end(); start++) {
                stringBuffer.setCharAt(start, ' ');
            }
        }
        return arrayList;
    }

    private static boolean[] checkFound(int i, List<Token> list) {
        boolean[] zArr = new boolean[i];
        for (int i2 = 0; i2 < i; i2++) {
            zArr[i2] = false;
        }
        int size = list == null ? 0 : list.size();
        for (int i3 = 0; i3 < size; i3++) {
            Token token = list.get(i3);
            int length = token.string.length();
            for (int i4 = 0; i4 < length; i4++) {
                zArr[token.index + i4] = true;
            }
        }
        return zArr;
    }
}
