package eus.ixa.ixa.pipe.tok;

import eus.ixa.ixa.pipe.seg.RuleBasedSegmenter;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:eus/ixa/ixa/pipe/tok/RuleBasedTokenizer.class */
public class RuleBasedTokenizer implements Tokenizer {
    public static final String TLD = "\\.asp|\\.at|\\.au|\\.az|\\.be|\\.biz|\\.cat|\\.ch|\\.com|\\.cym|\\.cz|\\.de|\\.dk|\\.edu|\\.es|\\.eu|\\.eus|\\.fr|\\.gal|\\.gov|\\.hk|\\.hu|\\.ie|\\.il|\\.info|\\.htm|\\.html|\\.it|\\.jp|\\.pl|\\.pt|\\.net|\\.nl|\\.org|\\.ru|\\.se|\\.sg|\\.sv|\\.uk|\\.zw";
    private final TokenFactory tokenFactory;
    private final NonPeriodBreaker nonBreaker;
    private final String lang;
    private final String originalText;
    private boolean unTokenizable;
    public static Pattern replacement = Pattern.compile("�", 256);
    public static Pattern doubleSpaces = Pattern.compile("[\\  ]+");
    public static Pattern asciiHex = Pattern.compile("[\\x00-\\x19]");
    public static Pattern specials = Pattern.compile("([^\\p{Alnum}\\p{Space}\\.—舒–\\-\\¿\\?\\¡\\!'`,/'\u0091\u0092’‚‛›‘‹])", 256);
    public static Pattern qexc = Pattern.compile("([\\¿\\?\\¡\\!]+)");
    public static Pattern spaceDashSpace = Pattern.compile("([\\ ]+[—舒–\\-/]+|[—舒–\\-/]+[\\ ]+)");
    public static Pattern multiDots = Pattern.compile("\\.([\\.]+)");
    public static Pattern dotmultiDot = Pattern.compile("DOTMULTI\\.");
    public static Pattern dotmultiDotAny = Pattern.compile("DOTMULTI\\.([^\\.])");
    public static Pattern noDigitComma = Pattern.compile("([^\\p{Digit}])(,)", 256);
    public static Pattern commaNoDigit = Pattern.compile("(,)([^\\p{Digit}])", 256);
    public static Pattern digitCommaNoDigit = Pattern.compile("([\\p{Digit}])(,)([^\\p{Digit}])", 256);
    public static Pattern noDigitCommaDigit = Pattern.compile("([^\\p{Digit}])(,)(\\p{Digit})", 256);
    public static Pattern wrongLink = Pattern.compile("((http|ftp)\\s:\\s//\\s*[\\s\\p{Alpha}\\p{Digit}+&@#/%?=~_|!:,.;-]+(\\.asp|\\.at|\\.au|\\.az|\\.be|\\.biz|\\.cat|\\.ch|\\.com|\\.cym|\\.cz|\\.de|\\.dk|\\.edu|\\.es|\\.eu|\\.eus|\\.fr|\\.gal|\\.gov|\\.hk|\\.hu|\\.ie|\\.il|\\.info|\\.htm|\\.html|\\.it|\\.jp|\\.pl|\\.pt|\\.net|\\.nl|\\.org|\\.ru|\\.se|\\.sg|\\.sv|\\.uk|\\.zw))", 256);
    public static Pattern beginLink = Pattern.compile("(http|ftp)(\\s:\\s)(/\\s*/\\s*)");
    public static Pattern endLink = Pattern.compile("(\\.asp|\\.at|\\.au|\\.az|\\.be|\\.biz|\\.cat|\\.ch|\\.com|\\.cym|\\.cz|\\.de|\\.dk|\\.edu|\\.es|\\.eu|\\.eus|\\.fr|\\.gal|\\.gov|\\.hk|\\.hu|\\.ie|\\.il|\\.info|\\.htm|\\.html|\\.it|\\.jp|\\.pl|\\.pt|\\.net|\\.nl|\\.org|\\.ru|\\.se|\\.sg|\\.sv|\\.uk|\\.zw)\\s+(/)");
    public static Pattern noAlphaAposNoAlpha = Pattern.compile("([^\\p{Alpha}])(['\u0091\u0092’‚‛›‘‹'])([^\\p{Alpha}])", 256);
    public static Pattern noAlphaDigitAposAlpha = Pattern.compile("([^\\p{Alpha}\\d])(['\u0091\u0092’‚‛›‘‹'])(\\p{Alpha})", 256);
    public static Pattern alphaAposNonAlpha = Pattern.compile("(\\p{Alpha})(['\u0091\u0092’‚‛›‘‹'])([^\\p{Alpha}])", 256);
    public static Pattern AlphaAposAlpha = Pattern.compile("(\\p{Alpha})(['\u0091\u0092’‚‛›‘‹'])(\\p{Alpha})", 256);
    public static Pattern englishApos = Pattern.compile("(\\p{Alpha})(['\u0091\u0092’‚‛›‘‹'])([msdMSD]|re|ve|ll)", 256);
    public static Pattern yearApos = Pattern.compile("([\\p{Digit}])(['\u0091\u0092’‚‛›‘‹'])([s])", 256);
    public static Pattern endOfSentenceApos = Pattern.compile("([^\\p{Alpha}])(['\u0091\u0092’‚‛›‘‹'])$");
    public static Pattern detokenParagraphs = Pattern.compile("(¶)[\\ ]*(¶)", 256);
    private static boolean DEBUG = false;

    public RuleBasedTokenizer(String str, Properties properties) {
        this.lang = properties.getProperty("language");
        printUntokenizable(properties);
        this.nonBreaker = new NonPeriodBreaker(properties);
        this.tokenFactory = new TokenFactory();
        this.originalText = RuleBasedSegmenter.buildText(str);
    }

    @Override // eus.ixa.ixa.pipe.tok.Tokenizer
    public List<List<Token>> tokenize(String[] strArr) {
        long nanoTime = System.nanoTime();
        int i = 0;
        int i2 = 0;
        String str = this.lang;
        ArrayList arrayList = new ArrayList();
        String str2 = this.originalText;
        for (String str3 : strArr) {
            if (DEBUG) {
                System.err.println("-> Segmented:" + str3);
            }
            ArrayList arrayList2 = new ArrayList();
            String[] tokens = getTokens(str3);
            for (String str4 : tokens) {
                int indexOf = str2.indexOf(str4, i2);
                if (indexOf == -1) {
                    indexOf = i2 + 1;
                }
                Token createToken = this.tokenFactory.createToken(str4, indexOf, str4.length());
                addTokens(createToken, arrayList2);
                if (DEBUG) {
                    System.err.println("-> Token:" + str4 + " curIndex: " + indexOf + " prev: " + i2);
                }
                i2 = indexOf + createToken.tokenLength();
            }
            arrayList.add(arrayList2);
            i += tokens.length;
        }
        normalizeTokens(arrayList, str);
        System.err.printf("ixa-pipe-tok tokenized %d tokens at %.2f tokens per second.%n", Integer.valueOf(i), Double.valueOf(i / ((System.nanoTime() - nanoTime) / 1.0E9d)));
        return arrayList;
    }

    private String[] getTokens(String str) {
        String replaceAll = detokenParagraphs.matcher(doubleSpaces.matcher(endLink.matcher(beginLink.matcher(detokenizeURLs(restoreMultidots(this.nonBreaker.TokenizerNonBreaker(treatContractions(noDigitCommaDigit.matcher(digitCommaNoDigit.matcher(commaNoDigit.matcher(noDigitComma.matcher(generateMultidots(specials.matcher(spaceDashSpace.matcher(qexc.matcher(asciiHex.matcher(doubleSpaces.matcher(str.trim()).replaceAll(" ")).replaceAll(" ")).replaceAll(" $1 ")).replaceAll(" $1 ")).replaceAll(" $1 "))).replaceAll("$1 $2")).replaceAll("$1 $2")).replaceAll("$1 $2 $3")).replaceAll("$1 $2 $3")))))).replaceAll("$1://")).replaceAll("$1$2").trim()).replaceAll(" ")).replaceAll("$1$2");
        if (DEBUG) {
            System.out.println("->Tokens:" + replaceAll);
        }
        return replaceAll.split(" ");
    }

    private String generateMultidots(String str) {
        String replaceAll = multiDots.matcher(str).replaceAll(" DOTMULTI$1 ");
        Matcher matcher = dotmultiDot.matcher(replaceAll);
        while (matcher.find()) {
            dotmultiDotAny.matcher(replaceAll).replaceAll("DOTDOTMULTI $1");
            replaceAll = matcher.replaceAll("DOTDOTMULTI");
            matcher.reset(replaceAll);
        }
        return replaceAll;
    }

    private String restoreMultidots(String str) {
        while (str.contains("DOTDOTMULTI")) {
            str = str.replaceAll("DOTDOTMULTI", "DOTMULTI.");
        }
        return str.replaceAll("DOTMULTI", ".");
    }

    private String treatContractions(String str) {
        return endOfSentenceApos.matcher(AlphaAposAlpha.matcher(yearApos.matcher(englishApos.matcher(alphaAposNonAlpha.matcher(noAlphaDigitAposAlpha.matcher(noAlphaAposNoAlpha.matcher(str).replaceAll("$1 $2 $3")).replaceAll("$1 $2 $3")).replaceAll("$1 $2 $3")).replaceAll("$1 $2$3")).replaceAll("$1 $2$3")).replaceAll("$1$2 $3")).replaceAll("$1 $2");
    }

    private String detokenizeURLs(String str) {
        Matcher matcher = wrongLink.matcher(str);
        StringBuffer stringBuffer = new StringBuffer();
        while (matcher.find()) {
            matcher.appendReplacement(stringBuffer, matcher.group().replaceAll("\\s", ""));
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    private void printUntokenizable(Properties properties) {
        if (properties.getProperty("untokenizable").equalsIgnoreCase("yes")) {
            this.unTokenizable = true;
        } else {
            this.unTokenizable = false;
        }
    }

    private void addTokens(Token token, List<Token> list) {
        if (token.tokenLength() != 0) {
            if (this.unTokenizable) {
                list.add(token);
            } else {
                if (this.unTokenizable || replacement.matcher(token.getTokenValue()).matches()) {
                    return;
                }
                list.add(token);
            }
        }
    }

    public static void normalizeTokens(List<List<Token>> list, String str) {
        for (List<Token> list2 : list) {
            Normalizer.convertNonCanonicalStrings(list2, str);
            Normalizer.normalizeQuotes(list2, str);
            Normalizer.normalizeDoubleQuotes(list2, str);
        }
    }
}
