package ivory.core.tokenize;

import edu.umd.hooka.VocabularyWritable;
import ivory.bloomir.util.OptionManager;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

/* loaded from: input_file:ivory/core/tokenize/Tokenizer.class */
public abstract class Tokenizer {
    private static final Logger LOG = Logger.getLogger(Tokenizer.class);
    protected static String delims;
    protected static int MIN_LENGTH;
    protected static int MAX_LENGTH;
    protected VocabularyWritable vocab;
    protected boolean isStopwordRemoval = false;
    protected boolean isStemming = false;
    protected Set<String> stopwords;
    protected Set<String> stemmedStopwords;

    static {
        LOG.setLevel(Level.INFO);
        delims = "`~!@#^&*()-_=+]}[{\\|'\";:/?.>,<";
        MIN_LENGTH = 2;
        MAX_LENGTH = 50;
    }

    public abstract void configure(Configuration configuration);

    public abstract void configure(Configuration configuration, FileSystem fileSystem);

    public abstract String[] processContent(String str);

    public boolean isStemming() {
        return this.isStemming;
    }

    public boolean isStopwordRemoval() {
        return this.isStopwordRemoval;
    }

    public void setVocab(VocabularyWritable vocabularyWritable) {
        this.vocab = vocabularyWritable;
    }

    public VocabularyWritable getVocab() {
        return this.vocab;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Set<String> readInput(FileSystem fileSystem, String str) {
        HashSet hashSet = new HashSet();
        if (str == null) {
            return hashSet;
        }
        try {
            LOG.info("File " + str + " exists? " + fileSystem.exists(new Path(str)) + ", fs: " + fileSystem);
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader((InputStream) fileSystem.open(new Path(str)), "UTF8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return hashSet;
                }
                hashSet.add(readLine);
            }
        } catch (Exception e) {
            LOG.warn("Problem reading stopwords from " + str);
            throw new RuntimeException("Problem reading stopwords from " + str);
        }
    }

    public int getNumberTokens(String str) {
        return processContent(str).length;
    }

    public float getOOVRate(String str, VocabularyWritable vocabularyWritable) {
        int i = 0;
        int i2 = 0;
        for (String str2 : processContent(str)) {
            i2++;
            if (vocabularyWritable != null && vocabularyWritable.get(str2) <= 0) {
                i++;
            }
        }
        return i / i2;
    }

    public static String removeNonUnicodeChars(String str) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < str.length(); i++) {
            char charAt = str.charAt(i);
            if (Character.getNumericValue(charAt) >= -1) {
                sb.append(charAt);
            }
        }
        return sb.toString();
    }

    public static String normalizeFrench(String str) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < str.length(); i++) {
            if (String.format("%04x", Integer.valueOf(str.charAt(i))).equals("2019")) {
                sb.append("' ");
            } else {
                sb.append(str.charAt(i));
            }
        }
        return sb.toString();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String preNormalize(String str) {
        return str.replaceAll("‘", "'").replaceAll("\u2060", "'").replaceAll("“", "\"").replaceAll("”", "\"").replaceAll("‛", "'").replaceAll("‟", "\"").replaceAll("„", "\"").replaceAll("´", "'").replaceAll("〟", "\"").replaceAll("’", "'").replaceAll("`", "'");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String postNormalize(String str) {
        return str.replaceAll("\\((\\S)", "( $1").replaceAll("(\\S)\\)", "$1 )").replaceAll("''(\\S)", "'' $1").replaceAll("–", "-").replaceAll("‑", "-").replaceAll("(\\S)-(\\S)", "$1 - $2").replaceAll("—", "——").replaceAll(" ' s ", " 's ").replaceAll(" l ' ", " l' ").replaceAll("\"(\\S)", "\" $1").replaceAll("(\\S)\"", "$1 \"");
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String streamToString(TokenStream tokenStream) {
        CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.clearAttributes();
        StringBuilder sb = new StringBuilder();
        while (tokenStream.incrementToken()) {
            try {
                sb.append(String.valueOf(attribute.toString()) + " ");
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString().trim();
    }

    public boolean isStopWord(String str) {
        if (delims.contains(str)) {
            return true;
        }
        if (isStemming() && this.stemmedStopwords.contains(str)) {
            return true;
        }
        return !isStemming() && this.stopwords.contains(str);
    }

    public boolean isStopWord(boolean z, String str) {
        if (delims.contains(str)) {
            return true;
        }
        if (z && this.stemmedStopwords.contains(str)) {
            return true;
        }
        return !z && this.stopwords.contains(str);
    }

    public boolean isDiscard(String str) {
        return str.length() < MIN_LENGTH || str.length() > MAX_LENGTH || isStopWord(str);
    }

    public boolean isDiscard(boolean z, String str) {
        return str.length() < MIN_LENGTH || str.length() > MAX_LENGTH || isStopWord(z, str);
    }

    public String removeBorderStopWords(String str) {
        String[] split = str.split(" ");
        int i = 0;
        int length = split.length - 1;
        int i2 = 0;
        while (true) {
            if (i2 >= split.length) {
                break;
            }
            if (!isStopWord(split[i2])) {
                i = i2;
                break;
            }
            i2++;
        }
        int length2 = split.length - 1;
        while (true) {
            if (length2 < 0) {
                break;
            }
            if (!isStopWord(split[length2])) {
                length = length2;
                break;
            }
            length2--;
        }
        String str2 = "";
        for (int i3 = i; i3 <= length; i3++) {
            str2 = String.valueOf(str2) + split[i3] + " ";
        }
        return str2.trim();
    }

    public String stem(String str) {
        return str;
    }

    public String getUTF8(String str) {
        String str2 = "";
        for (int i = 0; i < str.length(); i++) {
            str2 = String.valueOf(str2) + String.format("%04x", Integer.valueOf(str.charAt(i))) + " ";
        }
        return str2.trim();
    }

    public static void main(String[] strArr) {
        Options options = new Options();
        OptionBuilder.withArgName("full path to model file or directory");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("model file");
        options.addOption(OptionBuilder.create("model"));
        OptionBuilder.withArgName("full path to input file");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("input file");
        OptionBuilder.isRequired();
        options.addOption(OptionBuilder.create("input"));
        OptionBuilder.withArgName("full path to output file");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("output file");
        OptionBuilder.isRequired();
        options.addOption(OptionBuilder.create(OptionManager.OUTPUT_PATH));
        OptionBuilder.withArgName("en | zh | de | fr | ar | tr | es");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("2-character language code");
        OptionBuilder.isRequired();
        options.addOption(OptionBuilder.create("lang"));
        OptionBuilder.withArgName("path to stopwords list");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("one stopword per line");
        options.addOption(OptionBuilder.create("stopword"));
        OptionBuilder.withArgName("path to stemmed stopwords list");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("one stemmed stopword per line");
        options.addOption(OptionBuilder.create("stemmed_stopword"));
        OptionBuilder.withArgName("true|false");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("turn on/off stemming");
        options.addOption(OptionBuilder.create("stem"));
        OptionBuilder.withDescription("Hadoop option to load external jars");
        OptionBuilder.withArgName("jar packages");
        OptionBuilder.hasArg();
        options.addOption(OptionBuilder.create("libjars"));
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            String optionValue = parse.hasOption("stopword") ? parse.getOptionValue("stopword") : null;
            String optionValue2 = parse.hasOption("stemmed_stopword") ? parse.getOptionValue("stemmed_stopword") : null;
            boolean parseBoolean = parse.hasOption("stem") ? Boolean.parseBoolean(parse.getOptionValue("stem")) : true;
            Tokenizer createTokenizer = TokenizerFactory.createTokenizer(parse.getOptionValue("lang"), parse.hasOption("model") ? parse.getOptionValue("model") : null, parseBoolean, optionValue, optionValue2, null);
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(parse.getOptionValue(OptionManager.OUTPUT_PATH)), "UTF8"));
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(parse.getOptionValue("input")), "UTF8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    bufferedWriter.close();
                    return;
                }
                String str = "";
                for (String str2 : createTokenizer.processContent(readLine)) {
                    str = String.valueOf(str) + str2 + " ";
                }
                bufferedWriter.write(String.valueOf(str.trim()) + "\n");
            }
        } catch (Exception e) {
            new HelpFormatter().printHelp("Tokenizer", options);
            System.exit(-1);
        }
    }
}
