package ivory.core.tokenize;

import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.tartarus.snowball.SnowballStemmer;

/* loaded from: input_file:ivory/core/tokenize/OpenNLPTokenizer.class */
public class OpenNLPTokenizer implements Tokenizer {
    private static final Logger sLogger = Logger.getLogger(DocumentProcessingUtils.class);
    opennlp.tools.tokenize.Tokenizer tokenizer;
    SnowballStemmer stemmer;
    String lang;
    protected static int NUM_PREDS;
    protected static int MIN_LENGTH;
    protected static int MAX_LENGTH;
    String delims = "`~!@#$%^&*()-_=+]}[{\\|'\";:/?.>,<";
    VocabularyWritable vocab;

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration) {
        try {
            FileSystem fileSystem = FileSystem.get(configuration);
            setTokenizer(fileSystem, new Path(configuration.get("Ivory.TokenizerModel")));
            setLanguageAndStemmer(configuration.get("Ivory.Lang"));
            try {
                setVocab((VocabularyWritable) HadoopAlign.loadVocab(new Path(configuration.get("Ivory.CollectionVocab")), fileSystem));
            } catch (IOException e) {
                sLogger.warn("VOCAB IS NULL!");
            }
        } catch (IOException e2) {
            e2.printStackTrace();
            throw new RuntimeException(e2);
        }
    }

    public void setTokenizer(FileSystem fileSystem, Path path) {
        try {
            this.tokenizer = new TokenizerME(new TokenizerModel(fileSystem.open(path)));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void setLanguageAndStemmer(String str) {
        if (str.startsWith("en")) {
            this.lang = "english";
        } else if (str.startsWith("fr")) {
            this.lang = "french";
        } else if (str.equals("german") || str.startsWith("de")) {
            this.lang = "german";
        } else {
            sLogger.warn("Language not recognized!");
        }
        try {
            this.stemmer = (SnowballStemmer) Class.forName("org.tartarus.snowball.ext." + this.lang + "Stemmer").newInstance();
        } catch (ClassNotFoundException e) {
            sLogger.warn("Stemmer class not recognized!\norg.tartarus.snowball.ext." + this.lang + "Stemmer");
            this.stemmer = null;
        } catch (Exception e2) {
            e2.printStackTrace();
            throw new RuntimeException(e2);
        }
    }

    public void setVocab(VocabularyWritable vocabularyWritable) {
        this.vocab = vocabularyWritable;
    }

    @Override // ivory.core.tokenize.Tokenizer
    public String[] processContent(String str) {
        String[] strArr = this.tokenizer.tokenize(str);
        ArrayList arrayList = new ArrayList();
        for (String str2 : strArr) {
            String removeNonUnicodeChars = removeNonUnicodeChars(str2);
            if (removeNonUnicodeChars.length() >= MIN_LENGTH && removeNonUnicodeChars.length() <= MAX_LENGTH && !this.delims.contains(removeNonUnicodeChars)) {
                String str3 = removeNonUnicodeChars;
                if (this.stemmer != null) {
                    this.stemmer.setCurrent(removeNonUnicodeChars);
                    this.stemmer.stem();
                    str3 = this.stemmer.getCurrent().toLowerCase();
                }
                if (this.vocab == null || this.vocab.get(str3) > 0) {
                    arrayList.add(str3);
                }
            }
        }
        String[] strArr2 = new String[arrayList.size()];
        int i = 0;
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            int i2 = i;
            i++;
            strArr2[i2] = (String) it.next();
        }
        return strArr2;
    }

    public String processContent2(String str) {
        String str2 = "";
        for (String str3 : this.tokenizer.tokenize(str)) {
            String removeNonUnicodeChars = removeNonUnicodeChars(str3);
            String str4 = removeNonUnicodeChars;
            if (this.stemmer != null) {
                this.stemmer.setCurrent(removeNonUnicodeChars);
                this.stemmer.stem();
                str4 = this.stemmer.getCurrent().toLowerCase();
            }
            if (!str4.isEmpty()) {
                str2 = str2 + str4 + " ";
            }
        }
        return str2;
    }

    private String removeNonUnicodeChars(String str) {
        StringBuffer stringBuffer = new StringBuffer();
        for (int i = 0; i < str.length(); i++) {
            char charAt = str.charAt(i);
            if (Character.getNumericValue(charAt) >= 0) {
                stringBuffer.append(charAt);
            }
        }
        return stringBuffer.toString();
    }

    public static void main(String[] strArr) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
        OpenNLPTokenizer openNLPTokenizer = new OpenNLPTokenizer();
        openNLPTokenizer.setTokenizer(FileSystem.get(new Configuration()), new Path("/Users/ferhanture/edu/research/programs/opennlp-tools-1.4.3/models/GermanTok.bin"));
        openNLPTokenizer.setLanguageAndStemmer("german");
        FileSystem fileSystem = null;
        try {
            fileSystem = FileSystem.get(new Configuration());
        } catch (IOException e) {
        }
        openNLPTokenizer.setVocab((VocabularyWritable) HadoopAlign.loadVocab(new Path("/Users/ferhanture/edu/research/data/de-en/eu-nc-wmt08/berkeleyaligner.vocab.ger"), fileSystem));
        openNLPTokenizer.processContent("965 v. Chr.\n#redirect [[10. Jahrhundert v. Chr.]]");
        System.out.println();
    }

    static {
        sLogger.setLevel(Level.WARN);
        MIN_LENGTH = 2;
        MAX_LENGTH = 50;
    }
}
