package ivory.core.tokenize;

import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;
import ivory.core.Constants;
import java.io.IOException;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.tartarus.snowball.SnowballStemmer;

/* loaded from: input_file:ivory/core/tokenize/OpenNLPTokenizer.class */
public class OpenNLPTokenizer extends Tokenizer {
    private static final Logger sLogger = Logger.getLogger(OpenNLPTokenizer.class);
    private opennlp.tools.tokenize.Tokenizer tokenizer;
    private SnowballStemmer stemmer;
    private int lang;
    private static final int ENGLISH = 0;
    private static final int FRENCH = 1;
    private static final int GERMAN = 2;
    private static final String[] classes;

    static {
        sLogger.setLevel(Level.INFO);
        classes = new String[]{"org.tartarus.snowball.ext.englishStemmer", "org.tartarus.snowball.ext.frenchStemmer", "org.tartarus.snowball.ext.germanStemmer"};
    }

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration) {
        try {
            configure(configuration, FileSystem.get(configuration));
        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException(e);
        }
    }

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration, FileSystem fileSystem) {
        setTokenizer(fileSystem, new Path(configuration.get(Constants.TokenizerData)));
        if (configuration.getBoolean(Constants.Stemming, true)) {
            setLanguageAndStemmer(configuration.get(Constants.Language));
            this.isStemming = true;
        } else {
            setLanguage(configuration.get(Constants.Language));
        }
        this.stopwords = readInput(fileSystem, configuration.get(Constants.StopwordList));
        this.stemmedStopwords = readInput(fileSystem, configuration.get(Constants.StemmedStopwordList));
        try {
            setVocab((VocabularyWritable) HadoopAlign.loadVocab(new Path(configuration.get(Constants.CollectionVocab)), fileSystem));
        } catch (Exception e) {
            sLogger.warn("No vocabulary provided to tokenizer.");
        }
        this.isStopwordRemoval = !this.stopwords.isEmpty();
        sLogger.info("Stemmer: " + this.stemmer + "\nStopword removal is " + this.isStopwordRemoval + "; number of stopwords: " + this.stopwords.size() + "; stemmed: " + this.stemmedStopwords.size());
    }

    public void setTokenizer(FileSystem fileSystem, Path path) {
        try {
            this.tokenizer = new TokenizerME(new TokenizerModel(fileSystem.open(path)));
        } catch (IOException e) {
            e.printStackTrace();
            throw new RuntimeException("OpenNLPTokenizer model not available at " + path);
        }
    }

    public void setLanguage(String str) {
        if (str.startsWith(ivory.sqe.retrieval.Constants.English)) {
            this.lang = ENGLISH;
            return;
        }
        if (str.startsWith(ivory.sqe.retrieval.Constants.French)) {
            this.lang = 1;
        } else if (str.equals("german") || str.startsWith(ivory.sqe.retrieval.Constants.German)) {
            this.lang = GERMAN;
        } else {
            sLogger.warn("Language not recognized, setting to English!");
        }
    }

    public void setLanguageAndStemmer(String str) {
        setLanguage(str);
        try {
            this.stemmer = (SnowballStemmer) Class.forName(classes[this.lang]).newInstance();
        } catch (ClassNotFoundException e) {
            sLogger.warn("Stemmer class not recognized!\n" + classes[this.lang]);
            this.stemmer = null;
        } catch (Exception e2) {
            e2.printStackTrace();
            throw new RuntimeException(e2);
        }
    }

    @Override // ivory.core.tokenize.Tokenizer
    public String[] processContent(String str) {
        String preNormalize = preNormalize(str);
        if (this.lang == 1) {
            preNormalize = preNormalize.replaceAll("'", "' ");
        }
        String[] strArr = this.tokenizer.tokenize(preNormalize);
        StringBuilder sb = new StringBuilder();
        int length = strArr.length;
        for (int i = ENGLISH; i < length; i++) {
            sb.append(String.valueOf(strArr[i]) + " ");
        }
        String[] split = postNormalize(sb.toString().trim()).split(" ");
        sb.delete(ENGLISH, sb.length());
        for (int i2 = ENGLISH; i2 < split.length; i2++) {
            String lowerCase = split[i2].toLowerCase();
            if (!isStopwordRemoval() || !isDiscard(false, lowerCase)) {
                String stem = stem(lowerCase);
                if (this.vocab == null || this.vocab.get(stem) > 0) {
                    sb.append(String.valueOf(stem) + " ");
                }
            }
        }
        return sb.toString().trim().split(" ");
    }

    @Override // ivory.core.tokenize.Tokenizer
    public int getNumberTokens(String str) {
        return this.tokenizer.tokenize(str).length;
    }

    @Override // ivory.core.tokenize.Tokenizer
    public String stem(String str) {
        if (this.stemmer == null) {
            return str;
        }
        this.stemmer.setCurrent(str);
        this.stemmer.stem();
        return this.stemmer.getCurrent();
    }

    @Override // ivory.core.tokenize.Tokenizer
    public float getOOVRate(String str, VocabularyWritable vocabularyWritable) {
        int i = ENGLISH;
        int i2 = ENGLISH;
        String[] strArr = this.tokenizer.tokenize(preNormalize(str));
        StringBuilder sb = new StringBuilder();
        int length = strArr.length;
        for (int i3 = ENGLISH; i3 < length; i3++) {
            sb.append(String.valueOf(strArr[i3]) + " ");
        }
        String[] split = postNormalize(sb.toString().trim()).split(" ");
        for (int i4 = ENGLISH; i4 < split.length; i4++) {
            String lowerCase = split[i4].toLowerCase();
            if (!isStopwordRemoval() || !isDiscard(false, lowerCase)) {
                String stem = stem(lowerCase);
                if (vocabularyWritable != null && vocabularyWritable.get(stem) <= 0) {
                    i++;
                }
                i2++;
            }
        }
        return i / i2;
    }
}
