package ivory.core.tokenize;

import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;
import ivory.core.Constants;
import java.io.IOException;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.ar.ArabicStemFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;

/* loaded from: input_file:ivory/core/tokenize/LuceneArabicAnalyzer.class */
public class LuceneArabicAnalyzer extends Tokenizer {
    private static final Logger LOG = Logger.getLogger(LuceneArabicAnalyzer.class);
    private org.apache.lucene.analysis.Tokenizer tokenizer;

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration) {
        configure(configuration, null);
    }

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration, FileSystem fileSystem) {
        this.stopwords = readInput(fileSystem, configuration.get(Constants.StopwordList));
        this.stemmedStopwords = readInput(fileSystem, configuration.get(Constants.StemmedStopwordList));
        this.isStopwordRemoval = !this.stopwords.isEmpty();
        this.isStemming = configuration.getBoolean(Constants.Stemming, true);
        try {
            setVocab((VocabularyWritable) HadoopAlign.loadVocab(new Path(configuration.get(Constants.CollectionVocab)), fileSystem));
        } catch (Exception e) {
            LOG.warn("No vocabulary provided to tokenizer.");
        }
        LOG.warn("Stemming is " + this.isStemming + "; Stopword removal is " + this.isStopwordRemoval + "; number of stopwords: " + this.stopwords.size() + "; stemmed: " + this.stemmedStopwords.size());
    }

    @Override // ivory.core.tokenize.Tokenizer
    public String[] processContent(String str) {
        this.tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(preNormalize(str)));
        String postNormalize = postNormalize(streamToString(new LowerCaseFilter(Version.LUCENE_35, this.tokenizer)));
        StringBuilder sb = new StringBuilder();
        for (String str2 : postNormalize.split(" ")) {
            if (!isStopwordRemoval() || !isDiscard(false, str2)) {
                sb.append(str2 + " ");
            }
        }
        String trim = sb.toString().trim();
        if (isStemming()) {
            trim = stem(trim);
        }
        return trim.split(" ");
    }

    @Override // ivory.core.tokenize.Tokenizer
    public String stem(String str) {
        this.tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(str));
        ArabicStemFilter arabicStemFilter = new ArabicStemFilter(new ArabicNormalizationFilter(this.tokenizer));
        CharTermAttribute attribute = arabicStemFilter.getAttribute(CharTermAttribute.class);
        arabicStemFilter.clearAttributes();
        StringBuilder sb = new StringBuilder();
        while (arabicStemFilter.incrementToken()) {
            try {
                String obj = attribute.toString();
                if (this.vocab == null || this.vocab.get(obj) > 0) {
                    sb.append(obj + " ");
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString().trim();
    }

    @Override // ivory.core.tokenize.Tokenizer
    public float getOOVRate(String str, VocabularyWritable vocabularyWritable) {
        int i = 0;
        int i2 = 0;
        this.tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(str));
        String postNormalize = postNormalize(streamToString(new LowerCaseFilter(Version.LUCENE_35, new StandardFilter(Version.LUCENE_35, this.tokenizer))));
        StringBuilder sb = new StringBuilder();
        for (String str2 : postNormalize.split(" ")) {
            if (!isStopwordRemoval() || !isDiscard(false, str2)) {
                if (isStemming()) {
                    sb.append(str2 + " ");
                } else {
                    if (vocabularyWritable != null && vocabularyWritable.get(str2) <= 0) {
                        i++;
                    }
                    i2++;
                }
            }
        }
        if (isStemming()) {
            this.tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(sb.toString().trim()));
            ArabicStemFilter arabicStemFilter = new ArabicStemFilter(new ArabicNormalizationFilter(this.tokenizer));
            CharTermAttribute attribute = arabicStemFilter.getAttribute(CharTermAttribute.class);
            arabicStemFilter.clearAttributes();
            while (arabicStemFilter.incrementToken()) {
                try {
                    String obj = attribute.toString();
                    if (vocabularyWritable != null && vocabularyWritable.get(obj) <= 0) {
                        i++;
                    }
                    i2++;
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return i / i2;
    }

    static {
        LOG.setLevel(Level.WARN);
    }
}
