package ivory.core.tokenize;

import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import ivory.core.Constants;
import java.io.IOException;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/core/tokenize/StanfordChineseTokenizer.class */
public class StanfordChineseTokenizer extends Tokenizer {
    private static final Logger LOG = Logger.getLogger(StanfordChineseTokenizer.class);
    CRFClassifier classifier;
    DocumentReaderAndWriter readerWriter;

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration) {
        try {
            configure(configuration, FileSystem.get(configuration));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration, FileSystem fileSystem) {
        Properties properties = new Properties();
        properties.setProperty("sighanCorporaDict", configuration.get(Constants.TokenizerData));
        properties.setProperty("serDictionary", configuration.get(Constants.TokenizerData) + "/dict-chris6.ser");
        properties.setProperty("inputEncoding", "UTF-8");
        properties.setProperty("sighanPostProcessing", "true");
        try {
            this.classifier = new CRFClassifier(properties);
            FSDataInputStream open = fileSystem.open(new Path(configuration.get(Constants.TokenizerData) + "/pku"));
            FSDataInputStream open2 = fileSystem.open(new Path(configuration.get(Constants.TokenizerData) + "/dict-chris6.ser"));
            this.classifier.loadClassifier(open, properties);
            this.classifier.flags.setConf(configuration);
            this.readerWriter = this.classifier.makeReaderAndWriter(open2);
        } catch (Exception e) {
            e.printStackTrace();
            throw new RuntimeException("Tokenizer not configured properly!");
        }
    }

    @Override // ivory.core.tokenize.Tokenizer
    public String[] processContent(String str) {
        String[] strArr = null;
        try {
            strArr = this.classifier.classifyStringAndReturnAnswers(postNormalize(preNormalize(str).toLowerCase()), this.readerWriter);
        } catch (IOException e) {
            LOG.info("Problem in tokenizing Chinese");
            e.printStackTrace();
        }
        if (this.vocab == null) {
            return strArr;
        }
        StringBuilder sb = new StringBuilder();
        for (String str2 : strArr) {
            if (this.vocab.get(str2) > 0) {
                sb.append(str2 + " ");
            }
        }
        return sb.toString().trim().split("\\s+");
    }

    @Override // ivory.core.tokenize.Tokenizer
    public String removeBorderStopWords(String str) {
        return str;
    }

    @Override // ivory.core.tokenize.Tokenizer
    public boolean isStopWord(String str) {
        return false;
    }
}
