package ivory.core.tokenize;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.galagosearch.core.parse.TagTokenizer;
import org.tartarus.snowball.ext.englishStemmer;

/* loaded from: input_file:ivory/core/tokenize/GalagoTokenizer.class */
public class GalagoTokenizer extends Tokenizer {
    private static final String[] TERRIER_STOP_WORDS = {"a", "abaft", "abafter", "abaftest", "about", "abouter", "aboutest", "above", "abover", "abovest", "accordingly", "aer", "aest", "afore", "after", "afterer", "afterest", "afterward", "afterwards", "again", "against", "aid", "ain", "albeit", "all", "aller", "allest", "alls", "allyou", "almost", "along", "alongside", "already", "also", "although", "always", "amid", "amidst", "among", "amongst", "an", "and", "andor", "anear", "anent", "another", "any", "anybody", "anyhow", "anyone", "anything", "anywhere", "apart", "aparter", "apartest", "appear", "appeared", "appearing", "appears", "appropriate", "appropriated", "appropriater", "appropriates", "appropriatest", "appropriating", "are", "ares", "around", "as", "ases", "aside", "asides", "aslant", "astraddle", "astraddler", "astraddlest", "astride", "astrider", "astridest", "at", "athwart", "atop", "atween", "aught", "aughts", "available", "availabler", "availablest", "awfully", "b", "be", "became", "because", "become", "becomes", "becoming", "becominger", "becomingest", "becomings", "been", "before", "beforehand", "beforehander", "beforehandest", "behind", "behinds", "below", "beneath", "beside", "besides", "better", "bettered", "bettering", "betters", "between", "betwixt", "beyond", "bist", "both", "but", "buts", "by", "by-and-by", "byandby", "c", "cannot", "canst", "cant", "canted", "cantest", "canting", "cants", "cer", "certain", "certainer", "certainest", "cest", "chez", "circa", "co", "come-on", "come-ons", "comeon", "comeons", "concerning", "concerninger", "concerningest", "consequently", "considering", "could", "couldst", "cum", "d", "dday", "ddays", "describe", "described", "describes", "describing", "despite", "despited", "despites", "despiting", "did", "different", "differenter", "differentest", "do", "doe", "does", "doing", "doings", "done", "doner", "dones", "donest", "dos", "dost", "doth", "downs", "downward", "downwarder", "downwardest", "downwards", "during", "e", "each", "eg", "eight", "either", "else", "elsewhere", "enough", "ere", "et", "etc", "even", "evened", "evenest", "evens", "evenser", "evensest", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "excepted", "excepting", "excepts", "exes", "f", "fact", "facts", "failing", "failings", "few", "fewer", "fewest", "figupon", "figuponed", "figuponing", "figupons", "five", "followthrough", "for", "forby", "forbye", "fore", "forer", "fores", "forever", "former", "formerer", "formerest", "formerly", "formers", "fornenst", "forwhy", "four", "fourscore", "frae", "from", "fs", "further", "furthered", "furtherer", "furtherest", "furthering", "furthermore", "furthers", "g", "get", "gets", "getting", "go", "gone", "good", "got", "gotta", "gotten", "h", "had", "hadst", "hae", "hardly", "has", "hast", "hath", "have", "haves", "having", "he", "hence", "her", "hereafter", "hereafters", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "hither", "hitherer", "hitherest", "hoo", "hoos", "how", "how-do-you-do", "howbeit", "howdoyoudo", "however", "huh", "humph", "i", "idem", "idemer", "idemest", "ie", "if", "ifs", "immediate", "immediately", "immediater", "immediatest", "in", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "indicating", "info", "information", "insofar", "instead", "into", "inward", "inwarder", "inwardest", "inwards", "is", "it", "its", "itself", "j", "k", "l", "latter", "latterer", "latterest", "latterly", "latters", "layabout", "layabouts", "less", "lest", "lot", "lots", "lotted", "lotting", "m", "main", "make", "many", "mauger", "maugre", "mayest", "me", "meanwhile", "meanwhiles", "midst", "midsts", "might", "mights", "more", "moreover", "most", "mostly", "much", "mucher", "muchest", "must", "musth", "musths", "musts", "my", "myself", "n", "natheless", "nathless", "neath", "neaths", "necessarier", "necessariest", "necessary", "neither", "nethe", "nethermost", "never", "nevertheless", "nigh", "nigher", "nighest", "nine", "no", "no-one", "nobodies", "nobody", "noes", "none", "noone", "nor", "nos", "not", "nothing", "nothings", "notwithstanding", "nowhere", "nowheres", "o", "of", "off", "offest", "offs", "often", "oftener", "oftenest", "oh", "on", "one", "oneself", "onest", "ons", "onto", "or", "orer", "orest", "other", "others", "otherwise", "otherwiser", "otherwisest", "ought", "oughts", "our", "ours", "ourself", "ourselves", "out", "outed", "outest", "outs", "outside", "outwith", "over", "overall", "overaller", "overallest", "overalls", "overs", "own", "owned", "owning", "owns", "owt", "p", "particular", "particularer", "particularest", "particularly", "particulars", "per", "perhaps", "plaintiff", "please", "pleased", "pleases", "plenties", "plenty", "pro", "probably", "provide", "provided", "provides", "providing", "q", "qua", "que", "quite", "r", "rath", "rathe", "rather", "rathest", "re", "really", "regarding", "relate", "related", "relatively", "res", "respecting", "respectively", "s", "said", "saider", "saidest", "same", "samer", "sames", "samest", "sans", "sanserif", "sanserifs", "sanses", "saved", "sayid", "sayyid", "seem", "seemed", "seeminger", "seemingest", "seemings", "seems", "send", "sent", "senza", "serious", "seriouser", "seriousest", "seven", "several", "severaler", "severalest", "shall", "shalled", "shalling", "shalls", "she", "should", "shoulded", "shoulding", "shoulds", "since", "sine", "sines", "sith", "six", "so", "sobeit", "soer", "soest", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimer", "sometimes", "sometimest", "somewhat", "somewhere", "stop", "stopped", "such", "summat", "sup", "supped", "supping", "sups", "syn", "syne", "t", "ten", "than", "that", "the", "thee", "their", "theirs", "them", "themselves", "then", "thence", "thener", "thenest", "there", "thereafter", "thereby", "therefore", "therein", "therer", "therest", "thereupon", "these", "they", "thine", "thing", "things", "this", "thises", "thorough", "thorougher", "thoroughest", "thoroughly", "those", "thou", "though", "thous", "thouses", "three", "thro", "through", "througher", "throughest", "throughout", "thru", "thruer", "thruest", "thus", "thy", "thyself", "till", "tilled", "tilling", "tills", "to", "together", "too", "toward", "towarder", "towardest", "towards", "two", "u", "umpteen", "under", "underneath", "unless", "unlike", "unliker", "unlikest", "until", "unto", "up", "upon", "uponed", "uponing", "upons", "upped", "upping", "ups", "us", "use", "used", "usedest", "username", "usually", "v", "various", "variouser", "variousest", "verier", "veriest", "versus", "very", "via", "vis-a-vis", "vis-a-viser", "vis-a-visest", "viz", "vs", "w", "was", "wast", "we", "were", "wert", "what", "whatever", "whateverer", "whateverest", "whatsoever", "whatsoeverer", "whatsoeverest", "wheen", "when", "whenas", "whence", "whencesoever", "whenever", "whensoever", "where", "whereafter", "whereas", "whereby", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereupon", "wherever", "wherewith", "wherewithal", "whether", "which", "whichever", "whichsoever", "while", "whiles", "whilst", "whither", "whithersoever", "whoever", "whomever", "whose", "whoso", "whosoever", "why", "with", "withal", "within", "without", "would", "woulded", "woulding", "woulds", "x", "y", "ye", "yet", "yon", "yond", "yonder", "you", "your", "yours", "yourself", "yourselves", "z", "zillion"};
    private final englishStemmer stemmer = new englishStemmer();
    private final Map<String, String> cache = Maps.newHashMap();
    private final Set<String> stopwords = Sets.newHashSet(TERRIER_STOP_WORDS);

    @Override // ivory.core.tokenize.Tokenizer
    public boolean isStopWord(String str) {
        return this.stopwords.contains(str);
    }

    @Override // ivory.core.tokenize.Tokenizer
    public boolean isStemming() {
        return true;
    }

    @Override // ivory.core.tokenize.Tokenizer
    public boolean isStopwordRemoval() {
        return true;
    }

    @Override // ivory.core.tokenize.Tokenizer
    public String[] processContent(String str) {
        try {
            List<String> list = new TagTokenizer().tokenize(str).terms;
            ArrayList newArrayList = Lists.newArrayList();
            for (String str2 : list) {
                if (!this.stopwords.contains(str2)) {
                    newArrayList.add(str2);
                }
            }
            for (int i = 0; i < newArrayList.size(); i++) {
                String str3 = (String) newArrayList.get(i);
                if (str3 != null) {
                    if (this.cache.containsKey(str3)) {
                        newArrayList.set(i, this.cache.get(str3));
                    } else {
                        this.stemmer.setCurrent(str3);
                        if (this.stemmer.stem()) {
                            String current = this.stemmer.getCurrent();
                            newArrayList.set(i, current);
                            this.cache.put(str3, current);
                        } else {
                            this.cache.put(str3, str3);
                        }
                    }
                    if (this.cache.size() > 50000) {
                        this.cache.clear();
                    }
                }
            }
            return (String[]) newArrayList.toArray(new String[newArrayList.size()]);
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration) {
    }

    @Override // ivory.core.tokenize.Tokenizer
    public void configure(Configuration configuration, FileSystem fileSystem) {
    }

    public static void main(String[] strArr) {
        System.out.println("tokenization according to Galago: ");
        for (String str : new GalagoTokenizer().processContent(" this is a the <test> for the teokenizer 101 546 345-543543545436-4656765865865 rgger <xml> ergtre 456435klj345lj34590")) {
            System.out.println(str);
        }
    }
}
