package com.gengoai.hermes.en;

import com.gengoai.Tag;
import com.gengoai.collection.Iterables;
import com.gengoai.hermes.lexicon.TrieWordList;
import com.gengoai.hermes.lexicon.WordList;
import com.gengoai.hermes.morphology.StandardTokenizer;
import com.gengoai.hermes.morphology.TokenType;
import com.gengoai.hermes.morphology.Tokenizer;
import com.gengoai.string.Strings;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.NoSuchElementException;
import java.util.Set;
import lombok.NonNull;

/* loaded from: input_file:com/gengoai/hermes/en/ENTokenizer.class */
public class ENTokenizer implements Tokenizer, Serializable {
    private static final long serialVersionUID = 1;
    private final TrieWordList abbreviations = ENLexicons.ALL_ABBREVIATION.get();
    private final WordList tlds = ENLexicons.TLDS.get();
    private final TrieWordList emoticons = ENLexicons.EMOTICONS.get();

    /* loaded from: input_file:com/gengoai/hermes/en/ENTokenizer$TokenIterator.class */
    private class TokenIterator implements Iterator<Tokenizer.Token> {
        private final StandardTokenizer tokenizer;
        private final LinkedList<Tokenizer.Token> buffer = new LinkedList<>();
        private int lastIndex = 0;

        private TokenIterator(Reader reader) {
            this.tokenizer = new StandardTokenizer(reader);
        }

        private void addToBuffer(Tokenizer.Token token) {
            if (token == null || Strings.isNullOrBlank(token.text)) {
                return;
            }
            this.buffer.addFirst(token);
        }

        private Tokenizer.Token checkURL(Tokenizer.Token token) {
            if (token.text.contains("://")) {
                return token;
            }
            int indexOf = token.text.indexOf(47);
            if (indexOf == -1) {
                indexOf = token.text.length();
            }
            int lastIndexOf = token.text.substring(0, indexOf).lastIndexOf(46);
            String substring = token.text.substring(lastIndexOf + 1, indexOf);
            if (!ENTokenizer.this.tlds.contains(substring.toLowerCase())) {
                Tokenizer.Token peek = peek(0);
                if (peek == null || peek.charStartIndex != token.charEndIndex) {
                    addToBuffer(new Tokenizer.Token(substring, TokenType.ALPHA_NUMERIC, token.charStartIndex + lastIndexOf + 1, token.charEndIndex, token.index));
                } else {
                    consume();
                    addToBuffer(new Tokenizer.Token(substring + peek.text, TokenType.ALPHA_NUMERIC, token.charStartIndex + lastIndexOf + 1, peek.charEndIndex, token.index));
                }
                addToBuffer(new Tokenizer.Token(token.text.substring(lastIndexOf, lastIndexOf + 1), TokenType.PUNCTUATION, token.charStartIndex + lastIndexOf, token.charStartIndex + lastIndexOf + 1, token.index));
                token = new Tokenizer.Token(token.text.substring(0, lastIndexOf), TokenType.ALPHA_NUMERIC, token.charStartIndex, token.charStartIndex + lastIndexOf, token.index);
            }
            return token;
        }

        private Tokenizer.Token consume(int i) {
            Tokenizer.Token token = null;
            while (i >= 0) {
                token = consume();
                i--;
            }
            return token;
        }

        private Tokenizer.Token consume() {
            peek(0);
            while (!this.buffer.isEmpty()) {
                Tokenizer.Token remove = this.buffer.remove();
                if (remove != null && !Strings.isNullOrBlank(remove.text)) {
                    return remove;
                }
            }
            return null;
        }

        private Tokenizer.Token handleEmoticon(Tokenizer.Token token) {
            Tokenizer.Token peek;
            String str;
            String str2 = token.text;
            String lowerCase = token.text.toLowerCase();
            if (!ENTokenizer.this.emoticons.isPrefixMatch(lowerCase)) {
                return token;
            }
            Tokenizer.Token token2 = token;
            int i = token.charEndIndex;
            int i2 = 0;
            while (true) {
                peek = peek(i2);
                if (peek == null) {
                    if (!ENTokenizer.this.emoticons.contains(lowerCase)) {
                        return token;
                    }
                    if (lowerCase.length() <= 1) {
                        return new Tokenizer.Token(str2, TokenType.EMOTICON, token.charStartIndex, token.charEndIndex, token.index);
                    }
                    return new Tokenizer.Token(str2, TokenType.EMOTICON, token.charStartIndex, consume(i2 - 1).charEndIndex, token.index);
                }
                String str3 = lowerCase;
                if (token2.charEndIndex < peek.charStartIndex) {
                    str3 = str3 + Strings.repeat(' ', peek.charStartIndex - token2.charEndIndex);
                }
                str = str3 + peek.text.toLowerCase();
                token2 = peek;
                Set<String> prefixes = ENTokenizer.this.emoticons.prefixes(str);
                if (ENTokenizer.this.emoticons.prefixes(str).size() > 1 || (prefixes.size() == 1 && !prefixes.contains(str))) {
                    int i3 = peek.charEndIndex;
                    str2 = str2 + peek.text;
                    lowerCase = str;
                    i2++;
                }
            }
            if (ENTokenizer.this.emoticons.contains(str)) {
                consume(i2);
                this.lastIndex = token.index;
                return new Tokenizer.Token(str2, TokenType.EMOTICON, token.charStartIndex, peek.charEndIndex, token.index);
            }
            if (!ENTokenizer.this.emoticons.contains(lowerCase)) {
                return token;
            }
            Tokenizer.Token consume = consume(i2 - 1);
            this.lastIndex = token.index;
            return new Tokenizer.Token(str2, TokenType.EMOTICON, token.charStartIndex, consume.charEndIndex, token.index);
        }

        @Override // java.util.Iterator
        public boolean hasNext() {
            return peek(0) != null;
        }

        private Tokenizer.Token mergeAbbreviationAndAcronym(Tokenizer.Token token) {
            String str = token.text;
            int i = token.charEndIndex;
            int i2 = 0;
            Tokenizer.Token token2 = token;
            while (true) {
                Tokenizer.Token token3 = token2;
                Tokenizer.Token peek = peek(i2);
                if (peek == null) {
                    return token;
                }
                String str2 = str;
                if (token3.charEndIndex < peek.charStartIndex) {
                    str2 = str2 + Strings.repeat(' ', peek.charStartIndex - token3.charEndIndex);
                }
                String str3 = str2 + peek.text;
                if (peek.charStartIndex != token3.charEndIndex || !ENTokenizer.this.abbreviations.contains(str3)) {
                    break;
                }
                i2++;
                i = peek.charEndIndex;
                str = str3;
                token2 = peek;
            }
            if (i2 == 0) {
                return ENTokenizer.this.abbreviations.contains(token.text.toLowerCase()) ? new Tokenizer.Token(token.text, TokenType.ACRONYM, token.charStartIndex, token.charEndIndex, token.index) : token;
            }
            consume(i2 - 1);
            return new Tokenizer.Token(str, TokenType.ACRONYM, token.charStartIndex, i, token.index);
        }

        private Tokenizer.Token mergeMoneyNumber(Tokenizer.Token token) {
            Tokenizer.Token peek = peek(0);
            if (peek == null) {
                return token;
            }
            if (!peek.type.isInstance(TokenType.NUMBER) || peek.charStartIndex != token.charEndIndex) {
                return token;
            }
            Tokenizer.Token token2 = new Tokenizer.Token(token.text + peek.text, TokenType.MONEY, token.charStartIndex, peek.charEndIndex, token.index);
            consume();
            return token2;
        }

        private Tokenizer.Token mergeMultiHyphens(Tokenizer.Token token) {
            String str = token.text;
            int i = token.charEndIndex;
            while (peekIsType(0, TokenType.HYPHEN)) {
                Tokenizer.Token consume = consume();
                i = consume.charEndIndex;
                str = str + consume.text;
            }
            return i != token.charEndIndex ? new Tokenizer.Token(str, TokenType.HYPHEN, token.charStartIndex, i, 0) : token;
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.util.Iterator
        public Tokenizer.Token next() {
            if (peek(0) == null) {
                throw new NoSuchElementException();
            }
            Tokenizer.Token consume = consume();
            if (consume == null) {
                throw new NoSuchElementException();
            }
            TokenType tokenType = consume.type;
            if (consume.type.isInstance(TokenType.URL)) {
                consume = checkURL(consume);
            } else if (ENTokenizer.this.abbreviations.isPrefixMatch(consume.text)) {
                consume = mergeAbbreviationAndAcronym(consume);
            } else if (consume.type.isInstance(new Tag[]{TokenType.PUNCTUATION, TokenType.HYPHEN, TokenType.EMOTICON})) {
                consume = handleEmoticon(consume);
            } else if (consume.type.isInstance(TokenType.MONEY) && peekIsType(0, TokenType.NUMBER)) {
                consume = mergeMoneyNumber(consume);
            } else if (consume.type.isInstance(TokenType.NUMBER) && peekIsType(0, TokenType.MONEY)) {
                consume = mergeMoneyNumber(consume);
            }
            if (consume.type.isInstance(TokenType.HYPHEN)) {
                consume = mergeMultiHyphens(consume);
            }
            consume.index = this.lastIndex;
            this.lastIndex++;
            return consume;
        }

        private Tokenizer.Token peek(int i) {
            while (this.buffer.size() <= i) {
                try {
                    Tokenizer.Token next = this.tokenizer.next();
                    if (next == null) {
                        return null;
                    }
                    if (!Strings.isNullOrBlank(next.text)) {
                        this.buffer.add(next);
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return this.buffer.get(i);
        }

        private boolean peekIsType(int i, TokenType... tokenTypeArr) {
            Tokenizer.Token peek = peek(i);
            if (peek == null) {
                return false;
            }
            return peek.type.isInstance(tokenTypeArr);
        }
    }

    @Override // com.gengoai.hermes.morphology.Tokenizer
    public Iterable<Tokenizer.Token> tokenize(@NonNull Reader reader) {
        if (reader == null) {
            throw new NullPointerException("reader is marked non-null but is null");
        }
        return Iterables.asIterable(new TokenIterator(reader));
    }
}
