package com.gengoai.hermes.annotator;

import com.gengoai.Tag;
import com.gengoai.collection.Maps;
import com.gengoai.hermes.AnnotatableType;
import com.gengoai.hermes.Annotation;
import com.gengoai.hermes.Document;
import com.gengoai.hermes.Fragments;
import com.gengoai.hermes.Types;
import com.gengoai.hermes.morphology.TokenType;
import com.gengoai.string.Strings;
import com.gengoai.tuple.Tuples;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

/* loaded from: input_file:com/gengoai/hermes/annotator/DefaultSentenceAnnotator.class */
public class DefaultSentenceAnnotator extends Annotator {
    private static final long serialVersionUID = 1;
    private final char[] endOfSentence = {'!', '.', '.', '?', 1417, 1567, 1748, 1792, 1793, 1794, 2404, 4170, 4171, 4962, 4967, 4968, 5742, 6147, 6153, 8228, 8252, 8253, 8263, 8264, 8265, 12290, 65106, 65106, 65111, 65281, 65294, 65294, 65311, 65377};
    private final char[] sContinue = {',', '-', ':', 1373, 1548, 1549, 2040, 6146, 6152, 8211, 8212, 12289, 65040, 65041, 65043, 65073, 65074, 65104, 65105, 65109, 65112, 65123, 65292, 65293, 65306, 65380};
    private final Set<String> noSentenceBreak = new HashSet<String>() { // from class: com.gengoai.hermes.annotator.DefaultSentenceAnnotator.1
        {
            add("admr.");
            add("al.");
            add("ala.");
            add("jan");
            add("feb");
            add("mar");
            add("apr");
            add("jun");
            add("jul");
            add("aug");
            add("sep");
            add("sept");
            add("oct");
            add("nov");
            add("dec");
            add("alta.");
            add("arc.");
            add("ariz.");
            add("ark.");
            add("atty.");
            add("attys.");
            add("ave.");
            add("bld.");
            add("blvd.");
            add("cal.");
            add("calif.");
            add("cl.");
            add("cmdr.");
            add("co.");
            add("col.");
            add("colo.");
            add("conn.");
            add("corp.");
            add("cpl.");
            add("cres.");
            add("ct.");
            add("dak.");
            add("del.");
            add("det.");
            add("dist.");
            add("dr.");
            add("esp.");
            add("etc.");
            add("exp.");
            add("expy.");
            add("fed.");
            add("fla.");
            add("ft.");
            add("fw?y.");
            add("fy.");
            add("ga.");
            add("gen.");
            add("gov.");
            add("hway.");
            add("hwy.");
            add("ia.");
            add("id.");
            add("ida.");
            add("ill.");
            add("inc.");
            add("ind.");
            add("is.");
            add("jr.");
            add("kan.");
            add("kans.");
            add("ken.");
            add("ky.");
            add("la.");
            add("lt.");
            add("ltd.");
            add("maj.");
            add("man.");
            add("mass.");
            add("md.");
            add("me.");
            add("mex.");
            add("mich.");
            add("minn.");
            add("miss.");
            add("mo.");
            add("mont.");
            add("mr.");
            add("mrs.");
            add("ms.");
            add("mt.");
            add("neb.");
            add("nebr.");
            add("nev.");
            add("ok.");
            add("okla.");
            add("ont.");
            add("ore.");
            add("p.m.");
            add("pa.");
            add("pd.");
            add("pde?.");
            add("penn.");
            add("penna.");
            add("pl.");
            add("plz.");
            add("prof.");
            add("pvt.");
            add("qué.");
            add("rd.");
            add("rep.");
            add("reps.");
            add("rev.");
            add("sask.");
            add("sen.");
            add("sens.");
            add("sgt.");
            add("sr.");
            add("supt.");
            add("tce.");
            add("tenn.");
            add("tex.");
            add("tx.");
            add("u.s.");
            add("u.s.a.");
            add("us.");
            add("usa.");
            add("usafa.");
            add("ut.");
            add("va.");
            add("vs.");
            add("vt.");
            add("wash.");
            add("wis.");
            add("wisc.");
            add("wy.");
            add("wyo.");
            add("yuk.");
            add("st.");
        }
    };

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:com/gengoai/hermes/annotator/DefaultSentenceAnnotator$InternalType.class */
    public enum InternalType {
        ABBREVIATION,
        LIST_MARKER,
        QUOTATION_MARK,
        END_OF_SENTENCE,
        CONTINUE_SENTENCE,
        PERSON_TITLE,
        CAPITALIZED,
        END_BRACKET,
        OTHER
    }

    private boolean addSentence(Document document, int i, int i2, int i3) {
        while (i < document.length() && Character.isWhitespace(document.charAt(i))) {
            i++;
        }
        if (i > i2) {
            return false;
        }
        document.createAnnotation(Types.SENTENCE, i, i2, Maps.hashMapOf(new Map.Entry[]{Tuples.$(Types.INDEX, Integer.valueOf(i3))}));
        return true;
    }

    @Override // com.gengoai.hermes.annotator.Annotator
    protected void annotateImpl(Document document) {
        List<Annotation> list = document.tokens();
        int i = -1;
        int i2 = 0;
        int i3 = -1;
        int i4 = 0;
        int i5 = 0;
        while (i5 < list.size()) {
            Annotation annotation = list.get(i5);
            Annotation token = getToken(list, i5 + 1);
            if (i == -1) {
                i = annotation.start();
            }
            Set<InternalType> types = getTypes(annotation);
            Set<InternalType> types2 = getTypes(token);
            if (types.contains(InternalType.QUOTATION_MARK)) {
                i4++;
            }
            if (!(types.contains(InternalType.ABBREVIATION) && types2.contains(InternalType.CAPITALIZED) && i4 % 2 == 0) && (types.contains(InternalType.ABBREVIATION) || !types.contains(InternalType.END_OF_SENTENCE) || types2.contains(InternalType.PERSON_TITLE))) {
                int countNewLineBeforeNext = countNewLineBeforeNext(document, annotation, token);
                if ((countNewLineBeforeNext > 1 || ((countNewLineBeforeNext == 1 && types2.contains(InternalType.CAPITALIZED)) || (countNewLineBeforeNext == 1 && types2.contains(InternalType.LIST_MARKER)))) && addSentence(document, i, annotation.end(), i2)) {
                    i2++;
                    i3 = annotation.end();
                    i = -1;
                    i4 = 0;
                }
            } else {
                while (types2.contains(InternalType.END_OF_SENTENCE)) {
                    i5++;
                    annotation = token;
                    token = getToken(list, i5 + 1);
                    types2 = getTypes(token);
                }
                if ((types2.contains(InternalType.END_BRACKET) || types2.contains(InternalType.QUOTATION_MARK)) && distance(annotation, token) == 0) {
                    i5++;
                    annotation = token;
                }
                if (!this.noSentenceBreak.contains(annotation.toLowerCase()) && !types2.contains(InternalType.CONTINUE_SENTENCE) && addSentence(document, i, annotation.end(), i2)) {
                    i2++;
                    i3 = annotation.end();
                    i = -1;
                    i4 = 0;
                }
            }
            i5++;
        }
        if (list.size() <= 0 || i3 >= list.get(list.size() - 1).end()) {
            return;
        }
        addSentence(document, i, list.get(list.size() - 1).end(), i2);
    }

    private int countNewLineBeforeNext(Document document, Annotation annotation, Annotation annotation2) {
        if (annotation2.isEmpty()) {
            return 0;
        }
        int i = 0;
        char c = 0;
        for (int end = annotation.end(); end < annotation2.start(); end++) {
            if (document.charAt(end) == '\r' || (c != '\r' && document.charAt(end) == '\n')) {
                i++;
            }
            c = document.charAt(end);
        }
        return i;
    }

    private int distance(Annotation annotation, Annotation annotation2) {
        return annotation2.start() - annotation.end();
    }

    private Annotation getToken(List<Annotation> list, int i) {
        return (i < 0 || i >= list.size()) ? Fragments.orphanedAnnotation(Types.TOKEN) : list.get(i);
    }

    private Set<InternalType> getTypes(Annotation annotation) {
        HashSet hashSet = new HashSet();
        if (isQuotation(annotation)) {
            hashSet.add(InternalType.QUOTATION_MARK);
        }
        if (isAbbreviation(annotation)) {
            hashSet.add(InternalType.ABBREVIATION);
        }
        if (isListMarker(annotation)) {
            hashSet.add(InternalType.LIST_MARKER);
        }
        if (isEndOfSentenceMark(annotation)) {
            hashSet.add(InternalType.END_OF_SENTENCE);
        }
        if (isContinue(annotation)) {
            hashSet.add(InternalType.CONTINUE_SENTENCE);
        }
        if (((TokenType) annotation.attribute(Types.TOKEN_TYPE, TokenType.UNKNOWN)).equals(TokenType.PERSON_TITLE)) {
            hashSet.add(InternalType.PERSON_TITLE);
        }
        if (isCapitalized(annotation)) {
            hashSet.add(InternalType.CAPITALIZED);
        }
        if (isEndBracket(annotation)) {
            hashSet.add(InternalType.END_BRACKET);
        }
        if (hashSet.isEmpty()) {
            hashSet.add(InternalType.OTHER);
        }
        return hashSet;
    }

    private boolean isAbbreviation(Annotation annotation) {
        TokenType tokenType = (TokenType) annotation.attribute(Types.TOKEN_TYPE, TokenType.UNKNOWN);
        return tokenType != null && (tokenType.equals(TokenType.ACRONYM) || (tokenType.equals(TokenType.TIME) && (annotation.next().isEmpty() || Character.isUpperCase(annotation.next().charAt(0)))));
    }

    /* JADX WARN: Multi-variable type inference failed */
    private boolean isCapitalized(Annotation annotation) {
        if (annotation.length() == 1 && annotation.contentEquals("I")) {
            return true;
        }
        if (annotation.length() > 1) {
            return !Strings.hasLetter(annotation) || Character.isUpperCase(annotation.charAt(0));
        }
        return false;
    }

    private boolean isContinue(Annotation annotation) {
        return Arrays.binarySearch(this.sContinue, annotation.isEmpty() ? ' ' : annotation.charAt(annotation.length() - 1)) >= 0;
    }

    private boolean isEndBracket(Annotation annotation) {
        if (annotation.length() != 1) {
            return false;
        }
        switch (annotation.charAt(0)) {
            case ')':
            case '>':
            case ']':
                return true;
            default:
                return false;
        }
    }

    private boolean isEndOfSentenceMark(Annotation annotation) {
        return (annotation.isEmpty() || ((TokenType) annotation.attribute(Types.TOKEN_TYPE, TokenType.UNKNOWN)).isInstance(new Tag[]{TokenType.EMOTICON, TokenType.PERSON_TITLE}) || Arrays.binarySearch(this.endOfSentence, annotation.charAt(annotation.length() - 1)) < 0) ? false : true;
    }

    private boolean isEndPunctuation(Annotation annotation) {
        if (annotation.length() != 1) {
            return false;
        }
        char charAt = annotation.charAt(0);
        int type = Character.getType(charAt);
        return charAt == '\"' || type == 30 || type == 22;
    }

    private boolean isListMarker(Annotation annotation) {
        return annotation.contentEquals("*") || annotation.contentEquals("+") || annotation.contentEquals(">");
    }

    private boolean isQuotation(Annotation annotation) {
        if (annotation.length() != 1) {
            return false;
        }
        int type = Character.getType(annotation.charAt(0));
        return annotation.contentEquals("\"") || annotation.contentEquals("'") || type == 29 || type == 30;
    }

    @Override // com.gengoai.hermes.annotator.Annotator
    public Set<AnnotatableType> requires() {
        return Collections.singleton(Types.TOKEN);
    }

    @Override // com.gengoai.hermes.annotator.Annotator
    public Set<AnnotatableType> satisfies() {
        return Collections.singleton(Types.SENTENCE);
    }
}
