package de.julielab.jcore.ae.jsbd;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.propertyeditors.CustomBooleanEditor;

/* loaded from: input_file:de/julielab/jcore/ae/jsbd/Abstract2UnitPipe.class */
class Abstract2UnitPipe extends Pipe {
    private static final long serialVersionUID = 1;
    TreeSet<String> eosSymbols;
    TreeSet<String> abbrList;
    private boolean splitAfterPunctuation;
    private static final Logger log = LoggerFactory.getLogger((Class<?>) Abstract2UnitPipe.class);
    private static final Pattern splitPattern = Pattern.compile("[^\\s]+");
    private static final Pattern punctuationPattern = Pattern.compile("\\p{P}");

    /* JADX INFO: Access modifiers changed from: package-private */
    public Abstract2UnitPipe(boolean z) {
        super(new Alphabet(), new LabelAlphabet());
        this.splitAfterPunctuation = z;
        this.eosSymbols = new EOSSymbols().getSymbols();
        this.abbrList = new AbbreviationsMedical().getSet();
    }

    @Override // cc.mallet.pipe.Pipe
    public Instance pipe(Instance instance) {
        Object obj = (String) instance.getSource();
        List<String> list = (List) instance.getData();
        Map<String, Integer> unitFrequency = getUnitFrequency(list);
        TokenSequence tokenSequence = new TokenSequence();
        LabelSequence labelSequence = new LabelSequence((LabelAlphabet) getTargetAlphabet());
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i);
            if (!str.isEmpty()) {
                List<Unit> units = getUnits(str);
                if (!units.isEmpty()) {
                    for (int i2 = 0; i2 < units.size(); i2++) {
                        String str2 = units.get(i2).rep;
                        String plainUnit = getPlainUnit(str2);
                        Token token = new Token(str2);
                        if (units.get(i2).isTokenInternal) {
                            token.setFeatureValue("istokeninternal=", 1.0d);
                        }
                        if (containsEOSSymbol(str2)) {
                            token.setFeatureValue("endwithEOSSymb=" + getEOSSymbol(str2), 1.0d);
                        }
                        String str3 = i2 + 1 == units.size() ? "EOS" : "IS";
                        int nrEOSSymbolsContained = nrEOSSymbolsContained(plainUnit);
                        if (nrEOSSymbolsContained > 0) {
                            token.setFeatureValue("hasinnerEOSSymb=" + nrEOSSymbolsContained, 1.0d);
                        }
                        token.setFeatureValue("TOKEN=" + str2, 1.0d);
                        if (str2.matches("[\\p{Lu}\\p{M}].*")) {
                            token.setFeatureValue("INITCAPS", 1.0d);
                        }
                        if (str2.matches("[\\p{Lu}\\p{M}]")) {
                            token.setFeatureValue("ONECAPS", 1.0d);
                        }
                        if (str2.matches("[\\p{Lu}\\p{M}]+")) {
                            token.setFeatureValue("ALLCAPS", 1.0d);
                        }
                        if (str2.matches("(.*[\\p{L}\\p{M}].*[0-9].*|.*[0-9].*[\\p{L}\\p{M}].*)")) {
                            token.setFeatureValue("ALPHANUMERIC", 1.0d);
                        }
                        if (str2.matches("[IVXDLCM]+")) {
                            token.setFeatureValue("ROMAN", 1.0d);
                        }
                        if (str2.matches(".*\\b[IVXDLCM]+\\b.*")) {
                            token.setFeatureValue("HASROMAN", 1.0d);
                        }
                        if (str2.matches("[0-9]+")) {
                            token.setFeatureValue("NATURALNUMBER", 1.0d);
                        }
                        if (str2.matches("[-0-9]+[.,]+[0-9.,]+")) {
                            token.setFeatureValue("REALNUMBER", 1.0d);
                        }
                        if (str2.matches(".*[0-9]+.*")) {
                            token.setFeatureValue("HASDIGITS", 1.0d);
                        }
                        if (str2.matches("(\\(.*|\\[.*)")) {
                            token.setFeatureValue("BEGINBRACKETS", 1.0d);
                        }
                        if (str2.matches("(\\(.*\\)|\\[.*\\])")) {
                            token.setFeatureValue("INSIDEBRACKETS", 1.0d);
                        }
                        if (str2.matches("(\".*|'.*)")) {
                            token.setFeatureValue("BEGINQUOTES", 1.0d);
                        }
                        if (str2.matches("(\".*\"|'.*')")) {
                            token.setFeatureValue("INSIDEBQUOTES", 1.0d);
                        }
                        if (str2.length() <= 3) {
                            token.setFeatureValue("SIZE1", 1.0d);
                        } else if (str2.length() <= 6) {
                            token.setFeatureValue("SIZE2", 1.0d);
                        } else {
                            token.setFeatureValue("SIZE3", 1.0d);
                        }
                        if (str2.matches("[A-Z]\\.")) {
                            token.setFeatureValue("ABBR1", 1.0d);
                        }
                        if (str2.matches("([A-Za-z]\\.)+")) {
                            token.setFeatureValue("ABBR2", 1.0d);
                        }
                        if (str2.matches("[abcdfghjklmnpqrstvwxyz]+\\.")) {
                            token.setFeatureValue("ABBR3", 1.0d);
                        }
                        token.setFeatureValue("BWC=" + plainUnit.replaceAll("[\\p{Lu}\\p{M}]+", "A").replaceAll("[\\p{Ll}\\p{M}]+", "a").replaceAll("[0-9]+", CustomBooleanEditor.VALUE_0).replaceAll("[^\\p{L}\\p{M}0-9]+", "x"), 1.0d);
                        if (containsEOSSymbol(str2) && unitFrequency.get(str2).intValue() > 1) {
                            token.setFeatureValue("FreqTokenEOSSymbol", 1.0d);
                        }
                        if (this.abbrList.contains(str2)) {
                            token.setFeatureValue("KNOWNABBR", 1.0d);
                        }
                        tokenSequence.add((TokenSequence) token);
                        labelSequence.add(str3);
                    }
                    arrayList.addAll(units);
                }
            }
        }
        instance.setData(tokenSequence);
        instance.setTarget(labelSequence);
        instance.setName(arrayList);
        instance.setSource(obj);
        return instance;
    }

    private int nrEOSSymbolsContained(String str) {
        int i = 0;
        for (char c : str.toCharArray()) {
            if (this.eosSymbols.contains(new String(new char[]{c}))) {
                i++;
            }
        }
        return i;
    }

    private boolean containsEOSSymbol(String str) {
        if (str.length() > 0) {
            return this.eosSymbols.contains(str.substring(str.length() - 1, str.length()));
        }
        return false;
    }

    private String getEOSSymbol(String str) {
        if (str.length() <= 0) {
            return "";
        }
        String substring = str.substring(str.length() - 1, str.length());
        return this.eosSymbols.contains(substring) ? substring : "";
    }

    private String getPlainUnit(String str) {
        return containsEOSSymbol(str) ? str.substring(0, str.length() - 1) : str;
    }

    private Map<String, Integer> getUnitFrequency(List<String> list) {
        HashMap hashMap = new HashMap();
        for (int i = 0; i < list.size(); i++) {
            List<Unit> units = getUnits(list.get(i));
            for (int i2 = 0; i2 < units.size(); i2++) {
                Unit unit = units.get(i2);
                int i3 = 0;
                if (hashMap.containsKey(unit.rep)) {
                    i3 = ((Integer) hashMap.get(unit.rep)).intValue();
                }
                hashMap.put(unit.rep, Integer.valueOf(i3 + 1));
            }
        }
        return hashMap;
    }

    private List<Unit> getUnits(String str) {
        Matcher matcher = splitPattern.matcher(str);
        ArrayList arrayList = new ArrayList();
        while (matcher.find()) {
            String group = matcher.group();
            int start = matcher.start();
            int end = matcher.end();
            int i = start;
            if (this.splitAfterPunctuation) {
                Matcher matcher2 = punctuationPattern.matcher(group);
                while (matcher2.find()) {
                    int end2 = start + matcher2.end();
                    boolean z = start > 0 && Character.isWhitespace(str.charAt(start - 1));
                    boolean z2 = end2 < str.length() && Character.isWhitespace(str.charAt(end2));
                    arrayList.add(new Unit(start, end2, str.substring(i, end2), end2 < end, z, z2));
                    i = end2;
                }
            }
            int i2 = i;
            if (i2 < end && i2 < str.length()) {
                arrayList.add(new Unit(i2, end, str.substring(i2, end), false, i2 > 0 && Character.isWhitespace(str.charAt(i2 - 1)), end < str.length() && Character.isWhitespace(str.charAt(end))));
            }
        }
        return arrayList;
    }

    private void readObject(ObjectInputStream objectInputStream) throws IOException, ClassNotFoundException {
        objectInputStream.defaultReadObject();
        log.info("This sentence splitter model allows sentence splits after all punctuation: " + this.splitAfterPunctuation);
    }
}
