package de.julielab.jcore.ae.jtbd;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import java.util.ArrayList;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* JADX INFO: Access modifiers changed from: package-private */
/* loaded from: input_file:de/julielab/jcore/ae/jtbd/Sentence2TokenPipe.class */
public class Sentence2TokenPipe extends Pipe {
    private static final long serialVersionUID = 1;
    private static final Logger LOGGER = LoggerFactory.getLogger(Sentence2TokenPipe.class);
    private static final String CAPS = "A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜ";
    private static final String LOW = "a-zàèìòùáéíóúçñïäöü";
    private final Set<String> tbSymbols;
    private final Pattern splitPattern;

    public Sentence2TokenPipe() {
        super(new Alphabet(), new LabelAlphabet());
        this.splitPattern = Pattern.compile("[^\\s]+");
        this.tbSymbols = TokenBoundarySymbols.getSymbols();
    }

    private ArrayList<String> getSuperUnits(String str) {
        Matcher matcher = this.splitPattern.matcher(str);
        ArrayList<String> arrayList = new ArrayList<>();
        while (matcher.find()) {
            arrayList.add(matcher.group());
        }
        return arrayList;
    }

    public ArrayList<String> makeLabels(String str) {
        LOGGER.trace("makeLabels()");
        ArrayList<String> arrayList = new ArrayList<>();
        StringBuffer stringBuffer = new StringBuffer(str);
        StringBuffer stringBuffer2 = new StringBuffer();
        while (stringBuffer.length() > 0) {
            String valueOf = String.valueOf(stringBuffer.charAt(0));
            LOGGER.trace("makeLabels() - " + valueOf);
            if (Pattern.matches("\\s", valueOf)) {
                LOGGER.trace("makeLabels() - found WS");
                if (stringBuffer2.length() > 0) {
                    stringBuffer2.delete(0, stringBuffer2.length());
                    LOGGER.trace("makeLabels() - adding label P");
                    arrayList.add("P");
                }
                stringBuffer.deleteCharAt(0);
            } else if (this.tbSymbols.contains(valueOf)) {
                LOGGER.trace("makeLabels() - found TB");
                if (stringBuffer2.length() > 0) {
                    stringBuffer2.delete(0, stringBuffer2.length());
                    LOGGER.trace("makeLabels() - adding label N");
                    arrayList.add("N");
                }
                stringBuffer2.append(valueOf);
                if (stringBuffer.length() <= 1) {
                    LOGGER.trace("makeLabels() - label N");
                    arrayList.add("N");
                } else if (Pattern.matches("\\s", String.valueOf(stringBuffer.charAt(1)))) {
                    LOGGER.trace("makeLabels() - label P");
                    arrayList.add("P");
                } else {
                    LOGGER.trace("makeLabels() - label N");
                    arrayList.add("N");
                }
                if (stringBuffer2.length() > 0) {
                    stringBuffer2.delete(0, stringBuffer2.length());
                }
                stringBuffer.deleteCharAt(0);
            } else {
                LOGGER.trace("makeLabels() - token");
                stringBuffer2.append(valueOf);
                stringBuffer.deleteCharAt(0);
            }
        }
        LOGGER.trace("makeLabels() -  " + str);
        if (stringBuffer2.length() > 0) {
            arrayList.add("N");
        }
        LOGGER.trace("makeLabels() - " + arrayList.toString());
        return arrayList;
    }

    public void makeUnits(String str, ArrayList<Unit> arrayList, ArrayList<String> arrayList2) {
        LOGGER.trace("makeUnits() - making units...");
        StringBuffer stringBuffer = new StringBuffer(str);
        ArrayList<String> superUnits = getSuperUnits(str);
        int i = 0;
        StringBuffer stringBuffer2 = new StringBuffer();
        int i2 = 0;
        int i3 = 0;
        while (stringBuffer.length() > 0) {
            String valueOf = String.valueOf(stringBuffer.charAt(0));
            LOGGER.trace("makeUnits() - " + valueOf);
            if (Pattern.matches("\\s", valueOf)) {
                LOGGER.trace("makeUnits() - WS");
                if (stringBuffer2.length() > 0) {
                    arrayList.add(new Unit(i2, i3, stringBuffer2.toString(), superUnits.get(i)));
                    LOGGER.trace("makeUnits() -adding unit:" + stringBuffer2 + "!");
                    stringBuffer2.delete(0, stringBuffer2.length());
                    arrayList2.add("WS");
                    i++;
                }
                stringBuffer.deleteCharAt(0);
                LOGGER.trace("makeUnits() - " + arrayList.toString() + " -- " + arrayList2.toString());
                i3++;
                i2 = i3;
            } else if (this.tbSymbols.contains(valueOf)) {
                LOGGER.trace("makeUnits() - TB");
                if (stringBuffer2.length() > 0) {
                    arrayList.add(new Unit(i2, i3, stringBuffer2.toString(), superUnits.get(i)));
                    LOGGER.trace("makeUnits() - Adding unit:" + stringBuffer2 + "!");
                    stringBuffer2.delete(0, stringBuffer2.length());
                    arrayList2.add("noWS");
                    i2 = i3;
                    LOGGER.trace("makeUnits() - SE:" + i2 + "." + i3);
                }
                stringBuffer2.append(valueOf);
                LOGGER.trace("makeUnits() - adding unit:" + stringBuffer2 + "!!");
                if (stringBuffer.length() <= 1) {
                    arrayList2.add("noWS");
                } else if (Pattern.matches("\\s", String.valueOf(stringBuffer.charAt(1)))) {
                    arrayList2.add("WS");
                } else {
                    arrayList2.add("noWS");
                }
                if (stringBuffer2.length() > 0) {
                    i3++;
                    LOGGER.trace("makeUnits() - SE:" + i2 + "." + i3);
                    arrayList.add(new Unit(i2, i3, stringBuffer2.toString(), superUnits.get(i)));
                    stringBuffer2.delete(0, stringBuffer2.length());
                }
                stringBuffer.deleteCharAt(0);
                i2 = i3;
                LOGGER.trace("makeUnits() - " + arrayList.toString() + " -- " + arrayList2.toString());
            } else {
                LOGGER.trace("makeUnits() - token");
                stringBuffer2.append(valueOf);
                stringBuffer.deleteCharAt(0);
                i3++;
            }
        }
        LOGGER.trace("makeUnits() - " + str);
        if (stringBuffer2.length() > 0) {
            arrayList.add(new Unit(i2, i3, stringBuffer2.toString(), superUnits.get(i)));
            arrayList2.add("noWS");
        }
        String str2 = "";
        for (int i4 = 0; i4 < arrayList.size(); i4++) {
            LOGGER.trace("makeUnits() - " + arrayList.get(i4) + "\t" + arrayList2.get(i4));
            str2 = str2 + arrayList.get(i4).rep + (arrayList2.get(i4).equals("WS") ? " " : "");
        }
        LOGGER.trace("makeUnits() -org: " + str);
        LOGGER.trace("makeUnits() -new: " + str2);
        LOGGER.trace("makeUnits() - " + arrayList.toString());
    }

    public Instance pipe(Instance instance) {
        ArrayList<String> arrayList;
        String str = (String) instance.getData();
        String str2 = (String) instance.getSource();
        TokenSequence tokenSequence = new TokenSequence();
        LabelSequence labelSequence = new LabelSequence(getTargetAlphabet());
        ArrayList<Unit> arrayList2 = new ArrayList<>();
        ArrayList<String> arrayList3 = new ArrayList<>();
        makeUnits(str, arrayList2, arrayList3);
        if (str2.length() > 0) {
            arrayList = makeLabels(str2);
        } else {
            arrayList = new ArrayList<>();
            for (int i = 0; i < arrayList2.size(); i++) {
                arrayList.add("N");
            }
        }
        if (arrayList2.size() != arrayList.size() || arrayList.size() != arrayList3.size()) {
            int intValue = null != instance.getName() ? ((Integer) instance.getName()).intValue() + 1 : -1;
            LOGGER.error("Something's wrong with unit creation. Number of units: {}; number of labels: {}; number of whitespaces: {}", new Object[]{Integer.valueOf(arrayList2.size()), Integer.valueOf(arrayList.size()), Integer.valueOf(arrayList3.size())});
            LOGGER.error("pipe() - Unit and label extraction produced failure (at position " + (intValue == -1 ? "unknown" : Integer.valueOf(intValue)) + "). Omitting sentences for feature generation...\n" + str + "\n" + str2);
            instance.setData(tokenSequence);
            instance.setTarget(labelSequence);
            instance.setName(arrayList2);
            return instance;
        }
        for (int i2 = 0; i2 < arrayList2.size(); i2++) {
            String str3 = arrayList2.get(i2).rep;
            String str4 = arrayList2.get(i2).superUnitRep;
            String str5 = arrayList.get(i2);
            Token token = new Token(str3);
            token.setFeatureValue("U_lex=" + str3, 1.0d);
            if (arrayList3.get(i2).equals("WS")) {
                token.setFeatureValue("U_HasRightWhiteSpace", 1.0d);
            }
            if (this.tbSymbols.contains(str3)) {
                token.setFeatureValue("U_isTokenBoundarySymbol", 1.0d);
            }
            token.setFeatureValue("U_BWC=" + str3.replaceAll("[A-Z]+", "A").replaceAll("[a-z]+", "a").replaceAll("[0-9]+", "0").replaceAll("[^A-Za-z0-9]+", "x"), 1.0d);
            if (str3.length() <= 3) {
                token.setFeatureValue("U_SIZE1", 1.0d);
            } else if (str3.length() <= 6) {
                token.setFeatureValue("U_SIZE2", 1.0d);
            } else {
                token.setFeatureValue("U_SIZE3", 1.0d);
            }
            if (str3.matches("[A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜ]\\.")) {
                token.setFeatureValue("U_ABBR1", 1.0d);
            }
            if (str3.matches("([A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜa-zàèìòùáéíóúçñïäöü]\\.)+")) {
                token.setFeatureValue("U_ABBR2", 1.0d);
            }
            if (str3.matches("[a-zàèìòùáéíóúçñïäöü]+\\.")) {
                token.setFeatureValue("U_ABBR3", 1.0d);
            }
            if (str3.matches("[A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜ].*")) {
                token.setFeatureValue("U_INITCAPS", 1.0d);
            }
            if (str3.matches("[A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜ]")) {
                token.setFeatureValue("U_ONECAPS", 1.0d);
            }
            if (str3.matches("[A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜ]+")) {
                token.setFeatureValue("U_ALLCAPS", 1.0d);
            }
            if (str3.matches("(.*[A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜa-zàèìòùáéíóúçñïäöü].*[0-9].*|.*[0-9].*[A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜa-zàèìòùáéíóúçñïäöü].*)")) {
                token.setFeatureValue("U_ALPHANUMERIC", 1.0d);
            }
            if (str3.matches("[IVXDLCM]+")) {
                token.setFeatureValue("U_ROMAN", 1.0d);
            }
            if (str3.matches(".*\\b[IVXDLCM]+\\b.*")) {
                token.setFeatureValue("U_HASROMAN", 1.0d);
            }
            if (str3.matches("[0-9]+")) {
                token.setFeatureValue("U_NATURALNUMBER", 1.0d);
            }
            if (str3.matches("[-0-9]+[.,]+[0-9.,]+")) {
                token.setFeatureValue("U_REALNUMBER", 1.0d);
            }
            if (str3.matches(".*[0-9]+.*")) {
                token.setFeatureValue("U_HASDIGITS", 1.0d);
            }
            if (str3.matches("(\\(.*|\\[.*)")) {
                token.setFeatureValue("U_BEGINBRACKETS", 1.0d);
            }
            token.setFeatureValue("SU_lex=" + str4, 1.0d);
            if (str4.matches(".*[\\w]]+.*")) {
                token.setFeatureValue("SU_isAlphanumeric", 1.0d);
            }
            if (str4.matches("\\(.*\\)|\\[.*\\]")) {
                token.setFeatureValue("SU_inBrackets", 1.0d);
            } else if (str4.matches(".*\\(.*\\).*|.*\\[.*\\].*")) {
                token.setFeatureValue("SU_hasClosedBrackets", 1.0d);
            } else if (str4.matches(".*\\(.*|.*\\[.*")) {
                token.setFeatureValue("SU_hasLeftBracketOnly", 1.0d);
            } else if (str4.matches(".*\\).*|.*\\].*")) {
                token.setFeatureValue("SU_hasRightBracketOnly", 1.0d);
            }
            if (str4.matches(".*-->.*") && (str3.equals("-") || str3.equals(">"))) {
                token.setFeatureValue("SU_isPartOfArrow", 1.0d);
            }
            if (str4.matches("----")) {
                token.setFeatureValue("SU_isDoubleDash", 1.0d);
            } else if (str4.matches(".*----.*")) {
                token.setFeatureValue("SU_hasDoubleDash", 1.0d);
            } else if (str4.matches("--")) {
                token.setFeatureValue("SU_isDash", 1.0d);
            } else if (str4.matches(".*--.*")) {
                token.setFeatureValue("SU_hasDash", 1.0d);
            }
            if (str4.matches(".*[+-]/[+-].*")) {
                token.setFeatureValue("SU_hasPlusMinus", 1.0d);
            }
            if (str4.matches(".*\\([+-]\\).*")) {
                token.setFeatureValue("SU_PMwithBrackets", 1.0d);
            }
            if (str4.matches("\\(([0-9]|[a-h]|i|ii|iii|iv|v)\\)")) {
                token.setFeatureValue("SU_isEnumeration", 1.0d);
            }
            if (str4.matches(".*\\(s\\)")) {
                token.setFeatureValue("SU_hasBracketedPlural", 1.0d);
            }
            if (str4.matches(".*'s")) {
                token.setFeatureValue("SU_hasGenitive", 1.0d);
            }
            if (str4.length() <= 4) {
                token.setFeatureValue("SU_SIZE1", 1.0d);
            } else if (str4.length() <= 8) {
                token.setFeatureValue("SU_SIZE2", 1.0d);
            } else {
                token.setFeatureValue("SU_SIZE3", 1.0d);
            }
            if (str4.length() > 6 && str4.matches("(.*[\\W].*){5,}") && !str4.contains("-->")) {
                token.setFeatureValue("SU_isChemical", 1.0d);
            }
            token.setFeatureValue("SU_BWC=" + str4.replaceAll("[A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜ]+", "A").replaceAll("[a-zàèìòùáéíóúçñïäöü]+", "a").replaceAll("[0-9]+", "0").replaceAll("[^A-ZÁÉÍÓÚÀÈÌÒÙÇÑÏÄÖÜa-zàèìòùáéíóúçñïäöü0-9]+", "x"), 1.0d);
            if (str4.matches("\\(?www\\..*?\\)?")) {
                token.setFeatureValue("SU_wwwURL", 1.0d);
            } else if (str4.matches("\\(?http:.*?\\)?") || str4.matches("\\(?ftp:.*?\\)?")) {
                token.setFeatureValue("SU_httpURL", 1.0d);
            }
            tokenSequence.add(token);
            labelSequence.add(str5);
        }
        instance.setData(tokenSequence);
        instance.setTarget(labelSequence);
        instance.setName(arrayList2);
        instance.setSource(arrayList3);
        return instance;
    }
}
