package de.julielab.jcore.ae.jnet.tagger;

import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.TokenSequence2FeatureVectorSequence;
import cc.mallet.pipe.tsf.LexiconMembership;
import cc.mallet.pipe.tsf.OffsetConjunctions;
import cc.mallet.pipe.tsf.RegexMatches;
import cc.mallet.pipe.tsf.TokenTextCharNGrams;
import cc.mallet.pipe.tsf.TokenTextCharPrefix;
import cc.mallet.pipe.tsf.TokenTextCharSuffix;
import cc.mallet.types.FeatureVectorSequence;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.LabelSequence;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Properties;
import java.util.regex.Pattern;

/* loaded from: input_file:de/julielab/jcore/ae/jnet/tagger/FeatureGenerator.class */
class FeatureGenerator {
    private static final String GREEK = "(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)";
    private static final String UNICODE_UPPER = "\\p{Lu}";
    private static final String UNICODE_LOWER = "\\p{Ll}";

    public InstanceList createFeatureData(ArrayList<Sentence> arrayList, Properties properties) {
        FeatureConfiguration featureConfiguration = new FeatureConfiguration();
        ArrayList arrayList2 = new ArrayList();
        arrayList2.add(new BasePipe(properties));
        arrayList2.add(new RegexMatches("INITLOWCAPS_ANYTHING_NONUMBER", Pattern.compile("[\\p{Ll}][\\p{Lu}][^0-9]*")));
        arrayList2.add(new RegexMatches("INITLOWCAPS_ANYTHING_WITHNUMBER", Pattern.compile("[\\p{Ll}][\\p{Lu}].*[0-9].*")));
        arrayList2.add(new RegexMatches("INITCAPS", Pattern.compile("[\\p{Lu}].*")));
        arrayList2.add(new RegexMatches("INITCAPSALPHA", Pattern.compile("[\\p{Lu}][\\p{Ll}].*")));
        arrayList2.add(new RegexMatches("ALLCAPS", Pattern.compile("[\\p{Lu}]+")));
        arrayList2.add(new RegexMatches("CAPSMIX", Pattern.compile("[\\p{Lu}\\p{Ll}]+")));
        arrayList2.add(new RegexMatches("HASDIGIT", Pattern.compile(".*[0-9].*")));
        arrayList2.add(new RegexMatches("SINGLEDIGIT", Pattern.compile("[0-9]")));
        arrayList2.add(new RegexMatches("DOUBLEDIGIT", Pattern.compile("[0-9][0-9]")));
        arrayList2.add(new RegexMatches("NATURALNUMBER", Pattern.compile("[0-9]+")));
        arrayList2.add(new RegexMatches("REALNUMBER", Pattern.compile("[-0-9]+[.,]+[0-9.,]+")));
        arrayList2.add(new RegexMatches("HASDASH", Pattern.compile(".*-.*")));
        arrayList2.add(new RegexMatches("INITDASH", Pattern.compile("-.*")));
        arrayList2.add(new RegexMatches("ENDDASH", Pattern.compile(".*-")));
        arrayList2.add(new RegexMatches("ALPHANUMERIC", Pattern.compile(".*[\\p{Lu}\\p{Ll}].*[0-9].*")));
        arrayList2.add(new RegexMatches("ALPHANUMERIC", Pattern.compile(".*[0-9].*[\\p{Lu}\\p{Ll}].*")));
        arrayList2.add(new RegexMatches("IS_PUNCTUATION_MARK", Pattern.compile("[,.;:?!]")));
        arrayList2.add(new RegexMatches("IS_MINUSDASHSLASH", Pattern.compile("[-_/]")));
        if (featureConfiguration.featureActive(properties, "feat_bioregexp_enabled")) {
            arrayList2.add(new RegexMatches("ROMAN", Pattern.compile("[IVXDLCM]+")));
            arrayList2.add(new RegexMatches("HASROMAN", Pattern.compile(".*\\b[IVXDLCM]+\\b.*")));
            arrayList2.add(new RegexMatches("GREEK", Pattern.compile(GREEK)));
            arrayList2.add(new RegexMatches("HASGREEK", Pattern.compile(".*\\b(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\\b.*")));
        }
        int[] intArray = featureConfiguration.getIntArray(properties, "prefix_sizes");
        if (intArray != null) {
            for (int i : intArray) {
                arrayList2.add(new TokenTextCharPrefix("PREFIX=", i));
            }
        }
        int[] intArray2 = featureConfiguration.getIntArray(properties, "suffix_sizes");
        if (intArray2 != null) {
            for (int i2 : intArray2) {
                arrayList2.add(new TokenTextCharSuffix("SUFFIX=", i2));
            }
        }
        Iterator<String> it = featureConfiguration.getLexiconKeys(properties).iterator();
        while (it.hasNext()) {
            String next = it.next();
            try {
                arrayList2.add(new LexiconMembership(next + "_membership", new File(properties.getProperty(next)), true));
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
        }
        int[][] offsetConjFromConfig = featureConfiguration.offsetConjFromConfig(properties.getProperty("offset_conjunctions"));
        if (offsetConjFromConfig != null) {
            arrayList2.add(new OffsetConjunctions(offsetConjFromConfig));
        }
        int[] intArray3 = featureConfiguration.getIntArray(properties, "token_ngrams");
        if (intArray3 != null) {
            arrayList2.add(new TokenNGramPipe(intArray3));
        }
        int[] intArray4 = featureConfiguration.getIntArray(properties, "char_ngrams");
        if (intArray4 != null) {
            arrayList2.add(new TokenTextCharNGrams("CHAR_NGRAM=", intArray4));
        }
        arrayList2.add(new TokenSequence2FeatureVectorSequence(true, true));
        Pipe[] pipeArr = new Pipe[arrayList2.size()];
        arrayList2.toArray(pipeArr);
        InstanceList instanceList = new InstanceList(new SerialPipes(pipeArr));
        instanceList.addThruPipe(new SentencePipeIterator(arrayList));
        return instanceList;
    }

    public static InstanceList convertFeatsforClassifier(Pipe pipe, InstanceList instanceList) {
        InstanceList instanceList2 = new InstanceList(pipe);
        for (int i = 0; i < instanceList.size(); i++) {
            Instance instance = instanceList.get(i);
            FeatureVectorSequence featureVectorSequence = (FeatureVectorSequence) instance.getData();
            LabelSequence labelSequence = (LabelSequence) instance.getTarget();
            LabelAlphabet labelAlphabet = (LabelAlphabet) labelSequence.getAlphabet();
            Object source = instance.getSource();
            Object name = instance.getName();
            if (labelSequence.size() != featureVectorSequence.size()) {
                System.err.println("failed making token instances: size of labelsequence != size of featue vector sequence: " + labelSequence.size() + " - " + featureVectorSequence.size());
                System.exit(-1);
            }
            for (int i2 = 0; i2 < featureVectorSequence.size(); i2++) {
                instanceList2.add(new Instance(featureVectorSequence.getFeatureVector(i2), labelAlphabet.lookupLabel(labelSequence.get(i2)), name, source));
            }
        }
        return instanceList2;
    }

    public static InstanceList convertFeatsforClassifier(Pipe pipe, Instance instance) {
        InstanceList instanceList = new InstanceList(pipe);
        FeatureVectorSequence featureVectorSequence = (FeatureVectorSequence) instance.getData();
        LabelSequence labelSequence = (LabelSequence) instance.getTarget();
        LabelAlphabet labelAlphabet = (LabelAlphabet) labelSequence.getAlphabet();
        Object source = instance.getSource();
        Object name = instance.getName();
        if (labelSequence.size() != featureVectorSequence.size()) {
            System.err.println("failed making token instances: size of labelsequence != size of featue vector sequence: " + labelSequence.size() + " - " + featureVectorSequence.size());
            System.exit(-1);
        }
        for (int i = 0; i < featureVectorSequence.size(); i++) {
            instanceList.add(new Instance(featureVectorSequence.getFeatureVector(i), labelAlphabet.lookupLabel(labelSequence.get(i)), name, source));
        }
        return instanceList;
    }
}
