package de.julielab.jules.ae.genemapping.scoring;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.Label;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.Token;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/jules/ae/genemapping/scoring/MaxEntScorerFeaturePipe.class */
public class MaxEntScorerFeaturePipe extends Pipe implements Serializable {
    private boolean lexicalize;
    private boolean debug;
    private static final long serialVersionUID = 1;
    private static final Logger LOGGER = LoggerFactory.getLogger(MaxEntScorerFeaturePipe.class);
    private final String GREEK = "(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)";
    private final String GREEK_ALPHA = "alpha";
    private final String NUMBER = "[0-9]+";
    private final String ONE = "1";
    private final String CHAR = "[a-z]";
    private final String ALPHA = "[a-z]+";
    private final String MOL_WEIGHT = "p [0-9][0-9]?";
    private String MODIFIER;
    private String NON_DESCRIPTIVE;
    private TokenJaroSimilarity jaroSim;

    public MaxEntScorerFeaturePipe() {
        super(new Alphabet(), new LabelAlphabet());
        this.lexicalize = true;
        this.debug = false;
        this.GREEK = "(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)";
        this.GREEK_ALPHA = "alpha";
        this.NUMBER = "[0-9]+";
        this.ONE = "1";
        this.CHAR = "[a-z]";
        this.ALPHA = "[a-z]+";
        this.MOL_WEIGHT = "p [0-9][0-9]?";
        this.MODIFIER = "(receptor|tranporter|regulator|inhibitor|activator|suppressor|enhancer|repressor|adaptor|interactor|modulator|mediator|inducer|effector|coactivator|supressor|integrator|facilitator|binder|terminator|acceptor|proactivator|exchanger|enhancer|adapter|responder|modifier|ligand|cofactor|tranporting|regulating|inhibiting|activating|suppressing|enhancing|repressing|adapting|interacting|modulating|mediating|inducing|effecting|coactivating|supressing|integrating|facilitating|binding|terminating|accepting|responding|proactivating|exchanging|enhancing|adapting|modifying|coreceptor|cotranporter|coregulator|coinhibitor|coactivator|cosuppressor|coenhancer|corepressor|coadaptor|cointeractor|comodulator|comediator|coinducer|coeffector|coactivator|cointegrator|cofacilitator|cobinder|coterminator|coacceptor|proactivator|coexchanger|coenhancer|coadapter|coresponder|comodifier|coligand|cofactor)";
        this.NON_DESCRIPTIVE = "(fragment|antigen|precursor|protein|chain|domain|gene|homolog|homologue|isoform|isolog|isotype|motif|ortholog|precursor|precursors|product|sequence|subtype|subunit)";
        this.jaroSim = null;
    }

    public Instance pipe(Instance instance) {
        if (this.jaroSim == null) {
            this.jaroSim = new TokenJaroSimilarity();
        }
        String[] strArr = (String[]) instance.getData();
        String str = strArr[0];
        String str2 = strArr[1];
        String str3 = strArr[2];
        MaxEntScorerPairExtractor maxEntScorerPairExtractor = new MaxEntScorerPairExtractor();
        Label lookupLabel = getTargetAlphabet().lookupLabel(str3);
        String[][] compareStrings = maxEntScorerPairExtractor.compareStrings(str, str2);
        String[] allBigrams = allBigrams(str);
        String[] allBigrams2 = allBigrams(str2);
        String[] differentBigrams = differentBigrams(str, str2);
        String[] commonBigrams = commonBigrams(str, str2);
        differentTrigrams(str, str2);
        String[] commonTrigrams = commonTrigrams(str, str2);
        Token token = new Token(str);
        token.setText(str);
        boolean z = false;
        boolean z2 = false;
        for (String str4 : allBigrams) {
            if (str4.matches("p [0-9][0-9]?")) {
                z = true;
            }
        }
        for (String str5 : allBigrams2) {
            if (str5.matches("p [0-9][0-9]?")) {
                z2 = true;
            }
        }
        for (String str6 : differentBigrams) {
            if (str6.matches("p [0-9][0-9]?") && z && z2) {
                token.setFeatureValue("DIFF_MOL_WEIGHT", 1.0d);
            }
        }
        for (String str7 : commonBigrams) {
            token.setFeatureValue("COMMON_BIGRAM=" + str7, 1.0d);
            if (str7.matches("p [0-9][0-9]?")) {
                token.setFeatureValue("SAME_MOL_WEIGHT", 1.0d);
            }
        }
        for (String str8 : commonTrigrams) {
            token.setFeatureValue("COMMON_TRIGRAM=" + str8, 1.0d);
        }
        double score = new SimpleScorer().getScore(str, str2);
        if (score == 1.0d) {
            token.setFeatureValue("SIMPLESCORE=1", 1.0d);
        } else if (score >= 0.9d) {
            token.setFeatureValue("SIMPLESCORE>=0.9", 1.0d);
        } else if (score >= 0.8d) {
            token.setFeatureValue("SIMPLESCORE>=0.8", 1.0d);
        } else if (score >= 0.7d) {
            token.setFeatureValue("SIMPLESCORE>=0.7", 1.0d);
        } else if (score >= 0.6d) {
            token.setFeatureValue("SIMPLESCORE>=0.6", 1.0d);
        } else if (score >= 0.5d) {
            token.setFeatureValue("SIMPLESCORE>=0.5", 1.0d);
        } else if (score >= 0.3d) {
            token.setFeatureValue("SIMPLESCORE>=0.3", 1.0d);
        }
        if (str.indexOf(str2) > -1 || str2.indexOf(str) > -1) {
            token.setFeatureValue("SUBSTRING", 1.0d);
        }
        token.setFeatureValue("TRANSPOSITIONS=" + this.jaroSim.getTokenTranspositions(str, str2), 1.0d);
        HashMap<String, Integer> hashMap = new HashMap<>();
        for (int i = 0; i < compareStrings[0].length; i++) {
            String str9 = compareStrings[0][i];
            if (str9.matches("[0-9]+")) {
                add2HashMap(hashMap, "SAME_NUM");
            } else if (str9.matches("(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)")) {
                add2HashMap(hashMap, "SAME_GREEK");
            } else if (!str9.matches("[a-z]")) {
                if (str9.matches("[a-z]+")) {
                    add2HashMap(hashMap, "SAME_ALPHA");
                } else if (str9.matches(this.MODIFIER)) {
                    add2HashMap(hashMap, "SAME_MODIFIER");
                } else if (!str9.matches(this.NON_DESCRIPTIVE) && this.lexicalize) {
                    hashMap.put("SAME_STRING=" + str9, 1);
                }
            }
        }
        for (String str10 : hashMap.keySet()) {
            token.setFeatureValue(str10 + "=" + hashMap.get(str10).intValue(), 1.0d);
        }
        token.setFeatureValue("NUM_OF_SAMES=" + compareStrings[0].length, 1.0d);
        if (compareStrings[0].length == 1) {
            String str11 = compareStrings[0][0];
            if (str11.matches("[0-9]+")) {
                token.setFeatureValue("ONLY_SAME_NUMBER", 1.0d);
            } else if (str11.matches("(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)")) {
                token.setFeatureValue("ONLY_SAME_GREEK", 1.0d);
            } else if (str11.matches("[a-z]")) {
                token.setFeatureValue("ONLY_SAME_CHAR", 1.0d);
            } else if (str11.matches("[a-z]+")) {
                token.setFeatureValue("ONLY_SAME_ALPHA", 1.0d);
            } else if (!str11.matches(this.MODIFIER) && !str11.matches(this.NON_DESCRIPTIVE) && this.lexicalize) {
                hashMap.put("ONLY_SAME_STRING=" + str11, 1);
            }
        }
        HashMap<String, Integer> hashMap2 = new HashMap<>();
        for (int i2 = 0; i2 < compareStrings[1].length; i2++) {
            String str12 = compareStrings[1][i2];
            if (str12.matches("[0-9]+")) {
                add2HashMap(hashMap2, "DIFF_NUM");
            } else if (str12.matches("(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)")) {
                add2HashMap(hashMap2, "DIFF_GREEK");
            } else if (str12.matches("[a-z]")) {
                add2HashMap(hashMap2, "DIFF_CHAR");
            } else if (str12.matches("[a-z]+")) {
                add2HashMap(hashMap2, "DIFF_ALPHA");
            } else if (str12.matches(this.MODIFIER)) {
                add2HashMap(hashMap2, "DIFF_MODIFIER");
            } else if (!str12.matches(this.NON_DESCRIPTIVE) && this.lexicalize) {
                hashMap2.put("DIFF_STRING=" + str12, 1);
            }
        }
        for (String str13 : hashMap2.keySet()) {
            token.setFeatureValue(str13 + "=" + hashMap2.get(str13).intValue(), 1.0d);
        }
        token.setFeatureValue("NUM_OF_DIFFS=" + compareStrings[1].length, 1.0d);
        if (compareStrings[1].length == 1) {
            String str14 = compareStrings[1][0];
            if (str14.matches("1")) {
                token.setFeatureValue("ONLY_DIFF_ONE", 1.0d);
            } else if (str14.matches("[0-9]+")) {
                token.setFeatureValue("ONLY_DIFF_NUMBER", 1.0d);
            } else if (str14.matches("alpha")) {
                token.setFeatureValue("ONLY_DIFF_GREEK_ALPHA", 1.0d);
            } else if (str14.matches("(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)")) {
                token.setFeatureValue("ONLY_DIFF_GREEK", 1.0d);
            } else if (str14.matches("[a-z]+")) {
                token.setFeatureValue("ONLY_DIFF_ALPHA", 1.0d);
            } else if (str14.matches(this.MODIFIER)) {
                token.setFeatureValue("ONLY_DIFF_MODIFIER", 1.0d);
            } else if (str14.matches(this.NON_DESCRIPTIVE)) {
                token.setFeatureValue("ONLY_DIFF_NON_DESCRIPTIVE", 1.0d);
            } else if (this.lexicalize) {
                hashMap.put("ONLY_DIFF_STRING=" + str14, 1);
            }
        }
        int abs = Math.abs(str.split(" ").length - str2.split(" ").length);
        token.setFeatureValue("LENGTHDIFF=" + abs, 1.0d);
        double max = 1.0d - (abs / Math.max(str.split(" ").length, str2.split(" ").length));
        if (max >= 0.9d) {
            token.setFeatureValue("RELLENGTHDIFF>=0.9", 1.0d);
        } else if (max >= 0.7d) {
            token.setFeatureValue("RELLENGTHDIFF>=0.7", 1.0d);
        } else if (max >= 0.5d) {
            token.setFeatureValue("RELLENGTHDIFF>=0.5", 1.0d);
        } else {
            token.setFeatureValue("RELLENGTHDIFF<0.5", 1.0d);
        }
        if (this.debug) {
            System.out.println("\n--------------------------------------------\nFeatures for: " + str + "\t" + str2 + "\t" + str3 + "\n" + token.toString());
        }
        instance.setData(token);
        instance.setTarget(lookupLabel);
        instance.setSource(str + " <-> " + str2);
        instance.setName(lookupLabel.toString());
        return instance;
    }

    private ArrayList<String> makeBigrams(String str) {
        String[] split = str.split(" ");
        ArrayList<String> arrayList = new ArrayList<>();
        for (int i = 1; i < split.length; i++) {
            arrayList.add((split[i - 1] + " " + split[i]).trim());
        }
        return arrayList;
    }

    private String[] allBigrams(String str) {
        return (String[]) makeBigrams(str).toArray(new String[0]);
    }

    private String[] commonBigrams(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        ArrayList<String> makeBigrams = makeBigrams(str);
        String[] strArr = (String[]) makeBigrams.toArray(new String[0]);
        ArrayList<String> makeBigrams2 = makeBigrams(str2);
        String[] strArr2 = (String[]) makeBigrams2.toArray(new String[0]);
        for (String str3 : strArr) {
            if (makeBigrams2.contains(str3)) {
                arrayList.add(str3);
            }
        }
        for (String str4 : strArr2) {
            if (makeBigrams.contains(str4) && !arrayList.contains(str4)) {
                arrayList.add(str4);
            }
        }
        return (String[]) arrayList.toArray(new String[0]);
    }

    private ArrayList<String> makeCharTrigrams(String str) {
        StringBuilder sb = new StringBuilder(str);
        ArrayList<String> arrayList = new ArrayList<>();
        for (int i = 2; i < sb.length(); i++) {
            arrayList.add(sb.charAt(i - 2) + sb.charAt(i - 1) + sb.charAt(i));
        }
        return arrayList;
    }

    private String[] commonCharTrigrams(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        ArrayList<String> makeCharTrigrams = makeCharTrigrams(str);
        String[] strArr = (String[]) makeCharTrigrams.toArray(new String[0]);
        ArrayList<String> makeCharTrigrams2 = makeCharTrigrams(str2);
        String[] strArr2 = (String[]) makeCharTrigrams2.toArray(new String[0]);
        for (String str3 : strArr) {
            if (makeCharTrigrams2.contains(str3)) {
                arrayList.add(str3);
            }
        }
        for (String str4 : strArr2) {
            if (makeCharTrigrams.contains(str4) && !arrayList.contains(str4)) {
                arrayList.add(str4);
            }
        }
        return (String[]) arrayList.toArray(new String[0]);
    }

    private ArrayList<String> makeTrigrams(String str) {
        String[] split = str.split(" ");
        ArrayList<String> arrayList = new ArrayList<>();
        for (int i = 2; i < split.length; i++) {
            arrayList.add((split[i - 2] + " " + split[i - 1] + " " + split[i]).trim());
        }
        return arrayList;
    }

    private String[] commonTrigrams(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        ArrayList<String> makeTrigrams = makeTrigrams(str);
        String[] strArr = (String[]) makeTrigrams.toArray(new String[0]);
        ArrayList<String> makeTrigrams2 = makeTrigrams(str2);
        String[] strArr2 = (String[]) makeTrigrams2.toArray(new String[0]);
        for (String str3 : strArr) {
            if (makeTrigrams2.contains(str3)) {
                arrayList.add(str3);
            }
        }
        for (String str4 : strArr2) {
            if (makeTrigrams.contains(str4) && !arrayList.contains(str4)) {
                arrayList.add(str4);
            }
        }
        return (String[]) arrayList.toArray(new String[0]);
    }

    private String[] differentBigrams(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        ArrayList<String> makeBigrams = makeBigrams(str);
        String[] strArr = (String[]) makeBigrams.toArray(new String[0]);
        ArrayList<String> makeBigrams2 = makeBigrams(str2);
        String[] strArr2 = (String[]) makeBigrams2.toArray(new String[0]);
        for (String str3 : strArr) {
            if (!makeBigrams2.contains(str3)) {
                arrayList.add(str3);
            }
        }
        for (String str4 : strArr2) {
            if (!makeBigrams.contains(str4) && !arrayList.contains(str4)) {
                arrayList.add(str4);
            }
        }
        return (String[]) arrayList.toArray(new String[0]);
    }

    private String[] differentTrigrams(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        ArrayList<String> makeTrigrams = makeTrigrams(str);
        String[] strArr = (String[]) makeTrigrams.toArray(new String[0]);
        ArrayList<String> makeTrigrams2 = makeTrigrams(str2);
        String[] strArr2 = (String[]) makeTrigrams2.toArray(new String[0]);
        for (String str3 : strArr) {
            if (!makeTrigrams2.contains(str3)) {
                arrayList.add(str3);
            }
        }
        for (String str4 : strArr2) {
            if (!makeTrigrams.contains(str4) && !arrayList.contains(str4)) {
                arrayList.add(str4);
            }
        }
        return (String[]) arrayList.toArray(new String[0]);
    }

    private void add2HashMap(HashMap<String, Integer> hashMap, String str) {
        int i = 0;
        if (hashMap.containsKey(str)) {
            i = hashMap.get(str).intValue();
        }
        hashMap.put(str, Integer.valueOf(i + 1));
    }
}
