package de.julielab.jules.ae.genemapping;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import de.julielab.jules.ae.genemapping.utils.Utils;
import de.julielab.jules.ae.genemapping.utils.norm.TermNormalizer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/jules/ae/genemapping/CandidateFilter.class */
public class CandidateFilter {
    private static final Logger LOGGER = LoggerFactory.getLogger(CandidateFilter.class);
    public static final String[] GREEK = {"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"};
    public static final String[] LAT_NUM = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX"};
    public static String GREEK_REGEX = "(" + ((String) Stream.of((Object[]) GREEK).collect(Collectors.joining("|"))) + ")";
    public static String LAT_NUM_REGEX = "(" + ((String) Stream.of((Object[]) LAT_NUM).sorted(Comparator.reverseOrder()).collect(Collectors.joining("|"))) + ")";
    public static final Map<String, String> greekAbbrMap = new HashMap();
    public static final String SUB_GREEK = "(beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)";
    public static String MODIFIER;
    public static String NON_DESCRIPTIVE;
    public static String AMINO_ACIDS;
    public Pattern patternNonDesc;
    public Matcher matcherNonDesc;
    public Pattern patternDomainFamilies;
    private static final String UNSPECIFIEDS_FILE = "/unspecified_proteins";
    public String UNSPECIFIEDS;
    public Pattern patternUnspecifieds;
    public Matcher matcherUnspecifieds;
    private static final String PREMOD_FILE = "/premodifiers";
    public String PREMODS;
    public Pattern patternPreMods;
    private Pattern num;
    private Pattern singChar;
    private Pattern specWords;
    public String NON_DESC = "(promoter|onco protein|oncoprotein|proto oncogene|protooncogene|protease|binding site|transcript|element|construct|si rna|prem rna|pre m rna|m rna ?s?|rna|locus|gene product|product|reporter gene|reporter|gene|protein|c dna|molecule|pseudogene|autoantigen|peptide|polypeptide|enzyme)$";
    public String DOMAIN_FAMILIES = "^.*(acceptors|acid|activators|adapters|adaptors|antibodi|antibody|binders|binding|binding site|binding sites|box|boxe|channel|channels|chromosome|coactivators|cofactors|complex|domain|dyneins|effectors|element|enhancers|epitope|erythrocyte|exchangers|exon|facilitators|factors|familie|family|filament|finger|helicases|histone|histones|homeodomain|inducers|inhibitors|integrators|interactors|intron|kinases|kinesins|lectins|ligands|mediators|member|membrane|modifiers|modulators|motif|myosins|proactivators|proteases|proteasome|proteins|reductases|region|regulators|repeat|repressors|residue|responders|sequence|site|subdomain|subfamily|subunits|superfamily|suppressors|supressors|syndrome|tail|terminal|terminators|terminus|tranporters|transferases|zinc finger)e?s?";

    public CandidateFilter() throws IOException {
        initUnspecifieds();
        initPreModifiers();
        this.patternDomainFamilies = Pattern.compile(this.DOMAIN_FAMILIES);
        this.patternNonDesc = Pattern.compile(".* " + this.NON_DESC);
        this.matcherNonDesc = this.patternNonDesc.matcher("");
        this.num = Pattern.compile("[0-9]*");
        this.singChar = Pattern.compile("([a-z]|[0-9])");
        this.specWords = Pattern.compile("(" + GREEK_REGEX + "|" + MODIFIER + "||" + NON_DESCRIPTIVE + ")");
    }

    public static void main(String[] strArr) throws IOException {
        Pattern pattern = new CandidateFilter().patternUnspecifieds;
        System.out.println(pattern.pattern());
        if (pattern.matcher("fos").matches()) {
            System.out.println("yes");
        } else {
            System.out.println("no");
        }
    }

    private boolean differInTypeOfOneTerm(String str, String str2, String str3) {
        if (str.equals(str2)) {
            return false;
        }
        TreeSet<String> set = getSet(str.split("\\s+"));
        TreeSet<String> set2 = getSet(str2.split("\\s+"));
        TreeSet<String> treeSet = new TreeSet<>();
        if (set.size() == set2.size() + 1) {
            treeSet = set;
            treeSet.removeAll(set2);
        } else if (set.size() == set2.size() + 1) {
            treeSet = set2;
            treeSet.removeAll(set);
        }
        if (treeSet.size() == 1) {
            return Pattern.compile(str3).matcher(treeSet.first()).matches();
        }
        return false;
    }

    private TreeSet<String> getSet(String[] strArr) {
        TreeSet<String> treeSet = new TreeSet<>();
        for (String str : strArr) {
            treeSet.add(str);
        }
        return treeSet;
    }

    private int getNumberOfOccurrences(String str, String str2) {
        String[] split = str.split("\\s+");
        int i = 0;
        Pattern compile = Pattern.compile(str2);
        for (String str3 : split) {
            if (compile.matcher(str3).matches()) {
                i++;
            }
        }
        return i;
    }

    private boolean onlyDifferentTypes(String str, String str2, String str3) {
        Pattern compile = Pattern.compile("([a-z0-9 ]*?) ?" + str3 + " ?([a-z0-9 ]*?)");
        if (getNumberOfOccurrences(str, str3) != 1 || getNumberOfOccurrences(str2, str3) != 1) {
            return false;
        }
        Matcher matcher = compile.matcher(str);
        Matcher matcher2 = compile.matcher(str2);
        return matcher.matches() && matcher2.matches() && !matcher.group(2).equals(matcher2.group(2)) && matcher.group(1).equals(matcher2.group(1)) && matcher.group(3).equals(matcher2.group(3));
    }

    public boolean filterOut(String str, String str2) {
        TreeSet<String> commonWords = Utils.getCommonWords(str.split(" "), str2.split(" "));
        boolean z = true;
        Iterator<String> it = commonWords.iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            if (!this.num.matcher(it.next()).matches()) {
                z = false;
                break;
            }
        }
        if (z) {
            LOGGER.debug("filtered out because: overlap only numbers: '" + str + "' <-> '" + str2 + "'");
            return z;
        }
        boolean z2 = true;
        Iterator<String> it2 = commonWords.iterator();
        while (true) {
            if (!it2.hasNext()) {
                break;
            }
            if (!this.singChar.matcher(it2.next()).matches()) {
                z2 = false;
                break;
            }
        }
        if (z2) {
            LOGGER.debug("filtered out because: overlap only single characters or single digits: '" + str + "' <-> '" + str2 + "'");
            return z2;
        }
        boolean z3 = true;
        Iterator<String> it3 = commonWords.iterator();
        while (true) {
            if (!it3.hasNext()) {
                break;
            }
            if (!this.specWords.matcher(it3.next()).matches()) {
                z3 = false;
                break;
            }
        }
        if (z3) {
            LOGGER.debug("filtered out because: overlap consists only of special words (greek, modifiers, non-descriptive): '" + str + "' <-> '" + str2 + "'");
            return z3;
        }
        if (onlyDifferentTypes(str, str2, "([0-9]+)")) {
            LOGGER.debug("filtered out because: terms differ in one number only: '" + str + "' <-> '" + str2 + "'");
            return true;
        }
        if (onlyDifferentTypes(str, str2, GREEK_REGEX)) {
            LOGGER.debug("filtered out because: terms differ in one greek token only: '" + str + "' <-> '" + str2 + "'");
            return true;
        }
        if (differInTypeOfOneTerm(str, str2, "([02-9]|[1-9]{2,})")) {
            LOGGER.debug("filtered out because: one has a number and the other doesn't (1 is excluded): '" + str + "' <-> '" + str2 + "'");
            return true;
        }
        if (differInTypeOfOneTerm(str, str2, SUB_GREEK)) {
            LOGGER.debug("filtered out because: one has a greek and the other doesn't (alpha is excluded): '" + str + "' <-> '" + str2 + "'");
            return true;
        }
        if (!differInTypeOfOneTerm(str, str2, MODIFIER)) {
            return false;
        }
        LOGGER.debug("filtered out because: one has a modifier and the other doesn't: '" + str + "' <-> '" + str2 + "'");
        return true;
    }

    public void initUnspecifieds() throws IOException {
        TermNormalizer termNormalizer = new TermNormalizer();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream(UNSPECIFIEDS_FILE)));
        this.UNSPECIFIEDS = "^(";
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                } else if (!readLine.startsWith("##")) {
                    this.UNSPECIFIEDS += termNormalizer.normalize(readLine.trim()).trim() + "|";
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        bufferedReader.close();
        this.UNSPECIFIEDS = this.UNSPECIFIEDS.substring(0, this.UNSPECIFIEDS.length() - 1) + ")e?s?$";
        this.UNSPECIFIEDS = this.UNSPECIFIEDS.trim();
        this.patternUnspecifieds = Pattern.compile(this.UNSPECIFIEDS);
        this.matcherUnspecifieds = this.patternUnspecifieds.matcher("");
        LOGGER.debug("Initializing unspecified proteins pattern from file: " + this.patternUnspecifieds);
    }

    public void initPreModifiers() throws IOException {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream(PREMOD_FILE)));
        this.PREMODS = "^(";
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                } else if (!readLine.startsWith("##")) {
                    this.PREMODS += readLine.trim() + "|";
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        bufferedReader.close();
        this.PREMODS = this.PREMODS.substring(0, this.PREMODS.length() - 1) + ") ";
        this.patternPreMods = Pattern.compile(this.PREMODS + ".*");
        LOGGER.debug("Initializing protein void premodifiers from file: " + this.patternPreMods);
    }

    public boolean hasContradictingGreek(String str, String str2) {
        return false;
    }

    public static String expendGreek(String str) {
        Matcher matcher = Pattern.compile("\\b[a-zA-Z]\\b").matcher(str);
        StringBuilder sb = new StringBuilder();
        int i = 0;
        while (matcher.find()) {
            sb.append(str.substring(i, matcher.start()));
            i = matcher.end();
            String group = matcher.group();
            String str2 = greekAbbrMap.get(group);
            sb.append(str2 != null ? str2 : group);
        }
        if (i > 0 && i < str.length() - 1) {
            sb.append(str.substring(i, str.length()));
        }
        return sb.length() == 0 ? str : sb.toString();
    }

    public static boolean isNumberCompatible(String str, String str2) {
        String[] split = str.split("\\s");
        String[] split2 = str2.split("\\s");
        Multiset<String> numbers = getNumbers(split);
        Multiset<String> numbers2 = getNumbers(split2);
        return numbers.size() == numbers2.size() && Multisets.intersection(numbers, numbers2).size() == numbers.size();
    }

    public static Multiset<String> getNumbers(String[] strArr) {
        HashMultiset create = HashMultiset.create();
        for (String str : strArr) {
            if (str.matches("[0-9]+")) {
                create.add(str);
            }
        }
        return create;
    }

    public static Multiset<String> getSingleSymbols(String[] strArr) {
        HashMultiset create = HashMultiset.create();
        for (String str : strArr) {
            if (str.matches("[a-zA-Z]|[0-9]+|" + GREEK_REGEX)) {
                create.add(str);
            }
        }
        return create;
    }

    public static Multiset<String> getContentTokens(String[] strArr) {
        HashMultiset create = HashMultiset.create();
        for (String str : strArr) {
            if (!str.matches("[a-zA-Z]|[0-9]+|" + GREEK_REGEX)) {
                create.add(str);
            }
        }
        return create;
    }

    public static Multiset<String> getNumberOfCommonTokens(String str, String str2) {
        String[] split = str.split("\\s");
        String[] split2 = str2.split("\\s");
        HashMultiset create = HashMultiset.create();
        HashMultiset create2 = HashMultiset.create();
        for (String str3 : split) {
            create.add(str3);
        }
        for (String str4 : split2) {
            create2.add(str4);
        }
        return Multisets.intersection(create, create2);
    }

    public boolean isUnspecified(String str) {
        return this.matcherUnspecifieds.reset(str).matches();
    }

    public boolean isNonDescriptive(String str) {
        return this.matcherNonDesc.reset(str).matches();
    }

    static {
        for (int i = 0; i < GREEK.length; i++) {
            String str = GREEK[i];
            String substring = str.substring(0, 1);
            if (!greekAbbrMap.containsKey(substring)) {
                greekAbbrMap.put(substring, str);
            }
        }
        MODIFIER = "(receptors?|cofactors?|factors?|tranporters?|regulators?|inhibitors?|activators?|suppressors?|enhancers?|repressors?|adaptors?|interactors?|modulators?|mediators?|inducers?|effectors?|coactivators?|supressors?|integrators?|facilitators?|binders?|terminators?|acceptors?|responders?|proactivators?|exchangers?|enhancers?|adapters?|responders?|modifiers?|ligands?)";
        NON_DESCRIPTIVE = "(constructs?|fragments?|antigens?|precursors?|proteins?|genes?|chains?|domains?|kinases?|homologues?|homologs?|isoforms?|isologs?|isotypes?|motifs?|orthologues?|orthologs?|products?|sequences?|subtypes?|subunits?)";
        AMINO_ACIDS = "(alanine|arginine|asparagine|aspartic|cysteine|glutamine|glutamic|glycine|histidine|isoleucine|leucine|lysine|methionine|phenylalanine|proline|serine|threonine|tryptophan|tyrosine|valine)";
    }
}
