package de.julielab.geneexpbase;

import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.IOStreamUtilities;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.Range;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tartarus.snowball.SnowballProgram;

/* loaded from: input_file:de/julielab/geneexpbase/TermNormalizer.class */
public class TermNormalizer {
    private static final Logger log = LoggerFactory.getLogger(TermNormalizer.class);
    private static AhoCorasickOptimized specialistLexEmbeddedGreekAC;
    private static final Map<String, String> greekCharacterNormalizationMap;
    public static final String[] GREEK;
    public static String GREEK_REGEX;
    public static String LAT_NUM_REGEX;
    public static final Pattern ROMAN_NUMBERS_PATTERN;
    private static final String NUMBERPATTERN = "([A-Za-z]+)([0-9]+)";
    private static TreeSet<String> nonDescriptives;
    private static TreeSet<String> stopwords;
    private HashMap<String, String> plurals;
    private static final Pattern NUMBER_SPECIFIER_PATTERN;
    private final AhoCorasickOptimized greekHighLowKinaseAC;
    private final AhoCorasickOptimized greekAC;
    private final SnowballProgram stemmer;
    private final String GREEK_CHAR_PATTERN = "α|β|γ|δ|ε|ζ|η|θ|ι|κ|λ|μ|ν|ξ|ο|π|ρ|ς|σ|τ|υ|φ|χ|ψ";
    private final String NON_DESCRIPTIVES_FILE = "/non_descriptives";
    private final String SHORTFORMPATTERN = "((.*[0-9a-z]+)(L|R)|(.*[0-9]+)(l|r)|(r|l|R|L))";
    private final String SHORTFORMEND_WITH_NUMBER_PATTERN = "(.* )(ra|rb|rg|bp)( [0-9]*)?";
    private final String SHORTFORMEND_NO_NUMBER_PATTERN = "(.* )(a|b)";
    private final String TOKENSPLITPATTERN = "(.*[a-z])([A-Z0-9].*)|(.*[A-Z])([0-9].*)|(.*[0-9])([a-zA-Z].*)|(.*[A-Z][A-Z])([a-z].*)";
    private final String DOTREMOVAL = "(.*)([a-zA-Z])\\.([a-zA-Z0-9])(.*)";
    private final Pattern shortFormPattern = Pattern.compile("((.*[0-9a-z]+)(L|R)|(.*[0-9]+)(l|r)|(r|l|R|L))");
    private final Pattern tokenSplitPattern = Pattern.compile("(.*[a-z])([A-Z0-9].*)|(.*[A-Z])([0-9].*)|(.*[0-9])([a-zA-Z].*)|(.*[A-Z][A-Z])([a-z].*)");
    private final Pattern dotRemovalPattern = Pattern.compile("(.*)([a-zA-Z])\\.([a-zA-Z0-9])(.*)");
    private final Pattern shortFormEndWithNumberPattern = Pattern.compile("(.* )(ra|rb|rg|bp)( [0-9]*)?");
    private final Pattern shortFormEndNoNumberPattern = Pattern.compile("(.* )(a|b)");
    private final Pattern greekCharPattern = Pattern.compile("α|β|γ|δ|ε|ζ|η|θ|ι|κ|λ|μ|ν|ξ|ο|π|ρ|ς|σ|τ|υ|φ|χ|ψ");

    public TermNormalizer() {
        List list = (List) Arrays.stream(GREEK).collect(Collectors.toList());
        list.add("high");
        list.add("low");
        list.add("kinase");
        this.greekHighLowKinaseAC = new AhoCorasickOptimized(list);
        this.greekAC = new AhoCorasickOptimized(GREEK);
        initStopwords();
        initNonDescriptives();
        try {
            this.stemmer = (SnowballProgram) Class.forName("org.tartarus.snowball.ext.EnglishStemmer").getDeclaredConstructor(new Class[0]).newInstance(new Object[0]);
        } catch (ReflectiveOperationException e) {
            throw new RuntimeException(e.toString());
        }
    }

    public static void main(String[] strArr) {
        if (strArr.length != 2) {
            System.err.println("usage:\nTermNormalizer <inputFile> <outputFile>");
            System.exit(-1);
        } else {
            new TermNormalizer().normalizeFile(new File(strArr[0]), new File(strArr[1]));
        }
    }

    public static String removeModifiers(String str) {
        log.debug("TRYING to remove modifiers or even complete term: {}", str);
        if (CandidateFilter.patternUnspecifieds.matcher(str).matches()) {
            log.debug("IS UNSPECIFIED: {}", str);
            str = str.replaceFirst(CandidateFilter.UNSPECIFIEDS, "");
            log.debug("UNSPECIFIED REMOVED: |{}|", str);
        }
        String trim = str.trim();
        if (CandidateFilter.patternDomainFamilies.matcher(trim).matches()) {
            log.debug("IS DOMAIN: {}", trim);
            trim = trim.replaceFirst(CandidateFilter.DOMAIN_FAMILIES, "");
            log.debug("DOMAIN REMOVED: |{}|", trim);
        }
        String trim2 = trim.trim();
        if (CandidateFilter.patternPreMods.matcher(trim2).matches()) {
            log.debug("PREMODIFIER: {}", trim2);
            trim2 = trim2.replaceFirst(CandidateFilter.PREMODS, "");
        }
        String trim3 = trim2.trim();
        if (CandidateFilter.patternNonDesc.matcher(trim3).matches()) {
            log.debug("IS NONDESC: {}", trim3);
            trim3 = trim3.replaceFirst(CandidateFilter.NON_DESCRIPTIVE, "");
            log.debug("NONDESC REMOVED: |{}|", trim3);
        }
        return trim3.trim();
    }

    public static String removeUnspecifieds(String str) {
        Matcher matcher = CandidateFilter.patternUnspecifieds.matcher(str);
        log.debug("TRYING to remove modifiers or even complete term: " + str);
        if (matcher.matches()) {
            log.debug("IS UNSPECIFIED: " + str);
            str = str.replaceFirst(CandidateFilter.UNSPECIFIEDS, "");
            log.debug("UNSPECIFIED REMOVED: |" + str + "|");
        }
        return str.trim();
    }

    public static String removeNondescriptives(String str) {
        int i;
        Matcher matcher = CandidateFilter.patternNonDesc.matcher(str);
        StringBuilder sb = new StringBuilder();
        int i2 = 0;
        while (true) {
            i = i2;
            if (!matcher.find(i)) {
                break;
            }
            if (log.isDebugEnabled()) {
                log.debug("IS NONDESC: {} because of {}", str, matcher.group(2));
            }
            sb.append((CharSequence) str, i, matcher.start(2));
            if (log.isDebugEnabled()) {
                log.debug("NONDESC REMOVED: |{}|", matcher.group(2));
            }
            i2 = matcher.end(2);
        }
        sb.append((CharSequence) str, i, str.length());
        while (sb.length() > 0 && Character.isWhitespace(sb.charAt(0))) {
            sb.deleteCharAt(0);
        }
        while (sb.length() > 0 && Character.isWhitespace(sb.charAt(sb.length() - 1))) {
            sb.deleteCharAt(sb.length() - 1);
        }
        return sb.toString();
    }

    public static String removeDomainFamilies(String str) {
        if (CandidateFilter.patternDomainFamilies.matcher(str).matches()) {
            log.debug("IS DOMAIN: " + str);
            str = str.replaceFirst(CandidateFilter.DOMAIN_FAMILIES, "");
            log.debug("DOMAIN REMOVED: |" + str + "|");
        }
        return str.trim();
    }

    public static String removePremodifiers(String str) {
        if (CandidateFilter.patternPreMods.matcher(str).matches()) {
            log.debug("PREMODIFIER: " + str);
            str = str.replaceFirst(CandidateFilter.PREMODS, "");
        }
        return str.trim();
    }

    public String normalize(String str) {
        List<String> list;
        List<String> decomposition = decomposition(removeSpecialCharacters(splitAndNormalizeGreekCharacters(removeStopwords(str))));
        do {
            list = decomposition;
            decomposition = specialTokenSplit(splitAwayNumbers(decomposition));
        } while (!decomposition.equals(list));
        return ArrayList2String(toLowerCase(splitAwayCharacterStrings(replaceRomanNumbers(decomposition)))).trim();
    }

    protected List<String> decomposition(List<String> list) {
        ArrayList arrayList = new ArrayList();
        for (String str : list) {
            LinkedHashSet<Range> linkedHashSet = new LinkedHashSet();
            specialistLexEmbeddedGreekAC.match(str, (i, i2, str2) -> {
                linkedHashSet.add(Range.between(Integer.valueOf(i), Integer.valueOf(i2 + 1)));
            });
            ArrayList<Range> arrayList2 = new ArrayList();
            for (Range range : linkedHashSet) {
                Iterator it = linkedHashSet.iterator();
                while (true) {
                    if (it.hasNext()) {
                        Range range2 = (Range) it.next();
                        if (!range2.isBefore((Integer) range.getMinimum()) && range != range2 && ((Integer) range.getMinimum()).intValue() == 0 && range.getMaximum() == range2.getMinimum() && ((Integer) range2.getMaximum()).intValue() == str.length()) {
                            arrayList2.add(range);
                            arrayList2.add(range2);
                            break;
                        }
                    }
                }
            }
            if (arrayList2.isEmpty()) {
                arrayList.add(str);
            } else {
                for (Range range3 : arrayList2) {
                    arrayList.add(str.substring(((Integer) range3.getMinimum()).intValue(), ((Integer) range3.getMaximum()).intValue()));
                }
            }
        }
        return arrayList;
    }

    protected List<String> splitAndNormalizeGreekCharacters(List<String> list) {
        Matcher matcher = this.greekCharPattern.matcher("");
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            matcher.reset(it.next());
            Stream map = Arrays.stream(matcher.replaceAll(" $0 ").split("\\s+")).map(str -> {
                return greekCharacterNormalizationMap.containsKey(str) ? greekCharacterNormalizationMap.get(str) : str;
            });
            Objects.requireNonNull(arrayList);
            map.forEach((v1) -> {
                r1.add(v1);
            });
        }
        return arrayList;
    }

    public List<String> generateVariants(String str) {
        ArrayList arrayList = new ArrayList();
        arrayList.add(str.replaceAll("([^-0-9])\\-([^0-9])", "$1$2"));
        arrayList.add((String) splitAwayRomanNumbers(Arrays.asList(str.split("\\s+"))).stream().collect(Collectors.joining(" ")));
        arrayList.add(str.replaceAll("alpha", "a").replaceAll("beta", "b").replaceAll("gamma", "g").replaceAll("delta", "d"));
        arrayList.add(str.replaceAll("\\s?alpha", "a").replaceAll("\\s?beta", "b").replaceAll("\\s?gamma", "g").replaceAll("\\s?delta", "d"));
        return (List) arrayList.stream().distinct().collect(Collectors.toList());
    }

    public String concatenateTrailingSpecifier(String str) {
        String[] split = str.split("\\p{P}");
        return split[split.length - 1].length() == 1 ? str.substring(0, str.length() - 1) + split[split.length - 1] : str;
    }

    public String stemNameTokens(String str) throws IOException {
        String[] split = str.split("\\s+");
        ArrayList arrayList = new ArrayList(split.length);
        for (String str2 : split) {
            this.stemmer.setCurrent(str2);
            this.stemmer.stem();
            arrayList.add(this.stemmer.getCurrent());
        }
        return StringUtils.join(arrayList, " ");
    }

    public Stream<String> getRomanNumbers(String str) {
        Stream.Builder builder = Stream.builder();
        String[] split = str.split("\\s+");
        for (int i = 0; i < split.length; i++) {
            String str2 = split[i];
            Matcher matcher = ROMAN_NUMBERS_PATTERN.matcher(str2);
            while (matcher.find()) {
                if (matcher.start() != 0 && matcher.end() == str2.length()) {
                    builder.accept(matcher.group());
                } else if (i == split.length - 1 && str2.length() == matcher.end() + 1 && Character.isUpperCase(str2.charAt(str2.length() - 1))) {
                    builder.accept(matcher.group());
                }
            }
        }
        return builder.build();
    }

    public Stream<String> getGreekCharacters(String str) {
        Stream.Builder builder = Stream.builder();
        this.greekAC.match(str, (i, i2, str2) -> {
            builder.accept(str2);
        });
        return builder.build();
    }

    public Stream<String> getNumbers(String str) {
        Matcher matcher = NUMBER_SPECIFIER_PATTERN.matcher("");
        return Stream.of(str).flatMap(str2 -> {
            return Arrays.stream(str2.split("\\s+"));
        }).flatMap(str3 -> {
            Stream.Builder builder = Stream.builder();
            matcher.reset(str3);
            while (matcher.find()) {
                builder.accept(matcher.group());
            }
            return builder.build();
        });
    }

    public void normalizeFile(File file, File file2) {
        System.out.println("Normalizing file " + file.getAbsolutePath() + " and writing the result to " + file2.getAbsolutePath());
        AtomicInteger atomicInteger = new AtomicInteger(0);
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
            try {
                FileWriter fileWriter = new FileWriter(file2);
                try {
                    ((Stream) ((Stream) bufferedReader.lines().parallel()).map(str -> {
                        return str.split("\t");
                    }).filter(strArr -> {
                        if (strArr.length == 2 || strArr.length == 3) {
                            return true;
                        }
                        atomicInteger.incrementAndGet();
                        System.err.println("wrong line format, ignoring line: " + Arrays.toString(strArr));
                        return false;
                    }).flatMap(strArr2 -> {
                        Stream.Builder builder = Stream.builder();
                        String normalize = normalize(strArr2[0]);
                        if (!normalize.isEmpty()) {
                            List<String> generateVariants = generateVariants(strArr2[0]);
                            for (int i = 0; i < generateVariants.size(); i++) {
                                generateVariants.set(i, normalize(generateVariants.get(i)));
                            }
                            if (strArr2.length == 3) {
                                builder.accept(normalize + "\t" + strArr2[1] + "\t" + strArr2[2] + "\n");
                            } else if (strArr2.length == 2) {
                                builder.accept(normalize + "\t" + strArr2[1] + "\n");
                            }
                            for (int i2 = 0; i2 < generateVariants.size(); i2++) {
                                if (strArr2.length == 3) {
                                    builder.accept(generateVariants.get(i2) + "\t" + strArr2[1] + "\t" + strArr2[2] + "\n");
                                } else if (strArr2.length == 2) {
                                    builder.accept(generateVariants.get(i2) + "\t" + strArr2[1] + "\n");
                                }
                            }
                        }
                        return builder.build();
                    }).unordered()).distinct().forEach(str2 -> {
                        try {
                            synchronized (fileWriter) {
                                fileWriter.write(str2);
                            }
                        } catch (IOException e) {
                            System.err.println("Could not write line: " + str2);
                            e.printStackTrace();
                        }
                    });
                    fileWriter.close();
                    bufferedReader.close();
                } catch (Throwable th) {
                    try {
                        fileWriter.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                    throw th;
                }
            } finally {
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println("\n\n\ndone");
        System.out.println("number of ignored lines (due to wrong format): " + atomicInteger);
    }

    public List<String> specialTokenSplit(List<String> list) {
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i);
            do {
                list.remove(i);
                list.add(i, str);
                Matcher matcher = this.tokenSplitPattern.matcher(str);
                if (matcher.matches()) {
                    if (matcher.group(1) != null && matcher.group(2) != null) {
                        str = matcher.group(1) + " " + matcher.group(2);
                    } else if (matcher.group(3) != null && matcher.group(4) != null) {
                        str = matcher.group(3) + " " + matcher.group(4);
                    } else if (matcher.group(5) != null && matcher.group(6) != null) {
                        str = matcher.group(5) + " " + matcher.group(6);
                    } else if (matcher.group(7) != null && matcher.group(8) != null) {
                        str = matcher.group(7) + " " + matcher.group(8);
                    }
                }
            } while (!str.equals(list.get(i)));
        }
        ArrayList arrayList = new ArrayList();
        for (String str2 : list) {
            if (str2.length() > 0) {
                for (String str3 : str2.split(" ")) {
                    arrayList.add(str3);
                }
            }
        }
        return arrayList;
    }

    public AhoCorasickOptimized getGreekAC() {
        return this.greekAC;
    }

    public AhoCorasickOptimized getGreekHighLowKinaseAC() {
        return this.greekHighLowKinaseAC;
    }

    protected List<String> splitAwayCharacterStrings(List<String> list) {
        AhoCorasickLongestMatchCallback ahoCorasickLongestMatchCallback = new AhoCorasickLongestMatchCallback();
        int i = 0;
        while (i < list.size()) {
            ahoCorasickLongestMatchCallback.clear();
            String lowerCase = list.get(i).toLowerCase();
            specialistLexEmbeddedGreekAC.match(lowerCase, ahoCorasickLongestMatchCallback);
            TreeMap<Range<Integer>, String> longestMatches = ahoCorasickLongestMatchCallback.getLongestMatches();
            if (!longestMatches.containsValue(lowerCase)) {
                ahoCorasickLongestMatchCallback.clear();
                this.greekHighLowKinaseAC.match(lowerCase, ahoCorasickLongestMatchCallback);
                TreeMap<Range<Integer>, String> longestMatches2 = ahoCorasickLongestMatchCallback.getLongestMatches();
                if (!longestMatches2.isEmpty() && (longestMatches2.size() != 1 || !longestMatches2.firstEntry().getValue().equals(lowerCase))) {
                    int i2 = 0;
                    boolean z = false;
                    for (Range<Integer> range : longestMatches2.keySet()) {
                        for (Range<Integer> range2 : longestMatches.keySet()) {
                            if (range2.containsRange(range) && (((Integer) range2.getMinimum()).intValue() < ((Integer) range.getMinimum()).intValue() || ((Integer) range2.getMaximum()).intValue() > ((Integer) range.getMaximum()).intValue())) {
                                z = true;
                            }
                        }
                        if (!z) {
                            Range between = Range.between(Integer.valueOf(i2), (Integer) range.getMinimum());
                            if (i2 != 0) {
                                if (((Integer) between.getMaximum()).intValue() > ((Integer) between.getMinimum()).intValue()) {
                                    list.add(i, lowerCase.substring(((Integer) between.getMinimum()).intValue(), ((Integer) between.getMaximum()).intValue()));
                                    i++;
                                }
                                list.add(i, longestMatches2.get(range));
                            } else if (((Integer) between.getMaximum()).intValue() > 0) {
                                list.set(i, lowerCase.substring(((Integer) between.getMinimum()).intValue(), ((Integer) between.getMaximum()).intValue()));
                                i++;
                                list.add(i, longestMatches2.get(range));
                            } else {
                                list.set(i, longestMatches2.get(range));
                            }
                            i++;
                            i2 = ((Integer) range.getMaximum()).intValue() + 1;
                        }
                    }
                    if (!z && i2 < lowerCase.length() - 1) {
                        list.add(i, lowerCase.substring(i2));
                    }
                }
            }
            i++;
        }
        return list;
    }

    private List<String> replaceShortForms(List<String> list) {
        int i = 0;
        while (i < list.size()) {
            Matcher matcher = this.shortFormPattern.matcher(list.get(i));
            if (matcher.matches()) {
                String str = "";
                String str2 = "";
                if (matcher.group(3) != null) {
                    str = matcher.group(2);
                    str2 = matcher.group(3);
                    if (str2.equals("L")) {
                        str2 = "ligand";
                    } else if (str2.equals("R")) {
                        str2 = "receptor";
                    }
                } else if (matcher.group(5) != null) {
                    str = matcher.group(4);
                    str2 = matcher.group(5);
                    if (str2.equals("l")) {
                        str2 = "ligand";
                    } else if (str2.equals("r")) {
                        str2 = "receptor";
                    }
                } else if (matcher.group(6) != null) {
                    if (matcher.group(1).equalsIgnoreCase("l")) {
                        str2 = "ligand";
                    } else if (matcher.group(1).equalsIgnoreCase("r")) {
                        str2 = "receptor";
                    }
                }
                list.set(i, str);
                i++;
                list.add(i, str2);
            }
            i++;
        }
        return list;
    }

    private String replaceShortFormsAtEnd(String str) {
        String str2 = "";
        Matcher matcher = this.shortFormEndWithNumberPattern.matcher(str);
        if (matcher.matches()) {
            if (matcher.group(2).equals("ra")) {
                str2 = "receptor alpha";
            } else if (matcher.group(2).equals("rb")) {
                str2 = "receptor beta";
            } else if (matcher.group(2).equals("rg")) {
                str2 = "receptor gamma";
            } else if (matcher.group(2).equals("bp")) {
                str2 = "binding protein";
            } else if (matcher.group(2).equals("a")) {
                str2 = "alpha";
            } else if (matcher.group(2).equals("b")) {
                str2 = "beta";
            }
            if (str2.length() > 0) {
                return matcher.group(1) + str2 + (matcher.group(3) != null ? matcher.group(3) : "");
            }
        }
        Matcher matcher2 = this.shortFormEndNoNumberPattern.matcher(str);
        if (matcher2.matches()) {
            if (matcher2.group(2).equals("a")) {
                str2 = "alpha";
            } else if (matcher2.group(2).equals("b")) {
                str2 = "beta";
            }
            if (str2.length() > 0) {
                return matcher2.group(1) + str2;
            }
        }
        return str;
    }

    private List<String> replaceKnownAcronyms(List<String> list) {
        for (int i = 0; i < list.size(); i++) {
            if (list.get(i).equals("il") || list.get(i).equals("IL")) {
                list.set(i, "interleukin");
            }
        }
        return list;
    }

    public List<String> splitAwayNumbers(List<String> list) {
        int i = 0;
        while (i < list.size()) {
            Matcher matcher = NUMBER_SPECIFIER_PATTERN.matcher(list.get(i));
            if (matcher.matches()) {
                list.set(i, matcher.group(1));
                i++;
                list.add(i, matcher.group(2));
            }
            i++;
        }
        return list;
    }

    public List<String> splitAwayRomanNumbers(List<String> list) {
        ArrayList arrayList = new ArrayList(list);
        int i = 0;
        while (i < arrayList.size()) {
            String str = (String) arrayList.get(i);
            Matcher matcher = ROMAN_NUMBERS_PATTERN.matcher(str);
            while (matcher.find()) {
                if (matcher.start() != 0 && matcher.end() == str.length()) {
                    arrayList.set(i, str.substring(0, matcher.start()));
                    i++;
                    arrayList.add(i, matcher.group());
                } else if (i == arrayList.size() - 1 && str.length() == matcher.end() + 1 && Character.isUpperCase(str.charAt(str.length() - 1))) {
                    arrayList.set(i, str.substring(0, matcher.start()));
                    int i2 = i + 1;
                    arrayList.add(i2, matcher.group());
                    i = i2 + 1;
                    arrayList.add(i, String.valueOf(str.charAt(str.length() - 1)));
                }
            }
            i++;
        }
        return arrayList;
    }

    protected List<String> replaceRomanNumbers(List<String> list) {
        if (list.size() > 1) {
            for (int i = 0; i < list.size(); i++) {
                String str = list.get(i);
                if (str.equals("I")) {
                    list.set(i, "1");
                } else if (str.equals("II")) {
                    list.set(i, "2");
                } else if (str.equals("III")) {
                    list.set(i, "3");
                } else if (str.equals("IV")) {
                    list.set(i, "4");
                }
            }
        }
        return list;
    }

    private List<String> transformPlurals(List<String> list) {
        for (int i = 0; i < list.size(); i++) {
            if (this.plurals.containsKey(list.get(i))) {
                list.set(i, this.plurals.get(list.get(i)));
            }
        }
        return list;
    }

    protected List<String> toLowerCase(List<String> list) {
        for (int i = 0; i < list.size(); i++) {
            list.set(i, list.get(i).trim().toLowerCase());
        }
        return list;
    }

    protected List<String> removeSpecialCharacters(List<String> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            String replaceAll = it.next().replaceAll("[\\W_&&[^\\.]]", " ");
            Matcher matcher = this.dotRemovalPattern.matcher(replaceAll);
            if (matcher.matches()) {
                replaceAll = matcher.replaceFirst(matcher.group(1) + matcher.group(2) + " " + matcher.group(3) + matcher.group(4));
            }
            String trim = replaceAll.replaceAll("[ ]+", " ").trim();
            if (trim.length() > 0) {
                for (String str : trim.split(" ")) {
                    arrayList.add(str);
                }
            }
        }
        return arrayList;
    }

    private List<String> removeDotAndHyphen(List<String> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().replaceAll("\\-", " "));
        }
        return arrayList;
    }

    protected List<String> removeStopwords(String str) {
        String[] split = str.split(" ");
        ArrayList arrayList = new ArrayList(split.length);
        if (split.length == 1) {
            arrayList.add(split[0]);
            return arrayList;
        }
        for (int i = 0; i < split.length; i++) {
            if (!stopwords.contains(split[i])) {
                arrayList.add(split[i]);
            }
        }
        return arrayList;
    }

    public String removeNonDescriptives(String str) {
        String[] split = str.split(" ");
        ArrayList arrayList = new ArrayList(split.length);
        for (int i = 0; i < split.length; i++) {
            if (!nonDescriptives.contains(split[i])) {
                arrayList.add(split[i]);
            }
        }
        return ArrayList2String(arrayList);
    }

    public boolean isNonDescriptive(String str) {
        return nonDescriptives.contains(str);
    }

    private synchronized void initStopwords() {
        if (stopwords == null) {
            stopwords = new TreeSet<>();
            stopwords.add("of");
            stopwords.add("for");
            stopwords.add("and");
            stopwords.add("or");
            stopwords.add("the");
        }
    }

    private void initPlurals() {
        this.plurals = new HashMap<>();
        this.plurals.put("receptors", "receptor");
        this.plurals.put("proteins", "protein");
        this.plurals.put("factors", "factor");
        this.plurals.put("ligands", "ligand");
        this.plurals.put("chains", "chain");
        this.plurals.put("antigens", "antigen");
        this.plurals.put("genes", "gene");
        this.plurals.put("transcripts", "transcript");
    }

    private synchronized void initNonDescriptives() {
        if (nonDescriptives != null) {
            return;
        }
        nonDescriptives = new TreeSet<>();
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(FileUtilities.findResource("/non_descriptives")));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                }
                nonDescriptives.add(readLine.trim());
            }
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    }

    protected String ArrayList2String(List<String> list) {
        StringBuffer stringBuffer = new StringBuffer();
        for (int i = 0; i < list.size(); i++) {
            stringBuffer.append(list.get(i) + " ");
        }
        if (stringBuffer.length() != 0) {
            stringBuffer.deleteCharAt(stringBuffer.length() - 1);
        }
        return stringBuffer.toString().trim();
    }

    static {
        try {
            List list = (List) IOStreamUtilities.getReaderFromInputStream(FileUtilities.findResource("specialistLexiconEmbeddedGreekLowHigh.txt.gz")).lines().collect(Collectors.toList());
            list.add("kinase");
            list.add("kinases");
            specialistLexEmbeddedGreekAC = new AhoCorasickOptimized(list);
        } catch (IOException e) {
            log.error("Could not find the SPECIALIST Lexicon entries filtered for embedded mentions of greek symbols or the words 'low' and 'high' on the classpath as 'specialistLexiconEmbeddedGreekLowHigh.txt.gz'. Term normalization will split words containing greek symbols by coincidence.");
            specialistLexEmbeddedGreekAC = new AhoCorasickOptimized(new String[0]);
        }
        greekCharacterNormalizationMap = new HashMap();
        greekCharacterNormalizationMap.put("α", "alpha");
        greekCharacterNormalizationMap.put("β", "beta");
        greekCharacterNormalizationMap.put("γ", "gamma");
        greekCharacterNormalizationMap.put("δ", "delta");
        greekCharacterNormalizationMap.put("ε", "epsilon");
        greekCharacterNormalizationMap.put("ζ", "zeta");
        greekCharacterNormalizationMap.put("η", "eta");
        greekCharacterNormalizationMap.put("θ", "theta");
        greekCharacterNormalizationMap.put("ι", "iota");
        greekCharacterNormalizationMap.put("κ", "kappa");
        greekCharacterNormalizationMap.put("λ", "delta");
        greekCharacterNormalizationMap.put("μ", "mu");
        greekCharacterNormalizationMap.put("ν", "nu");
        greekCharacterNormalizationMap.put("ξ", "xi");
        greekCharacterNormalizationMap.put("ο", "omicron");
        greekCharacterNormalizationMap.put("π", "pi");
        greekCharacterNormalizationMap.put("ρ", "rho");
        greekCharacterNormalizationMap.put("σ", "sigma");
        greekCharacterNormalizationMap.put("τ", "tau");
        greekCharacterNormalizationMap.put("υ", "upsilon");
        greekCharacterNormalizationMap.put("φ", "phi");
        greekCharacterNormalizationMap.put("χ", "chi");
        greekCharacterNormalizationMap.put("ψ", "omega");
        GREEK = new String[]{"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta", "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho", "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega"};
        GREEK_REGEX = "(" + ((String) Stream.of((Object[]) GREEK).collect(Collectors.joining("|"))) + ")";
        LAT_NUM_REGEX = "(" + ((String) Stream.of((Object[]) new String[]{"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX"}).sorted(Comparator.reverseOrder()).collect(Collectors.joining("|"))) + ")";
        ROMAN_NUMBERS_PATTERN = Pattern.compile(LAT_NUM_REGEX);
        NUMBER_SPECIFIER_PATTERN = Pattern.compile(NUMBERPATTERN);
    }
}
