package de.julielab.jules.ae.genemapping.utils.norm;

import com.lahodiuk.ahocorasick.AhoCorasickOptimized;
import de.julielab.jules.ae.genemapping.AhoCorasickLongestMatchCallback;
import de.julielab.jules.ae.genemapping.CandidateFilter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.Range;
import org.apache.commons.lang3.StringUtils;
import org.tartarus.snowball.SnowballProgram;

/* loaded from: input_file:de/julielab/jules/ae/genemapping/utils/norm/TermNormalizer.class */
public class TermNormalizer {
    private TreeSet<String> nonDescriptives;
    private TreeSet<String> stopwords;
    private HashMap<String, String> plurals;
    private AhoCorasickOptimized greekAC;
    private SnowballProgram stemmer;
    private final String NON_DESCRIPTIVES_FILE = "/non_descriptives";
    private final String NUMBERPATTERN = "([A-Za-z]+)([0-9]+)";
    private final String SHORTFORMPATTERN = "((.*[0-9a-z]+)(L|R)|(.*[0-9]+)(l|r)|(r|l|R|L))";
    private final String SHORTFORMEND_WITH_NUMBER_PATTERN = "(.* )(ra|rb|rg|bp)( [0-9]*)?";
    private final String SHORTFORMEND_NO_NUMBER_PATTERN = "(.* )(a|b)";
    private final String TOKENSPLITPATTERN = "(.*[a-z])([A-Z0-9].*)|(.*[A-Z])([0-9].*)|(.*[0-9])([a-zA-Z].*)|(.*[A-Z][A-Z])([a-z].*)";
    private final String DOTREMOVAL = "(.*)([a-zA-Z])\\.([a-zA-Z0-9])(.*)";
    private Pattern numberPattern = Pattern.compile("([A-Za-z]+)([0-9]+)");
    private Pattern shortFormPattern = Pattern.compile("((.*[0-9a-z]+)(L|R)|(.*[0-9]+)(l|r)|(r|l|R|L))");
    private Pattern tokenSplitPattern = Pattern.compile("(.*[a-z])([A-Z0-9].*)|(.*[A-Z])([0-9].*)|(.*[0-9])([a-zA-Z].*)|(.*[A-Z][A-Z])([a-z].*)");
    private Pattern dotRemovalPattern = Pattern.compile("(.*)([a-zA-Z])\\.([a-zA-Z0-9])(.*)");
    private Pattern shortFormEndWithNumberPattern = Pattern.compile("(.* )(ra|rb|rg|bp)( [0-9]*)?");
    private Pattern shortFormEndNoNumberPattern = Pattern.compile("(.* )(a|b)");

    public TermNormalizer() {
        List list = (List) Arrays.stream(CandidateFilter.GREEK).collect(Collectors.toList());
        list.add("high");
        list.add("low");
        this.greekAC = new AhoCorasickOptimized((List<String>) list);
        initStopwords();
        initNonDescriptives();
        try {
            this.stemmer = (SnowballProgram) Class.forName("org.tartarus.snowball.ext.EnglishStemmer").newInstance();
        } catch (ReflectiveOperationException e) {
            throw new RuntimeException(e.toString());
        }
    }

    public static void main(String[] strArr) throws IOException {
        if (strArr.length != 2) {
            System.err.println("usage:\nTermNormalizer <inputFil> <outputFile>");
            System.exit(-1);
        } else {
            new TermNormalizer().normalizeFile(new File(strArr[0]), new File(strArr[1]));
        }
    }

    public String normalize(String str) {
        ArrayList<String> arrayList;
        ArrayList<String> removeSpecialCharacters = removeSpecialCharacters(removeStopwords(str));
        do {
            arrayList = removeSpecialCharacters;
            removeSpecialCharacters = specialTokenSplit(splitAwayNumbers(removeSpecialCharacters));
        } while (!removeSpecialCharacters.equals(arrayList));
        return ArrayList2String(toLowerCase(replaceRomanNumbers(splitAwayCharacterStrings(removeSpecialCharacters)))).trim();
    }

    public List<String> generateVariants(String str) {
        ArrayList arrayList = new ArrayList();
        arrayList.add(str.replaceAll("([^-0-9])\\-([^0-9])", "$1$2"));
        arrayList.add((String) splitAwayRomanNumbers(Arrays.asList(str.split("\\s+"))).stream().collect(Collectors.joining(" ")));
        arrayList.add(str.replaceAll("alpha", "a").replaceAll("beta", "b").replaceAll("gamma", "g").replaceAll("delta", "d"));
        arrayList.add(str.replaceAll("\\s?alpha", "a").replaceAll("\\s?beta", "b").replaceAll("\\s?gamma", "g").replaceAll("\\s?delta", "d"));
        return arrayList;
    }

    public String stemNameTokens(String str) throws IOException {
        String[] split = str.split("\\s+");
        ArrayList arrayList = new ArrayList(split.length);
        for (String str2 : split) {
            this.stemmer.setCurrent(str2);
            this.stemmer.stem();
            arrayList.add(this.stemmer.getCurrent());
        }
        return StringUtils.join(arrayList, " ");
    }

    public void normalizeFile(File file, File file2) {
        System.out.println("Normalizing file " + file.getAbsolutePath() + " and writing the result to " + file2.getAbsolutePath());
        AtomicInteger atomicInteger = new AtomicInteger(0);
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
            try {
                FileWriter fileWriter = new FileWriter(file2);
                try {
                    ((Stream) ((Stream) bufferedReader.lines().parallel()).map(str -> {
                        return str.split("\t");
                    }).filter(strArr -> {
                        if (strArr.length == 3) {
                            return true;
                        }
                        atomicInteger.incrementAndGet();
                        System.err.println("wrong line format, ignoring line: " + Arrays.toString(strArr));
                        return false;
                    }).flatMap(strArr2 -> {
                        Stream.Builder builder = Stream.builder();
                        String normalize = normalize(strArr2[0]);
                        if (!normalize.isEmpty()) {
                            List<String> generateVariants = generateVariants(strArr2[0]);
                            for (int i = 0; i < generateVariants.size(); i++) {
                                generateVariants.set(i, normalize(generateVariants.get(i)));
                            }
                            builder.accept(normalize + "\t" + strArr2[1] + "\t" + strArr2[2] + "\n");
                            for (int i2 = 0; i2 < generateVariants.size(); i2++) {
                                builder.accept(generateVariants.get(i2) + "\t" + strArr2[1] + "\t" + strArr2[2] + "\n");
                            }
                        }
                        return builder.build();
                    }).unordered()).distinct().forEach(str2 -> {
                        try {
                            synchronized (fileWriter) {
                                fileWriter.write(str2);
                            }
                        } catch (IOException e) {
                            System.err.println("Could not write line: " + str2);
                            e.printStackTrace();
                        }
                    });
                    fileWriter.close();
                    bufferedReader.close();
                } catch (Throwable th) {
                    try {
                        fileWriter.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                    throw th;
                }
            } finally {
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println("\n\n\ndone");
        System.out.println("number of ignored lines (due to wrong format): " + atomicInteger);
    }

    private ArrayList<String> specialTokenSplit(ArrayList<String> arrayList) {
        for (int i = 0; i < arrayList.size(); i++) {
            String str = arrayList.get(i);
            do {
                arrayList.remove(i);
                arrayList.add(i, str);
                Matcher matcher = this.tokenSplitPattern.matcher(str);
                if (matcher.matches()) {
                    if (matcher.group(1) != null && matcher.group(2) != null) {
                        str = matcher.group(1) + " " + matcher.group(2);
                    } else if (matcher.group(3) != null && matcher.group(4) != null) {
                        str = matcher.group(3) + " " + matcher.group(4);
                    } else if (matcher.group(5) != null && matcher.group(6) != null) {
                        str = matcher.group(5) + " " + matcher.group(6);
                    } else if (matcher.group(7) != null && matcher.group(8) != null) {
                        str = matcher.group(7) + " " + matcher.group(8);
                    }
                }
            } while (!str.equals(arrayList.get(i)));
        }
        ArrayList<String> arrayList2 = new ArrayList<>();
        Iterator<String> it = arrayList.iterator();
        while (it.hasNext()) {
            String next = it.next();
            if (next.length() > 0) {
                for (String str2 : next.split(" ")) {
                    arrayList2.add(str2);
                }
            }
        }
        return arrayList2;
    }

    private ArrayList<String> splitAwayCharacterStrings(ArrayList<String> arrayList) {
        AhoCorasickLongestMatchCallback ahoCorasickLongestMatchCallback = new AhoCorasickLongestMatchCallback();
        int i = 0;
        while (i < arrayList.size()) {
            ahoCorasickLongestMatchCallback.clear();
            String lowerCase = arrayList.get(i).toLowerCase();
            this.greekAC.match(lowerCase, ahoCorasickLongestMatchCallback);
            TreeMap<Range<Integer>, String> longestMatches = ahoCorasickLongestMatchCallback.getLongestMatches();
            if (!longestMatches.isEmpty() && (longestMatches.size() != 1 || !longestMatches.firstEntry().getValue().equals(lowerCase))) {
                int i2 = 0;
                for (Range<Integer> range : longestMatches.keySet()) {
                    Range between = Range.between(Integer.valueOf(i2), (Integer) range.getMinimum());
                    if (i2 != 0) {
                        if (((Integer) between.getMaximum()).intValue() > ((Integer) between.getMinimum()).intValue()) {
                            arrayList.add(i, lowerCase.substring(((Integer) between.getMinimum()).intValue(), ((Integer) between.getMaximum()).intValue()));
                            i++;
                        }
                        arrayList.add(i, longestMatches.get(range));
                    } else if (((Integer) between.getMaximum()).intValue() > 0) {
                        arrayList.set(i, lowerCase.substring(((Integer) between.getMinimum()).intValue(), ((Integer) between.getMaximum()).intValue()));
                        i++;
                        arrayList.add(i, longestMatches.get(range));
                    } else {
                        arrayList.set(i, longestMatches.get(range));
                    }
                    i++;
                    i2 = ((Integer) range.getMaximum()).intValue() + 1;
                }
                if (i2 < lowerCase.length() - 1) {
                    arrayList.add(i, lowerCase.substring(i2));
                }
            }
            i++;
        }
        return arrayList;
    }

    private ArrayList<String> replaceShortForms(ArrayList<String> arrayList) {
        int i = 0;
        while (i < arrayList.size()) {
            Matcher matcher = this.shortFormPattern.matcher(arrayList.get(i));
            if (matcher.matches()) {
                String str = "";
                String str2 = "";
                if (matcher.group(3) != null) {
                    str = matcher.group(2);
                    str2 = matcher.group(3);
                    if (str2.equals("L")) {
                        str2 = "ligand";
                    } else if (str2.equals("R")) {
                        str2 = "receptor";
                    }
                } else if (matcher.group(5) != null) {
                    str = matcher.group(4);
                    str2 = matcher.group(5);
                    if (str2.equals("l")) {
                        str2 = "ligand";
                    } else if (str2.equals("r")) {
                        str2 = "receptor";
                    }
                } else if (matcher.group(6) != null) {
                    if (matcher.group(1).toLowerCase().equals("l")) {
                        str2 = "ligand";
                    } else if (matcher.group(1).toLowerCase().equals("r")) {
                        str2 = "receptor";
                    }
                }
                arrayList.set(i, str);
                i++;
                arrayList.add(i, str2);
            }
            i++;
        }
        return arrayList;
    }

    private String replaceShortFormsAtEnd(String str) {
        String str2 = "";
        Matcher matcher = this.shortFormEndWithNumberPattern.matcher(str);
        if (matcher.matches()) {
            if (matcher.group(2).equals("ra")) {
                str2 = "receptor alpha";
            } else if (matcher.group(2).equals("rb")) {
                str2 = "receptor beta";
            } else if (matcher.group(2).equals("rg")) {
                str2 = "receptor gamma";
            } else if (matcher.group(2).equals("bp")) {
                str2 = "binding protein";
            } else if (matcher.group(2).equals("a")) {
                str2 = "alpha";
            } else if (matcher.group(2).equals("b")) {
                str2 = "beta";
            }
            if (str2.length() > 0) {
                return matcher.group(1) + str2 + (matcher.group(3) != null ? matcher.group(3) : "");
            }
        }
        Matcher matcher2 = this.shortFormEndNoNumberPattern.matcher(str);
        if (matcher2.matches()) {
            if (matcher2.group(2).equals("a")) {
                str2 = "alpha";
            } else if (matcher2.group(2).equals("b")) {
                str2 = "beta";
            }
            if (str2.length() > 0) {
                return matcher2.group(1) + str2;
            }
        }
        return str;
    }

    private ArrayList<String> replaceKnownAcronyms(ArrayList<String> arrayList) {
        for (int i = 0; i < arrayList.size(); i++) {
            if (arrayList.get(i).equals("il") || arrayList.get(i).equals("IL")) {
                arrayList.set(i, "interleukin");
            }
        }
        return arrayList;
    }

    private ArrayList<String> splitAwayNumbers(ArrayList<String> arrayList) {
        int i = 0;
        while (i < arrayList.size()) {
            Matcher matcher = this.numberPattern.matcher(arrayList.get(i));
            if (matcher.matches()) {
                arrayList.set(i, matcher.group(1));
                i++;
                arrayList.add(i, matcher.group(2));
            }
            i++;
        }
        return arrayList;
    }

    public List<String> splitAwayRomanNumbers(List<String> list) {
        ArrayList arrayList = new ArrayList(list);
        int i = 0;
        while (i < arrayList.size()) {
            String str = (String) arrayList.get(i);
            Matcher matcher = Pattern.compile(CandidateFilter.LAT_NUM_REGEX).matcher(str);
            while (matcher.find()) {
                if (matcher.start() != 0 && matcher.end() == str.length()) {
                    arrayList.set(i, str.substring(0, matcher.start()));
                    i++;
                    arrayList.add(i, matcher.group());
                }
            }
            i++;
        }
        return arrayList;
    }

    private ArrayList<String> replaceRomanNumbers(ArrayList<String> arrayList) {
        if (arrayList.size() > 1) {
            for (int i = 0; i < arrayList.size(); i++) {
                String str = arrayList.get(i);
                if (str.equals("I")) {
                    arrayList.set(i, "1");
                } else if (str.equals("II")) {
                    arrayList.set(i, "2");
                } else if (str.equals("III")) {
                    arrayList.set(i, "3");
                } else if (str.equals("IV")) {
                    arrayList.set(i, "4");
                }
            }
        }
        return arrayList;
    }

    private ArrayList<String> transformPlurals(ArrayList<String> arrayList) {
        for (int i = 0; i < arrayList.size(); i++) {
            if (this.plurals.containsKey(arrayList.get(i))) {
                arrayList.set(i, this.plurals.get(arrayList.get(i)));
            }
        }
        return arrayList;
    }

    private ArrayList<String> toLowerCase(ArrayList<String> arrayList) {
        for (int i = 0; i < arrayList.size(); i++) {
            arrayList.set(i, arrayList.get(i).trim().toLowerCase());
        }
        return arrayList;
    }

    private ArrayList<String> removeSpecialCharacters(ArrayList<String> arrayList) {
        ArrayList<String> arrayList2 = new ArrayList<>();
        Iterator<String> it = arrayList.iterator();
        while (it.hasNext()) {
            String replaceAll = it.next().replaceAll("[\\W_&&[^\\.]]", " ");
            Matcher matcher = this.dotRemovalPattern.matcher(replaceAll);
            if (matcher.matches()) {
                replaceAll = matcher.replaceFirst(matcher.group(1) + matcher.group(2) + " " + matcher.group(3) + matcher.group(4));
            }
            String trim = replaceAll.replaceAll("[ ]+", " ").trim();
            if (trim.length() > 0) {
                for (String str : trim.split(" ")) {
                    arrayList2.add(str);
                }
            }
        }
        return arrayList2;
    }

    private ArrayList<String> removeDotAndHyphen(ArrayList<String> arrayList) {
        ArrayList<String> arrayList2 = new ArrayList<>();
        Iterator<String> it = arrayList.iterator();
        while (it.hasNext()) {
            arrayList2.add(it.next().replaceAll("\\-", " "));
        }
        return arrayList2;
    }

    private ArrayList<String> removeStopwords(String str) {
        String[] split = str.split(" ");
        ArrayList<String> arrayList = new ArrayList<>(split.length);
        if (split.length == 1) {
            arrayList.add(split[0]);
            return arrayList;
        }
        for (int i = 0; i < split.length; i++) {
            if (!this.stopwords.contains(split[i])) {
                arrayList.add(split[i]);
            }
        }
        return arrayList;
    }

    public String removeNonDescriptives(String str) {
        String[] split = str.split(" ");
        ArrayList<String> arrayList = new ArrayList<>(split.length);
        for (int i = 0; i < split.length; i++) {
            if (!this.nonDescriptives.contains(split[i])) {
                arrayList.add(split[i]);
            }
        }
        return ArrayList2String(arrayList);
    }

    public boolean isNonDescriptive(String str) {
        return this.nonDescriptives.contains(str);
    }

    private void initStopwords() {
        this.stopwords = new TreeSet<>();
        this.stopwords.add("of");
        this.stopwords.add("for");
        this.stopwords.add("and");
        this.stopwords.add("or");
        this.stopwords.add("the");
    }

    private void initPlurals() {
        this.plurals = new HashMap<>();
        this.plurals.put("receptors", "receptor");
        this.plurals.put("proteins", "protein");
        this.plurals.put("factors", "factor");
        this.plurals.put("ligands", "ligand");
        this.plurals.put("chains", "chain");
        this.plurals.put("antigens", "antigen");
        this.plurals.put("genes", "gene");
        this.plurals.put("transcripts", "transcript");
    }

    private void initNonDescriptives() {
        this.nonDescriptives = new TreeSet<>();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream("/non_descriptives")));
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                }
                this.nonDescriptives.add(readLine.trim());
            } catch (IOException e) {
                e.printStackTrace();
                return;
            }
        }
    }

    private String ArrayList2String(ArrayList<String> arrayList) {
        StringBuffer stringBuffer = new StringBuffer("");
        for (int i = 0; i < arrayList.size(); i++) {
            stringBuffer.append(arrayList.get(i) + " ");
        }
        if (stringBuffer.length() != 0) {
            stringBuffer.deleteCharAt(stringBuffer.length() - 1);
        }
        return stringBuffer.toString().trim();
    }
}
