package uk.ac.man.entitytagger.matching.matchers;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import uk.ac.man.documentparser.dataholders.Document;
import uk.ac.man.entitytagger.Mention;
import uk.ac.man.entitytagger.matching.Matcher;

/* loaded from: input_file:uk/ac/man/entitytagger/matching/matchers/TaxonGrabMatcher.class */
public class TaxonGrabMatcher extends Matcher {
    private Set<String> dict;

    public TaxonGrabMatcher(File file) {
        System.out.print("Loading TaxonGrab dictionary...");
        this.dict = loadDict(file);
        System.out.println(" done, loaded " + this.dict.size() + " entries.");
    }

    private Set<String> loadDict(File file) {
        HashSet hashSet = new HashSet();
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
            for (String readLine = bufferedReader.readLine(); readLine != null; readLine = bufferedReader.readLine()) {
                hashSet.add(readLine);
            }
            bufferedReader.close();
        } catch (Exception e) {
            System.err.println(e);
            e.printStackTrace();
            System.exit(-1);
        }
        return hashSet;
    }

    @Override // uk.ac.man.entitytagger.matching.Matcher
    public List<Mention> match(String str, Document document) {
        String id = document != null ? document.getID() : null;
        String str2 = "";
        String str3 = "";
        String str4 = "";
        ArrayList arrayList = new ArrayList();
        for (String str5 : str.replace(" -\r", " - ").replace(" -\n", " - ").replace("-\r", "").replace("-\n", "").replace("\r", " ").replace("\t", "").split("\n")) {
            for (String str6 : str5.replace(":", " ").replace(";", " ").replace(".", ". ").split("\\s")) {
                if (str6.matches(".*[\\$\\%\\|\\{\\}\\*\\+\\?\\=\\-\\'\\^\\/\\@\\&]|[0-9].*")) {
                    str2 = "";
                    str3 = "";
                    str4 = "";
                }
                if (str6.matches(".*[(][\\sa-z]+.*")) {
                    str6 = str6.replaceAll("[()]", "");
                }
                String replace = str6.toLowerCase().replace(".", "").replace(",", "");
                if (!str4.equals("") && str6.matches(".*^[A-Za-z()]{2,}.*")) {
                    arrayList.add(str4 + " " + str6);
                    str4 = "";
                }
                if (this.dict.contains(replace)) {
                    str2 = "";
                    str3 = "";
                } else if (str6.matches(".*\\A(?:((^[A-Z][a-z]{1,})|(^[A-Z][a-z]?\\.)))\\z.*") && !str6.matches(".*var|subsp.*")) {
                    str2 = str6;
                    str3 = "";
                } else if (!str2.equals("") && str3.equals("")) {
                    String replace2 = str6.replace(",", "");
                    if (replace2.matches(".*^[a-z]{3,}.\\z.*")) {
                        str3 = replace2;
                        arrayList.add(str2 + " " + str3);
                    } else if (replace2.matches(".*\\A\\([A-Z][a-z]{3,}\\)\\z.*")) {
                        str3 = replace2;
                        arrayList.add("temporary, should be deleted");
                    } else {
                        str2 = "";
                        str3 = "";
                    }
                } else if (str2.equals("") || str3.equals("") || str6.length() <= 2) {
                    str2 = "";
                    str3 = "";
                } else {
                    String replace3 = str6.replace(",", "");
                    if (replace3.matches("^[A-Za-z()]{2,}")) {
                        arrayList.remove(arrayList.size() - 1);
                        if (replace3.matches(".*var|subsp|subg|ssp.*")) {
                            str4 = str2 + " " + str3 + " " + replace3;
                        } else if (!replace3.contains(".")) {
                            arrayList.add(str2 + " " + str3 + " " + replace3);
                        }
                    }
                    str2 = "";
                    str3 = "";
                }
            }
        }
        ArrayList arrayList2 = new ArrayList();
        HashSet hashSet = new HashSet();
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            String str7 = (String) it.next();
            if (!hashSet.contains(str7)) {
                if (str7.endsWith(".")) {
                    str7 = str7.substring(0, str7.length() - 1);
                }
                if (str7.endsWith(")") && str7.indexOf("(") == -1) {
                    str7 = str7.substring(0, str7.length() - 1);
                }
                if (!str7.contains(". ")) {
                    int indexOf = str.indexOf(str7);
                    while (true) {
                        int i = indexOf;
                        if (i == -1) {
                            break;
                        }
                        Mention mention = new Mention(new String[0], i, i + str7.length(), str7);
                        mention.setComment("taxongrab (" + str7 + ")");
                        mention.setDocid(id);
                        if (document == null || document.isValid(mention.getStart(), mention.getEnd())) {
                            arrayList2.add(mention);
                        }
                        indexOf = str.indexOf(str7, i + 1);
                    }
                } else {
                    java.util.regex.Matcher matcher = Pattern.compile(str7.replace(". ", ". ?").replace("(", "\\(").replace(")", "\\)").replace("[", "\\[").replace("]", "\\]").replace("{", "\\{").replace("}", "\\}").replace(".", "\\.")).matcher(str);
                    while (matcher.find()) {
                        int start = matcher.start();
                        int end = matcher.end();
                        Mention mention2 = new Mention(new String[0], start, end, str.substring(start, end));
                        mention2.setComment("taxongrab (" + str7 + ")");
                        mention2.setDocid(id);
                        if (document == null || document.isValid(mention2.getStart(), mention2.getEnd())) {
                            arrayList2.add(mention2);
                        }
                    }
                }
                hashSet.add(str7);
            }
        }
        return arrayList2;
    }
}
