package uk.ac.cam.ch.wwmm.oscarMEMM.models;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import nu.xom.Document;
import nu.xom.Element;
import nu.xom.Nodes;
import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
import uk.ac.cam.ch.wwmm.oscar.document.XOMBasedProcessingDocumentFactory;
import uk.ac.cam.ch.wwmm.oscar.tools.InlineToSAF;
import uk.ac.cam.ch.wwmm.oscar.tools.StringTools;
import uk.ac.cam.ch.wwmm.oscar.types.NamedEntityType;
import uk.ac.cam.ch.wwmm.oscar.xmltools.XOMTools;
import uk.ac.cam.ch.wwmm.oscarrecogniser.ptcDataStruct.Bag;
import uk.ac.cam.ch.wwmm.oscartokeniser.HyphenTokeniser;
import uk.ac.cam.ch.wwmm.oscartokeniser.Tokeniser;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/oscarMEMM/models/TrainingDataExtractor.class */
public final class TrainingDataExtractor {
    public Collection<String> chemicalWords;
    public Collection<String> nonChemicalWords;
    public Set<String> chemicalNonWords;
    public Set<String> nonChemicalNonWords;
    public Set<String> afterHyphen;
    public Set<String> notForPrefix;
    public Set<String> pnStops;
    public Set<String> polysemous;
    public Set<String> rnEnd;
    public Set<String> rnMid;
    private static Pattern notForPrefixPattern = Pattern.compile("[0-9]+-([a-z]+)");

    private Element stringsToElement(Collection<String> collection, String str) {
        Element element = new Element(str);
        StringBuffer stringBuffer = new StringBuffer();
        Iterator<String> it = collection.iterator();
        while (it.hasNext()) {
            stringBuffer.append(it.next());
            stringBuffer.append("\n");
        }
        element.appendChild(stringBuffer.toString());
        return element;
    }

    public Element toXML() {
        Element element = new Element("etd");
        element.appendChild(stringsToElement(this.chemicalWords, "chemicalWords"));
        element.appendChild(stringsToElement(this.nonChemicalWords, "nonChemicalWords"));
        element.appendChild(stringsToElement(this.chemicalNonWords, "chemicalNonWords"));
        element.appendChild(stringsToElement(this.nonChemicalNonWords, "nonChemicalNonWords"));
        element.appendChild(stringsToElement(this.afterHyphen, "afterHyphen"));
        element.appendChild(stringsToElement(this.notForPrefix, "notForPrefix"));
        element.appendChild(stringsToElement(this.pnStops, "pnStops"));
        element.appendChild(stringsToElement(this.polysemous, "polysemous"));
        element.appendChild(stringsToElement(this.rnEnd, "rnEnd"));
        element.appendChild(stringsToElement(this.rnMid, "rnMid"));
        return element;
    }

    public int makeHash() {
        return toXML().toXML().hashCode();
    }

    public TrainingDataExtractor(Collection<Document> collection) {
        init(collection);
    }

    public TrainingDataExtractor(Document document) {
        init(document);
    }

    private void initSets() {
        this.chemicalWords = new HashSet();
        this.nonChemicalWords = new HashSet();
        this.afterHyphen = new HashSet();
        this.chemicalNonWords = new HashSet();
        this.nonChemicalNonWords = new HashSet();
        this.pnStops = new HashSet();
        this.notForPrefix = new HashSet();
        this.polysemous = new HashSet();
        this.rnEnd = new HashSet();
        this.rnMid = new HashSet();
    }

    private void init(Collection<Document> collection) {
        HashSet hashSet = new HashSet();
        initSets();
        HyphenTokeniser.reinitialise();
        Bag<String> bag = new Bag<>();
        Bag<String> bag2 = new Bag<>();
        Bag<String> bag3 = new Bag<>();
        Bag<String> bag4 = new Bag<>();
        int i = 0;
        Iterator<Document> it = collection.iterator();
        while (it.hasNext()) {
            loadAnnotations(hashSet, bag, bag2, bag3, bag4, it.next());
            i++;
        }
        for (String str : bag.getSet()) {
            if (bag.getCount(str) > 0 && bag3.getCount(str) == 0) {
                this.chemicalWords.add(str);
            }
        }
        for (String str2 : bag3.getSet()) {
            if (bag3.getCount(str2) > 0 && bag.getCount(str2) == 0 && !this.chemicalWords.contains(str2)) {
                this.nonChemicalWords.add(str2);
            }
        }
        for (String str3 : bag2.getSet()) {
            if (bag2.getCount(str3) > 0 && bag4.getCount(str3) == 0) {
                this.chemicalNonWords.add(str3);
            }
        }
        for (String str4 : bag4.getSet()) {
            if (bag4.getCount(str4) > 0 && bag2.getCount(str4) == 0) {
                this.nonChemicalNonWords.add(str4);
            }
        }
        HashSet<String> hashSet2 = new HashSet();
        hashSet2.addAll(bag.getSet());
        hashSet2.addAll(bag2.getSet());
        HashSet hashSet3 = new HashSet();
        hashSet3.addAll(bag3.getSet());
        hashSet3.addAll(bag4.getSet());
        for (String str5 : hashSet2) {
            if (hashSet3.contains(str5)) {
                this.polysemous.add(str5);
            }
        }
        for (String str6 : hashSet) {
            if (this.pnStops.contains(str6)) {
                this.pnStops.remove(str6);
            }
        }
        for (String str7 : this.nonChemicalWords) {
            if (str7.matches("[a-z][a-z][a-z]+")) {
                String str8 = str7.substring(0, 1).toUpperCase() + str7.substring(1);
                if (!hashSet.contains(str8)) {
                    this.pnStops.add(str8);
                }
            }
        }
        for (String str9 : this.chemicalWords) {
            if (str9.matches("[a-z][a-z][a-z]+")) {
                String str10 = str9.substring(0, 1).toUpperCase() + str9.substring(1);
                if (!hashSet.contains(str10)) {
                    this.pnStops.add(str10);
                }
            }
        }
        HyphenTokeniser.reinitialise();
    }

    private void init(Document document) {
        HashSet hashSet = new HashSet();
        initSets();
        HyphenTokeniser.reinitialise();
        Bag<String> bag = new Bag<>();
        Bag<String> bag2 = new Bag<>();
        Bag<String> bag3 = new Bag<>();
        Bag<String> bag4 = new Bag<>();
        loadAnnotations(hashSet, bag, bag2, bag3, bag4, document);
        for (String str : bag.getSet()) {
            if (bag.getCount(str) > 0 && bag3.getCount(str) == 0) {
                this.chemicalWords.add(str);
            }
        }
        for (String str2 : bag3.getSet()) {
            if (bag3.getCount(str2) > 0 && bag.getCount(str2) == 0) {
                this.nonChemicalWords.add(str2);
            }
        }
        for (String str3 : bag2.getSet()) {
            if (bag2.getCount(str3) > 0 && bag4.getCount(str3) == 0) {
                this.chemicalNonWords.add(str3);
            }
        }
        for (String str4 : bag4.getSet()) {
            if (bag4.getCount(str4) > 0 && bag2.getCount(str4) == 0) {
                this.nonChemicalNonWords.add(str4);
            }
        }
        HashSet<String> hashSet2 = new HashSet();
        hashSet2.addAll(bag.getSet());
        hashSet2.addAll(bag2.getSet());
        HashSet hashSet3 = new HashSet();
        hashSet3.addAll(bag3.getSet());
        hashSet3.addAll(bag4.getSet());
        for (String str5 : hashSet2) {
            if (hashSet3.contains(str5)) {
                this.polysemous.add(str5);
            }
        }
        for (String str6 : hashSet) {
            if (this.pnStops.contains(str6)) {
                this.pnStops.remove(str6);
            }
        }
        for (String str7 : this.nonChemicalWords) {
            if (str7.matches("[a-z][a-z][a-z]+")) {
                String str8 = str7.substring(0, 1).toUpperCase() + str7.substring(1);
                if (!hashSet.contains(str8)) {
                    this.pnStops.add(str8);
                }
            }
        }
        for (String str9 : this.chemicalWords) {
            if (str9.matches("[a-z][a-z][a-z]+")) {
                String str10 = str9.substring(0, 1).toUpperCase() + str9.substring(1);
                if (!hashSet.contains(str10)) {
                    this.pnStops.add(str10);
                }
            }
        }
        HyphenTokeniser.reinitialise();
    }

    private void loadAnnotations(Set<String> set, Bag<String> bag, Bag<String> bag2, Bag<String> bag3, Bag<String> bag4, Document document) {
        Nodes query = document.query("//cmlPile");
        for (int i = 0; i < query.size(); i++) {
            query.get(i).detach();
        }
        Document document2 = new Document(XOMTools.safeCopy(document.getRootElement()));
        Nodes query2 = document2.query("//ne");
        for (int i2 = 0; i2 < query2.size(); i2++) {
            XOMTools.removeElementPreservingText(query2.get(i2));
        }
        for (TokenSequence tokenSequence : XOMBasedProcessingDocumentFactory.getInstance().makeTokenisedDocument(Tokeniser.getDefaultInstance(), document2, true, false, InlineToSAF.extractSAFs(document, document2, "foo")).getTokenSequences()) {
            this.afterHyphen.addAll(tokenSequence.getAfterHyphens());
            Map nes = tokenSequence.getNes();
            ArrayList<List> arrayList = new ArrayList();
            if (nes.containsKey(NamedEntityType.COMPOUND)) {
                arrayList.addAll((Collection) nes.get(NamedEntityType.COMPOUND));
            }
            if (nes.containsKey(NamedEntityType.ADJECTIVE)) {
                arrayList.addAll((Collection) nes.get(NamedEntityType.ADJECTIVE));
            }
            if (nes.containsKey(NamedEntityType.REACTION)) {
                arrayList.addAll((Collection) nes.get(NamedEntityType.REACTION));
            }
            if (nes.containsKey(NamedEntityType.ASE)) {
                arrayList.addAll((Collection) nes.get(NamedEntityType.ASE));
            }
            for (List<String> list : arrayList) {
                if (list.size() == 1) {
                    if (((String) list.get(0)).matches(".*[a-z][a-z].*")) {
                        bag.add((String) list.get(0));
                    } else if (((String) list.get(0)).matches(".*[A-Z].*")) {
                        bag2.add((String) list.get(0));
                    }
                } else if (((String) list.get(0)).matches("[A-Z][a-z][a-z]+")) {
                    set.add((String) list.get(0));
                    while (list.size() > 3 && StringTools.isHyphen((String) list.get(2)) && ((String) list.get(2)).matches("[A-Z][a-z][a-z]+")) {
                        list = list.subList(2, list.size());
                        set.add((String) list.get(0));
                    }
                } else {
                    for (String str : list) {
                        if (str.matches(".*[a-z][a-z].*")) {
                            bag.add(str);
                        }
                    }
                }
            }
            if (nes.containsKey(NamedEntityType.REACTION)) {
                for (List list2 : (List) nes.get(NamedEntityType.REACTION)) {
                    if (list2.size() > 1) {
                        this.rnEnd.add((String) list2.get(list2.size() - 1));
                        for (int i3 = 1; i3 < list2.size() - 1; i3++) {
                            String str2 = (String) list2.get(i3);
                            if (str2.matches("[a-z].+")) {
                                this.rnMid.add(str2);
                            }
                        }
                    }
                }
            }
            for (String str3 : tokenSequence.getNonNes()) {
                if (str3.matches(".*[a-z][a-z].*")) {
                    bag3.add(str3.toLowerCase());
                }
                if (str3.matches("[A-Z][a-z][a-z]+")) {
                    this.pnStops.add(str3);
                }
                Matcher matcher = notForPrefixPattern.matcher(str3);
                if (matcher.matches()) {
                    this.notForPrefix.add(matcher.group(1));
                }
                if (str3.matches(".*[A-Z].*") && !str3.matches("[A-Z][a-z][a-z]+")) {
                    bag4.add(str3);
                }
            }
        }
    }
}
