package de.julielab.topicmodeling.services;

import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.TokenSequence2FeatureSequence;
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.pipe.iterator.ArrayIterator;
import cc.mallet.topics.ParallelTopicModel;
import cc.mallet.topics.TopicInferencer;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.types.TokenSequence;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.jcore.types.Lemma;
import de.julielab.jcore.types.Token;
import de.julielab.jcore.types.pubmed.Header;
import de.julielab.jcore.utility.JCoReTools;
import de.julielab.topicmodeling.businessobjects.Document;
import de.julielab.topicmodeling.businessobjects.Model;
import de.julielab.topicmodeling.businessobjects.TMSearchResult;
import de.julielab.topicmodeling.businessobjects.Topic;
import de.julielab.xml.JulieXMLTools;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.commons.configuration2.HierarchicalConfiguration;
import org.apache.commons.configuration2.XMLConfiguration;
import org.apache.commons.configuration2.builder.BuilderParameters;
import org.apache.commons.configuration2.builder.FileBasedConfigurationBuilder;
import org.apache.commons.configuration2.builder.fluent.Parameters;
import org.apache.commons.configuration2.ex.ConfigurationException;
import org.apache.commons.configuration2.tree.ImmutableNode;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/topicmodeling/services/MalletTopicModeling.class */
public class MalletTopicModeling implements ITopicModeling {
    private static final Logger LOGGER = LoggerFactory.getLogger(MalletTopicModeling.class);
    String forEach = "/PubmedArticleSet/PubmedArticle/MedlineCitation";
    String idField = "PMID";
    String textField = "Article/Abstract/AbstractText";
    String alternativeTextField = "OtherAbstract/AbstractText";

    public static double computeSimilarity(double[] dArr, double[] dArr2) {
        double d = 0.0d;
        double d2 = 0.0d;
        double d3 = 0.0d;
        for (int i = 0; i < dArr.length; i++) {
            d += dArr[i] * dArr2[i];
            d2 += Math.pow(dArr[i], 2.0d);
            d3 += Math.pow(dArr2[i], 2.0d);
        }
        return d / (Math.sqrt(d2) * Math.sqrt(d3));
    }

    public XMLConfiguration loadConfig(String str) throws ConfigurationException {
        return new FileBasedConfigurationBuilder(XMLConfiguration.class).configure(new BuilderParameters[]{(BuilderParameters) new Parameters().xml().setFileName(str)}).getConfiguration();
    }

    @Override // de.julielab.topicmodeling.services.ITopicModeling
    public Model train(List<Document> list, XMLConfiguration xMLConfiguration) {
        int i = xMLConfiguration.getInt("train.parameters.parameter.numTopics");
        double d = xMLConfiguration.getDouble("train.parameters.parameter.alphaSum");
        double d2 = xMLConfiguration.getDouble("train.parameters.parameter.beta");
        int i2 = xMLConfiguration.getInt("train.parameters.parameter.numThreads");
        int i3 = xMLConfiguration.getInt("train.parameters.parameter.numIterations");
        int i4 = xMLConfiguration.getInt("train.parameters.parameter.optimizationInterval");
        String string = xMLConfiguration.getString("model.meta.ID");
        String string2 = xMLConfiguration.getString("model.meta.Version");
        LOGGER.info("Chosen number of topics: " + i);
        LOGGER.info("Chosen Dirichlet-alpha: " + d);
        LOGGER.info("Chosen Dirichlet-beta: " + d2);
        LOGGER.info("Chosen training iterations: " + i3);
        LOGGER.info("Chosen optimization interval (if 0, optim. is deactivated): " + i4);
        ParallelTopicModel parallelTopicModel = new ParallelTopicModel(i, d, d2);
        Model model = new Model();
        model.modelId = string;
        model.modelVersion = string2;
        try {
            LOGGER.info("Start preprocessing");
            parallelTopicModel.addInstances(preprocess(list));
            parallelTopicModel.setNumThreads(i2);
            parallelTopicModel.setNumIterations(i3);
            parallelTopicModel.setOptimizeInterval(i4);
            parallelTopicModel.estimate();
            model.malletModel = parallelTopicModel;
        } catch (Exception e) {
            e.printStackTrace();
        }
        LOGGER.info("Model: " + string + " Version: " + string2 + " is trained");
        mapMalletIdToPubmedId(list, model);
        LOGGER.info("PubMed citation IDs (PMIDs) are mapped to Mallet document IDs");
        return model;
    }

    public Model train(InstanceList instanceList, XMLConfiguration xMLConfiguration) {
        String string = xMLConfiguration.getString("model.id");
        String string2 = xMLConfiguration.getString("model.version");
        int i = xMLConfiguration.getInt("train.parameters.parameter.numTopics");
        double d = xMLConfiguration.getDouble("train.parameters.parameter.alphaSum");
        double d2 = xMLConfiguration.getDouble("train.parameters.parameter.beta");
        int i2 = xMLConfiguration.getInt("train.parameters.parameter.numThreads");
        int i3 = xMLConfiguration.getInt("train.parameters.parameter.numIterations");
        int i4 = xMLConfiguration.getInt("train.parameters.parameter.optimizationInterval");
        LOGGER.info("Chosen number of topics: " + i);
        LOGGER.info("Chosen Dirichlet-alpha: " + d);
        LOGGER.info("Chosen Dirichlet-beta: " + d2);
        LOGGER.info("Chosen training iterations: " + i3);
        LOGGER.info("Chosen optimization interval (if 0, optim. is deactivated): " + i4);
        ParallelTopicModel parallelTopicModel = new ParallelTopicModel(i, d, d2);
        Model model = new Model();
        try {
            LOGGER.info("Start preprocessing");
            parallelTopicModel.addInstances(instanceList);
            parallelTopicModel.setNumThreads(i2);
            parallelTopicModel.setNumIterations(i3);
            parallelTopicModel.setOptimizeInterval(i4);
            LOGGER.info("Start training");
            parallelTopicModel.estimate();
            model.malletModel = parallelTopicModel;
            model.modelId = string;
            model.modelVersion = string2;
        } catch (Exception e) {
            e.printStackTrace();
        }
        LOGGER.info("Model is trained");
        return model;
    }

    public void saveMalletModel(Model model, File file) {
        try {
            if (model.malletModel != null) {
                model.malletModel.write(file);
                LOGGER.info("Mallet model is saved in " + file.getName());
            } else {
                LOGGER.info("No Mallet model was found in ModelID: " + model.modelId + ", ModelVersion: " + model.modelVersion);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    @Override // de.julielab.topicmodeling.services.ITopicModeling
    public void saveModel(Model model, String str) {
        try {
            BufferedOutputStream outputStreamToFile = FileUtilities.getOutputStreamToFile(new File(str));
            ObjectOutputStream objectOutputStream = new ObjectOutputStream(outputStreamToFile);
            objectOutputStream.writeObject(model);
            objectOutputStream.close();
            outputStreamToFile.close();
            LOGGER.info("Serialized Model is saved in " + str);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override // de.julielab.topicmodeling.services.ITopicModeling
    public List<Document> readDocuments(File file, XMLConfiguration xMLConfiguration) {
        if (file.isDirectory()) {
            FilenameFilter filenameFilter = new FilenameFilter() { // from class: de.julielab.topicmodeling.services.MalletTopicModeling.1
                @Override // java.io.FilenameFilter
                public boolean accept(File file2, String str) {
                    return str.endsWith(".xml.gz") || str.endsWith(".xml.zip") || str.endsWith(".xml.gzip") || str.endsWith(".xml");
                }
            };
            ArrayList arrayList = new ArrayList();
            File[] listFiles = file.listFiles(filenameFilter);
            int intValue = xMLConfiguration.getInteger("evaluate.heldout.files.number", Integer.valueOf(listFiles.length)).intValue();
            for (int i = 0; i < intValue; i++) {
                LOGGER.info("Attempt to read " + listFiles[i].getName() + ", no. " + (i + 1) + " of total " + intValue);
                List<Document> readDocuments = readDocuments(listFiles[i], xMLConfiguration);
                for (int i2 = 0; i2 < readDocuments.size(); i2++) {
                    arrayList.add(readDocuments.get(i2));
                }
            }
            return arrayList;
        }
        String absolutePath = file.getAbsolutePath();
        String[] strArr = {this.idField, this.textField, this.alternativeTextField};
        ArrayList arrayList2 = new ArrayList();
        for (int i3 = 0; i3 < strArr.length; i3++) {
            String str = strArr[i3];
            HashMap hashMap = new HashMap();
            hashMap.put("name", "fieldvalue" + i3);
            hashMap.put("xpath", str);
            arrayList2.add(hashMap);
        }
        Iterator constructRowIterator = JulieXMLTools.constructRowIterator(absolutePath, 1024, this.forEach, arrayList2, false);
        ArrayList arrayList3 = new ArrayList();
        while (constructRowIterator.hasNext()) {
            Document document = new Document();
            Map map = (Map) constructRowIterator.next();
            ArrayList arrayList4 = new ArrayList();
            String str2 = (String) map.get("fieldvalue0");
            String str3 = (String) map.get("fieldvalue1");
            String str4 = (String) map.get("fieldvalue2");
            arrayList4.add(str2);
            arrayList4.add(str3);
            arrayList4.add(str4);
            if (str2 != null) {
                document.id = str2;
            }
            if (str3 != null) {
                document.text = str3;
            }
            if (str4 != null) {
                document.text = str4;
            }
            arrayList3.add(document);
        }
        LOGGER.info("Total citations found: " + arrayList3.size());
        return arrayList3;
    }

    public List<Document> readXmiDb(MalletTopicModeling malletTopicModeling, HierarchicalConfiguration<ImmutableNode> hierarchicalConfiguration) {
        String string = hierarchicalConfiguration.getString("train.corpus.subset.table");
        String string2 = hierarchicalConfiguration.getString("train.corpus.subset.annotationpgschema");
        boolean z = hierarchicalConfiguration.getBoolean("train.corpus.subset.reset", false);
        String string3 = hierarchicalConfiguration.getString("train.corpus.costosys.configurationFile");
        LOGGER.info("Start reading from DB table {} with CoStoSys configuration file {}", string, string3);
        ArrayList arrayList = new ArrayList();
        arrayList.add(Token.class.getCanonicalName());
        ArrayList arrayList2 = new ArrayList();
        try {
            CollectionReader createReader = CollectionReaderFactory.createReader("de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-reader", new Object[]{"AdditionalTables", arrayList, "Table", string, "CostosysConfigFile", string3, "ReadsBaseDocument", true, "ResetTable", Boolean.valueOf(z), "AdditionalTablesPostgresSchema", string2});
            CAS cas = JCasFactory.createJCas(new String[]{"de.julielab.jcore.types.jcore-all-types"}).getCas();
            while (createReader.hasNext()) {
                createReader.getNext(cas);
                Document document = new Document();
                JCas jCas = cas.getJCas();
                document.preprocessedData = malletTopicModeling.getLemmata(jCas);
                document.id = malletTopicModeling.getId(jCas);
                LOGGER.debug("Data for doc " + document.id + ": " + document.preprocessedData);
                arrayList2.add(document);
            }
            return arrayList2;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    @Override // de.julielab.topicmodeling.services.ITopicModeling
    public TMSearchResult search(Document document, Model model, XMLConfiguration xMLConfiguration) {
        double d = xMLConfiguration.getDouble("search.parameters.parameter.probabilityThreshold");
        TMSearchResult tMSearchResult = new TMSearchResult();
        tMSearchResult.malletId = new ArrayList();
        tMSearchResult.probabilities = new ArrayList();
        tMSearchResult.pubmedID = new ArrayList();
        if (document.preprocessedData == null) {
            ArrayList arrayList = new ArrayList();
            arrayList.add(document);
            document.preprocessedData = jcorePreprocess(arrayList).get(0);
        }
        List<Topic> list = inferLabel(document, model, xMLConfiguration).get(document.id);
        ArrayList arrayList2 = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            if (list.get(i).probability >= d) {
                arrayList2.add(Integer.valueOf(i));
            }
        }
        double[] dArr = new double[arrayList2.size()];
        HashMap hashMap = new HashMap();
        for (int i2 = 0; i2 < arrayList2.size(); i2++) {
            dArr[i2] = list.get(i2).probability;
        }
        double[][] documentTopics = model.malletModel.getDocumentTopics(false, false);
        for (int i3 = 0; i3 < arrayList2.size(); i3++) {
            hashMap.put(Integer.valueOf(i3), Double.valueOf(computeSimilarity(dArr, documentTopics[((Integer) arrayList2.get(i3)).intValue()])));
        }
        HashMap<String, List<Topic>> hashMap2 = model.index;
        for (int i4 = 0; i4 < hashMap2.size(); i4++) {
            List<Topic> list2 = hashMap2.get(hashMap2.keySet().toArray()[i4]);
            if (list2 != null) {
                double[] dArr2 = new double[arrayList2.size()];
                for (int i5 = 0; i5 < arrayList2.size(); i5++) {
                    for (int i6 = 0; i6 < list2.size(); i6++) {
                        if (list2.get(i6).id == ((Integer) arrayList2.get(i5)).intValue() && arrayList2.get(i5) != null) {
                            dArr2[i5] = list2.get(i6).probability;
                        }
                    }
                }
                hashMap.put(Integer.valueOf(i4), Double.valueOf(computeSimilarity(dArr, dArr2)));
            }
        }
        LinkedList linkedList = new LinkedList(hashMap.entrySet());
        Collections.sort(linkedList, new Comparator<Map.Entry<Integer, Double>>() { // from class: de.julielab.topicmodeling.services.MalletTopicModeling.2
            @Override // java.util.Comparator
            public int compare(Map.Entry<Integer, Double> entry, Map.Entry<Integer, Double> entry2) {
                return entry.getValue().compareTo(entry2.getValue());
            }
        });
        int i7 = xMLConfiguration.getInt("search.results.displayedHits", linkedList.size());
        for (int i8 = 0; i8 < i7; i8++) {
            Map.Entry entry = (Map.Entry) linkedList.get(i8);
            tMSearchResult.malletId.add(entry.getKey());
            tMSearchResult.probabilities.add(entry.getValue());
            tMSearchResult.pubmedID.add(model.ModelIdpubmedId.get(entry.getKey()));
        }
        return tMSearchResult;
    }

    /* JADX WARN: Multi-variable type inference failed */
    public TMSearchResult searchModelOnly(Document document, Model model, XMLConfiguration xMLConfiguration) {
        double d = xMLConfiguration.getDouble("search.parameters.parameter.probabilityThreshold");
        TMSearchResult tMSearchResult = new TMSearchResult();
        tMSearchResult.malletId = new ArrayList();
        tMSearchResult.probabilities = new ArrayList();
        tMSearchResult.pubmedID = new ArrayList();
        if (document.preprocessedData == null) {
            ArrayList arrayList = new ArrayList();
            arrayList.add(document);
            document.preprocessedData = jcorePreprocess(arrayList).get(0);
        }
        List<Topic> list = inferLabel(document, model, xMLConfiguration).get(document.id);
        ArrayList arrayList2 = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            if (list.get(i).probability >= d) {
                arrayList2.add(Integer.valueOf(i));
            }
        }
        double[] dArr = new double[arrayList2.size()];
        HashMap hashMap = new HashMap();
        for (int i2 = 0; i2 < arrayList2.size(); i2++) {
            dArr[i2] = list.get(i2).probability;
        }
        double[][] documentTopics = model.malletModel.getDocumentTopics(false, false);
        for (int i3 = 0; i3 < arrayList2.size(); i3++) {
            hashMap.put(Integer.valueOf(i3), Double.valueOf(computeSimilarity(dArr, documentTopics[((Integer) arrayList2.get(i3)).intValue()])));
        }
        LinkedList linkedList = new LinkedList(hashMap.entrySet());
        Collections.sort(linkedList, new Comparator<Map.Entry<Integer, Double>>() { // from class: de.julielab.topicmodeling.services.MalletTopicModeling.3
            @Override // java.util.Comparator
            public int compare(Map.Entry<Integer, Double> entry, Map.Entry<Integer, Double> entry2) {
                return entry.getValue().compareTo(entry2.getValue());
            }
        });
        int i4 = xMLConfiguration.getInt("search.results.displayedHits", linkedList.size());
        for (int i5 = 0; i5 < i4; i5++) {
            Map.Entry entry = (Map.Entry) linkedList.get(i5);
            tMSearchResult.malletId.add(entry.getKey());
            tMSearchResult.probabilities.add(entry.getValue());
            tMSearchResult.pubmedID.add(model.ModelIdpubmedId.get(entry.getKey()));
        }
        return tMSearchResult;
    }

    /* JADX WARN: Multi-variable type inference failed */
    public TMSearchResult searchIndexOnly(Document document, Model model, XMLConfiguration xMLConfiguration) {
        double d = xMLConfiguration.getDouble("search.parameters.parameter.probabilityThreshold");
        TMSearchResult tMSearchResult = new TMSearchResult();
        tMSearchResult.malletId = new ArrayList();
        tMSearchResult.probabilities = new ArrayList();
        tMSearchResult.pubmedID = new ArrayList();
        if (document.preprocessedData == null) {
            ArrayList arrayList = new ArrayList();
            arrayList.add(document);
            document.preprocessedData = jcorePreprocess(arrayList).get(0);
        }
        List<Topic> list = inferLabel(document, model, xMLConfiguration).get(document.id);
        ArrayList arrayList2 = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            if (list.get(i).probability >= d) {
                arrayList2.add(Integer.valueOf(i));
            }
        }
        double[] dArr = new double[arrayList2.size()];
        HashMap hashMap = new HashMap();
        for (int i2 = 0; i2 < arrayList2.size(); i2++) {
            dArr[i2] = list.get(i2).probability;
        }
        HashMap<String, List<Topic>> hashMap2 = model.index;
        for (int i3 = 0; i3 < hashMap2.size(); i3++) {
            List<Topic> list2 = hashMap2.get(hashMap2.keySet().toArray()[i3]);
            if (list2 != null) {
                double[] dArr2 = new double[arrayList2.size()];
                for (int i4 = 0; i4 < arrayList2.size(); i4++) {
                    for (int i5 = 0; i5 < list2.size(); i5++) {
                        if (list2.get(i5).id == ((Integer) arrayList2.get(i4)).intValue() && arrayList2.get(i4) != null) {
                            dArr2[i4] = list2.get(i5).probability;
                        }
                    }
                }
                hashMap.put(Integer.valueOf(i3), Double.valueOf(computeSimilarity(dArr, dArr2)));
            }
        }
        LinkedList linkedList = new LinkedList(hashMap.entrySet());
        Collections.sort(linkedList, new Comparator<Map.Entry<Integer, Double>>() { // from class: de.julielab.topicmodeling.services.MalletTopicModeling.4
            @Override // java.util.Comparator
            public int compare(Map.Entry<Integer, Double> entry, Map.Entry<Integer, Double> entry2) {
                return entry.getValue().compareTo(entry2.getValue());
            }
        });
        int i6 = xMLConfiguration.getInt("search.results.displayedHits", linkedList.size());
        for (int i7 = 0; i7 < i6; i7++) {
            Map.Entry entry = (Map.Entry) linkedList.get(i7);
            tMSearchResult.malletId.add(entry.getKey());
            tMSearchResult.probabilities.add(entry.getValue());
            tMSearchResult.pubmedID.add(model.ModelIdpubmedId.get(entry.getKey()));
        }
        return tMSearchResult;
    }

    @Override // de.julielab.topicmodeling.services.ITopicModeling
    public Map<String, List<Topic>> inferLabel(Document document, Model model, XMLConfiguration xMLConfiguration) {
        int i = xMLConfiguration.getInt("infer.parameters.parameter.numIterations");
        int i2 = xMLConfiguration.getInt("infer.parameters.parameter.savingInterval");
        int i3 = xMLConfiguration.getInt("infer.parameters.parameter.firstSavingInterval");
        HashMap hashMap = new HashMap();
        TopicInferencer inferencer = model.malletModel.getInferencer();
        TokenSequence tokenSequence = (TokenSequence) document.preprocessedData;
        if (tokenSequence.isEmpty()) {
            LOGGER.warn("Document tokens are empty");
        }
        ArrayList arrayList = new ArrayList();
        arrayList.add(tokenSequence);
        double[] sampledDistribution = inferencer.getSampledDistribution((Instance) malletPreprocess(arrayList).get(0), i, i2, i3);
        ArrayList arrayList2 = new ArrayList();
        for (int i4 = 0; i4 < sampledDistribution.length; i4++) {
            Topic topic = new Topic();
            topic.probability = sampledDistribution[i4];
            topic.id = i4;
            arrayList2.add(topic);
        }
        hashMap.put(document.id, arrayList2);
        return hashMap;
    }

    public Map<String, List<Topic>> inferLabel(JCas jCas, Model model, XMLConfiguration xMLConfiguration) {
        TokenSequence lemmata = getLemmata(jCas);
        Document document = new Document();
        document.id = JCoReTools.getDocId(jCas);
        document.preprocessedData = lemmata;
        return inferLabel(document, model, xMLConfiguration);
    }

    public Model readMalletModel(File file) {
        Model model = new Model();
        try {
            model.malletModel = ParallelTopicModel.read(file);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return model;
    }

    @Override // de.julielab.topicmodeling.services.ITopicModeling
    public Model readModel(String str) {
        Model model = new Model();
        try {
            BufferedInputStream inputStreamFromFile = FileUtilities.getInputStreamFromFile(new File(str));
            ObjectInputStream objectInputStream = new ObjectInputStream(inputStreamFromFile);
            model = (Model) objectInputStream.readObject();
            objectInputStream.close();
            inputStreamFromFile.close();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e2) {
            LOGGER.error("Model class not found", e2);
        }
        return model;
    }

    public InstanceList preprocess(List<Document> list) {
        return malletPreprocess(jcorePreprocess(list));
    }

    public List<TokenSequence> jcorePreprocess(List<Document> list) {
        ArrayList arrayList = new ArrayList();
        try {
            AnalysisEngine createEngine = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.jsbd.desc.jcore-jsbd-ae-biomedical-english", new Object[0]);
            AnalysisEngine createEngine2 = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.jtbd.desc.jcore-jtbd-ae-biomedical-english", new Object[0]);
            AnalysisEngine createEngine3 = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.opennlp.postag.desc.jcore-opennlp-postag-ae-biomedical-english", new Object[0]);
            AnalysisEngine createEngine4 = AnalysisEngineFactory.createEngine("de.julielab.jcore.ae.biolemmatizer.desc.jcore-biolemmatizer-ae", new Object[0]);
            JCas createJCas = JCasFactory.createJCas();
            for (int i = 0; i < list.size(); i++) {
                String str = list.get(i).text;
                LOGGER.debug("Attempt to process document: " + list.get(i).id);
                if (str != null) {
                    createJCas.setDocumentText(str);
                    createEngine.process(createJCas);
                    createEngine2.process(createJCas);
                    createEngine3.process(createJCas);
                    createEngine4.process(createJCas);
                    arrayList.add(getLemmata(createJCas));
                    createJCas.reset();
                }
            }
            createEngine.destroy();
            createEngine2.destroy();
            createEngine3.destroy();
            createEngine4.destroy();
        } catch (Exception e) {
            e.printStackTrace();
        }
        LOGGER.info("JCoRe preprocessing finished");
        return arrayList;
    }

    public InstanceList malletPreprocess(List<TokenSequence> list) {
        ArrayList arrayList = new ArrayList();
        arrayList.add(new TokenSequenceRemoveStopwords(false, false));
        arrayList.add(new TokenSequence2FeatureSequence());
        InstanceList instanceList = new InstanceList(new SerialPipes(arrayList));
        instanceList.addThruPipe(new ArrayIterator(list));
        return instanceList;
    }

    public TokenSequence getLemmata(JCas jCas) {
        TokenSequence tokenSequence = new TokenSequence();
        FSIterator it = jCas.getAnnotationIndex(Token.type).iterator();
        while (it.hasNext()) {
            Lemma lemma = it.get().getLemma();
            if (lemma == null) {
                throw new IllegalArgumentException("The input UIMA CAS is missing lemma annotations set to the tokens as the lemma feature.");
            }
            String value = lemma.getValue();
            if (isNotNum(value) && isNotPunctuation(value)) {
                tokenSequence.add(value);
            }
            it.next();
        }
        return tokenSequence;
    }

    public String getId(JCas jCas) {
        String str = "";
        FSIterator it = jCas.getAnnotationIndex(Header.type).iterator();
        while (it.hasNext()) {
            str = ((Header) it.next()).getDocId();
            LOGGER.trace("Found id: " + str);
        }
        return str;
    }

    public boolean isNotNum(String str) {
        return !str.matches("\\s?-?\\d+.?\\d*\\s?");
    }

    public boolean isNotPunctuation(String str) {
        return !str.matches("[.,:;!?\\-\\/()<>\\[\\]%'+=]");
    }

    public void mapPubmedIdToMalletId(List<Document> list, Model model) {
        model.pubmedIdModelId = new HashMap<>();
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i).id;
            LOGGER.trace("Attempting to map PMID " + str + " to mallet doc " + i);
            model.pubmedIdModelId.put(str, Integer.valueOf(i));
            LOGGER.debug("PubMed citation IDs (PMIDs) are mapped to Mallet document IDs");
        }
    }

    public void mapMalletIdToPubmedId(List<Document> list, Model model) {
        model.ModelIdpubmedId = new HashMap<>();
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i).id;
            LOGGER.trace("Attempting to map Mallet DocID " + i + " to PMID " + str);
            model.ModelIdpubmedId.put(Integer.valueOf(i), str);
        }
        LOGGER.debug("Mallet document IDs are mapped to PubMed citation IDs (PMIDs)");
    }

    public Object[] getVocabulary(Model model) {
        return model.malletModel.getAlphabet().toArray();
    }
}
