package de.julielab.genemapper.resources;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.google.inject.Module;
import de.julielab.geneexpbase.configuration.Parameters;
import de.julielab.geneexpbase.data.DocumentLoader;
import de.julielab.geneexpbase.genemodel.GeneDocument;
import de.julielab.geneexpbase.genemodel.GeneOrthologs;
import de.julielab.geneexpbase.ioc.ServicesShutdownHub;
import de.julielab.genemapper.Configuration;
import de.julielab.genemapper.GeneMapper;
import de.julielab.genemapper.classification.TransformerDisambiguationDataUtils;
import de.julielab.genemapper.genemodel.GeneDocumentFactory;
import de.julielab.genemapper.ioc.GeneMappingModule;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.java.utilities.ProgressBar;
import de.julielab.jcore.ae.checkpoint.DBCheckpointAE;
import de.julielab.jcore.types.Gene;
import de.julielab.jcore.utility.JCoReTools;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.CasIterator;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.impl.ResourceManager_impl;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.resource.metadata.impl.ProcessingResourceMetaData_impl;
import org.apache.uima.util.CasPool;
import org.apache.uima.util.InvalidXMLException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/genemapper/resources/TransformerDisambiguationGene2PubmedDataWriter.class */
public class TransformerDisambiguationGene2PubmedDataWriter {
    private static final Logger log = LoggerFactory.getLogger(TransformerDisambiguationGene2PubmedDataWriter.class);
    private static final BlockingDeque<GeneDocument> documentBuffer = new LinkedBlockingDeque(512);
    private final List<WritingThread> writingThreads = new ArrayList();
    private static Injector injector;
    private static Configuration configuration;
    private boolean errorOccurred;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:de/julielab/genemapper/resources/TransformerDisambiguationGene2PubmedDataWriter$WritingThread.class */
    public class WritingThread extends Thread {
        private final GeneMapper geneMapper;
        private final File outputFile;
        private boolean finish = false;
        private final BufferedWriter bw;

        public WritingThread(GeneMapper geneMapper, File file) throws IOException {
            this.geneMapper = geneMapper;
            this.outputFile = file;
            if (file.exists()) {
                file.delete();
            }
            this.bw = FileUtilities.getWriterToFile(file);
        }

        public File getOutputFile() {
            return this.outputFile;
        }

        public void finish() {
            this.finish = true;
        }

        @Override // java.lang.Thread, java.lang.Runnable
        public void run() {
            while (true) {
                try {
                    try {
                        if (this.finish && TransformerDisambiguationGene2PubmedDataWriter.documentBuffer.isEmpty()) {
                            break;
                        }
                        synchronized (TransformerDisambiguationGene2PubmedDataWriter.documentBuffer) {
                            if (TransformerDisambiguationGene2PubmedDataWriter.documentBuffer.remainingCapacity() > 500 && !this.finish) {
                                TransformerDisambiguationGene2PubmedDataWriter.log.trace("Waiting for notification.");
                                TransformerDisambiguationGene2PubmedDataWriter.documentBuffer.wait();
                            }
                        }
                        TransformerDisambiguationGene2PubmedDataWriter.log.debug("Draining document buffer of size {} to outbound list.", Integer.valueOf(TransformerDisambiguationGene2PubmedDataWriter.documentBuffer.size()));
                        ArrayList arrayList = new ArrayList(TransformerDisambiguationGene2PubmedDataWriter.documentBuffer.size());
                        TransformerDisambiguationGene2PubmedDataWriter.documentBuffer.drainTo(arrayList);
                        TransformerDisambiguationGene2PubmedDataWriter.log.debug("Writing document buffer to file {}", this.outputFile);
                        Iterator it = arrayList.iterator();
                        while (it.hasNext()) {
                            TransformerDisambiguationDataUtils.writeData(this.bw, this.geneMapper, (GeneDocument) it.next());
                        }
                        TransformerDisambiguationGene2PubmedDataWriter.log.debug("Writing finished.");
                    } catch (Throwable th) {
                        TransformerDisambiguationGene2PubmedDataWriter.log.error("Error in the data writing thread.", th);
                        throw new RuntimeException(th);
                    }
                } catch (Throwable th2) {
                    if (this.bw != null) {
                        try {
                            this.bw.close();
                        } catch (IOException e) {
                            TransformerDisambiguationGene2PubmedDataWriter.log.error("Could not close writer to {}", this.outputFile);
                        }
                    }
                    throw th2;
                }
            }
            if (this.bw != null) {
                try {
                    this.bw.close();
                } catch (IOException e2) {
                    TransformerDisambiguationGene2PubmedDataWriter.log.error("Could not close writer to {}", this.outputFile);
                }
            }
        }
    }

    public static void main(String[] strArr) throws Exception {
        configuration = new Configuration(new File("configurations/genemapper_gene2pubmed.properties"));
        log.info("Detected {} CPUs. Using this number minus 2.", Integer.valueOf(Runtime.getRuntime().availableProcessors()));
        int max = Math.max(1, Runtime.getRuntime().availableProcessors() - 2);
        injector = Guice.createInjector(new Module[]{new GeneMappingModule(configuration)});
        new TransformerDisambiguationGene2PubmedDataWriter().createDisambiguationData(new File("../jcore-gene-mapper-resources/gene2pubmed.gz"), "../jcore-gene-mapper-resources/src/main/resources/costosys.xml", "geno.gene2pubmed", (GeneMapper) injector.getInstance(GeneMapper.class), "transformerDisambiguationData-gene2pubmed-v23-" + "goldTax" + "-%s.tsv.gz", max);
        log.info("Shutting down gene mapper services.");
        log.info("Application finished.");
        ((ServicesShutdownHub) injector.getInstance(ServicesShutdownHub.class)).shutdown();
    }

    public void createDisambiguationData(File file, String str, String str2, GeneMapper geneMapper, String str3, int i) throws IOException, UIMAException, InterruptedException {
        ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(i);
        try {
            Multimap<String, String> readGene2pubmed = readGene2pubmed(file);
            CollectionReader createReader = CollectionReaderFactory.createReader("de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier-reader", new Object[]{"CostosysConfigFile", str, "AnnotationsToLoad", new String[]{"de.julielab.jcore.types.Sentence", "de.julielab.jcore.types.Token", "de.julielab.jcore.types.PennBioIEPOSTag", "de.julielab.jcore.types.Organism", "de.julielab.jcore.types.Abbreviation", "flair:de.julielab.jcore.types.Gene"}, "ReadsBaseDocument", true, "Table", str2, "BatchSize", 50, "ResetTable", true});
            TypeSystemDescription createTypeSystemDescription = TypeSystemDescriptionFactory.createTypeSystemDescription(new String[]{"de.julielab.jcore.types.jcore-morpho-syntax-types", "de.julielab.jcore.types.jcore-document-structure-pubmed-types", "de.julielab.jcore.types.jcore-document-meta-pubmed-types", "de.julielab.jcore.ae.genemapper.desc.ProteinOffsetExpansionTypeSystem", "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types", "de.julielab.jcore.types.casmultiplier.jcore-dbtable-multiplier-types"});
            ArrayList arrayList = new ArrayList(i);
            ArrayList arrayList2 = new ArrayList(i);
            ArrayList arrayList3 = new ArrayList(i);
            ArrayList arrayList4 = new ArrayList(i);
            ArrayList arrayList5 = new ArrayList(i);
            for (int i2 = 0; i2 < i; i2++) {
                AnalysisEngine createEngineWithTs = createEngineWithTs("de.julielab.jcore.reader.xmi.desc.jcore-xmi-db-multiplier", createTypeSystemDescription);
                AnalysisEngine createEngineWithTs2 = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.ProteinOffsetExpansionEngine", createTypeSystemDescription);
                AnalysisEngine createEngineWithTs3 = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.jcore-extended-proteins-merger", createTypeSystemDescription);
                AnalysisEngine createEngineWithTs4 = createEngineWithTs("de.julielab.jcore.ae.genemapper.desc.jcore-protein-consistency-tagger", createTypeSystemDescription);
                AnalysisEngine createEngine = AnalysisEngineFactory.createEngine(DBCheckpointAE.class, createTypeSystemDescription, new Object[]{"CheckpointName", "end", "CostosysConfigFile", str, "IndicateFinished", true});
                arrayList.add(createEngineWithTs);
                arrayList2.add(createEngineWithTs2);
                arrayList3.add(createEngineWithTs3);
                arrayList4.add(createEngineWithTs4);
                arrayList5.add(createEngine);
            }
            ProcessingResourceMetaData_impl processingResourceMetaData_impl = new ProcessingResourceMetaData_impl();
            processingResourceMetaData_impl.setTypeSystem(createTypeSystemDescription);
            CasPool casPool = new CasPool(i + 4, processingResourceMetaData_impl, new ResourceManager_impl());
            ConcurrentHashMap concurrentHashMap = new ConcurrentHashMap(i);
            createWritingThreads(geneMapper, str3, i);
            ProgressBar progressBar = new ProgressBar(createReader.getProgress()[0].getTotal() / 50, 80, true);
            while (createReader.hasNext() && !this.errorOccurred) {
                log.debug("Getting CAS. Available CASes: {}", Integer.valueOf(casPool.getNumAvailable()));
                CAS cas = casPool.getCas(1200000L);
                if (cas != null) {
                    createReader.getNext(cas);
                    newFixedThreadPool.submit(() -> {
                        String str4 = null;
                        Integer num = (Integer) concurrentHashMap.compute(Thread.currentThread(), (thread, num2) -> {
                            return Integer.valueOf(num2 != null ? num2.intValue() : concurrentHashMap.size());
                        });
                        try {
                            CasIterator processAndOutputNewCASes = ((AnalysisEngine) arrayList.get(num.intValue())).processAndOutputNewCASes(cas);
                            while (processAndOutputNewCASes.hasNext()) {
                                CAS next = processAndOutputNewCASes.next();
                                str4 = JCoReTools.getDocId(next.getJCas());
                                ((AnalysisEngine) arrayList2.get(num.intValue())).process(next);
                                ((AnalysisEngine) arrayList3.get(num.intValue())).process(next);
                                ((AnalysisEngine) arrayList4.get(num.intValue())).process(next);
                                writeTrainingData(readGene2pubmed, next.getJCas(), geneMapper, 256);
                                ((AnalysisEngine) arrayList5.get(num.intValue())).process(next);
                                next.release();
                            }
                            casPool.releaseCas(cas);
                            callBatchProcessingComplete(arrayList2);
                            callBatchProcessingComplete(arrayList3);
                            callBatchProcessingComplete(arrayList4);
                        } catch (ClassCastException | AnalysisEngineProcessException e) {
                            log.warn("Got {} exception for document '{}': {}. Assuming that this is a JeDIS (de-)serialization issue, skipping the document.", new Object[]{e.getClass().getCanonicalName(), str4, e.getMessage()});
                        } catch (Throwable th) {
                            log.error("Could not process batch of CASes with Thread ID {} (name: {}) due to exception. The last seen document ID was '{}'.", new Object[]{num, Thread.currentThread().getName(), str4, th});
                            this.errorOccurred = true;
                        }
                    });
                    progressBar.incrementDone();
                    progressBar.printProgressBar();
                }
            }
            Iterator it = arrayList5.iterator();
            while (it.hasNext()) {
                ((AnalysisEngine) it.next()).collectionProcessComplete();
            }
            log.info("Shutting down ExecutorService.");
            newFixedThreadPool.shutdown();
            log.info("Waiting 15 minutes for all processing threads to finish.");
            newFixedThreadPool.awaitTermination(15L, TimeUnit.MINUTES);
            synchronized (documentBuffer) {
                log.info("Processing threads have finished, signaling the to-disc-writing threads to finish.");
                this.writingThreads.forEach((v0) -> {
                    v0.finish();
                });
                log.info("Notifying the to-disc-writing thread to continue a last writing iteration.");
                documentBuffer.notifyAll();
            }
            log.info("Waiting for last data to be written to disc...");
            Iterator<WritingThread> it2 = this.writingThreads.iterator();
            while (it2.hasNext()) {
                it2.next().join();
            }
            log.info("WritingThreads have terminated.");
            log.info("Merging written files into {}", String.format(str3, "all"));
            mergeFiles(str3);
            log.info("Merging done, application finished.");
            newFixedThreadPool.shutdown();
            if (this.errorOccurred) {
                log.error("Early termination due to error. Check the log messages above.");
            }
        } catch (Throwable th) {
            newFixedThreadPool.shutdown();
            throw th;
        }
    }

    private void callBatchProcessingComplete(List<AnalysisEngine> list) throws AnalysisEngineProcessException {
        Iterator<AnalysisEngine> it = list.iterator();
        while (it.hasNext()) {
            it.next().collectionProcessComplete();
        }
    }

    private void mergeFiles(String str) throws IOException {
        BufferedOutputStream outputStreamToFile = FileUtilities.getOutputStreamToFile(new File(String.format(str, "all")));
        try {
            for (WritingThread writingThread : this.writingThreads) {
                IOUtils.copy(FileUtilities.getReaderFromFile(writingThread.getOutputFile()), outputStreamToFile, StandardCharsets.UTF_8);
                writingThread.getOutputFile().delete();
            }
            if (outputStreamToFile != null) {
                outputStreamToFile.close();
            }
        } catch (Throwable th) {
            if (outputStreamToFile != null) {
                try {
                    outputStreamToFile.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    public void createWritingThreads(GeneMapper geneMapper, String str, int i) throws IOException {
        for (int i2 = 0; i2 < i; i2++) {
            WritingThread writingThread = new WritingThread(geneMapper, new File(String.format(str, Integer.valueOf(i2))));
            writingThread.setName("WritingThread-" + i2);
            writingThread.start();
            this.writingThreads.add(writingThread);
        }
    }

    private Multimap<String, String> readGene2pubmed(File file) throws IOException {
        HashMultimap create = HashMultimap.create();
        BufferedReader readerFromFile = FileUtilities.getReaderFromFile(file);
        try {
            readerFromFile.lines().filter(Predicate.not(str -> {
                return str.startsWith("#");
            })).map(str2 -> {
                return str2.split("\t");
            }).forEach(strArr -> {
                create.put(strArr[2], strArr[1]);
            });
            if (readerFromFile != null) {
                readerFromFile.close();
            }
            return create;
        } catch (Throwable th) {
            if (readerFromFile != null) {
                try {
                    readerFromFile.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    private void writeTrainingData(Multimap<String, String> multimap, JCas jCas, GeneMapper geneMapper, int i) throws AnalysisEngineProcessException, InterruptedException {
        GeneDocument createGeneDocument = ((GeneDocumentFactory) injector.getInstance(GeneDocumentFactory.class)).createGeneDocument(jCas, Map.of(Gene.class.getCanonicalName(), Pattern.compile("Gene|protein|protein_complex|protein_enum|protein_familiy_or_group").matcher("")), entityMention -> {
            return ImmutablePair.nullPair();
        }, new Parameters(configuration));
        TransformerDisambiguationDataUtils.addDocumentLevelGeneAnnotations(createGeneDocument, multimap);
        ((DocumentLoader) injector.getInstance(DocumentLoader.class)).inferDocumentLevelLabelsToMentions(createGeneDocument, createGeneDocument.getGoldIds(), geneMapper.getMappingCore().getCandidateRetrieval(), (GeneOrthologs) injector.getInstance(GeneOrthologs.class), false);
        createGeneDocument.setCompletelyAnnotated(false);
        do {
            synchronized (documentBuffer) {
                if (documentBuffer.remainingCapacity() == 0) {
                    log.trace("Notifying writing thread that buffer is full.");
                    documentBuffer.notify();
                }
            }
        } while (!documentBuffer.offer(createGeneDocument, 1L, TimeUnit.MINUTES));
    }

    private AnalysisEngine createEngineWithTs(String str, TypeSystemDescription typeSystemDescription) throws IOException, InvalidXMLException, ResourceInitializationException {
        AnalysisEngineDescription createEngineDescription = AnalysisEngineFactory.createEngineDescription(str, new Object[0]);
        createEngineDescription.getAnalysisEngineMetaData().setTypeSystem(typeSystemDescription);
        return AnalysisEngineFactory.createEngine(createEngineDescription, new Object[0]);
    }
}
