package de.julielab.jcore.ae.jsbd.main;

import de.julielab.jcore.ae.jsbd.SentenceSplitter;
import de.julielab.jcore.ae.jsbd.Unit;
import de.julielab.jcore.types.Sentence;
import de.julielab.jcore.utility.JCoReAnnotationIndexMerger;
import de.julielab.jcore.utility.JCoReCondensedDocumentText;
import de.julielab.jcore.utility.JCoReTools;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.AntPathMatcher;

/* loaded from: input_file:de/julielab/jcore/ae/jsbd/main/SentenceAnnotator.class */
public class SentenceAnnotator extends JCasAnnotator_ImplBase {
    public static final String PARAM_MODEL_FILE = "ModelFilename";
    public static final String PARAM_POSTPROCESSING = "Postprocessing";
    public static final String PARAM_SENTENCE_DELIMITER_TYPES = "SentenceDelimiterTypes";
    public static final String PARAM_CUT_AWAY_TYPES = "CutAwayTypes";
    private final Matcher letterMatcher = Pattern.compile("\\p{L}\\p{M}*").matcher("");

    @ConfigurationParameter(name = PARAM_POSTPROCESSING, mandatory = false, defaultValue = {"false"}, description = "One of 'biomed' or 'medical'. Does some post processing to e.g. respect parenthesis and don't put a sentence boundary withing in a pair of opening and closing parenthesis.")
    private String postprocessingFilter = null;

    @ConfigurationParameter(name = PARAM_SENTENCE_DELIMITER_TYPES, mandatory = false, description = "An array of annotation types that should never begin or end within a sentence. For example, sentences should never reach out of a paragraph or a section heading.")
    private Set<String> sentenceDelimiterTypes;

    @ConfigurationParameter(name = PARAM_MODEL_FILE, mandatory = true)
    private String modelFilename;

    @ConfigurationParameter(name = PARAM_CUT_AWAY_TYPES, mandatory = false, description = "An array of fully qualified type names. Document text covered by annotations of these types will be ignored from sentence splitting. This means that sentence splitting happens as if the covered text of these annotations would not exist in the text. This helps for references, for example, which otherwise might confuse the sentence splitting. A post-processing step tries to extend sentences include such annotations if they appear directly after the sentence (e.g. references: '...as Smith et al. have shown.1 Further text follows...').")
    private Set<String> cutAwayTypes;
    private SentenceSplitter sentenceSplitter;
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) SentenceAnnotator.class);
    private static AtomicInteger numEmptyCases = new AtomicInteger();

    @Override // org.apache.uima.analysis_component.AnalysisComponent_ImplBase, org.apache.uima.analysis_component.AnalysisComponent
    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        InputStream resourceAsStream;
        super.initialize(uimaContext);
        try {
            this.sentenceSplitter = new SentenceSplitter();
            LOGGER.info("initializing JSBD Annotator ...");
            this.modelFilename = (String) uimaContext.getConfigParameterValue(PARAM_MODEL_FILE);
            File file = new File(this.modelFilename);
            if (file.exists()) {
                resourceAsStream = new FileInputStream(file);
            } else {
                LOGGER.debug("File \"{}\" does not exist. Searching for the model as a classpath resource.", this.modelFilename);
                resourceAsStream = getClass().getResourceAsStream(this.modelFilename.startsWith(AntPathMatcher.DEFAULT_PATH_SEPARATOR) ? this.modelFilename : "/" + this.modelFilename);
                if (null == resourceAsStream) {
                    throw new IllegalArgumentException("The model file \"" + this.modelFilename + "\" could be found neither in the file system nor in the classpath.");
                }
            }
            this.sentenceSplitter.readModel(resourceAsStream);
            Object configParameterValue = uimaContext.getConfigParameterValue(PARAM_POSTPROCESSING);
            if (configParameterValue != null) {
                this.postprocessingFilter = (String) configParameterValue;
            }
            String[] strArr = (String[]) uimaContext.getConfigParameterValue(PARAM_SENTENCE_DELIMITER_TYPES);
            if (null != strArr) {
                this.sentenceDelimiterTypes = new LinkedHashSet(Arrays.asList(strArr));
            }
            String[] strArr2 = (String[]) uimaContext.getConfigParameterValue(PARAM_CUT_AWAY_TYPES);
            if (null != strArr2) {
                this.cutAwayTypes = (Set) Stream.of((Object[]) strArr2).collect(Collectors.toSet());
            }
        } catch (IOException | ClassNotFoundException e) {
            throw new ResourceInitializationException(e);
        }
    }

    @Override // org.apache.uima.analysis_component.JCasAnnotator_ImplBase
    public void process(JCas jCas) throws AnalysisEngineProcessException {
        if (StringUtils.isBlank(jCas.getDocumentText())) {
            LOGGER.warn("The document text is empty.");
            return;
        }
        try {
            JCoReCondensedDocumentText jCoReCondensedDocumentText = new JCoReCondensedDocumentText(jCas, this.cutAwayTypes);
            if (this.sentenceDelimiterTypes == null) {
                if (jCas.getDocumentText() != null && jCas.getDocumentText().length() > 0) {
                    doSegmentation(jCoReCondensedDocumentText, jCoReCondensedDocumentText.getCodensedText(), 0);
                    return;
                }
                if (numEmptyCases.get() < 10) {
                    LOGGER.debug("document text empty. Skipping this document.");
                    numEmptyCases.incrementAndGet();
                    return;
                } else {
                    if (numEmptyCases.get() == 10) {
                        LOGGER.warn("Encountered 10 documents with an empty text body. This message will not appear again to avoid scrolling in cases where this is expected.");
                        return;
                    }
                    return;
                }
            }
            try {
                JCoReAnnotationIndexMerger jCoReAnnotationIndexMerger = new JCoReAnnotationIndexMerger(this.sentenceDelimiterTypes, false, null, jCas);
                ArrayList arrayList = new ArrayList();
                arrayList.add(0);
                arrayList.add(Integer.valueOf(jCas.getDocumentText().length()));
                while (jCoReAnnotationIndexMerger.incrementAnnotation()) {
                    Annotation annotation = (Annotation) jCoReAnnotationIndexMerger.getAnnotation();
                    arrayList.add(Integer.valueOf(jCoReCondensedDocumentText.getCondensedOffsetForOriginalOffset(annotation.getBegin())));
                    arrayList.add(Integer.valueOf(jCoReCondensedDocumentText.getCondensedOffsetForOriginalOffset(annotation.getEnd())));
                }
                arrayList.sort(null);
                for (int i = 1; i < arrayList.size(); i++) {
                    int intValue = ((Integer) arrayList.get(i - 1)).intValue();
                    int intValue2 = ((Integer) arrayList.get(i)).intValue();
                    while (intValue < intValue2 && Character.isWhitespace(jCas.getDocumentText().charAt(intValue))) {
                        intValue++;
                    }
                    String substring = jCoReCondensedDocumentText.getCodensedText().substring(intValue, intValue2);
                    if (!StringUtils.isBlank(substring)) {
                        doSegmentation(jCoReCondensedDocumentText, substring, intValue);
                    }
                }
            } catch (ClassNotFoundException e) {
                throw new AnalysisEngineProcessException(e);
            }
        } catch (ClassNotFoundException e2) {
            throw new AnalysisEngineProcessException(e2);
        }
    }

    private void doSegmentation(JCoReCondensedDocumentText jCoReCondensedDocumentText, String str, int i) {
        ArrayList arrayList = new ArrayList();
        arrayList.add(str);
        addAnnotations(jCoReCondensedDocumentText, this.sentenceSplitter.predict(arrayList, this.postprocessingFilter), i);
    }

    private void addAnnotations(JCoReCondensedDocumentText jCoReCondensedDocumentText, List<Unit> list, int i) {
        int i2 = 0;
        for (int i3 = 0; i3 < list.size(); i3++) {
            Unit unit = list.get(i3);
            String str = list.get(i3).label;
            if (i2 == -1) {
                i2 = unit.begin;
            }
            if (str.equals("EOS") || i3 == list.size() - 1) {
                Sentence sentence = new Sentence(jCoReCondensedDocumentText.getCas());
                int originalOffsetForCondensedOffset = jCoReCondensedDocumentText.getOriginalOffsetForCondensedOffset(i2 + i);
                int originalOffsetForCondensedOffset2 = jCoReCondensedDocumentText.getOriginalOffsetForCondensedOffset(unit.end + i);
                while (originalOffsetForCondensedOffset < jCoReCondensedDocumentText.getCas().getDocumentText().length() && Character.isWhitespace(jCoReCondensedDocumentText.getCas().getDocumentText().charAt(originalOffsetForCondensedOffset))) {
                    originalOffsetForCondensedOffset++;
                }
                while (originalOffsetForCondensedOffset2 > 0 && Character.isWhitespace(jCoReCondensedDocumentText.getCas().getDocumentText().codePointAt(originalOffsetForCondensedOffset2 - 1))) {
                    originalOffsetForCondensedOffset2--;
                }
                if (originalOffsetForCondensedOffset < originalOffsetForCondensedOffset2) {
                    sentence.setBegin(originalOffsetForCondensedOffset);
                    sentence.setEnd(originalOffsetForCondensedOffset2);
                    sentence.setComponentId(getClass().getName());
                    try {
                        this.letterMatcher.reset(sentence.getCoveredText());
                        if (this.letterMatcher.find()) {
                            sentence.addToIndexes();
                        }
                    } catch (StringIndexOutOfBoundsException e) {
                        LOGGER.error("Document {}. Invalid sentence offsets: {}-{}. Document text length: {}.", JCoReTools.getDocId(jCoReCondensedDocumentText.getCas()), Integer.valueOf(originalOffsetForCondensedOffset), Integer.valueOf(originalOffsetForCondensedOffset2), Integer.valueOf(jCoReCondensedDocumentText.getCas().getDocumentText().length()));
                        throw e;
                    }
                }
                i2 = -1;
            }
        }
    }
}
