package de.julielab.jcore.ae.lingpipegazetteer.uima;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunker;
import com.aliasi.chunk.Chunking;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;
import com.ibm.icu.text.Transliterator;
import de.julielab.java.utilities.spanutils.OffsetSet;
import de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider;
import de.julielab.jcore.ae.lingpipegazetteer.chunking.OverlappingChunk;
import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking;
import de.julielab.jcore.types.Abbreviation;
import de.julielab.jcore.types.AbbreviationLongform;
import de.julielab.jcore.types.ConceptMention;
import de.julielab.jcore.types.PennBioIEPOSTag;
import de.julielab.jcore.types.mantra.Entity;
import de.julielab.jcore.utility.JCoReAnnotationTools;
import de.julielab.jcore.utility.index.JCoReHashMapAnnotationIndex;
import de.julielab.jcore.utility.index.TermGenerators;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Spliterators;
import java.util.Stack;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.Range;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceAccessException;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator.class */
public class GazetteerAnnotator extends JCasAnnotator_ImplBase {
    public static final String CHUNKER_RESOURCE_NAME = "DictionaryChunkerProvider";
    public static final String PARAM_CHECK_ACRONYMS = "CheckAcronyms";
    public static final String PARAM_OUTPUT_TYPE = "OutputType";
    private static final String PARAM_USE_MANTRA_MODE = "MantraMode";

    @ExternalResource(key = CHUNKER_RESOURCE_NAME, mandatory = true)
    private ChunkerProvider provider;
    private Transliterator transliterator;
    private TokenizerFactory normalizationTokenFactory;
    private Set<String> stopWords;
    private static final String COMPONENT_ID = GazetteerAnnotator.class.getCanonicalName();
    private static final Logger LOGGER = LoggerFactory.getLogger(GazetteerAnnotator.class);
    private static int initializeCount = 0;

    @ConfigurationParameter(name = PARAM_USE_MANTRA_MODE, defaultValue = {"false"})
    private boolean mantraMode = false;

    @ConfigurationParameter(name = PARAM_CHECK_ACRONYMS, defaultValue = {"true"})
    private boolean checkAcronyms = true;

    @ConfigurationParameter(name = PARAM_OUTPUT_TYPE)
    private String outputType = null;
    private Chunker gazetteer = null;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:de/julielab/jcore/ae/lingpipegazetteer/uima/GazetteerAnnotator$ParenthesisType.class */
    public enum ParenthesisType {
        ROUND,
        BRACKET,
        CURLY,
        NONE
    }

    static boolean filterParenthesis(String str) {
        Stack stack = new Stack();
        for (int i = 0; i < str.length(); i++) {
            char charAt = str.charAt(i);
            if (isParentheses(charAt)) {
                if (isOpenedParentheses(charAt)) {
                    stack.add(Character.valueOf(charAt));
                } else if (stack.isEmpty() || !isParenthesisCounterpart((Character) stack.pop(), Character.valueOf(charAt))) {
                    return true;
                }
            }
        }
        return !stack.isEmpty();
    }

    private static boolean isParenthesisCounterpart(Character ch, Character ch2) {
        ParenthesisType parenthesisType = getParenthesisType(ch2.charValue());
        ParenthesisType parenthesisType2 = getParenthesisType(ch.charValue());
        if (parenthesisType == ParenthesisType.NONE || parenthesisType2 == ParenthesisType.NONE) {
            throw new IllegalArgumentException("The two characters '" + ch + "' and '" + ch2 + "' were given in order to determine whether they are compatible parenthesis counterparts, but at least one of those characters is no parentheses.");
        }
        return parenthesisType.equals(parenthesisType2);
    }

    static ParenthesisType getParenthesisType(char c) {
        switch (c) {
            case '(':
            case ')':
                return ParenthesisType.ROUND;
            case '[':
            case ']':
                return ParenthesisType.BRACKET;
            case '{':
            case '}':
                return ParenthesisType.CURLY;
            default:
                return ParenthesisType.NONE;
        }
    }

    static boolean isParentheses(char c) {
        return isOpenedParentheses(c) || isClosedParentheses(c);
    }

    static boolean isOpenedParentheses(char c) {
        switch (c) {
            case '(':
            case '[':
            case '{':
                return true;
            default:
                return false;
        }
    }

    static boolean isClosedParentheses(char c) {
        switch (c) {
            case ')':
            case ']':
            case '}':
                return true;
            default:
                return false;
        }
    }

    static List<OverlappingChunk> groupOverlappingChunks(List<Chunk> list, String str) {
        Collections.sort(list, new Comparator<Chunk>() { // from class: de.julielab.jcore.ae.lingpipegazetteer.uima.GazetteerAnnotator.1
            @Override // java.util.Comparator
            public int compare(Chunk chunk, Chunk chunk2) {
                return chunk.start() - chunk2.start();
            }
        });
        ArrayList<OverlappingChunk> arrayList = new ArrayList();
        for (Chunk chunk : list) {
            boolean z = false;
            for (OverlappingChunk overlappingChunk : arrayList) {
                if (overlappingChunk.isOverlappingSpan(chunk.start(), chunk.end())) {
                    overlappingChunk.addChunk(chunk.start(), chunk.end(), chunk);
                    z = true;
                }
            }
            if (!z) {
                arrayList.add(new OverlappingChunk(chunk.start(), chunk.end(), chunk, str));
            }
        }
        return arrayList;
    }

    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        LOGGER.info("calls to initialize: " + initializeCount);
        super.initialize(uimaContext);
        LOGGER.info("initialize() - initializing GazetteerAnnotator...");
        try {
            this.provider = (ChunkerProvider) getContext().getResourceObject(CHUNKER_RESOURCE_NAME);
            this.gazetteer = this.provider.getChunker();
            this.stopWords = this.provider.getStopWords();
        } catch (ResourceAccessException e) {
            LOGGER.error("Exception while initializing", e);
        }
        this.checkAcronyms = ((Boolean) uimaContext.getConfigParameterValue(PARAM_CHECK_ACRONYMS)).booleanValue();
        LOGGER.info("Check for acronyms (found dictionary entries that are abbreviations are only accepted if their long form is an abbreviation of the same type, too): {}", Boolean.valueOf(this.checkAcronyms));
        if (Boolean.valueOf(this.provider.getNormalize()).booleanValue()) {
            this.normalizationTokenFactory = new IndoEuropeanTokenizerFactory();
        }
        LOGGER.info("Normalize CAS document text (i.e. do stemming and remove possessive 's): {}", Boolean.valueOf(this.provider.getNormalize()));
        Boolean.valueOf(this.provider.getTransliterate());
        this.transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC");
        LOGGER.info("Transliterate CAS document text (i.e. transform accented characters to their base forms): {}", Boolean.valueOf(this.provider.getTransliterate()));
        this.outputType = (String) uimaContext.getConfigParameterValue(PARAM_OUTPUT_TYPE);
        if (this.outputType == null) {
            LOGGER.error("initialize() - output type not specified.");
            throw new ResourceInitializationException();
        }
        this.mantraMode = uimaContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE) != null ? ((Boolean) uimaContext.getConfigParameterValue(PARAM_USE_MANTRA_MODE)).booleanValue() : false;
    }

    public void process(JCas jCas) throws AnalysisEngineProcessException {
        if (this.gazetteer == null) {
            throw new IllegalStateException("The actual gazetteer object is null. Check previous log messages pointing to the error (most probably the dictionary file could not be found).");
        }
        String documentText = jCas.getDocumentText();
        if (documentText == null || documentText.length() == 0) {
            return;
        }
        if (this.provider.getTransliterate() && !this.provider.getNormalize()) {
            documentText = this.transliterator.transform(documentText);
        }
        StringNormalizerForChunking.NormalizedString normalizeString = this.provider.getNormalize() ? this.provider.getNormalizePlural() ? StringNormalizerForChunking.normalizeString(documentText, this.normalizationTokenFactory, true, (OffsetSet) StreamSupport.stream(Spliterators.spliterator((Iterator) jCas.getAnnotationIndex(PennBioIEPOSTag.type).iterator(), 0L, 0), false).filter(pennBioIEPOSTag -> {
            return pennBioIEPOSTag.getValue().equals("NNS");
        }).map(pennBioIEPOSTag2 -> {
            return Range.between(Integer.valueOf(pennBioIEPOSTag2.getBegin()), Integer.valueOf(pennBioIEPOSTag2.getEnd()));
        }).collect(Collectors.toCollection(OffsetSet::new)), this.transliterator) : StringNormalizerForChunking.normalizeString(documentText, this.normalizationTokenFactory, this.transliterator) : null;
        if (!this.provider.getCaseSensitive() && this.provider.getUseApproximateMatching()) {
            if (this.provider.getNormalize()) {
                normalizeString.string = normalizeString.string.toLowerCase();
            } else {
                documentText = documentText.toLowerCase();
            }
        }
        TermGenerators.LongOffsetIndexTermGenerator longOffsetTermGenerator = TermGenerators.longOffsetTermGenerator();
        JCoReHashMapAnnotationIndex<Long, ConceptMention> jCoReHashMapAnnotationIndex = new JCoReHashMapAnnotationIndex<>(longOffsetTermGenerator, longOffsetTermGenerator, jCas, ConceptMention.type);
        JCoReHashMapAnnotationIndex<Long, Abbreviation> jCoReHashMapAnnotationIndex2 = new JCoReHashMapAnnotationIndex<>(longOffsetTermGenerator, longOffsetTermGenerator, jCas, Abbreviation.type);
        LOGGER.debug("Performing actual Gazetteer annotation...");
        Chunking chunk = this.provider.getNormalize() ? this.gazetteer.chunk(normalizeString.string) : this.gazetteer.chunk(documentText);
        LOGGER.debug("Gazetteer annotation done.");
        if (this.provider.getUseApproximateMatching()) {
            List<OverlappingChunk> groupOverlappingChunks = groupOverlappingChunks(filterChunking(chunk), chunk.charSequence().toString());
            LOGGER.debug("all overlapping chunks:\n");
            for (OverlappingChunk overlappingChunk : groupOverlappingChunks) {
                LOGGER.debug(overlappingChunk.toStringAll());
                List<Chunk> bestChunks = overlappingChunk.getBestChunks();
                LOGGER.debug("Found {} best chunks.", Integer.valueOf(bestChunks.size()));
                for (int i = 0; i < bestChunks.size(); i++) {
                    Chunk chunk2 = bestChunks.get(i);
                    if (LOGGER.isDebugEnabled()) {
                        String substring = this.provider.getNormalize() ? normalizeString.string.substring(chunk2.start(), chunk2.end()) : jCas.getDocumentText().substring(chunk2.start(), chunk2.end());
                        Logger logger = LOGGER;
                        logger.debug("Nr. " + i + " best chunk: " + chunk2.start() + " - " + chunk2.end() + ": " + chunk2.score() + " ; type: " + logger + " ; text: " + chunk2.type());
                    }
                    add2Cas(jCas, chunk2, normalizeString, jCoReHashMapAnnotationIndex, jCoReHashMapAnnotationIndex2);
                }
            }
        } else {
            Iterator it = chunk.chunkSet().iterator();
            while (it.hasNext()) {
                add2Cas(jCas, (Chunk) it.next(), normalizeString, jCoReHashMapAnnotationIndex, jCoReHashMapAnnotationIndex2);
            }
        }
        if (!this.checkAcronyms || this.mantraMode) {
            return;
        }
        LOGGER.debug("process() - checking acronyms");
        annotateAcronymsWithFullFormEntity(jCas, jCoReHashMapAnnotationIndex);
    }

    private List<Chunk> filterChunking(Chunking chunking) {
        ArrayList arrayList = new ArrayList(chunking.chunkSet().size());
        for (Chunk chunk : chunking.chunkSet()) {
            String charSequence = chunking.charSequence().subSequence(chunk.start(), chunk.end()).toString();
            if (!filterParenthesis(charSequence) && !filterPunctuationArtifacts(charSequence) && !filterStopwords(charSequence)) {
                arrayList.add(chunk);
            }
        }
        return arrayList;
    }

    private boolean filterPunctuationArtifacts(String str) {
        return str.startsWith("-") || str.endsWith("-");
    }

    private boolean filterStopwords(String str) {
        if (this.stopWords.contains(str.toLowerCase())) {
            return true;
        }
        if (!str.contains(" ")) {
            return false;
        }
        int i = 0;
        for (String str2 : str.split(" ")) {
            if (this.stopWords.contains(str2.toLowerCase())) {
                i++;
            }
        }
        if (Math.ceil(r0.length / 2.0d) > i) {
            return false;
        }
        LOGGER.debug("Filtering due to high stop word occurrences: {}", str);
        return true;
    }

    private boolean isAcronymWithSameFullFormSpecificType(JCas jCas, Chunk chunk, StringNormalizerForChunking.NormalizedString normalizedString, JCoReHashMapAnnotationIndex<Long, ConceptMention> jCoReHashMapAnnotationIndex, JCoReHashMapAnnotationIndex<Long, Abbreviation> jCoReHashMapAnnotationIndex2) {
        int intValue;
        int intValue2;
        if (this.provider.getNormalize()) {
            try {
                intValue = normalizedString.getOriginalOffset(chunk.start()).intValue();
                intValue2 = normalizedString.getOriginalOffset(chunk.end()).intValue();
            } catch (Exception e) {
                System.out.println("Text: " + normalizedString);
                System.out.println("Chunk: " + chunk);
                System.out.println("Chunk end: " + chunk.end());
                System.out.println("Normalized Text: " + normalizedString.string.substring(chunk.start(), chunk.end()));
                throw e;
            }
        } else {
            intValue = chunk.start();
            intValue2 = chunk.end();
        }
        Abbreviation first = jCoReHashMapAnnotationIndex2.getFirst(TermGenerators.longOffsetTermGenerator().forOffsets(intValue, intValue2));
        String str = null;
        if (LOGGER.isDebugEnabled()) {
            str = jCas.getDocumentText().substring(intValue, intValue2);
        }
        if (first == null) {
            LOGGER.debug("{} chunk \"{}\" is not an abbreviation\n", chunk, str);
            return true;
        }
        AbbreviationLongform textReference = first.getTextReference();
        ConceptMention first2 = jCoReHashMapAnnotationIndex.getFirst(textReference);
        if (first2 == null) {
            LOGGER.debug(chunk + " chunk \"{}\" is an abbreviation but respective full \"{}\" form is no ConceptMention\n", str, textReference.getCoveredText());
            return false;
        }
        String canonicalName = first2.getClass().getCanonicalName();
        if (canonicalName.equals(this.outputType)) {
            LOGGER.debug(chunk + " chunk \"{}\" is an abbreviation and respective full form \"{}\" is ConceptMention with same type as OutputType\n", str, first2.getCoveredText());
            return true;
        }
        LOGGER.debug(chunk + " chunk \"{}\" is an abbreviation but respective full form \"{}\" is ConceptMention without the correct OutputType (is: {}; OutputType: {})\n", new Object[]{str, first2.getCoveredText(), canonicalName, this.outputType});
        return false;
    }

    private void add2Cas(JCas jCas, Chunk chunk, StringNormalizerForChunking.NormalizedString normalizedString, JCoReHashMapAnnotationIndex<Long, ConceptMention> jCoReHashMapAnnotationIndex, JCoReHashMapAnnotationIndex<Long, Abbreviation> jCoReHashMapAnnotationIndex2) throws AnalysisEngineProcessException {
        if (!this.checkAcronyms || isAcronymWithSameFullFormSpecificType(jCas, chunk, normalizedString, jCoReHashMapAnnotationIndex, jCoReHashMapAnnotationIndex2)) {
            int min = Math.min(jCas.getDocumentText().length(), Math.max(0, this.provider.getNormalize() ? normalizedString.getOriginalOffset(chunk.start()).intValue() : chunk.start()));
            int min2 = Math.min(jCas.getDocumentText().length(), Math.max(0, this.provider.getNormalize() ? normalizedString.getOriginalOffset(chunk.end()).intValue() : chunk.end()));
            try {
                if (this.mantraMode) {
                    for (String str : chunk.type().split("@@TERM@@")) {
                        String[] split = str.split("@@");
                        Entity annotationByClassName = JCoReAnnotationTools.getAnnotationByClassName(jCas, "de.julielab.jcore.types.mantra.Entity");
                        annotationByClassName.setBegin(min);
                        annotationByClassName.setEnd(min2);
                        annotationByClassName.setComponentId(COMPONENT_ID);
                        annotationByClassName.setConfidence(chunk.score());
                        annotationByClassName.setSource(split[0]);
                        annotationByClassName.setCui(split[1]);
                        annotationByClassName.setSemanticType(split[2]);
                        annotationByClassName.setSemanticGroup(split[3]);
                        annotationByClassName.addToIndexes();
                    }
                } else {
                    ConceptMention annotationByClassName2 = JCoReAnnotationTools.getAnnotationByClassName(jCas, this.outputType);
                    annotationByClassName2.setBegin(min);
                    annotationByClassName2.setEnd(min2);
                    annotationByClassName2.setSpecificType(chunk.type());
                    annotationByClassName2.setComponentId(COMPONENT_ID);
                    annotationByClassName2.setConfidence(chunk.score());
                    annotationByClassName2.addToIndexes();
                    jCoReHashMapAnnotationIndex.index(annotationByClassName2);
                }
            } catch (Exception e) {
                LOGGER.error("process() - could not generate output type: " + e.getMessage());
                e.printStackTrace();
                throw new AnalysisEngineProcessException(e);
            }
        }
    }

    private void annotateAcronymsWithFullFormEntity(JCas jCas, JCoReHashMapAnnotationIndex<Long, ConceptMention> jCoReHashMapAnnotationIndex) throws AnalysisEngineProcessException {
        FSIterator it = jCas.getJFSIndexRepository().getAnnotationIndex(Abbreviation.type).iterator();
        TermGenerators.longOffsetTermGenerator();
        while (it.hasNext()) {
            Abbreviation abbreviation = (Abbreviation) it.next();
            AbbreviationLongform textReference = abbreviation.getTextReference();
            LOGGER.debug("annotateAcronymsWithFullFormEntity() - checking abbreviation: " + abbreviation.getCoveredText());
            ConceptMention first = jCoReHashMapAnnotationIndex.getFirst(textReference);
            String str = null;
            if (first != null) {
                str = first.getClass().getCanonicalName();
            }
            ConceptMention first2 = jCoReHashMapAnnotationIndex.getFirst(abbreviation);
            if (str != null && str.equals(this.outputType)) {
                if (first == null) {
                    LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation has no ConceptMention\n");
                } else if (first.getComponentId() != null && first.getComponentId().equals(COMPONENT_ID) && (first2 == null || !first2.getClass().getName().equals(first.getClass().getName()))) {
                    try {
                        LOGGER.debug("annotateAcronymsWithFullFormEntity() - fullform of abbreviation (" + abbreviation.getCoveredText() + " [begin=" + abbreviation.getBegin() + "; end=" + abbreviation.getEnd() + "]) has ConceptMention: " + first.toString());
                        ConceptMention annotationByClassName = JCoReAnnotationTools.getAnnotationByClassName(jCas, this.outputType);
                        annotationByClassName.setBegin(abbreviation.getBegin());
                        annotationByClassName.setEnd(abbreviation.getEnd());
                        annotationByClassName.setTextualRepresentation(annotationByClassName.getCoveredText());
                        annotationByClassName.setSpecificType(first.getSpecificType());
                        annotationByClassName.setComponentId(COMPONENT_ID + "+acronym");
                        annotationByClassName.setConfidence(first.getConfidence());
                        annotationByClassName.addToIndexes();
                    } catch (Exception e) {
                        LOGGER.error("process() - could not generate output type: " + e.getMessage());
                        e.printStackTrace();
                        throw new AnalysisEngineProcessException("annotator_exception", (Object[]) null);
                    }
                } else if (first2 == null) {
                    LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcronym != null");
                } else if (first2.getClass().getName().equals(first.getClass().getName())) {
                    LOGGER.debug("annotateAcronymsWithFullFormEntity() - emAcroType=" + first2.getClass().getCanonicalName() + " == emFullformType=" + first.getClass().getCanonicalName());
                }
            }
        }
    }
}
