package de.julielab.jcore.ae.lingpipegazetteer.chunking;

import com.aliasi.chunk.Chunker;
import com.aliasi.dict.AbstractDictionary;
import com.aliasi.dict.ApproxDictionaryChunker;
import com.aliasi.dict.DictionaryEntry;
import com.aliasi.dict.ExactDictionaryChunker;
import com.aliasi.dict.MapDictionary;
import com.aliasi.dict.TrieDictionary;
import com.aliasi.spell.WeightedEditDistance;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.ibm.icu.text.Transliterator;
import de.julielab.java.utilities.UriUtilities;
import de.julielab.jcore.ae.lingpipegazetteer.utils.StringNormalizerForChunking;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.lang.NotImplementedException;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.resource.DataResource;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.SharedResourceObject;
import org.apache.uima.resource.metadata.ConfigurationParameterSettings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/julielab/jcore/ae/lingpipegazetteer/chunking/ConfigurableChunkerProviderImplAlt.class */
public class ConfigurableChunkerProviderImplAlt implements ChunkerProvider, SharedResourceObject {
    public static final String PARAM_USE_APPROXIMATE_MATCHING = "UseApproximateMatching";
    public static final String PARAM_CASE_SENSITIVE = "CaseSensitive";
    public static final String PARAM_MAKE_VARIANTS = "MakeVariants";
    public static final String PARAM_STOPWORD_FILE = "StopWordFile";
    public static final String PARAM_NORMALIZE_TEXT = "NormalizeText";
    public static final String PARAM_NORMALIZE_PLURAL = "NormalizePlural";
    public static final String PARAM_TRANSLITERATE_TEXT = "TransliterateText";
    private static final Logger LOGGER = LoggerFactory.getLogger(ConfigurableChunkerProviderImplAlt.class);
    private Boolean generateVariants;
    private Boolean caseSensitive;
    private Boolean useApproximateMatching;
    private Boolean transliterate;
    private Boolean normalize;
    private Boolean normalizePlural;
    private InputStream dictFile;
    private InputStream stopFile;
    private AbstractDictionary<String> dict;
    private String stopwordFilePath;
    private URI resourceUri;
    private final double CHUNK_SCORE = 1.0d;
    private final int MIN_TERM_LENGTH = 3;
    private final double APPROX_MATCH_THRESHOLD_SCORE = 100.0d;
    private Chunker dictChunker = null;
    private Set<String> stopWords = new HashSet();

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public Chunker getChunker() {
        return this.dictChunker;
    }

    public void load(DataResource dataResource) throws ResourceInitializationException {
        this.resourceUri = dataResource.getUri();
        LOGGER.info("Creating dictionary chunker with dictionary loaded from " + this.resourceUri);
        ConfigurationParameterSettings configurationParameterSettings = dataResource.getMetaData().getConfigurationParameterSettings();
        this.stopwordFilePath = (String) configurationParameterSettings.getParameterValue("StopWordFile");
        if (this.stopwordFilePath == null) {
            throw new ResourceInitializationException("config_setting_absent", new Object[]{"StopWordFile"});
        }
        this.generateVariants = (Boolean) configurationParameterSettings.getParameterValue("MakeVariants");
        LOGGER.info("Generate variants: {}", this.generateVariants);
        this.normalize = (Boolean) configurationParameterSettings.getParameterValue("NormalizeText");
        LOGGER.info("Normalize dictionary entries (i.e. completely strip dashes, parenthesis etc): {}", this.normalize);
        this.normalizePlural = Boolean.valueOf(((Boolean) Optional.ofNullable((Boolean) configurationParameterSettings.getParameterValue("NormalizePlural")).orElse(false)).booleanValue() && this.normalize.booleanValue());
        if (this.normalize.booleanValue()) {
            LOGGER.info("Also normalize plural forms to singular: {}", this.normalizePlural);
        }
        this.transliterate = (Boolean) configurationParameterSettings.getParameterValue("TransliterateText");
        LOGGER.info("Transliterate dictionary entries (i.e. transform accented characters to their base forms): {}", this.transliterate);
        this.caseSensitive = (Boolean) configurationParameterSettings.getParameterValue("CaseSensitive");
        LOGGER.info("Case sensitive: {}", this.caseSensitive);
        this.useApproximateMatching = (Boolean) configurationParameterSettings.getParameterValue("UseApproximateMatching");
        LOGGER.info("Use approximate matching: {}", this.useApproximateMatching);
        if (this.normalize.booleanValue() && this.generateVariants.booleanValue()) {
            throw new ResourceInitializationException(new IllegalStateException("MakeVariants and NormalizeText are both activated which is invalid. The two options work towards the same goal in two different ways, i.e. to recognize dictionary entry variants not given explicitly. However, the approaches are not compatible and you have to choose a single one."));
        }
        try {
            try {
                this.dictFile = UriUtilities.getInputStreamFromUri(dataResource.getUri());
                this.stopFile = readStreamFromFileSystemOrClassPath(this.stopwordFilePath);
                initStopWords(this.stopFile);
                readDictionary(this.dictFile);
                LOGGER.info("Now creating chunker.");
                long currentTimeMillis = System.currentTimeMillis();
                if (this.useApproximateMatching.booleanValue()) {
                    final HashSet hashSet = new HashSet();
                    hashSet.add('-');
                    WeightedEditDistance weightedEditDistance = ApproxDictionaryChunker.TT_DISTANCE;
                    this.dictChunker = new ApproxDictionaryChunker(this.dict, IndoEuropeanTokenizerFactory.INSTANCE, new WeightedEditDistance() { // from class: de.julielab.jcore.ae.lingpipegazetteer.chunking.ConfigurableChunkerProviderImplAlt.1
                        public double deleteWeight(char c) {
                            return c == '-' ? -5.0d : (c == ' ' || hashSet.contains(Character.valueOf(c))) ? -10.0d : -110.0d;
                        }

                        public double insertWeight(char c) {
                            return deleteWeight(c);
                        }

                        public double matchWeight(char c) {
                            return 0.0d;
                        }

                        public double substituteWeight(char c, char c2) {
                            if (c == ' ' && c2 == '-') {
                                return -2.0d;
                            }
                            if (c == '-' && c2 == ' ') {
                                return -2.0d;
                            }
                            if (c == ' ' && hashSet.contains(Character.valueOf(c2))) {
                                return -10.0d;
                            }
                            return (hashSet.contains(Character.valueOf(c)) && c2 == ' ') ? -10.0d : -110.0d;
                        }

                        public double transposeWeight(char c, char c2) {
                            return Double.NEGATIVE_INFINITY;
                        }
                    }, 100.0d);
                } else {
                    this.dictChunker = new ExactDictionaryChunker(this.dict, IndoEuropeanTokenizerFactory.INSTANCE, false, this.caseSensitive.booleanValue());
                }
                long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                LOGGER.info("Building the actual chunker from the dictionary took {}ms ({}s).", Long.valueOf(currentTimeMillis2), Long.valueOf(currentTimeMillis2 / 1000));
            } catch (Exception e) {
                LOGGER.error("Could not load the dictionary from {}, see the following exception for details.", dataResource.getUri());
                throw e;
            }
        } catch (Exception e2) {
            LOGGER.error("Exception while creating chunker instance from dictionary file {} with stopwords from {}", new Object[]{dataResource.getUri(), this.stopwordFilePath, e2});
        }
    }

    private void readDictionary(InputStream inputStream) throws IOException, AnalysisEngineProcessException {
        long currentTimeMillis = System.currentTimeMillis();
        if (this.useApproximateMatching.booleanValue()) {
            this.dict = new TrieDictionary();
        } else {
            this.dict = new MapDictionary();
        }
        LOGGER.info("readDictionary() - adding entries from " + this.resourceUri.toString() + " to dictionary...");
        BufferedReader bufferedReader = null;
        try {
            BufferedReader bufferedReader2 = new BufferedReader(new InputStreamReader(inputStream));
            Transliterator transliterator = Transliterator.getInstance("NFD; [:Nonspacing Mark:] Remove; NFC");
            IndoEuropeanTokenizerFactory indoEuropeanTokenizerFactory = null;
            if (this.normalize.booleanValue()) {
                indoEuropeanTokenizerFactory = new IndoEuropeanTokenizerFactory();
            }
            while (true) {
                String readLine = bufferedReader2.readLine();
                if (readLine == null) {
                    long currentTimeMillis2 = System.currentTimeMillis() - currentTimeMillis;
                    LOGGER.info("Reading dictionary took {}ms ({}s)", Long.valueOf(currentTimeMillis2), Long.valueOf(currentTimeMillis2 / 1000));
                    if (null != bufferedReader2) {
                        bufferedReader2.close();
                        return;
                    }
                    return;
                }
                if (!readLine.startsWith("#")) {
                    String[] split = readLine.split("\t");
                    if (split.length != 2) {
                        LOGGER.error("readDictionary() - wrong format of line: " + readLine);
                        throw new AnalysisEngineProcessException("annotator_exception", (Object[]) null);
                    }
                    String trim = split[0].trim();
                    if (!this.stopWords.contains(trim.toLowerCase())) {
                        if (this.normalize.booleanValue()) {
                            trim = StringNormalizerForChunking.normalizeString(trim, indoEuropeanTokenizerFactory, transliterator).string;
                        }
                        if (this.transliterate.booleanValue()) {
                            trim = transliterator.transform(trim);
                        }
                        if (this.useApproximateMatching.booleanValue() && !this.caseSensitive.booleanValue()) {
                            trim = trim.toLowerCase();
                        }
                        String trim2 = split[1].trim();
                        if (trim.length() >= 3) {
                            if (this.generateVariants.booleanValue()) {
                                throw new NotImplementedException("In this alternative ChunkerProvider, generating variants will currently fail to adequately filter out stop words due to the transliteration and/or normalization algorithms. If you don't need those algorithms, just stick to the original ChunkerProviderImpl. Otherwise, this issue must be fixed (shouldnt be too difficult). Variants are also currently not treated with normalization/transliteration (but this is deemed to be two alternative ways to achieve a similar thing anyway)");
                            }
                            if (!this.stopWords.contains(trim.toLowerCase())) {
                                this.dict.addEntry(new DictionaryEntry(trim, trim2, 1.0d));
                            }
                        }
                    }
                }
            }
        } catch (Throwable th) {
            if (0 != 0) {
                bufferedReader.close();
            }
            throw th;
        }
    }

    private void initStopWords(InputStream inputStream) throws IOException {
        this.stopWords = new HashSet();
        LOGGER.info("readDictionary() - adding entries from " + this.stopwordFilePath + " to dictionary...");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                } else if (!readLine.startsWith("#")) {
                    this.stopWords.add(readLine.trim().toLowerCase());
                }
            } catch (IOException e) {
                e.printStackTrace();
                return;
            }
        }
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public Set<String> getStopWords() {
        return this.stopWords;
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getUseApproximateMatching() {
        return this.useApproximateMatching.booleanValue();
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getNormalize() {
        return this.normalize.booleanValue();
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getNormalizePlural() {
        return this.normalizePlural.booleanValue();
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getTransliterate() {
        return this.transliterate.booleanValue();
    }

    @Override // de.julielab.jcore.ae.lingpipegazetteer.chunking.ChunkerProvider
    public boolean getCaseSensitive() {
        return this.caseSensitive.booleanValue();
    }

    private InputStream readStreamFromFileSystemOrClassPath(String str) throws FileNotFoundException {
        InputStream inputStream = null;
        File file = new File(str);
        if (file.exists()) {
            try {
                inputStream = new FileInputStream(file);
            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
        } else {
            inputStream = getClass().getResourceAsStream(str.startsWith("/") ? str : "/" + str);
        }
        if (str.endsWith(".gz") || str.endsWith(".gzip")) {
            try {
                inputStream = new GZIPInputStream(inputStream);
            } catch (IOException e2) {
                e2.printStackTrace();
            }
        }
        if (inputStream == null) {
            throw new FileNotFoundException("Could not read contents from " + str);
        }
        return inputStream;
    }
}
