package eus.ixa.ixa.pipe.tok;

import eus.ixa.ixa.pipe.seg.RuleBasedSegmenter;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:eus/ixa/ixa/pipe/tok/NonPeriodBreaker.class */
public class NonPeriodBreaker {
    public static String SECTION = "§";
    public static Pattern section = Pattern.compile(SECTION);
    public static Pattern segmentAll = Pattern.compile("([\\p{Alnum}\\.-]*" + RuleBasedSegmenter.FINAL_PUNCT + "*[\\.]+)[\\ ]*(" + RuleBasedSegmenter.INITIAL_PUNCT + "*[\\ ]*[\\p{Lu}])", 256);
    public static String NON_BREAKER_DIGITS = "(al|[Aa]rt|ca|figs?|[Nn]os?|[Nn]rs?|op|p|pp|[Pp]ág)";
    public static Pattern nonBreakerDigits = Pattern.compile("(" + NON_BREAKER_DIGITS + "[\\ ]*[\\.-]*)" + SECTION + "([\\ ]*\\p{Digit})", 256);
    public static Pattern acronym = Pattern.compile("(\\p{Lu})(\\.(§)[\\ ]*\\p{Lu})+([\\.])", 256);
    public static Pattern numbers = Pattern.compile("(\\p{Digit}+[\\.])[\\ ]*[§][\\ ]*(\\p{Digit}+)", 256);
    public static Pattern wordDot = Pattern.compile("^(\\S+)\\.$");
    public static Pattern alphabetic = Pattern.compile("\\p{Alpha}", 256);
    public static Pattern startLower = Pattern.compile("^\\p{Lower}+", 256);
    public static Pattern startPunct = Pattern.compile("^[\\!#\\$%&\\(\\)\\*\\+,-\\/:;=>\\?@\\[\\\\\\]\\^\\{\\|\\}~]");
    public static Pattern startDigit = Pattern.compile("^\\p{Digit}+", 256);
    private String NON_BREAKER = null;

    public NonPeriodBreaker(Properties properties) {
        loadNonBreaker(properties);
    }

    private void loadNonBreaker(Properties properties) {
        String property = properties.getProperty("language");
        if (this.NON_BREAKER == null) {
            createNonBreaker(property);
        }
    }

    private void createNonBreaker(String str) {
        ArrayList arrayList = new ArrayList();
        InputStream nonBreakerInputStream = getNonBreakerInputStream(str);
        if (nonBreakerInputStream == null) {
            System.err.println("ERROR: Not nonbreaker file for language " + str + " in src/main/resources!!");
            System.exit(1);
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(nonBreakerInputStream));
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                String trim = readLine.trim();
                if (!trim.startsWith("#")) {
                    arrayList.add(trim);
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        this.NON_BREAKER = StringUtils.createDisjunctRegexFromList(arrayList);
    }

    private final InputStream getNonBreakerInputStream(String str) {
        InputStream inputStream = null;
        if (str.equalsIgnoreCase("de")) {
            inputStream = getClass().getResourceAsStream("/de-nonbreaker.txt");
        } else if (str.equalsIgnoreCase("en")) {
            inputStream = getClass().getResourceAsStream("/en-nonbreaker.txt");
        } else if (str.equalsIgnoreCase("es")) {
            inputStream = getClass().getResourceAsStream("/es-nonbreaker.txt");
        } else if (str.equalsIgnoreCase("eu")) {
            inputStream = getClass().getResourceAsStream("/eu-nonbreaker.txt");
        } else if (str.equalsIgnoreCase("fr")) {
            inputStream = getClass().getResourceAsStream("/fr-nonbreaker.txt");
        } else if (str.equalsIgnoreCase("gl")) {
            inputStream = getClass().getResourceAsStream("/gl-nonbreaker.txt");
        } else if (str.equalsIgnoreCase("it")) {
            inputStream = getClass().getResourceAsStream("/it-nonbreaker.txt");
        } else if (str.equalsIgnoreCase("nl")) {
            inputStream = getClass().getResourceAsStream("/nl-nonbreaker.txt");
        }
        return inputStream;
    }

    public String SegmenterNonBreaker(String str) {
        return section.matcher(numbers.matcher(deSegmentAcronyms(Pattern.compile("([\\ ](" + this.NON_BREAKER + ")[\\ ]*[\\.]*)[\\ ]*" + SECTION).matcher(nonBreakerDigits.matcher(segmentAll.matcher(str).replaceAll("$1§$2")).replaceAll("$1$3")).replaceAll(" $1 "))).replaceAll("$1$2")).replaceAll("\n");
    }

    public static String deSegmentAcronyms(String str) {
        Matcher matcher = acronym.matcher(str);
        StringBuffer stringBuffer = new StringBuffer();
        while (matcher.find()) {
            matcher.appendReplacement(stringBuffer, matcher.group().replaceAll(SECTION, " "));
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    public String TokenizerNonBreaker(String str) {
        String replaceAll = RuleBasedTokenizer.doubleSpaces.matcher(str.trim()).replaceAll(" ");
        StringBuilder sb = new StringBuilder();
        String str2 = "";
        String[] split = replaceAll.split(" ");
        for (int i = 0; i < split.length; i++) {
            Matcher matcher = wordDot.matcher(split[i]);
            if (matcher.find()) {
                String replaceAll2 = matcher.replaceAll("$1");
                if ((!replaceAll2.contains(".") || !alphabetic.matcher(replaceAll2).find()) && !replaceAll2.matches("(" + this.NON_BREAKER + ")") && ((i >= split.length - 1 || (!startLower.matcher(split[i + 1]).find() && !startPunct.matcher(split[i + 1]).find())) && (!replaceAll2.matches(NON_BREAKER_DIGITS) || i >= split.length - 1 || !startDigit.matcher(split[i + 1]).find()))) {
                    split[i] = replaceAll2 + " .";
                }
            }
            sb.append(split[i]).append(" ");
            str2 = sb.toString();
        }
        return str2;
    }
}
