package eu.stratosphere.sopremo.tokenizer;

import eu.stratosphere.sopremo.type.CachingArrayNode;
import eu.stratosphere.sopremo.type.TextNode;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javolution.text.TextFormat;

/* loaded from: input_file:eu/stratosphere/sopremo/tokenizer/RegexTokenizer.class */
public class RegexTokenizer extends AbstractTokenizer {
    public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\p{javaWhitespace}+");
    private Pattern pattern;

    public RegexTokenizer() {
        this.pattern = WHITESPACE_PATTERN;
    }

    public RegexTokenizer(Pattern pattern) {
        this.pattern = WHITESPACE_PATTERN;
        this.pattern = pattern;
    }

    public void appendAsString(Appendable appendable) throws IOException {
        appendable.append("RegexTokenizer [pattern=");
        TextFormat.getInstance(Pattern.class).format(this.pattern, appendable);
        appendable.append("]");
    }

    public Pattern getPattern() {
        return this.pattern;
    }

    public void setPattern(Pattern pattern) {
        if (pattern == null) {
            throw new NullPointerException("pattern must not be null");
        }
        this.pattern = pattern;
    }

    @Override // eu.stratosphere.sopremo.tokenizer.Tokenizer
    public void tokenizeInto(CharSequence charSequence, CachingArrayNode<TextNode> cachingArrayNode) {
        Matcher matcher = this.pattern.matcher(charSequence);
        cachingArrayNode.clear();
        if (!matcher.find()) {
            addToken(cachingArrayNode, charSequence, 0, charSequence.length());
            return;
        }
        int i = 0;
        do {
            int start = matcher.start();
            if (start > i) {
                addToken(cachingArrayNode, charSequence, i, start);
            }
            i = matcher.end();
        } while (matcher.find());
        addToken(cachingArrayNode, charSequence, i, charSequence.length());
    }
}
