package com.digitalpebble.stormcrawler.parse;

import com.digitalpebble.stormcrawler.util.ConfUtils;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.jetbrains.annotations.Contract;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

/* loaded from: input_file:com/digitalpebble/stormcrawler/parse/TextExtractor.class */
public class TextExtractor {
    public static final String INCLUDE_PARAM_NAME = "textextractor.include.pattern";
    public static final String EXCLUDE_PARAM_NAME = "textextractor.exclude.tags";
    public static final String NO_TEXT_PARAM_NAME = "textextractor.no.text";
    private final List<String> inclusionPatterns;
    private final HashSet<String> excludedTags = new HashSet<>();
    private final boolean noText;

    public TextExtractor(Map<String, Object> map) {
        this.noText = ConfUtils.getBoolean(map, NO_TEXT_PARAM_NAME, false);
        this.inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, map);
        ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, map).forEach(str -> {
            this.excludedTags.add(str.toLowerCase());
        });
    }

    public String text(Element element) {
        if (this.noText) {
            return "";
        }
        if (this.inclusionPatterns.size() == 0 && this.excludedTags.size() == 0) {
            return _text(element);
        }
        Elements elements = new Elements();
        Iterator<String> it = this.inclusionPatterns.iterator();
        while (it.hasNext()) {
            elements = element.select(it.next());
            if (!elements.isEmpty()) {
                break;
            }
        }
        if (elements.isEmpty()) {
            elements.add(element);
        }
        StringBuilder sb = new StringBuilder();
        Iterator it2 = elements.iterator();
        while (it2.hasNext()) {
            sb.append(_text((Element) it2.next())).append("\n");
        }
        return sb.toString().trim();
    }

    private String _text(Node node) {
        final StringBuilder sb = new StringBuilder();
        NodeTraversor.traverse(new NodeVisitor() { // from class: com.digitalpebble.stormcrawler.parse.TextExtractor.1
            private Node excluded = null;

            public void head(Node node2, int i) {
                if (this.excluded == null && (node2 instanceof TextNode)) {
                    TextExtractor.appendNormalisedText(sb, (TextNode) node2);
                    return;
                }
                if (node2 instanceof Element) {
                    Element element = (Element) node2;
                    if (TextExtractor.this.excludedTags.contains(element.tagName())) {
                        this.excluded = element;
                    }
                    if (sb.length() > 0) {
                        if ((element.isBlock() || element.tag().getName().equals("br")) && !TextExtractor.lastCharIsWhitespace(sb)) {
                            sb.append(' ');
                        }
                    }
                }
            }

            public void tail(Node node2, int i) {
                if (node2 instanceof Element) {
                    Node node3 = (Element) node2;
                    if (node3 == this.excluded) {
                        this.excluded = null;
                    }
                    if (node3.isBlock() && (node2.nextSibling() instanceof TextNode) && !TextExtractor.lastCharIsWhitespace(sb)) {
                        sb.append(' ');
                    }
                }
            }
        }, node);
        return sb.toString().trim();
    }

    private static void appendNormalisedText(StringBuilder sb, TextNode textNode) {
        String wholeText = textNode.getWholeText();
        if (preserveWhitespace(textNode.parent()) || (textNode instanceof CDataNode)) {
            sb.append(wholeText);
        } else {
            StringUtil.appendNormalisedWhitespace(sb, wholeText, lastCharIsWhitespace(sb));
        }
    }

    @Contract("null -> false")
    static boolean preserveWhitespace(Node node) {
        if (node == null || !(node instanceof Element)) {
            return false;
        }
        Element element = (Element) node;
        int i = 0;
        while (!element.tag().preserveWhitespace()) {
            element = element.parent();
            i++;
            if (i >= 6 || element == null) {
                return false;
            }
        }
        return true;
    }

    static boolean lastCharIsWhitespace(StringBuilder sb) {
        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
    }
}
