package epic.preprocess;

import de.l3s.boilerpipe.extractors.ArticleExtractor;
import epic.preprocess.TextExtractor;
import epic.slab.Slab;
import epic.slab.Slab$;
import epic.slab.Source;
import epic.trees.Span;
import epic.trees.Span$;
import java.net.URL;
import org.apache.tika.Tika;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.ToTextContentHandler;
import org.xml.sax.Attributes;
import scala.Predef$;
import scala.Predef$ArrowAssoc$;
import scala.Tuple2;
import scala.collection.Seq;
import scala.collection.immutable.Set;
import scala.reflect.ClassTag$;
import scala.xml.Elem;

/* compiled from: TextExtractor.scala */
/* loaded from: input_file:epic/preprocess/TextExtractor$.class */
public final class TextExtractor$ {
    public static final TextExtractor$ MODULE$ = null;

    static {
        new TextExtractor$();
    }

    public String extractText(URL url, boolean z) {
        return loadSlab(url, z).content();
    }

    public boolean extractText$default$2() {
        return true;
    }

    public Slab<String, Span, Object> loadSlab(URL url, boolean z) {
        final Set apply = Predef$.MODULE$.Set().apply(Predef$.MODULE$.wrapRefArray(new String[]{"address", "blockquote", "div", "dl", "fieldset", "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "noscript", "ol", "p", "pre", "table", "ul", "dd", "dt", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "article", "aside", "audio", "canvas", "figcaption", "figure", "header", "hgroup", "output", "section", "video"}));
        final BoilerpipeContentHandler boilerpipeContentHandler = new ToTextContentHandler(apply) { // from class: epic.preprocess.TextExtractor$$anon$1
            private final Set newLineTags$1;

            public void ignorableWhitespace(char[] cArr, int i, int i2) {
                characters(cArr, i, i2);
            }

            /* JADX WARN: Multi-variable type inference failed */
            public void startElement(String str, String str2, String str3, Attributes attributes) {
                super/*org.xml.sax.helpers.DefaultHandler*/.startElement(str, str2, str3, attributes);
                if (this.newLineTags$1.apply(str3.toLowerCase())) {
                    ignorableWhitespace(new char[]{'\n'}, 0, 1);
                }
            }

            /* JADX WARN: Multi-variable type inference failed */
            public void endElement(String str, String str2, String str3) {
                super/*org.xml.sax.helpers.DefaultHandler*/.endElement(str, str2, str3);
                if (this.newLineTags$1.apply(str3.toLowerCase())) {
                    ignorableWhitespace(new char[]{'\n'}, 0, 1);
                }
            }

            {
                this.newLineTags$1 = apply;
            }
        };
        BoilerpipeContentHandler boilerpipeContentHandler2 = z ? new BoilerpipeContentHandler(boilerpipeContentHandler) { // from class: epic.preprocess.TextExtractor$$anon$2
            {
                ArticleExtractor articleExtractor = ArticleExtractor.getInstance();
                setIncludeMarkup(true);
            }
        } : boilerpipeContentHandler;
        Parser parser = new Tika().getParser();
        Metadata metadata = new Metadata();
        TikaInputStream tikaInputStream = TikaInputStream.get(url, metadata);
        try {
            ParseContext parseContext = new ParseContext();
            parseContext.set(Parser.class, parser);
            parser.parse(tikaInputStream, boilerpipeContentHandler2, metadata, parseContext);
            tikaInputStream.close();
            String trim = boilerpipeContentHandler.toString().trim();
            Slab<String, Span, Object> apply2 = Slab$.MODULE$.apply(trim);
            Predef$ predef$ = Predef$.MODULE$;
            Predef$ArrowAssoc$ predef$ArrowAssoc$ = Predef$ArrowAssoc$.MODULE$;
            return apply2.addLayer((Seq<Tuple2<Span, A>>) predef$.wrapRefArray(new Tuple2[]{new Tuple2(Predef$.MODULE$.ArrowAssoc(new Span(Span$.MODULE$.apply(0, trim.length()))), new Source(url))}), ClassTag$.MODULE$.apply(Source.class));
        } catch (Throwable th) {
            tikaInputStream.close();
            throw th;
        }
    }

    public boolean loadSlab$default$2() {
        return true;
    }

    public Elem extractXHTML(URL url) {
        Metadata metadata = new Metadata();
        TikaInputStream tikaInputStream = TikaInputStream.get(url, metadata);
        TextExtractor.Loader loader = new TextExtractor.Loader();
        new Tika().getParser().parse(tikaInputStream, loader, metadata, new ParseContext());
        return loader.value();
    }

    public String foo(URL url) {
        return ArticleExtractor.INSTANCE.getText(url);
    }

    public boolean hasTika() {
        try {
            Class.forName(Tika.class.getName());
            return true;
        } catch (Throwable unused) {
            return false;
        }
    }

    private TextExtractor$() {
        MODULE$ = this;
        if (!hasTika()) {
            throw new RuntimeException("Apache Tika is an optional dependency and is not on the classpath");
        }
    }
}
