package biz.neumann.ocr.format;

import biz.neumann.HTMLParser;
import biz.neumann.ocr.Block;
import biz.neumann.ocr.Block$;
import biz.neumann.ocr.Document;
import biz.neumann.ocr.Line;
import biz.neumann.ocr.Line$;
import biz.neumann.ocr.Page;
import biz.neumann.ocr.Page$;
import biz.neumann.ocr.Word;
import biz.neumann.ocr.package$;
import java.io.File;
import scala.Array$;
import scala.Option;
import scala.Predef$;
import scala.Predef$DummyImplicit$;
import scala.Some;
import scala.Symbol;
import scala.Symbol$;
import scala.Tuple2;
import scala.collection.IndexedSeq;
import scala.collection.LinearSeqOptimized;
import scala.collection.TraversableLike;
import scala.collection.TraversableOnce;
import scala.collection.immutable.Seq$;
import scala.collection.immutable.StringOps;
import scala.collection.mutable.StringBuilder;
import scala.runtime.BoxesRunTime;
import scala.util.matching.Regex;
import scala.xml.Node;
import scala.xml.NodeSeq;

/* compiled from: HOCR.scala */
/* loaded from: input_file:biz/neumann/ocr/format/HOCR$.class */
public final class HOCR$ {
    public static final HOCR$ MODULE$ = null;
    private final Regex coordinatesExtractor;
    private final Regex styleExtractor;
    private final Regex pagePathNumberExtractor;
    private final Regex pageNumberExtractor;
    private static Symbol symbol$1 = Symbol$.MODULE$.apply("ocrx_word");

    static {
        new HOCR$();
    }

    public Regex coordinatesExtractor() {
        return this.coordinatesExtractor;
    }

    public Regex styleExtractor() {
        return this.styleExtractor;
    }

    public Symbol extractOCRClass(NodeSeq nodeSeq) {
        return Symbol$.MODULE$.apply(nodeSeq.$bslash("@class").text());
    }

    public Tuple2<Tuple2<Object, Object>, Tuple2<Object, Object>> extractCoordinates(NodeSeq nodeSeq) {
        String text = nodeSeq.$bslash("@title").text();
        Option unapplySeq = coordinatesExtractor().unapplySeq(text);
        if (unapplySeq.isEmpty() || unapplySeq.get() == null || ((LinearSeqOptimized) unapplySeq.get()).lengthCompare(4) != 0) {
            throw new Exception(new StringBuilder().append("No valid coordiantes string in").append(text).toString());
        }
        return new Tuple2<>(new Tuple2.mcII.sp(new StringOps(Predef$.MODULE$.augmentString((String) ((LinearSeqOptimized) unapplySeq.get()).apply(0))).toInt(), new StringOps(Predef$.MODULE$.augmentString((String) ((LinearSeqOptimized) unapplySeq.get()).apply(1))).toInt()), new Tuple2.mcII.sp(new StringOps(Predef$.MODULE$.augmentString((String) ((LinearSeqOptimized) unapplySeq.get()).apply(2))).toInt(), new StringOps(Predef$.MODULE$.augmentString((String) ((LinearSeqOptimized) unapplySeq.get()).apply(3))).toInt()));
    }

    public Regex pagePathNumberExtractor() {
        return this.pagePathNumberExtractor;
    }

    public Regex pageNumberExtractor() {
        return this.pageNumberExtractor;
    }

    public Document fromFolder(String str) {
        return new Document(package$.MODULE$.Pages(pagesFromFolder(str)));
    }

    public Document fromFolderWithImage(String str, String str2) {
        return docWithImages(new Document(package$.MODULE$.Pages(pagesFromFolder(str))), str2);
    }

    public Document docWithImages(Document document, String str) {
        Predef$.MODULE$.refArrayOps((File[]) Predef$.MODULE$.refArrayOps(new File(str).listFiles()).filter(new HOCR$$anonfun$1())).par().foreach(new HOCR$$anonfun$docWithImages$1());
        return document;
    }

    public int getPageNumberFromImagePath(String str) {
        Option unapplySeq = pagePathNumberExtractor().unapplySeq(str);
        if (unapplySeq.isEmpty() || unapplySeq.get() == null || ((LinearSeqOptimized) unapplySeq.get()).lengthCompare(1) != 0) {
            throw new Error(new StringBuilder().append("Konnte Bilddatei: ").append(str).append(" keine Seite zuordnen: ").toString());
        }
        return new StringOps(Predef$.MODULE$.augmentString((String) ((LinearSeqOptimized) unapplySeq.get()).apply(0))).toInt();
    }

    public IndexedSeq<Page> pagesFromFolder(String str) {
        return (IndexedSeq) Predef$.MODULE$.refArrayOps(new File(str).listFiles()).map(new HOCR$$anonfun$pagesFromFolder$1(), Array$.MODULE$.fallbackCanBuildFrom(Predef$DummyImplicit$.MODULE$.dummyImplicit()));
    }

    public Page pageFromFile(File file) {
        return pageFromHTML(new HTMLParser().fromFile(file));
    }

    public Page pageFromHTML(NodeSeq nodeSeq) {
        Node node = (Node) nodeSeq.$bslash("body").$bslash("div").head();
        return new Page(extractPageNumber(node), extractCoordinates(node), blocksFromHTML(node), Page$.MODULE$.$lessinit$greater$default$4());
    }

    public int extractPageNumber(NodeSeq nodeSeq) {
        String text = nodeSeq.$bslash("@title").text();
        Option unapplySeq = pageNumberExtractor().unapplySeq(text);
        if (unapplySeq.isEmpty() || unapplySeq.get() == null || ((LinearSeqOptimized) unapplySeq.get()).lengthCompare(1) != 0) {
            throw new Exception(new StringBuilder().append(text).append(" is no valid title string for a pageFromXML").toString());
        }
        return new StringOps(Predef$.MODULE$.augmentString((String) ((LinearSeqOptimized) unapplySeq.get()).apply(0))).toInt();
    }

    public IndexedSeq<Block> blocksFromHTML(NodeSeq nodeSeq) {
        return ((TraversableOnce) nodeSeq.$bslash("div").map(new HOCR$$anonfun$blocksFromHTML$1(), Seq$.MODULE$.canBuildFrom())).toIndexedSeq();
    }

    public Block blockFromHTML(NodeSeq nodeSeq) {
        return new Block(extractCoordinates(nodeSeq), linesFromHTML(nodeSeq), Block$.MODULE$.apply$default$3());
    }

    public IndexedSeq<Line> linesFromHTML(NodeSeq nodeSeq) {
        return ((TraversableOnce) nodeSeq.$bslash("p").$bslash("span").map(new HOCR$$anonfun$linesFromHTML$1(), Seq$.MODULE$.canBuildFrom())).toIndexedSeq();
    }

    public Line lineFromHTML(NodeSeq nodeSeq) {
        return new Line(extractCoordinates(nodeSeq), buildWordSeq(nodeSeq), Line$.MODULE$.apply$default$3());
    }

    public boolean isOCRWord(NodeSeq nodeSeq) {
        Symbol extractOCRClass = extractOCRClass(nodeSeq);
        Symbol symbol = symbol$1;
        return extractOCRClass != null ? extractOCRClass.equals(symbol) : symbol == null;
    }

    public IndexedSeq<Word> buildWordSeq(NodeSeq nodeSeq) {
        return ((TraversableOnce) ((TraversableLike) nodeSeq.$bslash("span").filter(new HOCR$$anonfun$buildWordSeq$1())).map(new HOCR$$anonfun$buildWordSeq$2(), Seq$.MODULE$.canBuildFrom())).toIndexedSeq();
    }

    public Word wordfromHTML(Node node, int i) {
        return new Word(extractCoordinates(node), node.text(), new Some(BoxesRunTime.boxToInteger(i)));
    }

    public int wordfromHTML$default$2() {
        return 0;
    }

    private HOCR$() {
        MODULE$ = this;
        this.coordinatesExtractor = new StringOps(Predef$.MODULE$.augmentString("bbox (\\d+) (\\d+) (\\d+) (\\d+).*")).r();
        this.styleExtractor = new StringOps(Predef$.MODULE$.augmentString("([^:]+):([^;]+);")).r();
        this.pagePathNumberExtractor = new StringOps(Predef$.MODULE$.augmentString("[^_]+_(?:0)*(\\d+).*")).r();
        this.pageNumberExtractor = new StringOps(Predef$.MODULE$.augmentString("[^;]+;ppageno\\s*(\\d+)")).r();
    }
}
