package de.citec.scie.pdf;

import de.citec.scie.pdf.structure.Document;
import de.citec.scie.pdf.structure.Page;
import de.citec.scie.pdf.structure.Paragraph;
import de.citec.scie.pdf.structure.Text;
import de.citec.scie.pdf.structure.TextBlock;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFStreamEngine;
import org.apache.pdfbox.util.ResourceLoader;
import org.apache.pdfbox.util.TextPosition;

/* loaded from: input_file:de/citec/scie/pdf/PDFStructuredTextExtractor.class */
public class PDFStructuredTextExtractor {
    public static final int MINIMUMPARSIZE = 80;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:de/citec/scie/pdf/PDFStructuredTextExtractor$PDPagePreprocessor.class */
    public static class PDPagePreprocessor extends PDFStreamEngine {
        private static final String propertiesPath = "org/apache/pdfbox/resources/PDFTextStripper.properties";
        private final PDPage page;
        private final PreTextBlock preTextBlock;

        public PDPagePreprocessor(PDPage pDPage) throws IOException {
            super(ResourceLoader.loadProperties(propertiesPath, true));
            this.preTextBlock = new PreTextBlock();
            this.page = pDPage;
        }

        public void process() throws IOException {
            processStream(this.page, this.page.findResources(), this.page.getContents().getStream());
        }

        protected void processTextPosition(TextPosition textPosition) {
            this.preTextBlock.addTextPosition(textPosition);
        }

        public PreTextBlock getPreTextBlock() {
            return this.preTextBlock;
        }
    }

    public static Document importAsDocument(InputStream inputStream) throws IOException {
        PDDocument pDDocument = null;
        try {
            PDFParser pDFParser = new PDFParser(inputStream);
            pDFParser.parse();
            PDDocument pDDocument2 = pDFParser.getPDDocument();
            Document document = new Document();
            List allPages = pDDocument2.getDocumentCatalog().getAllPages();
            if (allPages.isEmpty()) {
                throw new IOException("PDFBox did not find any pages!");
            }
            int i = 0;
            for (Object obj : allPages) {
                i++;
                Page page = new Page();
                page.setPageNumber(i);
                document.content.add(page);
                PDPagePreprocessor pDPagePreprocessor = new PDPagePreprocessor((PDPage) obj);
                pDPagePreprocessor.process();
                TextBlockRankEstimator textBlockRankEstimator = new TextBlockRankEstimator();
                Iterator<PreTextBlock> it = pDPagePreprocessor.getPreTextBlock().split().iterator();
                while (it.hasNext()) {
                    PreTextBlock next = it.next();
                    TextBlock textBlock = new TextBlock();
                    page.content.add(textBlock);
                    textBlockRankEstimator.addBlock(textBlock, next);
                    Paragraph paragraph = new Paragraph();
                    textBlock.content.add(paragraph);
                    Text text = new Text();
                    paragraph.content.add(text);
                    ParagraphEstimator paragraphEstimator = new ParagraphEstimator(next);
                    WhiteSpaceEstimator whiteSpaceEstimator = new WhiteSpaceEstimator();
                    VerticalAlignmentEstimator verticalAlignmentEstimator = new VerticalAlignmentEstimator(next.lines.get(0));
                    TextPosition textPosition = next.lines.get(0).content.get(0);
                    text.setFontSize(textPosition.getFontSizeInPt());
                    if (textPosition.getFont() != null && textPosition.getFont().getFontDescriptor() != null) {
                        text.setFontName(textPosition.getFont().getFontDescriptor().getFontName());
                    }
                    text.setVerticalAlignment(verticalAlignmentEstimator.calculateAlignment(textPosition));
                    StringBuilder sb = new StringBuilder();
                    Iterator<PreTextLine> it2 = next.lines.iterator();
                    while (it2.hasNext()) {
                        PreTextLine next2 = it2.next();
                        VerticalAlignmentEstimator verticalAlignmentEstimator2 = new VerticalAlignmentEstimator(next2);
                        if (paragraphEstimator.isNewParagraph(next2)) {
                            paragraph = new Paragraph();
                            textBlock.content.add(paragraph);
                            sb.delete(sb.length() - 1, sb.length());
                            text.setText(sb.toString());
                            text = new Text();
                            paragraph.content.add(text);
                            sb = new StringBuilder();
                            TextPosition textPosition2 = next2.content.get(0);
                            text.setFontSize(textPosition2.getFontSizeInPt());
                            if (textPosition2.getFont() != null && textPosition2.getFont().getFontDescriptor() != null) {
                                text.setFontName(textPosition2.getFont().getFontDescriptor().getFontName());
                            }
                        }
                        Iterator<TextPosition> it3 = next2.content.iterator();
                        while (it3.hasNext()) {
                            TextPosition next3 = it3.next();
                            String fontName = (next3.getFont() == null || next3.getFont().getFontDescriptor() == null) ? null : next3.getFont().getFontDescriptor().getFontName();
                            boolean equals = fontName == null ? text.getFontName() == null : text.getFontName() == null ? false : fontName.equals(text.getFontName());
                            float fontSizeInPt = next3.getFontSizeInPt();
                            Text.VerticalAlignment calculateAlignment = verticalAlignmentEstimator2.calculateAlignment(next3);
                            if (!equals || fontSizeInPt != text.getFontSize() || calculateAlignment != text.getVerticalAlignment()) {
                                text.setText(sb.toString());
                                text = new Text();
                                paragraph.content.add(text);
                                sb = new StringBuilder();
                                text.setFontName(fontName);
                                text.setFontSize(fontSizeInPt);
                                text.setVerticalAlignment(calculateAlignment);
                                whiteSpaceEstimator = new WhiteSpaceEstimator();
                            }
                            if (whiteSpaceEstimator.hasWhiteSpace(next3)) {
                                sb.append(' ');
                            }
                            sb.append(next3.getCharacter());
                        }
                        if (sb.length() > 0 && sb.charAt(sb.length() - 1) != '-') {
                            sb.append(' ');
                        }
                    }
                    text.setText(sb.toString());
                }
                Iterator<TextBlock> it4 = page.content.iterator();
                while (it4.hasNext()) {
                    TextBlock next4 = it4.next();
                    paragraphSanityCheck(next4);
                    next4.setRelativeFontSize(textBlockRankEstimator.getRelativeFontSize(next4));
                }
            }
            new DocumentBlockCleaner().blockCleanup(document);
            if (document.content.isEmpty()) {
                throw new IOException("After cleanup the document contained nothing!");
            }
            if (pDDocument2 != null) {
                pDDocument2.close();
            }
            inputStream.close();
            return document;
        } catch (Throwable th) {
            if (0 != 0) {
                pDDocument.close();
            }
            inputStream.close();
            throw th;
        }
    }

    public static String importAsString(InputStream inputStream) throws IOException {
        return importAsDocument(inputStream).indexedToString(0);
    }

    public static InputStream importAsInputStream(InputStream inputStream) throws IOException {
        return new ByteArrayInputStream(importAsString(inputStream).getBytes("UTF-8"));
    }

    private static void paragraphSanityCheck(TextBlock textBlock) {
        if (textBlock.content.size() > 1) {
            int i = 0;
            Iterator<Paragraph> it = textBlock.content.iterator();
            while (it.hasNext()) {
                Iterator<Text> it2 = it.next().content.iterator();
                while (it2.hasNext()) {
                    i += it2.next().getText().length();
                }
            }
            if (i / textBlock.content.size() < 80.0d) {
                Paragraph paragraph = new Paragraph();
                Iterator<Paragraph> it3 = textBlock.content.iterator();
                while (it3.hasNext()) {
                    paragraph.content.addAll(it3.next().content);
                }
                textBlock.content.clear();
                textBlock.content.add(paragraph);
            }
        }
    }
}
