package io.annot8.components.documents.processors;

import com.drew.metadata.Metadata;
import com.drew.metadata.xmp.XmpReader;
import io.annot8.api.components.annotations.ComponentDescription;
import io.annot8.api.components.annotations.ComponentName;
import io.annot8.api.components.annotations.ComponentTags;
import io.annot8.api.components.annotations.SettingsClass;
import io.annot8.api.context.Context;
import io.annot8.api.exceptions.Annot8RuntimeException;
import io.annot8.api.exceptions.ProcessingException;
import io.annot8.api.settings.Description;
import io.annot8.common.data.content.DefaultRow;
import io.annot8.common.data.content.FileContent;
import io.annot8.common.data.content.InputStreamContent;
import io.annot8.common.data.content.Row;
import io.annot8.common.data.content.Table;
import io.annot8.common.utils.java.ConversionUtils;
import io.annot8.components.documents.data.ExtractionWithProperties;
import java.awt.image.BufferedImage;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.poifs.filesystem.FileMagic;
import technology.tabula.ObjectExtractor;
import technology.tabula.Page;
import technology.tabula.PageIterator;
import technology.tabula.Rectangle;
import technology.tabula.detectors.DetectionAlgorithm;
import technology.tabula.detectors.NurminenDetectionAlgorithm;
import technology.tabula.detectors.SpreadsheetDetectionAlgorithm;
import technology.tabula.extractors.BasicExtractionAlgorithm;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

@ComponentDescription("Extracts image and text from PDF (*.pdf) files")
@ComponentTags({"documents", "pdf", "extractor", "text", "images", "metadata", "tables"})
@ComponentName("PDF Extractor")
@SettingsClass(Settings.class)
/* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor.class */
public class PdfExtractor extends AbstractDocumentExtractorDescriptor<Processor, Settings> {

    /* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor$DetectionAlgorithmType.class */
    public enum DetectionAlgorithmType {
        NURMINEN,
        LATTICE
    }

    /* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor$ExtractionAlgorithmType.class */
    public enum ExtractionAlgorithmType {
        STREAM,
        LATTICE,
        DETERMINE
    }

    /* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor$PdfTable.class */
    public static class PdfTable implements Table {
        private final List<Row> rows;
        private final List<String> columnNames;

        public PdfTable(technology.tabula.Table table) {
            ArrayList arrayList = new ArrayList(table.getRowCount() - 1);
            List list = (List) table.getRows().stream().map(list2 -> {
                return (List) list2.stream().map((v0) -> {
                    return v0.getText();
                }).map(ConversionUtils::parseString).collect(Collectors.toList());
            }).collect(Collectors.toList());
            if (list.size() > 1) {
                this.columnNames = (List) ((List) list.remove(0)).stream().map((v0) -> {
                    return v0.toString();
                }).collect(Collectors.toList());
            } else {
                this.columnNames = Collections.emptyList();
            }
            for (int i = 0; i < list.size(); i++) {
                arrayList.add(new DefaultRow(i, this.columnNames, (List) list.get(i)));
            }
            this.rows = Collections.unmodifiableList(arrayList);
        }

        public int getColumnCount() {
            return this.columnNames.size();
        }

        public int getRowCount() {
            return this.rows.size();
        }

        public Optional<List<String>> getColumnNames() {
            return Optional.of(this.columnNames);
        }

        public Stream<Row> getRows() {
            return this.rows.stream();
        }
    }

    /* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor$Processor.class */
    public static class Processor extends AbstractDocumentExtractorProcessor<PDDocument, Settings> {
        private final PDFTextStripper stripper;
        private final DetectionAlgorithm detectionAlgorithm;
        private static final BasicExtractionAlgorithm basicExtractionAlgorithm = new BasicExtractionAlgorithm();
        private static final SpreadsheetExtractionAlgorithm spreadsheetExtractionAlgorithm = new SpreadsheetExtractionAlgorithm();

        public Processor(Context context, Settings settings) {
            super(context, settings);
            if (settings.isExtractText()) {
                try {
                    this.stripper = new PDFTextStripper();
                    this.stripper.setPageStart(settings.getPageStart());
                    this.stripper.setPageEnd(settings.getPageEnd());
                    this.stripper.setParagraphStart(settings.getParagraphStart());
                    this.stripper.setParagraphEnd(settings.getParagraphEnd());
                    this.stripper.setArticleStart(settings.getArticleStart());
                    this.stripper.setArticleEnd(settings.getArticleEnd());
                } catch (IOException e) {
                    throw new Annot8RuntimeException("Unable to create PDFTextStripper", e);
                }
            } else {
                this.stripper = null;
            }
            switch (settings.getTableDetectionAlgorithm()) {
                case LATTICE:
                    this.detectionAlgorithm = new SpreadsheetDetectionAlgorithm();
                    return;
                case NURMINEN:
                default:
                    this.detectionAlgorithm = new NurminenDetectionAlgorithm();
                    return;
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isMetadataSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isTextSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isImagesSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isTablesSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean acceptFile(FileContent fileContent) {
            try {
                return FileMagic.valueOf((File) fileContent.getData()) == FileMagic.PDF;
            } catch (IOException e) {
                return false;
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean acceptInputStream(InputStreamContent inputStreamContent) {
            try {
                BufferedInputStream bufferedInputStream = new BufferedInputStream((InputStream) inputStreamContent.getData());
                try {
                    boolean z = FileMagic.valueOf(bufferedInputStream) == FileMagic.PDF;
                    bufferedInputStream.close();
                    return z;
                } finally {
                }
            } catch (IOException e) {
                return false;
            }
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public PDDocument extractDocument(FileContent fileContent) throws IOException {
            return PDDocument.load((File) fileContent.getData());
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public PDDocument extractDocument(InputStreamContent inputStreamContent) throws IOException {
            return PDDocument.load((InputStream) inputStreamContent.getData());
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Map<String, Object> extractMetadata(PDDocument pDDocument) {
            HashMap hashMap = new HashMap();
            PDDocumentInformation documentInformation = pDDocument.getDocumentInformation();
            hashMap.put(DocumentProperties.AUTHOR, documentInformation.getAuthor());
            hashMap.put(DocumentProperties.CREATION_DATE, toTemporal(documentInformation.getCreationDate()));
            hashMap.put(DocumentProperties.CREATOR, documentInformation.getCreator());
            hashMap.put(DocumentProperties.KEYWORDS, documentInformation.getKeywords());
            hashMap.put(DocumentProperties.LAST_MODIFIED_DATE, toTemporal(documentInformation.getModificationDate()));
            hashMap.put(DocumentProperties.PRODUCER, documentInformation.getProducer());
            hashMap.put(DocumentProperties.SUBJECT, documentInformation.getSubject());
            hashMap.put("title", documentInformation.getTitle());
            for (String str : documentInformation.getMetadataKeys()) {
                hashMap.put("custom." + str, documentInformation.getCustomMetadataValue(str));
            }
            hashMap.put(DocumentProperties.PAGE_COUNT, Integer.valueOf(pDDocument.getNumberOfPages()));
            return hashMap;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<String>> extractText(PDDocument pDDocument) throws ProcessingException {
            try {
                return List.of(new ExtractionWithProperties(this.stripper.getText(pDDocument)));
            } catch (IOException e) {
                throw new ProcessingException("Unable to extract text from PDF", e);
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<BufferedImage>> extractImages(PDDocument pDDocument) {
            ArrayList arrayList = new ArrayList();
            int i = 0;
            ArrayList arrayList2 = new ArrayList();
            COSDictionary cOSDictionary = pDDocument.getPages().getCOSObject().getCOSDictionary(COSName.RESOURCES);
            if (cOSDictionary != null) {
                Collection<ExtractionWithProperties<BufferedImage>> imagesFromResources = getImagesFromResources(new PDResources(cOSDictionary), null, 0, Collections.emptyList());
                i = 0 + imagesFromResources.size();
                imagesFromResources.forEach(extractionWithProperties -> {
                    String obj = extractionWithProperties.getProperties().get("name").toString();
                    if (obj != null) {
                        arrayList2.add(COSName.getPDFName(obj));
                    }
                });
                arrayList.addAll(imagesFromResources);
            }
            int i2 = 0;
            Iterator it = pDDocument.getPages().iterator();
            while (it.hasNext()) {
                i2++;
                Collection<ExtractionWithProperties<BufferedImage>> imagesFromResources2 = getImagesFromResources(((PDPage) it.next()).getResources(), Integer.valueOf(i2), i, arrayList2);
                i += imagesFromResources2.size();
                arrayList.addAll(imagesFromResources2);
            }
            return arrayList;
        }

        private Collection<ExtractionWithProperties<BufferedImage>> getImagesFromResources(PDResources pDResources, Integer num, int i, Collection<COSName> collection) {
            ArrayList arrayList = new ArrayList();
            int i2 = i;
            for (COSName cOSName : pDResources.getXObjectNames()) {
                if (!collection.contains(cOSName)) {
                    try {
                        PDImageXObject xObject = pDResources.getXObject(cOSName);
                        if (xObject instanceof PDImageXObject) {
                            PDImageXObject pDImageXObject = xObject;
                            i2++;
                            HashMap hashMap = new HashMap();
                            PDMetadata metadata = pDImageXObject.getMetadata();
                            if (metadata != null) {
                                Metadata metadata2 = new Metadata();
                                new XmpReader().extract(metadata.toByteArray(), metadata2);
                                hashMap.putAll(toMap(metadata2));
                            }
                            hashMap.put("name", cOSName.getName());
                            hashMap.put("index", Integer.valueOf(i2));
                            if (num != null) {
                                hashMap.put("page", num);
                            }
                            arrayList.add(new ExtractionWithProperties(pDImageXObject.getImage(), hashMap));
                        }
                    } catch (IOException e) {
                        log().warn("Unable to read resource {}", cOSName.getName(), e);
                    }
                }
            }
            return arrayList;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<Table>> extractTables(PDDocument pDDocument) throws ProcessingException {
            List<technology.tabula.Table> extract;
            ArrayList arrayList = new ArrayList();
            PageIterator extract2 = new ObjectExtractor(pDDocument).extract();
            while (extract2.hasNext()) {
                Page next = extract2.next();
                List detect = this.detectionAlgorithm.detect(next);
                log().debug("{} tables found on page {}", Integer.valueOf(detect.size()), Integer.valueOf(next.getPageNumber()));
                Iterator it = detect.iterator();
                while (it.hasNext()) {
                    Page area = next.getArea((Rectangle) it.next());
                    switch (((Settings) this.settings).getTableExtractionAlgorithm()) {
                        case STREAM:
                            extract = basicExtractionAlgorithm.extract(area);
                            break;
                        case LATTICE:
                            extract = spreadsheetExtractionAlgorithm.extract(area);
                            break;
                        case DETERMINE:
                        default:
                            extract = spreadsheetExtractionAlgorithm.isTabular(area) ? spreadsheetExtractionAlgorithm.extract(area) : basicExtractionAlgorithm.extract(area);
                            break;
                    }
                    for (technology.tabula.Table table : extract) {
                        HashMap hashMap = new HashMap();
                        hashMap.put("x", Double.valueOf(table.getX()));
                        hashMap.put("y", Double.valueOf(table.getY()));
                        hashMap.put("width", Double.valueOf(table.getWidth()));
                        hashMap.put("height", Double.valueOf(table.getHeight()));
                        hashMap.put("extractionMethod", table.getExtractionMethod());
                        arrayList.add(new ExtractionWithProperties(new PdfTable(table), hashMap));
                    }
                }
            }
            return arrayList;
        }
    }

    /* loaded from: input_file:io/annot8/components/documents/processors/PdfExtractor$Settings.class */
    public static class Settings extends DocumentExtractorSettings {
        private String articleStart;
        private String articleEnd;
        private String pageStart;
        private String pageEnd;
        private String paragraphStart;
        private String paragraphEnd;
        private DetectionAlgorithmType tableDetectionAlgorithm;
        private ExtractionAlgorithmType tableExtractionAlgorithm;

        public Settings() {
            this.articleStart = "";
            this.articleEnd = "";
            this.pageStart = "";
            this.pageEnd = "";
            this.paragraphStart = "";
            this.paragraphEnd = "\n\n";
            this.tableDetectionAlgorithm = DetectionAlgorithmType.LATTICE;
            this.tableExtractionAlgorithm = ExtractionAlgorithmType.LATTICE;
        }

        public Settings(DocumentExtractorSettings documentExtractorSettings) {
            super(documentExtractorSettings);
            this.articleStart = "";
            this.articleEnd = "";
            this.pageStart = "";
            this.pageEnd = "";
            this.paragraphStart = "";
            this.paragraphEnd = "\n\n";
            this.tableDetectionAlgorithm = DetectionAlgorithmType.LATTICE;
            this.tableExtractionAlgorithm = ExtractionAlgorithmType.LATTICE;
        }

        @Override // io.annot8.components.documents.processors.DocumentExtractorSettings
        public boolean validate() {
            return (!super.validate() || this.articleStart == null || this.articleEnd == null || this.pageStart == null || this.pageEnd == null || this.paragraphStart == null || this.paragraphEnd == null) ? false : true;
        }

        @Description("String to add at the start of each article")
        public String getArticleStart() {
            return this.articleStart;
        }

        public void setArticleStart(String str) {
            this.articleStart = str;
        }

        @Description("String to add at the end of each article")
        public String getArticleEnd() {
            return this.articleEnd;
        }

        public void setArticleEnd(String str) {
            this.articleEnd = str;
        }

        @Description("String to add at the start of each article")
        public String getPageStart() {
            return this.pageStart;
        }

        public void setPageStart(String str) {
            this.pageStart = str;
        }

        @Description("String to add at the end of each page")
        public String getPageEnd() {
            return this.pageEnd;
        }

        public void setPageEnd(String str) {
            this.pageEnd = str;
        }

        @Description("String to add at the start of each paragraph")
        public String getParagraphStart() {
            return this.paragraphStart;
        }

        public void setParagraphStart(String str) {
            this.paragraphStart = str;
        }

        @Description("String to add at the end of each paragraph")
        public String getParagraphEnd() {
            return this.paragraphEnd;
        }

        public void setParagraphEnd(String str) {
            this.paragraphEnd = str;
        }

        @Description("The algorithm to use for detecting table content")
        public DetectionAlgorithmType getTableDetectionAlgorithm() {
            return this.tableDetectionAlgorithm;
        }

        public void setTableDetectionAlgorithm(DetectionAlgorithmType detectionAlgorithmType) {
            this.tableDetectionAlgorithm = detectionAlgorithmType;
        }

        @Description("The algorithm to use for extracting table content")
        public ExtractionAlgorithmType getTableExtractionAlgorithm() {
            return this.tableExtractionAlgorithm;
        }

        public void setTableExtractionAlgorithm(ExtractionAlgorithmType extractionAlgorithmType) {
            this.tableExtractionAlgorithm = extractionAlgorithmType;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Processor createComponent(Context context, Settings settings) {
        return new Processor(context, settings);
    }
}
