package io.annot8.components.documents.processors;

import io.annot8.api.components.annotations.ComponentDescription;
import io.annot8.api.components.annotations.ComponentName;
import io.annot8.api.components.annotations.ComponentTags;
import io.annot8.api.components.annotations.SettingsClass;
import io.annot8.api.context.Context;
import io.annot8.api.exceptions.ProcessingException;
import io.annot8.api.settings.Description;
import io.annot8.common.data.content.FileContent;
import io.annot8.common.data.content.InputStreamContent;
import io.annot8.common.data.content.Table;
import io.annot8.components.documents.data.ExtractionWithProperties;
import io.annot8.components.documents.processors.DocExtractor;
import io.annot8.components.documents.processors.DocxExtractor;
import io.annot8.components.documents.processors.HtmlExtractor;
import io.annot8.components.documents.processors.OdtExtractor;
import io.annot8.components.documents.processors.PdfExtractor;
import io.annot8.components.documents.processors.PlainTextExtractor;
import io.annot8.components.documents.processors.PptExtractor;
import io.annot8.components.documents.processors.PptxExtractor;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.jsoup.nodes.Document;
import org.odftoolkit.odfdom.doc.OdfTextDocument;

@ComponentDescription("Extracts images, tables, metadata and text from Document files by delegating to type-specific extractors")
@ComponentTags({"documents", "extractor", "text", "images", "tables", "metadata", "doc", "docx", "html", "odt", "pdf", "ppt", "pptx"})
@ComponentName("Document Extractor")
@SettingsClass(Settings.class)
/* loaded from: input_file:io/annot8/components/documents/processors/DocumentExtractor.class */
public class DocumentExtractor extends AbstractDocumentExtractorDescriptor<Processor, Settings> {

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:io/annot8/components/documents/processors/DocumentExtractor$DocumentObjectWithType.class */
    public static class DocumentObjectWithType {
        private final Object document;
        private final DocumentType type;

        private DocumentObjectWithType(Object obj, DocumentType documentType) {
            this.document = obj;
            this.type = documentType;
        }

        public Object getDocument() {
            return this.document;
        }

        public DocumentType getType() {
            return this.type;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:io/annot8/components/documents/processors/DocumentExtractor$DocumentType.class */
    public enum DocumentType {
        DOC,
        DOCX,
        HTML,
        ODT,
        PDF,
        PPT,
        PPTX,
        PLAIN_TEXT
    }

    /* loaded from: input_file:io/annot8/components/documents/processors/DocumentExtractor$Processor.class */
    public static class Processor extends AbstractDocumentExtractorProcessor<DocumentObjectWithType, Settings> {
        private final DocExtractor.Processor docProcessor;
        private final DocxExtractor.Processor docxProcessor;
        private final HtmlExtractor.Processor htmlProcessor;
        private final OdtExtractor.Processor odtProcessor;
        private final PdfExtractor.Processor pdfProcessor;
        private final PptExtractor.Processor pptProcessor;
        private final PptxExtractor.Processor pptxProcessor;
        private final PlainTextExtractor.Processor plainTextProcessor;
        private final Map<String, DocumentType> contentToType;

        public Processor(Context context, Settings settings) {
            super(context, settings);
            this.contentToType = new HashMap();
            this.docProcessor = new DocExtractor.Processor(context, new DocumentExtractorSettings(settings));
            this.docxProcessor = new DocxExtractor.Processor(context, new DocumentExtractorSettings(settings));
            this.htmlProcessor = new HtmlExtractor.Processor(context, new HtmlExtractor.Settings(settings));
            this.odtProcessor = new OdtExtractor.Processor(context, new DocumentExtractorSettings(settings));
            this.pdfProcessor = new PdfExtractor.Processor(context, new PdfExtractor.Settings(settings));
            this.pptProcessor = new PptExtractor.Processor(context, new DocumentExtractorSettings(settings));
            this.pptxProcessor = new PptxExtractor.Processor(context, new DocumentExtractorSettings(settings));
            this.plainTextProcessor = new PlainTextExtractor.Processor(context, new DocumentExtractorSettings(settings));
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isMetadataSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isTextSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isImagesSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean isTablesSupported() {
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public void reset() {
            this.contentToType.clear();
            this.docProcessor.reset();
            this.docxProcessor.reset();
            this.htmlProcessor.reset();
            this.odtProcessor.reset();
            this.pdfProcessor.reset();
            this.pptProcessor.reset();
            this.pptxProcessor.reset();
            this.plainTextProcessor.reset();
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean acceptFile(FileContent fileContent) {
            DocumentType documentType = DocumentType.PLAIN_TEXT;
            if (this.docProcessor.acceptFile(fileContent)) {
                documentType = DocumentType.DOC;
            } else if (this.docxProcessor.acceptFile(fileContent)) {
                documentType = DocumentType.DOCX;
            } else if (this.htmlProcessor.acceptFile(fileContent)) {
                documentType = DocumentType.HTML;
            } else if (this.odtProcessor.acceptFile(fileContent)) {
                documentType = DocumentType.ODT;
            } else if (this.pdfProcessor.acceptFile(fileContent)) {
                documentType = DocumentType.PDF;
            } else if (this.pptProcessor.acceptFile(fileContent)) {
                documentType = DocumentType.PPT;
            } else if (this.pptxProcessor.acceptFile(fileContent)) {
                documentType = DocumentType.PPTX;
            } else if (!((Settings) this.settings).isExtractPlainText()) {
                return false;
            }
            this.contentToType.put(fileContent.getId(), documentType);
            return true;
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public boolean acceptInputStream(InputStreamContent inputStreamContent) {
            DocumentType documentType = DocumentType.PLAIN_TEXT;
            if (this.docProcessor.acceptInputStream(inputStreamContent)) {
                documentType = DocumentType.DOC;
            } else if (this.docxProcessor.acceptInputStream(inputStreamContent)) {
                documentType = DocumentType.DOCX;
            } else if (this.htmlProcessor.acceptInputStream(inputStreamContent)) {
                documentType = DocumentType.HTML;
            } else if (this.odtProcessor.acceptInputStream(inputStreamContent)) {
                documentType = DocumentType.ODT;
            } else if (this.pdfProcessor.acceptInputStream(inputStreamContent)) {
                documentType = DocumentType.PDF;
            } else if (this.pptProcessor.acceptInputStream(inputStreamContent)) {
                documentType = DocumentType.PPT;
            } else if (this.pptxProcessor.acceptInputStream(inputStreamContent)) {
                documentType = DocumentType.PPTX;
            } else if (!((Settings) this.settings).isExtractPlainText()) {
                return false;
            }
            this.contentToType.put(inputStreamContent.getId(), documentType);
            return true;
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public DocumentObjectWithType extractDocument(FileContent fileContent) throws IOException {
            DocumentType documentType = this.contentToType.get(fileContent.getId());
            if (documentType == null) {
                throw new ProcessingException("FileContent type has not been recorded");
            }
            switch (documentType) {
                case DOC:
                    return new DocumentObjectWithType(this.docProcessor.extractDocument(fileContent), documentType);
                case DOCX:
                    return new DocumentObjectWithType(this.docxProcessor.extractDocument(fileContent), documentType);
                case HTML:
                    return new DocumentObjectWithType(this.htmlProcessor.extractDocument(fileContent), documentType);
                case ODT:
                    return new DocumentObjectWithType(this.odtProcessor.extractDocument(fileContent), documentType);
                case PDF:
                    return new DocumentObjectWithType(this.pdfProcessor.extractDocument(fileContent), documentType);
                case PPT:
                    return new DocumentObjectWithType(this.pptProcessor.extractDocument(fileContent), documentType);
                case PPTX:
                    return new DocumentObjectWithType(this.pptxProcessor.extractDocument(fileContent), documentType);
                case PLAIN_TEXT:
                    return new DocumentObjectWithType(this.plainTextProcessor.extractDocument(fileContent), documentType);
                default:
                    throw new ProcessingException("Unsupported type " + documentType);
            }
        }

        /* JADX WARN: Can't rename method to resolve collision */
        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public DocumentObjectWithType extractDocument(InputStreamContent inputStreamContent) throws IOException {
            DocumentType documentType = this.contentToType.get(inputStreamContent.getId());
            if (documentType == null) {
                throw new ProcessingException("InputStreamContent type has not been recorded");
            }
            switch (documentType) {
                case DOC:
                    return new DocumentObjectWithType(this.docProcessor.extractDocument(inputStreamContent), documentType);
                case DOCX:
                    return new DocumentObjectWithType(this.docxProcessor.extractDocument(inputStreamContent), documentType);
                case HTML:
                    return new DocumentObjectWithType(this.htmlProcessor.extractDocument(inputStreamContent), documentType);
                case ODT:
                    return new DocumentObjectWithType(this.odtProcessor.extractDocument(inputStreamContent), documentType);
                case PDF:
                    return new DocumentObjectWithType(this.pdfProcessor.extractDocument(inputStreamContent), documentType);
                case PPT:
                    return new DocumentObjectWithType(this.pptProcessor.extractDocument(inputStreamContent), documentType);
                case PPTX:
                    return new DocumentObjectWithType(this.pptxProcessor.extractDocument(inputStreamContent), documentType);
                case PLAIN_TEXT:
                    return new DocumentObjectWithType(this.plainTextProcessor.extractDocument(inputStreamContent), documentType);
                default:
                    throw new ProcessingException("Unsupported type " + documentType);
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Map<String, Object> extractMetadata(DocumentObjectWithType documentObjectWithType) {
            switch (documentObjectWithType.getType()) {
                case DOC:
                    return this.docProcessor.extractMetadata((HWPFDocument) documentObjectWithType.getDocument());
                case DOCX:
                    return this.docxProcessor.extractMetadata((XWPFDocument) documentObjectWithType.getDocument());
                case HTML:
                    return this.htmlProcessor.extractMetadata((Document) documentObjectWithType.getDocument());
                case ODT:
                    return this.odtProcessor.extractMetadata((OdfTextDocument) documentObjectWithType.getDocument());
                case PDF:
                    return this.pdfProcessor.extractMetadata((PDDocument) documentObjectWithType.getDocument());
                case PPT:
                    return this.pptProcessor.extractMetadata((HSLFSlideShow) documentObjectWithType.getDocument());
                case PPTX:
                    return this.pptxProcessor.extractMetadata((XMLSlideShow) documentObjectWithType.getDocument());
                case PLAIN_TEXT:
                    return this.plainTextProcessor.extractMetadata((String) documentObjectWithType.getDocument());
                default:
                    return Collections.emptyMap();
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<String>> extractText(DocumentObjectWithType documentObjectWithType) {
            switch (documentObjectWithType.getType()) {
                case DOC:
                    return this.docProcessor.extractText((HWPFDocument) documentObjectWithType.getDocument());
                case DOCX:
                    return this.docxProcessor.extractText((XWPFDocument) documentObjectWithType.getDocument());
                case HTML:
                    return this.htmlProcessor.extractText((Document) documentObjectWithType.getDocument());
                case ODT:
                    return this.odtProcessor.extractText((OdfTextDocument) documentObjectWithType.getDocument());
                case PDF:
                    return this.pdfProcessor.extractText((PDDocument) documentObjectWithType.getDocument());
                case PPT:
                    return this.pptProcessor.extractText((HSLFSlideShow) documentObjectWithType.getDocument());
                case PPTX:
                    return this.pptxProcessor.extractText((XMLSlideShow) documentObjectWithType.getDocument());
                case PLAIN_TEXT:
                    return this.plainTextProcessor.extractText((String) documentObjectWithType.getDocument());
                default:
                    return Collections.emptyList();
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<BufferedImage>> extractImages(DocumentObjectWithType documentObjectWithType) {
            switch (documentObjectWithType.getType()) {
                case DOC:
                    return this.docProcessor.extractImages((HWPFDocument) documentObjectWithType.getDocument());
                case DOCX:
                    return this.docxProcessor.extractImages((XWPFDocument) documentObjectWithType.getDocument());
                case HTML:
                    return this.htmlProcessor.extractImages((Document) documentObjectWithType.getDocument());
                case ODT:
                    return this.odtProcessor.extractImages((OdfTextDocument) documentObjectWithType.getDocument());
                case PDF:
                    return this.pdfProcessor.extractImages((PDDocument) documentObjectWithType.getDocument());
                case PPT:
                    return this.pptProcessor.extractImages((HSLFSlideShow) documentObjectWithType.getDocument());
                case PPTX:
                    return this.pptxProcessor.extractImages((XMLSlideShow) documentObjectWithType.getDocument());
                case PLAIN_TEXT:
                    return this.plainTextProcessor.extractImages((String) documentObjectWithType.getDocument());
                default:
                    return Collections.emptyList();
            }
        }

        @Override // io.annot8.components.documents.processors.AbstractDocumentExtractorProcessor
        public Collection<ExtractionWithProperties<Table>> extractTables(DocumentObjectWithType documentObjectWithType) throws ProcessingException {
            switch (documentObjectWithType.getType()) {
                case DOC:
                    return this.docProcessor.extractTables((HWPFDocument) documentObjectWithType.getDocument());
                case DOCX:
                    return this.docxProcessor.extractTables((XWPFDocument) documentObjectWithType.getDocument());
                case HTML:
                    return this.htmlProcessor.extractTables((Document) documentObjectWithType.getDocument());
                case ODT:
                    return this.odtProcessor.extractTables((OdfTextDocument) documentObjectWithType.getDocument());
                case PDF:
                    return this.pdfProcessor.extractTables((PDDocument) documentObjectWithType.getDocument());
                case PPT:
                    return this.pptProcessor.extractTables((HSLFSlideShow) documentObjectWithType.getDocument());
                case PPTX:
                    return this.pptxProcessor.extractTables((XMLSlideShow) documentObjectWithType.getDocument());
                case PLAIN_TEXT:
                    return this.plainTextProcessor.extractTables((String) documentObjectWithType.getDocument());
                default:
                    return Collections.emptyList();
            }
        }
    }

    /* loaded from: input_file:io/annot8/components/documents/processors/DocumentExtractor$Settings.class */
    public static class Settings extends DocumentExtractorSettings {
        private boolean extractPlainText = false;

        @Description("If true, then any files that can't be extracted via a different processor will be extracted as plain text")
        public boolean isExtractPlainText() {
            return this.extractPlainText;
        }

        public void setExtractPlainText(boolean z) {
            this.extractPlainText = z;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public Processor createComponent(Context context, Settings settings) {
        return new Processor(context, settings);
    }
}
