package com.gengoai.hermes.corpus;

import com.gengoai.apollo.math.statistics.measure.Association;
import com.gengoai.apollo.math.statistics.measure.ContingencyTable;
import com.gengoai.apollo.math.statistics.measure.ContingencyTableCalculator;
import com.gengoai.apollo.ml.DataSet;
import com.gengoai.collection.counter.Counter;
import com.gengoai.collection.counter.Counters;
import com.gengoai.collection.multimap.ArrayListMultimap;
import com.gengoai.collection.multimap.Multimap;
import com.gengoai.function.SerializableConsumer;
import com.gengoai.function.SerializableFunction;
import com.gengoai.function.SerializablePredicate;
import com.gengoai.hermes.AnnotatableType;
import com.gengoai.hermes.AnnotationPipeline;
import com.gengoai.hermes.Document;
import com.gengoai.hermes.HString;
import com.gengoai.hermes.extraction.Extractor;
import com.gengoai.hermes.extraction.NGramExtractor;
import com.gengoai.hermes.extraction.caduceus.CaduceusProgram;
import com.gengoai.hermes.extraction.regex.TokenMatch;
import com.gengoai.hermes.extraction.regex.TokenMatcher;
import com.gengoai.hermes.extraction.regex.TokenRegex;
import com.gengoai.hermes.format.DocFormatService;
import com.gengoai.hermes.lexicon.Lexicon;
import com.gengoai.hermes.ml.HStringDataSetGenerator;
import com.gengoai.hermes.morphology.StandardTokenizer;
import com.gengoai.io.Resources;
import com.gengoai.parsing.ParseException;
import com.gengoai.specification.Specification;
import com.gengoai.stream.MCounterAccumulator;
import com.gengoai.stream.MStream;
import com.gengoai.stream.StreamingContext;
import com.gengoai.tuple.Tuple;
import java.io.IOException;
import java.lang.invoke.SerializedLambda;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import lombok.NonNull;

/* loaded from: input_file:com/gengoai/hermes/corpus/DocumentCollection.class */
public interface DocumentCollection extends Iterable<Document>, AutoCloseable {
    public static final String REPORT_INTERVAL = "Corpus.reportInterval";
    public static final String REPORT_LEVEL = "Corpus.reportLevel";

    static DocumentCollection create(@NonNull Document... documentArr) {
        if (documentArr == null) {
            throw new NullPointerException("documents is marked non-null but is null");
        }
        return new MStreamDocumentCollection(StreamingContext.local().stream(documentArr));
    }

    static DocumentCollection create(@NonNull Iterable<Document> iterable) {
        if (iterable == null) {
            throw new NullPointerException("documents is marked non-null but is null");
        }
        return new MStreamDocumentCollection(StreamingContext.local().stream(iterable));
    }

    static DocumentCollection create(@NonNull Stream<Document> stream) {
        if (stream == null) {
            throw new NullPointerException("documents is marked non-null but is null");
        }
        return new MStreamDocumentCollection(StreamingContext.local().stream(stream));
    }

    static DocumentCollection create(@NonNull MStream<Document> mStream) {
        if (mStream == null) {
            throw new NullPointerException("documents is marked non-null but is null");
        }
        return new MStreamDocumentCollection(mStream);
    }

    static DocumentCollection create(@NonNull String str) {
        if (str == null) {
            throw new NullPointerException("specification is marked non-null but is null");
        }
        try {
            return create(Specification.parse(str));
        } catch (IllegalArgumentException e) {
            return Corpus.open(str);
        }
    }

    static DocumentCollection create(@NonNull Specification specification) {
        if (specification == null) {
            throw new NullPointerException("specification is marked non-null but is null");
        }
        return specification.getSchema().equalsIgnoreCase("corpus") ? Corpus.open(specification.getPath()) : create(DocFormatService.create(specification).read(Resources.from(specification.getPath())));
    }

    default DocumentCollection annotate(@NonNull AnnotatableType... annotatableTypeArr) {
        if (annotatableTypeArr == null) {
            throw new NullPointerException("annotatableTypes is marked non-null but is null");
        }
        AnnotationPipeline annotationPipeline = new AnnotationPipeline(annotatableTypeArr);
        if (!annotationPipeline.requiresUpdate()) {
            return this;
        }
        Objects.requireNonNull(annotationPipeline);
        return update("Annotate", annotationPipeline::annotate);
    }

    default DocumentCollection apply(@NonNull Lexicon lexicon, @NonNull SerializableConsumer<HString> serializableConsumer) {
        if (lexicon == null) {
            throw new NullPointerException("lexicon is marked non-null but is null");
        }
        if (serializableConsumer == null) {
            throw new NullPointerException("onMatch is marked non-null but is null");
        }
        return update("ApplyLexicon", document -> {
            lexicon.extract(document).forEach(serializableConsumer);
        });
    }

    default DocumentCollection apply(@NonNull TokenRegex tokenRegex, @NonNull SerializableConsumer<TokenMatch> serializableConsumer) {
        if (tokenRegex == null) {
            throw new NullPointerException("pattern is marked non-null but is null");
        }
        if (serializableConsumer == null) {
            throw new NullPointerException("onMatch is marked non-null but is null");
        }
        return update("ApplyTokenRegex", document -> {
            TokenMatcher matcher = tokenRegex.matcher(document);
            while (matcher.find()) {
                serializableConsumer.accept(matcher.asTokenMatch());
            }
        });
    }

    default DataSet asDataSet(@NonNull HStringDataSetGenerator hStringDataSetGenerator) {
        if (hStringDataSetGenerator == null) {
            throw new NullPointerException("HStringDataSetGenerator is marked non-null but is null");
        }
        return hStringDataSetGenerator.generate(stream());
    }

    default DocumentCollection cache() {
        return this;
    }

    default Counter<String> documentCount(@NonNull Extractor extractor) {
        if (extractor == null) {
            throw new NullPointerException("extractor is marked non-null but is null");
        }
        ProgressLogger create = ProgressLogger.create(this, "documentCount");
        MCounterAccumulator counterAccumulator = getStreamingContext().counterAccumulator();
        parallelStream().forEach(document -> {
            create.start();
            extractor.extract(document).count().forEach((str, d) -> {
                counterAccumulator.increment(str, 1.0d);
            });
            create.stop(document.tokenLength());
        });
        create.report();
        return (Counter) counterAccumulator.value();
    }

    default void export(String str) throws IOException {
        Specification parse = Specification.parse(str);
        DocFormatService.create(parse).write(this, Resources.from(parse.getPath()));
    }

    default DocumentCollection filter(@NonNull SerializablePredicate<Document> serializablePredicate) {
        if (serializablePredicate == null) {
            throw new NullPointerException("predicate is marked non-null but is null");
        }
        return new MStreamDocumentCollection(stream().filter(serializablePredicate));
    }

    StreamingContext getStreamingContext();

    default <K> Multimap<K, Document> groupBy(@NonNull SerializableFunction<? super Document, K> serializableFunction) {
        if (serializableFunction == null) {
            throw new NullPointerException("keyFunction is marked non-null but is null");
        }
        ArrayListMultimap arrayListMultimap = new ArrayListMultimap();
        forEach(document -> {
            arrayListMultimap.put(serializableFunction.apply(document), document);
        });
        return arrayListMultimap;
    }

    default boolean isEmpty() {
        return stream().isEmpty();
    }

    @Override // java.lang.Iterable
    default Iterator<Document> iterator() {
        return stream().iterator();
    }

    default Counter<Tuple> nGramCount(@NonNull NGramExtractor nGramExtractor) {
        if (nGramExtractor == null) {
            throw new NullPointerException("nGramExtractor is marked non-null but is null");
        }
        ProgressLogger create = ProgressLogger.create(this, "nGramCount");
        Counter<Tuple> newCounter = Counters.newCounter(parallelStream().flatMap(document -> {
            create.start();
            Stream<Tuple> stream = nGramExtractor.extractStringTuples(document).stream();
            create.stop(document.tokenLength());
            return stream;
        }).countByValue());
        create.report();
        return newCounter;
    }

    MStream<Document> parallelStream();

    default SearchResults query(@NonNull String str) throws ParseException {
        if (str == null) {
            throw new NullPointerException("query is marked non-null but is null");
        }
        return query(QueryParser.parse(str));
    }

    SearchResults query(@NonNull Query query);

    default DocumentCollection repartition(int i) {
        return this;
    }

    default DocumentCollection sample(int i) {
        return new MStreamDocumentCollection(stream().sample(false, i));
    }

    default DocumentCollection sample(int i, @NonNull Random random) {
        if (random == null) {
            throw new NullPointerException("random is marked non-null but is null");
        }
        if (i <= 0) {
            return new MStreamDocumentCollection(StreamingContext.local().empty());
        }
        List collect = stream().limit(i).collect();
        AtomicInteger atomicInteger = new AtomicInteger(i + 1);
        stream().skip(i).forEach(document -> {
            int nextInt = random.nextInt(atomicInteger.getAndIncrement());
            if (nextInt < i) {
                collect.set(nextInt, document);
            }
        });
        return new MStreamDocumentCollection(StreamingContext.local().stream(collect).parallel());
    }

    default Counter<Tuple> significantBigrams(@NonNull NGramExtractor nGramExtractor, int i, double d) {
        if (nGramExtractor == null) {
            throw new NullPointerException("nGramExtractor is marked non-null but is null");
        }
        return significantBigrams(nGramExtractor, i, d, Association.Mikolov);
    }

    default Counter<Tuple> significantBigrams(@NonNull NGramExtractor nGramExtractor, int i, double d, @NonNull ContingencyTableCalculator contingencyTableCalculator) {
        if (nGramExtractor == null) {
            throw new NullPointerException("nGramExtractor is marked non-null but is null");
        }
        if (contingencyTableCalculator == null) {
            throw new NullPointerException("calculator is marked non-null but is null");
        }
        Counter filterByValue = nGramCount(nGramExtractor.toBuilder().minOrder(1).maxOrder(2).build()).filterByValue(d2 -> {
            return d2 >= ((double) i);
        });
        Counter filterByKey = filterByValue.filterByKey(tuple -> {
            return tuple.degree() == 1;
        });
        Counter filterByKey2 = filterByValue.filterByKey(tuple2 -> {
            return tuple2.degree() == 2;
        });
        filterByValue.clear();
        Counter<Tuple> newCounter = Counters.newCounter(new Tuple[0]);
        filterByKey2.items().forEach(tuple3 -> {
            double calculate = contingencyTableCalculator.calculate(ContingencyTable.create2X2(filterByKey2.get(tuple3), filterByKey.get(tuple3.slice(0, 1)), filterByKey.get(tuple3.slice(1, 2)), filterByKey.sum()));
            if (calculate >= d) {
                newCounter.set(tuple3, calculate);
            }
        });
        return newCounter;
    }

    default long size() {
        return parallelStream().count();
    }

    MStream<Document> stream();

    default Counter<String> termCount(@NonNull Extractor extractor) {
        if (extractor == null) {
            throw new NullPointerException("extractor is marked non-null but is null");
        }
        ProgressLogger create = ProgressLogger.create(this, "termCount");
        MCounterAccumulator counterAccumulator = getStreamingContext().counterAccumulator();
        parallelStream().parallel().forEach(document -> {
            create.start();
            counterAccumulator.merge(extractor.extract(document).count());
            create.stop(document.tokenLength());
        });
        create.report();
        return (Counter) counterAccumulator.value();
    }

    DocumentCollection update(String str, @NonNull SerializableConsumer<Document> serializableConsumer);

    default DocumentCollection update(@NonNull CaduceusProgram caduceusProgram) {
        if (caduceusProgram == null) {
            throw new NullPointerException("program is marked non-null but is null");
        }
        Objects.requireNonNull(caduceusProgram);
        return update("ExecuteCaduceusProgram", caduceusProgram::execute);
    }

    private static /* synthetic */ Object $deserializeLambda$(SerializedLambda serializedLambda) {
        String implMethodName = serializedLambda.getImplMethodName();
        boolean z = -1;
        switch (implMethodName.hashCode()) {
            case -2109065263:
                if (implMethodName.equals("lambda$apply$cd973a48$1")) {
                    z = 5;
                    break;
                }
                break;
            case -1319569547:
                if (implMethodName.equals("execute")) {
                    z = 7;
                    break;
                }
                break;
            case -1252106316:
                if (implMethodName.equals("lambda$termCount$464e58d7$1")) {
                    z = 4;
                    break;
                }
                break;
            case -913395128:
                if (implMethodName.equals("lambda$apply$9f6ce476$1")) {
                    z = 2;
                    break;
                }
                break;
            case -649662132:
                if (implMethodName.equals("annotate")) {
                    z = 3;
                    break;
                }
                break;
            case -649225408:
                if (implMethodName.equals("lambda$nGramCount$55ee1fbc$1")) {
                    z = 6;
                    break;
                }
                break;
            case -362579785:
                if (implMethodName.equals("lambda$sample$a5d4758c$1")) {
                    z = true;
                    break;
                }
                break;
            case 55691904:
                if (implMethodName.equals("lambda$documentCount$9bda9dc8$1")) {
                    z = false;
                    break;
                }
                break;
        }
        switch (z) {
            case StandardTokenizer.YYINITIAL /* 0 */:
                if (serializedLambda.getImplMethodKind() == 6 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableConsumer") && serializedLambda.getFunctionalInterfaceMethodName().equals("accept") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)V") && serializedLambda.getImplClass().equals("com/gengoai/hermes/corpus/DocumentCollection") && serializedLambda.getImplMethodSignature().equals("(Lcom/gengoai/hermes/corpus/ProgressLogger;Lcom/gengoai/hermes/extraction/Extractor;Lcom/gengoai/stream/MCounterAccumulator;Lcom/gengoai/hermes/Document;)V")) {
                    ProgressLogger progressLogger = (ProgressLogger) serializedLambda.getCapturedArg(0);
                    Extractor extractor = (Extractor) serializedLambda.getCapturedArg(1);
                    MCounterAccumulator mCounterAccumulator = (MCounterAccumulator) serializedLambda.getCapturedArg(2);
                    return document -> {
                        progressLogger.start();
                        extractor.extract(document).count().forEach((str, d) -> {
                            mCounterAccumulator.increment(str, 1.0d);
                        });
                        progressLogger.stop(document.tokenLength());
                    };
                }
                break;
            case true:
                if (serializedLambda.getImplMethodKind() == 6 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableConsumer") && serializedLambda.getFunctionalInterfaceMethodName().equals("accept") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)V") && serializedLambda.getImplClass().equals("com/gengoai/hermes/corpus/DocumentCollection") && serializedLambda.getImplMethodSignature().equals("(Ljava/util/Random;Ljava/util/concurrent/atomic/AtomicInteger;ILjava/util/List;Lcom/gengoai/hermes/Document;)V")) {
                    Random random = (Random) serializedLambda.getCapturedArg(0);
                    AtomicInteger atomicInteger = (AtomicInteger) serializedLambda.getCapturedArg(1);
                    int intValue = ((Integer) serializedLambda.getCapturedArg(2)).intValue();
                    List list = (List) serializedLambda.getCapturedArg(3);
                    return document2 -> {
                        int nextInt = random.nextInt(atomicInteger.getAndIncrement());
                        if (nextInt < intValue) {
                            list.set(nextInt, document2);
                        }
                    };
                }
                break;
            case true:
                if (serializedLambda.getImplMethodKind() == 6 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableConsumer") && serializedLambda.getFunctionalInterfaceMethodName().equals("accept") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)V") && serializedLambda.getImplClass().equals("com/gengoai/hermes/corpus/DocumentCollection") && serializedLambda.getImplMethodSignature().equals("(Lcom/gengoai/hermes/extraction/regex/TokenRegex;Lcom/gengoai/function/SerializableConsumer;Lcom/gengoai/hermes/Document;)V")) {
                    TokenRegex tokenRegex = (TokenRegex) serializedLambda.getCapturedArg(0);
                    SerializableConsumer serializableConsumer = (SerializableConsumer) serializedLambda.getCapturedArg(1);
                    return document3 -> {
                        TokenMatcher matcher = tokenRegex.matcher(document3);
                        while (matcher.find()) {
                            serializableConsumer.accept(matcher.asTokenMatch());
                        }
                    };
                }
                break;
            case true:
                if (serializedLambda.getImplMethodKind() == 5 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableConsumer") && serializedLambda.getFunctionalInterfaceMethodName().equals("accept") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)V") && serializedLambda.getImplClass().equals("com/gengoai/hermes/AnnotationPipeline") && serializedLambda.getImplMethodSignature().equals("(Lcom/gengoai/hermes/Document;)Z")) {
                    AnnotationPipeline annotationPipeline = (AnnotationPipeline) serializedLambda.getCapturedArg(0);
                    return annotationPipeline::annotate;
                }
                break;
            case true:
                if (serializedLambda.getImplMethodKind() == 6 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableConsumer") && serializedLambda.getFunctionalInterfaceMethodName().equals("accept") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)V") && serializedLambda.getImplClass().equals("com/gengoai/hermes/corpus/DocumentCollection") && serializedLambda.getImplMethodSignature().equals("(Lcom/gengoai/hermes/corpus/ProgressLogger;Lcom/gengoai/stream/MCounterAccumulator;Lcom/gengoai/hermes/extraction/Extractor;Lcom/gengoai/hermes/Document;)V")) {
                    ProgressLogger progressLogger2 = (ProgressLogger) serializedLambda.getCapturedArg(0);
                    MCounterAccumulator mCounterAccumulator2 = (MCounterAccumulator) serializedLambda.getCapturedArg(1);
                    Extractor extractor2 = (Extractor) serializedLambda.getCapturedArg(2);
                    return document4 -> {
                        progressLogger2.start();
                        mCounterAccumulator2.merge(extractor2.extract(document4).count());
                        progressLogger2.stop(document4.tokenLength());
                    };
                }
                break;
            case true:
                if (serializedLambda.getImplMethodKind() == 6 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableConsumer") && serializedLambda.getFunctionalInterfaceMethodName().equals("accept") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)V") && serializedLambda.getImplClass().equals("com/gengoai/hermes/corpus/DocumentCollection") && serializedLambda.getImplMethodSignature().equals("(Lcom/gengoai/hermes/lexicon/Lexicon;Lcom/gengoai/function/SerializableConsumer;Lcom/gengoai/hermes/Document;)V")) {
                    Lexicon lexicon = (Lexicon) serializedLambda.getCapturedArg(0);
                    SerializableConsumer serializableConsumer2 = (SerializableConsumer) serializedLambda.getCapturedArg(1);
                    return document5 -> {
                        lexicon.extract(document5).forEach(serializableConsumer2);
                    };
                }
                break;
            case true:
                if (serializedLambda.getImplMethodKind() == 6 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableFunction") && serializedLambda.getFunctionalInterfaceMethodName().equals("apply") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)Ljava/lang/Object;") && serializedLambda.getImplClass().equals("com/gengoai/hermes/corpus/DocumentCollection") && serializedLambda.getImplMethodSignature().equals("(Lcom/gengoai/hermes/corpus/ProgressLogger;Lcom/gengoai/hermes/extraction/NGramExtractor;Lcom/gengoai/hermes/Document;)Ljava/util/stream/Stream;")) {
                    ProgressLogger progressLogger3 = (ProgressLogger) serializedLambda.getCapturedArg(0);
                    NGramExtractor nGramExtractor = (NGramExtractor) serializedLambda.getCapturedArg(1);
                    return document6 -> {
                        progressLogger3.start();
                        Stream<Tuple> stream = nGramExtractor.extractStringTuples(document6).stream();
                        progressLogger3.stop(document6.tokenLength());
                        return stream;
                    };
                }
                break;
            case true:
                if (serializedLambda.getImplMethodKind() == 5 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableConsumer") && serializedLambda.getFunctionalInterfaceMethodName().equals("accept") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)V") && serializedLambda.getImplClass().equals("com/gengoai/hermes/extraction/caduceus/CaduceusProgram") && serializedLambda.getImplMethodSignature().equals("(Lcom/gengoai/hermes/Document;)V")) {
                    CaduceusProgram caduceusProgram = (CaduceusProgram) serializedLambda.getCapturedArg(0);
                    return caduceusProgram::execute;
                }
                break;
        }
        throw new IllegalArgumentException("Invalid lambda deserialization");
    }
}
