package com.gengoai.hermes.ml.trainer;

import com.gengoai.LogUtils;
import com.gengoai.Stopwatch;
import com.gengoai.Tag;
import com.gengoai.apollo.ml.DataSet;
import com.gengoai.apollo.ml.feature.Featurizer;
import com.gengoai.apollo.ml.model.FitParameters;
import com.gengoai.apollo.ml.model.Model;
import com.gengoai.apollo.ml.model.sequence.Crf;
import com.gengoai.conversion.Cast;
import com.gengoai.hermes.Types;
import com.gengoai.hermes.corpus.DocumentCollection;
import com.gengoai.hermes.ml.HStringDataSetGenerator;
import com.gengoai.hermes.ml.feature.Features;
import com.gengoai.hermes.ml.feature.PredefinedFeatures;
import com.gengoai.hermes.morphology.PartOfSpeech;
import com.gengoai.hermes.morphology.PennTreeBank;
import com.gengoai.hermes.morphology.StandardTokenizer;
import java.lang.invoke.SerializedLambda;
import java.util.Set;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import lombok.NonNull;

/* loaded from: input_file:com/gengoai/hermes/ml/trainer/PhraseChunkTrainer.class */
public class PhraseChunkTrainer extends IOBTaggerTrainer {
    private static final Logger log = Logger.getLogger(PhraseChunkTrainer.class.getName());

    public PhraseChunkTrainer() {
        super(Types.PHRASE_CHUNK);
    }

    @Override // com.gengoai.hermes.ml.trainer.IOBTaggerTrainer
    protected void addInputs(HStringDataSetGenerator.Builder builder) {
        builder.tokenSequence("input", Featurizer.chain(new Featurizer[]{Features.LowerCaseWord, Features.PartOfSpeech, Features.WordClass}).withContext(new String[]{PredefinedFeatures.lenientContext(Features.LowerCaseWord, -1), PredefinedFeatures.strictContext(Features.LowerCaseWord, -1, Features.LowerCaseWord, 0), PredefinedFeatures.lenientContext(Features.LowerCaseWord, -2), PredefinedFeatures.lenientContext(Features.LowerCaseWord, 1), PredefinedFeatures.strictContext(Features.LowerCaseWord, 0, Features.LowerCaseWord, 1), PredefinedFeatures.lenientContext(Features.LowerCaseWord, 2), PredefinedFeatures.lenientContext(Features.PartOfSpeech, -1), PredefinedFeatures.strictContext(Features.PartOfSpeech, -1, Features.LowerCaseWord, 0), PredefinedFeatures.strictContext(Features.PartOfSpeech, -1, Features.PartOfSpeech, 0), PredefinedFeatures.lenientContext(Features.PartOfSpeech, -2), PredefinedFeatures.lenientContext(Features.PartOfSpeech, 1), PredefinedFeatures.lenientContext(Features.PartOfSpeech, 2), PredefinedFeatures.strictContext(Features.PartOfSpeech, 0, Features.LowerCaseWord, 1), PredefinedFeatures.strictContext(Features.PartOfSpeech, 0, Features.PartOfSpeech, 1)}));
    }

    @Override // com.gengoai.hermes.ml.trainer.SequenceTaggerTrainer
    public DataSet createDataset(@NonNull DocumentCollection documentCollection) {
        if (documentCollection == null) {
            throw new NullPointerException("data is marked non-null but is null");
        }
        Stopwatch createStarted = Stopwatch.createStarted();
        DataSet asDataSet = documentCollection.update("RemovePartOfSpeech", document -> {
            document.setUncompleted(Types.PART_OF_SPEECH);
        }).annotate(Types.PART_OF_SPEECH).asDataSet(getExampleGenerator());
        createStarted.stop();
        LogUtils.logFine(log, "Took {0} to create dataset.", new Object[]{createStarted});
        return asDataSet;
    }

    @Override // com.gengoai.hermes.ml.trainer.SequenceTaggerTrainer
    protected Model createSequenceLabeler(FitParameters<?> fitParameters) {
        return new Crf((Crf.Parameters) Cast.as(fitParameters));
    }

    @Override // com.gengoai.hermes.ml.trainer.SequenceTaggerTrainer
    public FitParameters<?> getFitParameters() {
        return new Crf.Parameters().update(parameters -> {
            parameters.minFeatureFreq.set(5);
        });
    }

    @Override // com.gengoai.hermes.ml.trainer.IOBTaggerTrainer
    protected Set<String> getValidTags() {
        return (Set) PartOfSpeech.values().stream().filter((v0) -> {
            return v0.isPhraseTag();
        }).filter(partOfSpeech -> {
            return !partOfSpeech.isInstance(new Tag[]{PennTreeBank.INTJ, PennTreeBank.LST, PennTreeBank.UCP});
        }).map((v0) -> {
            return v0.tag();
        }).collect(Collectors.toSet());
    }

    private static /* synthetic */ Object $deserializeLambda$(SerializedLambda serializedLambda) {
        String implMethodName = serializedLambda.getImplMethodName();
        boolean z = -1;
        switch (implMethodName.hashCode()) {
            case -490496578:
                if (implMethodName.equals("lambda$createDataset$7bcf8e3d$1")) {
                    z = false;
                    break;
                }
                break;
        }
        switch (z) {
            case StandardTokenizer.YYINITIAL /* 0 */:
                if (serializedLambda.getImplMethodKind() == 6 && serializedLambda.getFunctionalInterfaceClass().equals("com/gengoai/function/SerializableConsumer") && serializedLambda.getFunctionalInterfaceMethodName().equals("accept") && serializedLambda.getFunctionalInterfaceMethodSignature().equals("(Ljava/lang/Object;)V") && serializedLambda.getImplClass().equals("com/gengoai/hermes/ml/trainer/PhraseChunkTrainer") && serializedLambda.getImplMethodSignature().equals("(Lcom/gengoai/hermes/Document;)V")) {
                    return document -> {
                        document.setUncompleted(Types.PART_OF_SPEECH);
                    };
                }
                break;
        }
        throw new IllegalArgumentException("Invalid lambda deserialization");
    }
}
