package cc.mallet.classify.tui;

import cc.mallet.pipe.CharSequence2TokenSequence;
import cc.mallet.pipe.FeatureSequence2AugmentableFeatureVector;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.PrintInputAndTarget;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.Target2Label;
import cc.mallet.pipe.TokenSequence2FeatureSequence;
import cc.mallet.pipe.TokenSequence2FeatureSequenceWithBigrams;
import cc.mallet.pipe.TokenSequenceLowercase;
import cc.mallet.pipe.TokenSequenceRemoveNonAlpha;
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.pipe.iterator.CsvIterator;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/* loaded from: input_file:cc/mallet/classify/tui/Csv2Vectors.class */
public class Csv2Vectors {
    private static Logger logger = MalletLogger.getLogger(Csv2Vectors.class.getName());
    static CommandOption.File inputFile = new CommandOption.File(Csv2Vectors.class, "input", "FILE", true, null, "The file containing data to be classified, one instance per line", null);
    static CommandOption.File outputFile = new CommandOption.File(Csv2Vectors.class, "output", "FILE", true, new File("text.vectors"), "Write the instance list to this file; Using - indicates stdout.", null);
    static CommandOption.String lineRegex = new CommandOption.String(Csv2Vectors.class, "line-regex", "REGEX", true, "^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$", "Regular expression containing regex-groups for label, name and data.", null);
    static CommandOption.Integer labelOption = new CommandOption.Integer(Csv2Vectors.class, "label", "INTEGER", true, 2, "The index of the group containing the label string.\n   Use 0 to indicate that the label field is not used.", null);
    static CommandOption.Integer nameOption = new CommandOption.Integer(Csv2Vectors.class, "name", "INTEGER", true, 1, "The index of the group containing the instance name.\n   Use 0 to indicate that the name field is not used.", null);
    static CommandOption.Integer dataOption = new CommandOption.Integer(Csv2Vectors.class, "data", "INTEGER", true, 3, "The index of the group containing the data.", null);
    static CommandOption.File usePipeFromVectorsFile = new CommandOption.File(Csv2Vectors.class, "use-pipe-from", "FILE", true, new File("text.vectors"), "Use the pipe and alphabets from a previously created vectors file.\n   Allows the creation, for example, of a test set of vectors that are\n   compatible with a previously created set of training vectors", null);
    static CommandOption.Boolean keepSequence = new CommandOption.Boolean(Csv2Vectors.class, "keep-sequence", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
    static CommandOption.Boolean keepSequenceBigrams = new CommandOption.Boolean(Csv2Vectors.class, "keep-sequence-bigrams", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequenceWithBigrams rather than a FeatureVector.", null);
    static CommandOption.Boolean removeStopWords = new CommandOption.Boolean(Csv2Vectors.class, "remove-stopwords", "[TRUE|FALSE]", false, false, "If true, remove a default list of common English \"stop words\" from the text.", null);
    static CommandOption.File stoplistFile = new CommandOption.File(Csv2Vectors.class, "stoplist-file", "FILE", true, null, "Instead of the default list, read stop words from a file, one per line. Implies --remove-stopwords", null);
    static CommandOption.File extraStopwordsFile = new CommandOption.File(Csv2Vectors.class, "extra-stopwords", "FILE", true, null, "Read whitespace-separated words from this file, and add them to either \n   the default English stoplist or the list specified by --stoplist-file.", null);
    static CommandOption.Boolean preserveCase = new CommandOption.Boolean(Csv2Vectors.class, "preserve-case", "[TRUE|FALSE]", false, false, "If true, do not force all strings to lowercase.", null);
    static CommandOption.String encoding = new CommandOption.String(Csv2Vectors.class, "encoding", "STRING", true, Charset.defaultCharset().displayName(), "Character encoding for input file", null);
    static CommandOption.String tokenRegex = new CommandOption.String(Csv2Vectors.class, "token-regex", "REGEX", true, CharSequenceLexer.LEX_ALPHA.toString(), "Regular expression used for tokenization.\n   Example: \"[\\p{L}\\p{N}_]+|[\\p{P}]+\" (unicode letters, numbers and underscore OR all punctuation) ", null);
    static CommandOption.Boolean printOutput = new CommandOption.Boolean(Csv2Vectors.class, "print-output", "[TRUE|FALSE]", false, false, "If true, print a representation of the processed data\n   to standard output. This option is intended for debugging.", null);

    public static void main(String[] strArr) throws FileNotFoundException, IOException {
        Pattern compile;
        Pipe serialPipes;
        CommandOption.setSummary(Csv2Vectors.class, "A tool for creating instance lists of feature vectors from comma-separated-values");
        CommandOption.process(Csv2Vectors.class, strArr);
        if (strArr.length == 0) {
            CommandOption.getList(Csv2Vectors.class).printUsage(false);
            System.exit(-1);
        }
        if (inputFile == null) {
            throw new IllegalArgumentException("You must include `--input FILE ...' in order to specify afile containing the instances, one per line.");
        }
        InstanceList instanceList = null;
        if (usePipeFromVectorsFile.wasInvoked()) {
            instanceList = InstanceList.load(usePipeFromVectorsFile.value);
            serialPipes = instanceList.getPipe();
        } else {
            ArrayList arrayList = new ArrayList();
            if (labelOption.value > 0) {
                arrayList.add(new Target2Label());
            }
            if (keepSequenceBigrams.value) {
                compile = CharSequenceLexer.LEX_NONWHITESPACE_CLASSES;
            } else {
                try {
                    compile = Pattern.compile(tokenRegex.value);
                } catch (PatternSyntaxException e) {
                    throw new IllegalArgumentException("The token regular expression (" + tokenRegex.value + ") was invalid: " + e.getMessage());
                }
            }
            arrayList.add(new CharSequence2TokenSequence(compile));
            if (!preserveCase.value()) {
                arrayList.add(new TokenSequenceLowercase());
            }
            if (keepSequenceBigrams.value) {
                arrayList.add(new TokenSequenceRemoveNonAlpha(true));
            }
            if (stoplistFile.wasInvoked()) {
                TokenSequenceRemoveStopwords tokenSequenceRemoveStopwords = new TokenSequenceRemoveStopwords(stoplistFile.value, encoding.value, false, false, keepSequenceBigrams.value);
                if (extraStopwordsFile.wasInvoked()) {
                    tokenSequenceRemoveStopwords.addStopWords(extraStopwordsFile.value);
                }
                arrayList.add(tokenSequenceRemoveStopwords);
            } else if (removeStopWords.value) {
                TokenSequenceRemoveStopwords tokenSequenceRemoveStopwords2 = new TokenSequenceRemoveStopwords(false, keepSequenceBigrams.value);
                if (extraStopwordsFile.wasInvoked()) {
                    tokenSequenceRemoveStopwords2.addStopWords(extraStopwordsFile.value);
                }
                arrayList.add(tokenSequenceRemoveStopwords2);
            }
            if (keepSequenceBigrams.value) {
                arrayList.add(new TokenSequence2FeatureSequenceWithBigrams());
            } else if (keepSequence.value) {
                arrayList.add(new TokenSequence2FeatureSequence());
            } else {
                arrayList.add(new TokenSequence2FeatureSequence());
                arrayList.add(new FeatureSequence2AugmentableFeatureVector());
            }
            if (printOutput.value) {
                arrayList.add(new PrintInputAndTarget());
            }
            serialPipes = new SerialPipes(arrayList);
        }
        InstanceList instanceList2 = new InstanceList(serialPipes);
        instanceList2.addThruPipe(new CsvIterator(inputFile.value.toString().equals("-") ? new InputStreamReader(System.in) : new InputStreamReader(new FileInputStream(inputFile.value), encoding.value), Pattern.compile(lineRegex.value), dataOption.value, labelOption.value, nameOption.value));
        ObjectOutputStream objectOutputStream = outputFile.value.toString().equals("-") ? new ObjectOutputStream(System.out) : new ObjectOutputStream(new FileOutputStream(outputFile.value));
        objectOutputStream.writeObject(instanceList2);
        objectOutputStream.close();
        if (usePipeFromVectorsFile.wasInvoked()) {
            System.out.println(" Rewriting extended pipe from " + usePipeFromVectorsFile.value);
            System.out.println("  Instance ID = " + instanceList.getPipe().getInstanceId());
            ObjectOutputStream objectOutputStream2 = new ObjectOutputStream(new FileOutputStream(usePipeFromVectorsFile.value));
            objectOutputStream2.writeObject(instanceList);
            objectOutputStream2.close();
        }
    }
}
