package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;

import com.wcohen.secondstring.AbstractStatisticalTokenDistance;
import com.wcohen.secondstring.Jaccard;
import com.wcohen.secondstring.StringDistance;
import com.wcohen.secondstring.TFIDF;
import com.wcohen.secondstring.tokens.NGramTokenizer;
import com.wcohen.secondstring.tokens.SimpleTokenizer;
import edu.umass.cs.mallet.base.fst.CRF;
import edu.umass.cs.mallet.base.pipe.Pipe;
import edu.umass.cs.mallet.base.pipe.SerialPipes;
import edu.umass.cs.mallet.base.pipe.Target2Label;
import edu.umass.cs.mallet.base.pipe.iterator.FileIterator;
import edu.umass.cs.mallet.base.types.InstanceList;
import edu.umass.cs.mallet.base.util.CommandOption;
import edu.umass.cs.mallet.base.util.MalletLogger;
import edu.umass.cs.mallet.base.util.RegexFileFilter;
import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/umass/cs/mallet/projects/seg_plus_coref/coreference/ClusterPapersAndVenues.class */
public class ClusterPapersAndVenues {
    private static Pipe pipe;
    private static IEInterface ieInterface;
    private static IEInterface ieInterface1;
    private static IEInterface ieInterface2;
    private static IEInterface ieInterface3;
    private static IEInterface ieInterface4;
    private static StringDistance softtfidfPaper;
    private static StringDistance tfidfPaper;
    private static StringDistance tfidfVenue;
    private static Jaccard distanceMetricEditDistPaper;
    private static StringDistance triGramDistanceMetricPaper;
    private static StringDistance triGramDistanceMetricVenue;
    private static String[] SEPERATOR = {"<NEW_HEADER>", "<NEWREFERENCE>"};
    private static CRF crf = null;
    static CommandOption.Boolean fullPartition = new CommandOption.Boolean(ClusterPapersAndVenues.class, "full-partition", "FILENAME", false, false, "Use full partitioninig", null);
    static CommandOption.Boolean useWeightedAvg = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-weighted-avg", "FILENAME", false, false, "Use weighted average", null);
    static CommandOption.String loadMEFile = new CommandOption.String(ClusterPapersAndVenues.class, "load-me-file", "FILENAME", true, null, "The name of the MaxEnt model file.", null);
    static CommandOption.String outputFile = new CommandOption.String(ClusterPapersAndVenues.class, "output-file", "FILENAME", true, null, "The name of the file where output clusters will be printed to.", null);
    static CommandOption.String crfInputFile = new CommandOption.String(ClusterPapersAndVenues.class, "crf-input-file", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null);
    static CommandOption.String crfInputFile1 = new CommandOption.String(ClusterPapersAndVenues.class, "crf-input-file-1", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null);
    static CommandOption.String crfInputFile2 = new CommandOption.String(ClusterPapersAndVenues.class, "crf-input-file-2", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null);
    static CommandOption.String crfInputFile3 = new CommandOption.String(ClusterPapersAndVenues.class, "crf-input-file-3", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null);
    static CommandOption.String crfInputFile4 = new CommandOption.String(ClusterPapersAndVenues.class, "crf-input-file-4", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null);
    static CommandOption.Boolean useCRF = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-crf", "BOOL", false, false, "Use CRF or not.", null);
    static CommandOption.Boolean useMultipleCRFs = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-multiple-crfs", "BOOL", false, false, "Use a separate crf for each data segment or not.", null);
    static CommandOption.Boolean useTreeModel = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-tree-model", "BOOL", false, false, "Use and train tree model.", null);
    static CommandOption.Boolean useCorrelational = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-correlational", "BOOL", false, false, "Use Correlational Clustering or not, if not uses Greedy.", null);
    static CommandOption.Boolean useFeatureInduction = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-feature-induction", "BOOL", false, false, "Use Feature Induction or Not.", null);
    static CommandOption.Boolean useNBest = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-n-best", "BOOL", false, false, "Use NBest or not.", null);
    static CommandOption.Boolean useTrueNumClusters = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-true-num-clusters", "BOOL", false, false, "Use NBest or not.", null);
    static CommandOption.Boolean useOptimal = new CommandOption.Boolean(ClusterPapersAndVenues.class, "use-optimal", "BOOL", false, false, "Use NBest or not.", null);
    static CommandOption.Integer optimalNBest = new CommandOption.Integer(ClusterPapersAndVenues.class, "optimal-n-best", "INTEGER", true, -1, "Size of n, for searching for optimal n-best configuration.", null);
    static CommandOption.Integer rBeamSize = new CommandOption.Integer(ClusterPapersAndVenues.class, "r-beam-size", "INTEGER", true, 10, "Size of n, for searching for optimal n-best configuration.", null);
    static CommandOption.String trainingDir1 = new CommandOption.String(ClusterPapersAndVenues.class, "training-dir-1", "FILENAME", true, null, "Directory containing training files.", null);
    static CommandOption.String trainingDir2 = new CommandOption.String(ClusterPapersAndVenues.class, "training-dir-2", "FILENAME", true, null, "Directory containing training files.", null);
    static CommandOption.String trainingDir3 = new CommandOption.String(ClusterPapersAndVenues.class, "training-dir-3", "FILENAME", true, null, "Directory containing training files.", null);
    static CommandOption.String testingDir = new CommandOption.String(ClusterPapersAndVenues.class, "testing-dir", "FILENAME", true, null, "Directory containing testing files.", null);
    static CommandOption.Integer searchIters = new CommandOption.Integer(ClusterPapersAndVenues.class, "search-iters", "INTEGER", true, 3, "Number of search iterations.", null);
    static CommandOption.Integer searchReductions = new CommandOption.Integer(ClusterPapersAndVenues.class, "search-reductions", "INTEGER", true, 5, "Number of search reductions.", null);
    static CommandOption.Integer numNBest = new CommandOption.Integer(ClusterPapersAndVenues.class, "num-n-best", "INTEGER", true, 3, "Number of n-best candidates to store.", null);
    static CommandOption.Integer nthViterbi = new CommandOption.Integer(ClusterPapersAndVenues.class, "nth-viterbi", "INTEGER", true, 0, "Number of n-best candidates to use .", null);
    static CommandOption.Boolean trainUsingLabeled = new CommandOption.Boolean(ClusterPapersAndVenues.class, "train-using-labeled", "BOOL", true, false, "Train just using the labeled data, but test on CRF output", null);
    static final CommandOption.List commandOptions = new CommandOption.List("Training, testing and running information extraction on paper header or reference.", new CommandOption[]{useWeightedAvg, trainUsingLabeled, rBeamSize, loadMEFile, useTreeModel, fullPartition, outputFile, useOptimal, crfInputFile, crfInputFile1, crfInputFile2, crfInputFile3, crfInputFile4, useCRF, useMultipleCRFs, useFeatureInduction, useCorrelational, useNBest, optimalNBest, useTrueNumClusters, trainingDir1, trainingDir2, trainingDir3, testingDir, searchIters, searchReductions, numNBest, nthViterbi});
    private static Logger logger = MalletLogger.getLogger(ClusterPapersAndVenues.class.getName());

    public static void main(String[] strArr) throws FileNotFoundException {
        commandOptions.process(strArr);
        commandOptions.logOptions(logger);
        loadCRFs();
        logger.info("Creating Paper Nodes");
        ArrayList[] createNodesFromTraining = createNodesFromTraining(CitationUtils.PAPER);
        ArrayList createNodesFromTesting = createNodesFromTesting(CitationUtils.PAPER);
        ArrayList arrayList = new ArrayList();
        for (ArrayList arrayList2 : createNodesFromTraining) {
            arrayList.addAll(arrayList2);
        }
        System.out.println("finished computing nodes for PAPER, about to compute distanceMetric params ");
        triGramDistanceMetricPaper = getDistanceMetric(arrayList);
        Pipe paperPipe = getPaperPipe(CitationUtils.computeDistanceMetric(arrayList), triGramDistanceMetricPaper);
        InstanceList trainingList = getTrainingList(createNodesFromTraining, paperPipe);
        InstanceList makePairs = CitationUtils.makePairs(paperPipe, createNodesFromTesting);
        Collection makeCollections = CitationUtils.makeCollections(createNodesFromTesting);
        logger.info("Creating Venue Nodes");
        ArrayList[] createNodesFromTraining2 = createNodesFromTraining(CitationUtils.VENUE);
        ArrayList createNodesFromTesting2 = createNodesFromTesting(CitationUtils.VENUE);
        ArrayList arrayList3 = new ArrayList();
        for (ArrayList arrayList4 : createNodesFromTraining2) {
            arrayList3.addAll(arrayList4);
        }
        System.out.println("finished computing nodes for VENUE, about to compute distanceMetric params ");
        triGramDistanceMetricVenue = getDistanceMetric(arrayList3);
        Pipe venuePipe = getVenuePipe(CitationUtils.computeDistanceMetric(arrayList3), triGramDistanceMetricVenue);
        InstanceList trainingList2 = getTrainingList(createNodesFromTraining2, venuePipe);
        InstanceList makePairs2 = CitationUtils.makePairs(venuePipe, createNodesFromTesting2);
        Collection makeCollections2 = CitationUtils.makeCollections(createNodesFromTesting2);
        if (useTreeModel.value()) {
            throw new UnsupportedOperationException("Tree model not supported yet.");
        }
        MultipleCorefClusterer multipleCorefClusterer = null;
        MultipleCorefClusterer multipleCorefClusterer2 = null;
        MultipleCorefClusterer multipleCorefClusterer3 = null;
        if (1 != 0) {
            multipleCorefClusterer = new MultipleCorefClusterer(new Pipe[]{paperPipe, venuePipe});
            multipleCorefClusterer2 = new MultipleCorefClusterer(new Pipe[]{paperPipe});
            multipleCorefClusterer3 = new MultipleCorefClusterer(new Pipe[]{venuePipe});
            initializeClusterer(multipleCorefClusterer);
            initializeClusterer(multipleCorefClusterer2);
            initializeClusterer(multipleCorefClusterer3);
            if (loadMEFile.value() != null) {
                throw new UnsupportedOperationException("Loading MaxEnt not implemented yet");
            }
            multipleCorefClusterer.train(new InstanceList[]{trainingList, trainingList2});
            multipleCorefClusterer2.train(new InstanceList[]{trainingList});
            multipleCorefClusterer3.train(new InstanceList[]{trainingList2});
            multipleCorefClusterer.testClassifiers(new InstanceList[]{makePairs, makePairs2});
            multipleCorefClusterer2.testClassifiers(new InstanceList[]{makePairs});
            multipleCorefClusterer3.testClassifiers(new InstanceList[]{makePairs2});
        }
        Collection[] collectionArr = {makeCollections, makeCollections2};
        Collection[] collectionArr2 = {makeCollections};
        Collection[] collectionArr3 = {makeCollections2};
        multipleCorefClusterer.setKeyPartitioning(makeCollections);
        if (1 != 0) {
            Collection[] clusterMentions = multipleCorefClusterer.clusterMentions(new InstanceList[]{makePairs, makePairs2}, new List[]{createNodesFromTesting, createNodesFromTesting2}, -1, useCorrelational.value());
            logger.info("Evaluating " + clusterMentions.length + " type(s) of clusterings");
            for (int i = 0; i < clusterMentions.length; i++) {
                CitationUtils.evaluateClustering(collectionArr[i], clusterMentions[i], String.valueOf(i) + " JOINT COREFERENCE RESULTS");
                if (outputFile.value() != null) {
                    printClustersToFile(clusterMentions[i], outputFile.value() + "_" + String.valueOf(i));
                }
            }
            Collection[] clusterMentions2 = multipleCorefClusterer2.clusterMentions(new InstanceList[]{makePairs}, new List[]{createNodesFromTesting}, -1, useCorrelational.value());
            Collection[] clusterMentions3 = multipleCorefClusterer3.clusterMentions(new InstanceList[]{makePairs2}, new List[]{createNodesFromTesting2}, -1, useCorrelational.value());
            CitationUtils.evaluateClustering(collectionArr2[0], clusterMentions2[0], "SOLO PAPER COREFERENCE RESULTS");
            CitationUtils.evaluateClustering(collectionArr3[0], clusterMentions3[0], "SOLO VENUE COREFERENCE RESULTS");
        }
    }

    private static void initializeClusterer(MultipleCorefClusterer multipleCorefClusterer) {
        multipleCorefClusterer.setTrueNumStop(useTrueNumClusters.value());
        multipleCorefClusterer.setConfWeightedScores(useWeightedAvg.value());
        multipleCorefClusterer.setOptimality(useOptimal.value());
        multipleCorefClusterer.setRBeamSize(rBeamSize.value());
        multipleCorefClusterer.setNBestInference(useNBest.value());
        multipleCorefClusterer.setFullPartition(fullPartition.value());
        multipleCorefClusterer.setSearchParams(searchIters.value(), searchReductions.value());
    }

    private static void loadCRFs() {
        if (useCRF.value()) {
            if (!useMultipleCRFs.value()) {
                File file = new File(crfInputFile.value());
                ieInterface = new IEInterface(file);
                ieInterface.loadCRF(file);
                return;
            }
            System.out.println("Initializing CRF");
            File file2 = new File(crfInputFile1.value());
            ieInterface1 = new IEInterface(file2);
            ieInterface1.loadCRF(file2);
            File file3 = new File(crfInputFile2.value());
            ieInterface2 = new IEInterface(file3);
            ieInterface2.loadCRF(file3);
            File file4 = new File(crfInputFile3.value());
            ieInterface3 = new IEInterface(file4);
            ieInterface3.loadCRF(file4);
            File file5 = new File(crfInputFile4.value());
            ieInterface4 = new IEInterface(file5);
            ieInterface4.loadCRF(file5);
        }
    }

    private static ArrayList[] createNodesFromTraining(String str) {
        FileIterator fileIterator;
        ArrayList computeNodes;
        FileIterator fileIterator2 = null;
        FileIterator fileIterator3 = null;
        if (useCRF.value()) {
            fileIterator = new FileIterator(trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*")));
            if (trainingDir2.value() != null) {
                fileIterator2 = new FileIterator(trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*")));
            }
            if (trainingDir3.value() != null) {
                fileIterator3 = new FileIterator(trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*")));
            }
        } else {
            fileIterator = new FileIterator(trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*")));
            if (trainingDir2.value() != null) {
                fileIterator2 = new FileIterator(trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*")));
            }
            if (trainingDir3.value() != null) {
                fileIterator3 = new FileIterator(trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*")));
            }
        }
        ArrayList fileArray = fileIterator.getFileArray();
        new ArrayList();
        System.out.println("Number of files 1: " + fileArray.size());
        if (useMultipleCRFs.value()) {
            if (useTreeModel.value()) {
                throw new UnsupportedOperationException("tree model unsupported");
            }
            computeNodes = CitationUtils.computeNodes(fileArray, ieInterface1, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), str);
        } else {
            if (useTreeModel.value()) {
                throw new UnsupportedOperationException("tree model unsupported");
            }
            computeNodes = CitationUtils.computeNodes(fileArray, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), str);
        }
        ArrayList arrayList = null;
        ArrayList arrayList2 = null;
        if (fileIterator2 != null) {
            ArrayList fileArray2 = fileIterator2.getFileArray();
            new ArrayList();
            System.out.println("Number of files 2: " + fileArray2.size());
            if (useMultipleCRFs.value()) {
                if (useTreeModel.value()) {
                    throw new UnsupportedOperationException("tree model unsupported");
                }
                arrayList = CitationUtils.computeNodes(fileArray2, ieInterface2, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), str);
            } else {
                if (useTreeModel.value()) {
                    throw new UnsupportedOperationException("tree model unsupported");
                }
                arrayList = CitationUtils.computeNodes(fileArray2, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), str);
            }
        }
        if (fileIterator3 != null) {
            ArrayList fileArray3 = fileIterator3.getFileArray();
            new ArrayList();
            System.out.println("Number of files 3: " + fileArray3.size());
            if (useMultipleCRFs.value()) {
                if (useTreeModel.value()) {
                    throw new UnsupportedOperationException("tree model unsupported");
                }
                arrayList2 = CitationUtils.computeNodes(fileArray3, ieInterface3, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), str);
            } else {
                if (useTreeModel.value()) {
                    throw new UnsupportedOperationException("tree model unsupported");
                }
                arrayList2 = CitationUtils.computeNodes(fileArray3, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), str);
            }
            System.out.println(" There are " + arrayList2.size() + " training nodes");
        }
        int i = 1;
        if (arrayList != null) {
            i = 1 + 1;
        }
        if (arrayList2 != null) {
            i++;
        }
        return i == 3 ? new ArrayList[]{computeNodes, arrayList, arrayList2} : i == 2 ? new ArrayList[]{computeNodes, arrayList} : new ArrayList[]{computeNodes};
    }

    private static ArrayList createNodesFromTesting(String str) {
        ArrayList computeNodes;
        ArrayList fileArray = (useCRF.value() ? new FileIterator(testingDir.value(), new RegexFileFilter(Pattern.compile(".*"))) : new FileIterator(testingDir.value(), new RegexFileFilter(Pattern.compile(".*")))).getFileArray();
        new ArrayList();
        if (useMultipleCRFs.value()) {
            computeNodes = CitationUtils.computeNodes(fileArray, ieInterface4, false, numNBest.value(), nthViterbi.value(), str);
        } else {
            if (useTreeModel.value()) {
                throw new UnsupportedOperationException("tree model unsupported");
            }
            computeNodes = CitationUtils.computeNodes(fileArray, ieInterface, useCRF.value(), numNBest.value(), nthViterbi.value(), str);
        }
        return computeNodes;
    }

    private static InstanceList getTrainingList(ArrayList[] arrayListArr, Pipe pipe2) {
        InstanceList instanceList = new InstanceList();
        if (loadMEFile.value() == null) {
            instanceList.add(CitationUtils.makePairs(pipe2, arrayListArr[0]));
            if (arrayListArr.length > 1) {
                instanceList.add(CitationUtils.makePairs(pipe2, arrayListArr[1]));
            }
            if (arrayListArr.length > 2) {
                instanceList.add(CitationUtils.makePairs(pipe2, arrayListArr[2]));
            }
        }
        return instanceList;
    }

    private static Pipe getPaperPipe(AbstractStatisticalTokenDistance abstractStatisticalTokenDistance, StringDistance stringDistance) {
        return new SerialPipes(new Pipe[]{new ExactFieldMatchPipe(Citation.corefFields), new PageMatchPipe(), new YearsWithinFivePipe(), new FieldStringDistancePipe(stringDistance, Citation.corefFields, "trigramTFIDF"), new GlobalPipe(abstractStatisticalTokenDistance), new AuthorPipe(abstractStatisticalTokenDistance), new HeuristicPipe(Citation.corefFields), new InterFieldPipe(), new NodePair2FeatureVector(), new Target2Label()});
    }

    private static Pipe getVenuePipe(AbstractStatisticalTokenDistance abstractStatisticalTokenDistance, StringDistance stringDistance) {
        return new SerialPipes(new Pipe[]{new ExactFieldMatchPipe(Citation.corefFields), new PageMatchPipe(), new YearsWithinFivePipe(), new FieldStringDistancePipe(stringDistance, Citation.corefFields, "trigramTFIDF"), new GlobalPipe(abstractStatisticalTokenDistance), new AuthorPipe(abstractStatisticalTokenDistance), new VenuePipe((StringDistance) abstractStatisticalTokenDistance), new VenueAcronymPipe(), new HeuristicPipe(Citation.corefFields), new InterFieldPipe(), new NodePair2FeatureVector(), new Target2Label()});
    }

    private static TFIDF getDistanceMetric(ArrayList arrayList) {
        TFIDF tfidf = new TFIDF();
        TFIDF tfidf2 = new TFIDF(new NGramTokenizer(3, 3, false, new SimpleTokenizer(true, true)));
        CitationUtils.makeDistMetric(arrayList, tfidf, tfidf2);
        return tfidf2;
    }

    private static void readCluster(File file) {
    }

    protected static void printClustersToFile(Collection collection, String str) {
        try {
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(str));
            printClustersAsReceived(collection, bufferedWriter);
            bufferedWriter.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    protected static void printClustersAsReceived(Collection collection, BufferedWriter bufferedWriter) {
        int i = 1;
        int i2 = 1;
        Iterator it = collection.iterator();
        while (it.hasNext()) {
            for (Citation citation : (Collection) it.next()) {
                String str = (String) citation.getLabel();
                try {
                    bufferedWriter.write("<NEWREFERENCE>\n");
                    bufferedWriter.write("<meta reference_no=\"" + i + "\" cluster_no=\"" + i2 + "\" true_id=\"" + str + "\"></meta>");
                    bufferedWriter.write(citation.getOrigString());
                } catch (Exception e) {
                }
                i++;
            }
            i2++;
        }
    }

    protected static void printCollectionReferences(Collection collection) {
        Iterator it = collection.iterator();
        while (it.hasNext()) {
            for (Object obj : (Collection) it.next()) {
                if (obj instanceof Node) {
                    Node node = (Node) obj;
                    System.out.println("Node: " + node);
                    System.out.println("Node label: " + node.getLabel());
                    System.out.println("Node index: " + node.getIndex());
                } else {
                    System.out.println("Node: " + obj);
                }
            }
        }
    }

    public static double scoreCitations(List list) {
        double d = 0.0d;
        Iterator it = list.iterator();
        while (it.hasNext()) {
            d += ((Citation) it.next()).getScore();
        }
        return d / list.size();
    }
}
