package uk.ac.man.documentparser;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.logging.Logger;
import martin.common.ArgParser;
import martin.common.Loggers;
import martin.common.Misc;
import martin.common.MyConnection;
import martin.common.SQL;
import uk.ac.man.documentparser.dataholders.Document;
import uk.ac.man.documentparser.input.BMCFactory;
import uk.ac.man.documentparser.input.BMCXMLFactory;
import uk.ac.man.documentparser.input.DatabaseIterator;
import uk.ac.man.documentparser.input.DatabaseListIterator;
import uk.ac.man.documentparser.input.Directory;
import uk.ac.man.documentparser.input.DocumentIterator;
import uk.ac.man.documentparser.input.ElsevierFactory;
import uk.ac.man.documentparser.input.IDIterator;
import uk.ac.man.documentparser.input.MedlineIndexFactory;
import uk.ac.man.documentparser.input.MedlinePMCIndexFactory;
import uk.ac.man.documentparser.input.OTMI;
import uk.ac.man.documentparser.input.OTMIFactory;
import uk.ac.man.documentparser.input.PMCAbstract;
import uk.ac.man.documentparser.input.PMCFactory;
import uk.ac.man.documentparser.input.PMCIndexFactory;
import uk.ac.man.documentparser.input.TextFile;
import uk.ac.man.documentparser.input.TextFileFactory;
import uk.ac.man.documentparser.input.util.CleanUnicode;
import uk.ac.man.documentparser.input.util.DocumentBuffer;
import uk.ac.man.documentparser.input.util.Skipper;
import uk.ac.man.documentparser.input.util.Splitter;

/* loaded from: input_file:uk/ac/man/documentparser/DocumentParser.class */
public class DocumentParser {
    private static void runSeparated(DocumentIterator documentIterator, File file, int i, Logger logger) {
        if (file == null) {
            throw new IllegalStateException("Need to specify an output base directory after the runSeparated command");
        }
        int i2 = 0;
        while (documentIterator.hasNext()) {
            Document next = documentIterator.next();
            if (next != null) {
                if (next.getID() == null) {
                    throw new IllegalStateException("ID not set");
                }
                String id = next.getID();
                boolean startsWith = id.startsWith("PMC");
                String substring = ("0000" + id).substring((id.length() + 4) - 2, id.length() + 4);
                String substring2 = ("0000" + id).substring((id.length() + 4) - 4, (id.length() + 4) - 2);
                File file2 = new File(file, substring);
                if (!file2.exists()) {
                    file2.mkdir();
                }
                if (!startsWith) {
                    file2 = new File(file2, substring2);
                    if (!file2.exists()) {
                        file2.mkdir();
                    }
                }
                next.saveToTextFile(new File(file2, id.replace(File.separatorChar, '_') + ".txt"), false);
                if (i != -1) {
                    i2++;
                    if (i2 % i == 0) {
                        logger.info("%t: Stored " + i2 + " documents.\n");
                    }
                }
            }
        }
    }

    private static void run(DocumentIterator documentIterator, File file, boolean z, int i) {
        int i2 = 0;
        for (Document document : documentIterator) {
            if (document.getID() == null) {
                throw new IllegalStateException("ID not set");
            }
            if (file != null) {
                document.saveToTextFile(new File(file, document.getID().replace(File.separatorChar, '_') + ".txt"), z);
                if (i != -1) {
                    i2++;
                    if (i2 % i == 0) {
                        System.out.println("Stored " + i2 + " documents.");
                    }
                }
            } else {
                System.out.println(document.getID());
                document.print();
            }
        }
    }

    public static void main(String[] strArr) {
        ArgParser argParser = new ArgParser(strArr);
        Logger defaultLogger = Loggers.getDefaultLogger(argParser);
        int i = argParser.getInt("report", -1);
        if (argParser.containsKey("help") || strArr.length == 0) {
            System.out.println("documentparser.jar [--properties <conf file>]");
            System.out.println(getDocumentHelpMessage());
            System.out.println("[--outDir <export directory> [--simplify]]");
            System.out.println("[--getPubYears <output file> [--report <report interval>]]");
            System.exit(0);
        }
        if (argParser.containsKey("outDir")) {
            run(getDocuments(argParser, defaultLogger), argParser.getFile("outDir"), argParser.containsKey("simplify"), i);
        }
        if (argParser.containsKey("outSeparated")) {
            runSeparated(getDocuments(argParser, defaultLogger), argParser.getFile("outSeparated"), i, defaultLogger);
        }
        if (argParser.containsKey("print")) {
            Iterator<Document> it = getDocuments(argParser, defaultLogger).iterator();
            while (it.hasNext()) {
                it.next().print();
            }
        }
        if (argParser.containsKey("saveToDB")) {
            String str = argParser.get("saveToDB");
            DocumentIterator documents = getDocuments(argParser, defaultLogger);
            Connection connectMySQL = SQL.connectMySQL(argParser, defaultLogger, "articles");
            defaultLogger.info("%t: Processing...\n");
            saveToDB(documents, connectMySQL, defaultLogger, str, i, argParser.containsKey("clear"));
            defaultLogger.info("%t: Completed.\n");
        }
        if (argParser.containsKey("buildDescriptions")) {
            buildDescriptions(getDocuments(argParser, defaultLogger), defaultLogger, argParser.getFile("buildDescriptions"), argParser.getInt("report", -1));
        }
    }

    private static void buildDescriptions(DocumentIterator documentIterator, Logger logger, File file, int i) {
        try {
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(file));
            int i2 = 0;
            bufferedWriter.write("#ID\tdescription\tyear\n");
            for (Document document : documentIterator) {
                bufferedWriter.write(document.getID() + "\t" + document.getDescription() + "\t" + ((document.getYear() == null || document.getYear().length() != 4) ? "0" : document.getYear()) + "\n");
                if (i != -1) {
                    i2++;
                    if (i2 % i == 0) {
                        logger.info("%t: Processed " + i2 + " documents.\n");
                    }
                }
            }
            logger.info("%t: Completed.\n");
            bufferedWriter.close();
        } catch (Exception e) {
            System.err.println(e);
            e.printStackTrace();
            System.exit(-1);
        }
    }

    private static void saveToDB(DocumentIterator documentIterator, Connection connection, Logger logger, String str, int i, boolean z) {
        PreparedStatement prepareInsertStatements = Document.prepareInsertStatements(connection, str, z);
        int i2 = 0;
        for (Document document : documentIterator) {
            if (document != null) {
                document.saveToDB(prepareInsertStatements);
            }
            if (i != -1) {
                i2++;
                if (i2 % i == 0) {
                    logger.info("%t: Saved " + i2 + " documents to DB.\n");
                }
            }
        }
    }

    public static DocumentIterator getDocuments(ArgParser argParser) {
        return getDocuments(argParser, null);
    }

    public static DocumentIterator getDocuments(ArgParser argParser, Logger logger) {
        String[] sVar = argParser.gets("dtd");
        DocumentIterator documentIterator = null;
        if (argParser.containsKey("pmcAbs")) {
            File file = argParser.getFile("medlineBaseDir");
            File file2 = argParser.getFile("medlineIndex");
            DocumentIterator parse = file2 != null ? new MedlineIndexFactory(file, null).parse(file2) : null;
            File file3 = argParser.getFile("pmcBaseDir");
            File file4 = argParser.getFile("pmcIndex");
            documentIterator = new PMCAbstract(file4 != null ? new PMCIndexFactory(file3, sVar).parse(file4) : null, parse);
        } else if (argParser.containsKey("medlineIndex")) {
            documentIterator = new MedlineIndexFactory(argParser.getFile("medlineBaseDir"), null).parse(argParser.getFile("medlineIndex"));
        } else if (argParser.containsKey("medlinePMCIndex")) {
            documentIterator = new MedlinePMCIndexFactory(argParser.getFile("medlineBaseDir"), argParser.getFile("pmcBaseDir"), sVar, null).parse(argParser.getFile("medlinePMCIndex"));
        } else if (argParser.containsKey("pmcIndex")) {
            documentIterator = new PMCIndexFactory(argParser.getFile("pmcBaseDir"), sVar).parse(argParser.getFile("pmcIndex"));
        } else if (argParser.containsKey("pmcDir")) {
            documentIterator = new Directory(argParser.getFile("pmcDir"), new PMCFactory(sVar), "xml", argParser.containsKey("recursive"));
        } else if (argParser.containsKey("pmc")) {
            documentIterator = new PMCFactory(sVar).parse(argParser.getFile("pmc"));
        } else if (argParser.containsKey("OTMI")) {
            documentIterator = new OTMI(argParser.getFile("OTMI"));
        } else if (argParser.containsKey("OTMIDir")) {
            documentIterator = new Directory(argParser.getFile("OTMIDir"), new OTMIFactory(), ".otmi", argParser.containsKey("recursive"));
        } else {
            if (argParser.containsKey("text")) {
                return new TextFile(argParser.getFiles("text"));
            }
            if (argParser.containsKey("textDir")) {
                documentIterator = new Directory(argParser.getFile("textDir"), new TextFileFactory(), ".txt", argParser.containsKey("recursive"));
            } else if (argParser.containsKey("bmcxml")) {
                documentIterator = new BMCXMLFactory().parse(argParser.getFile("bmcxml"));
            } else if (argParser.containsKey("bmcxmlDir")) {
                documentIterator = new Directory(argParser.getFile("bmcxmlDir"), new BMCXMLFactory(), ".xml", argParser.containsKey("recursive"));
            } else if (argParser.containsKey("bmcDir")) {
                documentIterator = new Directory(argParser.getFile("bmcDir"), new BMCFactory(sVar), ".xml", argParser.containsKey("recursive"));
            } else if (argParser.containsKey("databaseDocs")) {
                documentIterator = new DatabaseIterator(SQL.connectMySQL2(argParser, logger, "articles"), argParser.get("databaseDocs"), argParser.containsKey("full"), argParser.get("skipDocIdsQuery"));
            } else if (argParser.containsKey("databaseList")) {
                if (argParser.gets("databaseList").length != 2) {
                    throw new IllegalStateException("Usage: --databaseList <table> <file with docids>");
                }
                MyConnection connectMySQL2 = SQL.connectMySQL2(argParser, logger, "articles");
                ArrayList arrayList = new ArrayList();
                arrayList.addAll(Misc.loadStringSetFromFile(argParser.getFiles("databaseList")[1]));
                documentIterator = new DatabaseListIterator(connectMySQL2, argParser.gets("databaseList")[0], arrayList, argParser.containsKey("full"));
            } else if (argParser.containsKey("elsevierDir")) {
                documentIterator = new Directory(argParser.getFile("elsevierDir"), new ElsevierFactory(sVar), "xml", argParser.containsKey("recursive"));
            } else if (argParser.containsKey("idsOnly")) {
                documentIterator = new IDIterator(argParser.getFile("idsOnly"));
            }
        }
        if (argParser.containsKey("buffer")) {
            documentIterator = new DocumentBuffer(documentIterator, argParser.getInt("buffer", 250), logger);
        }
        if (documentIterator != null && argParser.containsKey("skip")) {
            if (logger != null) {
                logger.info("%t: Skipping " + argParser.getInt("skip") + " documents...\n");
            }
            for (int i = 0; i < argParser.getInt("skip").intValue(); i++) {
                documentIterator.skip();
            }
            if (logger != null) {
                logger.info("%t: Skip complete.\n");
            }
        }
        if (documentIterator != null && argParser.containsKey("skipEvery")) {
            if (logger != null) {
                logger.info("%t: Will be skipping " + argParser.getInt("skipEvery") + " documents for each processed document.\n");
            }
            documentIterator = new Skipper(documentIterator, argParser.getInt("skipEvery").intValue());
        }
        if (documentIterator != null && argParser.containsKey("cleanUnicode")) {
            if (logger != null) {
                logger.info("%t: Removing high unicode characters from documents.");
            }
            documentIterator = new CleanUnicode(documentIterator);
        }
        if (documentIterator != null && argParser.getInt("split", 0) > 0) {
            if (logger != null) {
                logger.info("%t: Splitting all documents at " + argParser.getInt("split") + " sentencens.");
            }
            documentIterator = new Splitter(documentIterator, argParser.getInt("split").intValue());
        }
        return documentIterator;
    }

    public static String getDocumentHelpMessage() {
        return "[--medlineIndex <file> --medlineBaseDir <dir>]\n[--medlinePMCIndex <file> --medlineBaseDir <dir> --pmcBaseDir <dir> --dtd <files>]\n[--pmcIndex <file> --pmcBaseDir<dir> --dtd <files>]\n[--textDir <dir> [--recursive]]\n[--OTMIDir <dir> [--recursive]]\n";
    }

    public static Map<String, Document> getDocumentsToHash(ArgParser argParser) {
        HashMap hashMap = new HashMap();
        for (Document document : getDocuments(argParser)) {
            hashMap.put(document.getID(), document);
        }
        return hashMap;
    }
}
