package nlp4j.webcrawler.nhtsa;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import nlp4j.Document;
import nlp4j.crawler.Crawler;
import nlp4j.impl.DefaultDocument;
import nlp4j.webcrawler.AbstractWebCrawler;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/* loaded from: input_file:nlp4j/webcrawler/nhtsa/NhtsaCrawler.class */
public class NhtsaCrawler extends AbstractWebCrawler implements Crawler {
    static List<String> makesFilterList;
    private String charset = "UTF-8";
    private String encoding = "UTF-8";
    private int minDate = Integer.MIN_VALUE;
    boolean trim = true;
    private final String zipEntryFileName = "FLAT_CMPL.txt";
    private static Logger logger = LogManager.getLogger(MethodHandles.lookup().lookupClass());
    static String[] headers = "CMPLID,ODINO,MFR_NAME,MAKETXT,MODELTXT,YEARTXT,CRASH,FAILDATE,FIRE,INJURED,DEATHS,COMPDESC,CITY,STATE,VIN,DATEA,LDATE,MILES,OCCURENCES,CDESCR,CMPL_TYPE,POLICE_RPT_YN,PURCH_DT,ORIG_OWNER_YN,ANTI_BRAKES_YN,CRUISE_CONT_YN,NUM_CYLS,DRIVE_TRAIN,FUEL_SYS,FUEL_TYPE,TRANS_TYPE,VEH_SPEED,DOT,TIRE_SIZE,LOC_OF_TIRE,TIRE_FAIL_TYPE,ORIG_EQUIP_YN,MANUF_DT,SEAT_TYPE,RESTRAINT_TYPE,DEALER_NAME,DEALER_TEL,DEALER_CITY,DEALER_STATE,DEALER_ZIP,PROD_TYPE,REPAIRED_YN,MEDICAL_ATTN,VEHICLES_TOWED_YN".split(",");

    public List<Document> crawlDocuments() {
        File file = new File(((AbstractWebCrawler) this).prop.getProperty("input"));
        try {
            if (!file.exists()) {
                logger.warn("File Not Found: " + file.getAbsolutePath());
                return new ArrayList();
            }
            String extension = FilenameUtils.getExtension(file.getAbsolutePath());
            if (extension.equals("zip")) {
                logger.info("Read ZIP: " + file.getAbsolutePath());
                return readAsZip(file);
            }
            if (!extension.equals("txt")) {
                return new ArrayList();
            }
            logger.info("Read TXT: " + file.getAbsolutePath());
            return readAsTxt(file);
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
            return new ArrayList();
        }
    }

    private List<Document> read(InputStream inputStream, String str) throws IOException {
        ArrayList arrayList = new ArrayList();
        List readLines = IOUtils.readLines(inputStream, str);
        ArrayList arrayList2 = new ArrayList();
        Iterator it = readLines.iterator();
        while (it.hasNext()) {
            String[] split = ((String) it.next()).split("\t");
            for (int i = 0; i < split.length; i++) {
                if (split[i] != null) {
                    split[i] = split[i].trim();
                }
            }
            if (makesFilterList == null || makesFilterList.contains(split[3])) {
                DefaultDocument defaultDocument = new DefaultDocument();
                for (int i2 = 0; i2 < headers.length; i2++) {
                    if (split.length >= i2 + 1) {
                        defaultDocument.putAttribute(headers[i2], split[i2]);
                    }
                }
                try {
                } catch (NumberFormatException e) {
                    logger.error(e.getMessage(), e);
                }
                if (Integer.parseInt(defaultDocument.getAttributeAsString("DATEA")) >= this.minDate) {
                    arrayList.add(defaultDocument);
                    if (arrayList.size() % 1000 == 0) {
                        logger.info("Reading docs: " + String.format("%,d", Integer.valueOf(arrayList.size())));
                    }
                }
            }
            if (!arrayList2.contains(split[3])) {
                arrayList2.add(split[3]);
            }
        }
        logger.info("Reading docs done: " + String.format("%,d", Integer.valueOf(arrayList.size())));
        Collections.sort(arrayList2);
        logger.debug("Data MAKES: " + Arrays.toString(arrayList2.toArray(new String[0])));
        return arrayList;
    }

    private List<Document> readAsTxt(File file) throws IOException {
        try {
            FileInputStream fileInputStream = new FileInputStream(file);
            try {
                List<Document> read = read(fileInputStream, this.encoding);
                fileInputStream.close();
                return read;
            } finally {
            }
        } catch (IOException e) {
            throw e;
        }
    }

    private List<Document> readAsZip(File file) throws IOException {
        ZipEntry nextEntry;
        try {
            FileInputStream fileInputStream = new FileInputStream(file);
            try {
                BufferedInputStream bufferedInputStream = new BufferedInputStream(fileInputStream);
                try {
                    ZipInputStream zipInputStream = new ZipInputStream(bufferedInputStream, Charset.forName(this.charset));
                    do {
                        try {
                            nextEntry = zipInputStream.getNextEntry();
                            if (nextEntry == null) {
                                zipInputStream.close();
                                bufferedInputStream.close();
                                fileInputStream.close();
                                return null;
                            }
                            logger.info("zip entry: " + nextEntry.getName());
                        } catch (Throwable th) {
                            try {
                                zipInputStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                            throw th;
                        }
                    } while (!nextEntry.getName().equals("FLAT_CMPL.txt"));
                    List<Document> read = read(zipInputStream, this.encoding);
                    zipInputStream.close();
                    bufferedInputStream.close();
                    fileInputStream.close();
                    return read;
                } catch (Throwable th3) {
                    try {
                        bufferedInputStream.close();
                    } catch (Throwable th4) {
                        th3.addSuppressed(th4);
                    }
                    throw th3;
                }
            } finally {
            }
        } catch (IOException e) {
            throw e;
        }
    }

    public void setProperty(String str, String str2) {
        if (str == null || str2 == null) {
            return;
        }
        super.setProperty(str, str2);
        if (!str.equals("minDate")) {
            if (str.equals("MAKETXT")) {
                makesFilterList = Arrays.asList(str2.split(","));
            }
        } else {
            try {
                this.minDate = Integer.parseInt(str2);
            } catch (NumberFormatException e) {
                e.printStackTrace();
            }
        }
    }
}
