package nlp4j.webcrawler.caa;

import com.google.gson.Gson;
import com.google.gson.JsonObject;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import nlp4j.Document;
import nlp4j.impl.DefaultDocument;
import nlp4j.util.DocumentUtil;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;

/* loaded from: input_file:nlp4j/webcrawler/caa/CaaParser.class */
public class CaaParser {
    private static Logger logger = LogManager.getLogger(MethodHandles.lookup().lookupClass());

    public Document parse(String str) {
        if (str == null || str.trim().isEmpty()) {
            return null;
        }
        DefaultDocument defaultDocument = new DefaultDocument();
        org.jsoup.nodes.Document parse = Jsoup.parse(str);
        parse.select("#contents");
        Iterator it = parse.select(".detail_title").iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            String trim = ((Element) element.select("h3").get(0)).text().trim();
            String trim2 = ((Element) element.select("p").get(0)).text().trim();
            defaultDocument.putAttribute("title", trim);
            defaultDocument.putAttribute("description", trim2);
        }
        Iterator it2 = parse.select(".detail_box_left").first().select("li").iterator();
        while (it2.hasNext()) {
            Element element2 = (Element) it2.next();
            String trim3 = element2.select(".detail_cap").first().text().trim();
            String trim4 = element2.select(".detail_text").first().text().trim();
            if (element2.select("script") != null && element2.select("script").size() > 0) {
                for (String str2 : element2.select("script").first().data().split("\n")) {
                    if (str2.trim().startsWith("contentsText")) {
                        trim4 = trim4 + extractText(str2);
                    }
                }
            }
            if (trim3 != null && trim4 != null) {
                defaultDocument.putAttribute(trim3.trim(), trim4.trim());
            }
        }
        Iterator it3 = parse.select(".detail_box_bottom").first().select("li").iterator();
        while (it3.hasNext()) {
            Element element3 = (Element) it3.next();
            String trim5 = element3.select(".detail_cap").first().text().trim();
            String trim6 = element3.select(".detail_text").first().text().trim();
            if (element3.select("script") != null && element3.select("script").size() > 0) {
                for (String str3 : element3.select("script").first().data().split("\n")) {
                    if (str3.trim().startsWith("contentsText")) {
                        trim6 = trim6 + extractText(str3);
                    }
                }
            }
            if (trim5 != null && trim6 != null) {
                defaultDocument.putAttribute(trim5.trim(), trim6.trim());
            }
        }
        return defaultDocument;
    }

    private String extractText(String str) {
        if (str == null) {
            return null;
        }
        String substring = str.substring(str.indexOf("'") + 1, str.lastIndexOf("'"));
        if (substring.startsWith("{\\\"")) {
            substring = StringEscapeUtils.unescapeJson(substring);
        }
        return StringEscapeUtils.unescapeJson(((JsonObject) new Gson().fromJson(substring, JsonObject.class)).get("ops").getAsJsonArray().get(0).getAsJsonObject().get("insert").getAsString());
    }

    public static void main(String[] strArr) throws IOException {
        Collection<File> listFiles = FileUtils.listFiles(new File("C:/usr/local/nlp4j/collections/caa/data/html"), new String[]{"html"}, true);
        int i = 0;
        File file = new File(String.format("C:/usr/local/nlp4j/collections/caa/data/json/caa_recall_%s_json.txt", new SimpleDateFormat("yyyy-MM-dd").format(new Date())));
        for (File file2 : listFiles) {
            if (file2.length() != 0) {
                String trim = FileUtils.readFileToString(file2, "UTF-8").trim();
                if (!trim.isEmpty()) {
                    String name = file2.getName();
                    System.err.println(name);
                    int idNum = CaaFileUtil.getIdNum(name);
                    String url = CaaRecallDownloader.getUrl(idNum);
                    Document parse = new CaaParser().parse(trim);
                    parse.putAttribute("url", url);
                    parse.putAttribute("md5", DigestUtils.md5Hex(trim));
                    parse.putAttribute("id", Integer.valueOf(idNum));
                    System.err.println(DocumentUtil.toPrettyJsonStringShort(parse));
                    DocumentUtil.writeAsLineSeparatedJson(parse, file);
                    i++;
                    if (i > 10) {
                    }
                }
            }
        }
    }
}
