package in.codehub.paperparser;

import in.codehub.document.Alignment;
import in.codehub.document.Document;
import in.codehub.document.DocumentIterator;
import in.codehub.document.Line;
import in.codehub.document.Page;
import in.codehub.document.Paragraph;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;

/* loaded from: input_file:in/codehub/paperparser/PaperParser.class */
public class PaperParser {
    private static PaperParser instance = null;
    private static final String SPACE = " ";
    private String ABSTRACT = PaperTags.ABSTRACT;
    private String[] PREFIXES = {"keywords", "index terms", "general terms"};
    private String[] NO_NAMES = {"Computer", "Department", "Science", "University", "School", "Academy", "College", "Abstract", "Email", " of ", ","};
    private Pattern EMAIL_REGEX = Pattern.compile("^[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,6}$", 2);
    private float absConf = 1.0f;

    private PaperParser() {
    }

    public static PaperParser getInstance() {
        if (instance == null) {
            instance = new PaperParser();
        }
        return instance;
    }

    public Paper parse(Document document) {
        this.absConf = 1.0f;
        Paper paper = new Paper(document.getId());
        markUseless(document);
        DocumentIterator documentIterator = new DocumentIterator(document);
        paper.setTitle(extractTitle(documentIterator));
        paper.getAuthors().addAll(extractAuthors(documentIterator));
        paper.setAbstract(extractAbstract(documentIterator));
        paper.getKeywords().addAll(extractKeywords(documentIterator));
        return paper;
    }

    private String extractAbstract(DocumentIterator documentIterator) {
        String text = documentIterator.currParagraph().text();
        int fontSize = documentIterator.currParagraph().fontSize();
        while (documentIterator.hasNextParagraph()) {
            Paragraph nextParagraph = documentIterator.nextParagraph();
            if (!isAbstractEx(nextParagraph, fontSize, text.length())) {
                break;
            }
            text = text + SPACE + nextParagraph.text();
        }
        return StringUtils.removeStartIgnoreCase(text, this.ABSTRACT).replaceAll("^[^A-Za-z]*", "");
    }

    private String extractTitle(DocumentIterator documentIterator) {
        Line line = null;
        while (documentIterator.hasNextLine()) {
            line = documentIterator.nextLine();
            if (line.tag() == null) {
                break;
            }
        }
        StringBuilder sb = new StringBuilder();
        if (line != null) {
            int i = 0;
            while (line.getFontSize() > i) {
                sb = new StringBuilder();
                sb.append(line.getText());
                line.setTag(PaperTags.TITLE);
                i = line.getFontSize();
                while (true) {
                    if (documentIterator.hasNextLine()) {
                        Line nextLine = documentIterator.nextLine();
                        if (line.getFontSize() != nextLine.getFontSize()) {
                            line = nextLine;
                            break;
                        }
                        sb.append(SPACE).append(nextLine.getText());
                        nextLine.setTag(PaperTags.TITLE);
                        line = nextLine;
                    }
                }
            }
        }
        return sb.toString().trim();
    }

    private List<String> extractKeywords(DocumentIterator documentIterator) {
        HashSet hashSet = new HashSet();
        Paragraph currParagraph = documentIterator.currParagraph();
        while (true) {
            Paragraph paragraph = currParagraph;
            if (!isKeyWordSection(paragraph)) {
                return new ArrayList(hashSet);
            }
            String trim = paragraph.text().trim();
            for (String str : this.PREFIXES) {
                trim = StringUtils.removeStartIgnoreCase(trim, str);
            }
            Iterator it = Arrays.asList(trim.split(",")).iterator();
            while (it.hasNext()) {
                hashSet.add(StringUtils.removeStart(StringUtils.normalizeSpace((String) it.next()).replaceAll("[^A-Za-z0-9 ]", SPACE), "and "));
            }
            currParagraph = documentIterator.nextParagraph();
        }
    }

    private List<String> extractAuthors(DocumentIterator documentIterator) {
        ArrayList arrayList = new ArrayList();
        Line currLine = documentIterator.currLine();
        if (currLine != null) {
            int fontSize = currLine.getFontSize();
            addToAuthors(arrayList, currLine, true);
            boolean z = arrayList.size() > 1;
            if (!z) {
                for (Line line : documentIterator.currParagraph().getLines()) {
                    if (line.tag() == null && !z && line.getFontSize() == fontSize) {
                        addToAuthors(arrayList, line, false);
                    }
                    z = z || arrayList.size() >= 4;
                }
            }
            Paragraph currParagraph = documentIterator.currParagraph();
            while (true) {
                Paragraph paragraph = currParagraph;
                if (!documentIterator.hasNextParagraph()) {
                    break;
                }
                Paragraph nextParagraph = documentIterator.nextParagraph();
                if (isAbstract(nextParagraph, paragraph)) {
                    break;
                }
                Line nextLine = documentIterator.nextLine();
                if (!z && nextLine.getFontSize() == fontSize) {
                    addToAuthors(arrayList, nextLine, false);
                }
                z = z || arrayList.size() >= 4;
                currParagraph = nextParagraph;
            }
        }
        return arrayList;
    }

    private void addToAuthors(List<String> list, Line line, boolean z) {
        line.setTag(PaperTags.AUTHOR);
        String text = line.getText();
        if (z) {
            Iterator it = Arrays.asList(text.replaceAll("[^A-Za-z. ,]", "").trim().split(",")).iterator();
            while (it.hasNext()) {
                for (String str : StringUtils.removeStartIgnoreCase(StringUtils.normalizeSpace((String) it.next()), "and ").split(" and ")) {
                    String boilAuthor = boilAuthor(str);
                    if (boilAuthor.length() > 0) {
                        list.add(boilAuthor);
                    }
                }
            }
            return;
        }
        if (text.length() < 25) {
            String trim = text.replaceAll("[^A-Za-z. ,]", "").trim();
            boolean z2 = trim.length() > 0;
            String[] strArr = this.NO_NAMES;
            int length = strArr.length;
            int i = 0;
            while (true) {
                if (i >= length) {
                    break;
                }
                if (StringUtils.containsIgnoreCase(trim, strArr[i])) {
                    z2 = false;
                    break;
                }
                i++;
            }
            if (z2) {
                String boilAuthor2 = boilAuthor(trim);
                if (boilAuthor2.length() > 0) {
                    list.add(boilAuthor2);
                }
            }
        }
    }

    private String boilAuthor(String str) {
        if (str.length() < 5) {
            return "";
        }
        if (str.lastIndexOf(32) == str.length() - 2) {
            str = str.substring(0, str.length() - 2);
        }
        return str;
    }

    private boolean isAbstractEx(Paragraph paragraph, int i, int i2) {
        return i == paragraph.fontSize() && i == paragraph.getLines().get(0).getFontSize() && i2 + paragraph.textLength() < 1200 && !isKeyWordSection(paragraph);
    }

    private boolean isKeyWordSection(Paragraph paragraph) {
        String lowerCase = paragraph.text().trim().toLowerCase();
        return lowerCase.length() < 150 && StringUtils.startsWithAny(lowerCase, this.PREFIXES);
    }

    private boolean isAbstract(Paragraph paragraph, Paragraph paragraph2) {
        String text = paragraph2.text();
        if (StringUtils.containsIgnoreCase(text, this.ABSTRACT) && text.length() < 12) {
            return true;
        }
        String text2 = paragraph.text();
        if (text2.length() > 20 && StringUtils.startsWithIgnoreCase(text2, this.ABSTRACT)) {
            return true;
        }
        if (text2.length() <= 350) {
            return false;
        }
        this.absConf = (float) (this.absConf * 0.9d);
        return true;
    }

    private void markUseless(Document document) {
        boolean z = false;
        DocumentIterator it = document.iterator();
        while (it.hasNextLine()) {
            Line nextLine = it.nextLine();
            Page currPage = it.currPage();
            if (!isInsideContentArea(nextLine, currPage, document)) {
                nextLine.setTag(PaperTags.USELESS);
            }
            if (!z) {
                z = isPotentialTitle(nextLine, currPage, document);
                if (!z) {
                    nextLine.setTag(PaperTags.USELESS);
                }
            }
        }
    }

    private boolean isPotentialTitle(Line line, Page page, Document document) {
        return ((double) line.getY()) < ((double) page.getHeight()) * 0.75d && getAlignment(line, page, document) != Alignment.RIGHT;
    }

    private Alignment getAlignment(Line line, Page page, Document document) {
        int firstX = line.getFirstX() - document.getLeftMargin();
        int width = (page.getWidth() - document.getRightMargin()) - line.getLastX();
        int i = 20;
        if (line.getLastX() - line.getFirstX() > 300) {
            i = 20 + 20;
        }
        return firstX - width > i ? Alignment.RIGHT : width - firstX > i ? Alignment.LEFT : Alignment.CENTER;
    }

    private boolean isInsideContentArea(Line line, Page page, Document document) {
        return line.getFirstX() > document.getLeftMargin() - 20 && line.getLastX() < (page.getWidth() - document.getRightMargin()) + 30 && line.getY() > document.getTopMargin() && line.getY() < page.getHeight() - document.getBottomMargin();
    }
}
