package org.galagosearch.core.parse;

import ivory.core.util.CLIRUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

/* loaded from: input_file:org/galagosearch/core/parse/TagTokenizer.class */
public class TagTokenizer {
    protected static final boolean[] splits = buildSplits();
    protected static HashSet<String> ignoredTags = buildIgnoredTags();
    protected String ignoreUntil;
    protected StringPooler pooler = new StringPooler();
    protected String text = null;
    protected int position = 0;
    protected int lastSplit = -1;
    ArrayList<String> tokens = new ArrayList<>();
    HashMap<String, ArrayList<BeginTag>> openTags = new HashMap<>();
    ArrayList<ClosedTag> closedTags = new ArrayList<>();
    ArrayList<Pair> tokenPositions = new ArrayList<>();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* renamed from: org.galagosearch.core.parse.TagTokenizer$1, reason: invalid class name */
    /* loaded from: input_file:org/galagosearch/core/parse/TagTokenizer$1.class */
    public static /* synthetic */ class AnonymousClass1 {
        static final /* synthetic */ int[] $SwitchMap$org$galagosearch$core$parse$TagTokenizer$StringStatus = new int[StringStatus.values().length];

        static {
            try {
                $SwitchMap$org$galagosearch$core$parse$TagTokenizer$StringStatus[StringStatus.NeedsSimpleFix.ordinal()] = 1;
            } catch (NoSuchFieldError e) {
            }
            try {
                $SwitchMap$org$galagosearch$core$parse$TagTokenizer$StringStatus[StringStatus.NeedsComplexFix.ordinal()] = 2;
            } catch (NoSuchFieldError e2) {
            }
            try {
                $SwitchMap$org$galagosearch$core$parse$TagTokenizer$StringStatus[StringStatus.NeedsAcronymProcessing.ordinal()] = 3;
            } catch (NoSuchFieldError e3) {
            }
            try {
                $SwitchMap$org$galagosearch$core$parse$TagTokenizer$StringStatus[StringStatus.Clean.ordinal()] = 4;
            } catch (NoSuchFieldError e4) {
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/galagosearch/core/parse/TagTokenizer$BeginTag.class */
    public class BeginTag {
        String name;
        Map<String, String> attributes;
        int bytePosition;
        int termPosition;

        public BeginTag(String str, Map<String, String> map) {
            this.name = str;
            this.attributes = map;
            this.bytePosition = TagTokenizer.this.position;
            this.termPosition = TagTokenizer.this.tokens.size();
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/galagosearch/core/parse/TagTokenizer$ClosedTag.class */
    public class ClosedTag {
        String name;
        Map<String, String> attributes;
        int byteStart;
        int termStart;
        int byteEnd;
        int termEnd;

        public ClosedTag(BeginTag beginTag) {
            this.name = beginTag.name;
            this.attributes = beginTag.attributes;
            this.byteStart = beginTag.bytePosition;
            this.termStart = beginTag.termPosition;
            this.byteEnd = TagTokenizer.this.position;
            this.termEnd = TagTokenizer.this.tokens.size();
        }
    }

    /* loaded from: input_file:org/galagosearch/core/parse/TagTokenizer$Pair.class */
    public static class Pair {
        public int start;
        public int end;

        public Pair(int i, int i2) {
            this.start = i;
            this.end = i2;
        }

        public String toString() {
            return String.format("%d,%d", Integer.valueOf(this.start), Integer.valueOf(this.end));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:org/galagosearch/core/parse/TagTokenizer$StringStatus.class */
    public enum StringStatus {
        Clean,
        NeedsSimpleFix,
        NeedsComplexFix,
        NeedsAcronymProcessing
    }

    protected static boolean[] buildSplits() {
        boolean[] zArr = new boolean[257];
        for (int i = 0; i < zArr.length; i++) {
            zArr[i] = false;
        }
        for (char c : new char[]{' ', '\t', '\n', '\r', ';', '\"', '&', '/', ':', '!', '#', '?', '$', '%', '(', ')', '@', '^', '*', '+', '-', ',', '=', '>', '<', '[', ']', '{', '}', '|', '`', '~', '_'}) {
            zArr[(byte) c] = true;
        }
        byte b = 0;
        while (true) {
            byte b2 = b;
            if (b2 > 32) {
                return zArr;
            }
            zArr[b2] = true;
            b = (byte) (b2 + 1);
        }
    }

    protected static HashSet<String> buildIgnoredTags() {
        HashSet<String> hashSet = new HashSet<>();
        hashSet.add("style");
        hashSet.add("script");
        return hashSet;
    }

    public void reset() {
        this.ignoreUntil = null;
        this.text = null;
        this.position = 0;
        this.lastSplit = -1;
        this.tokens.clear();
        this.openTags.clear();
        this.closedTags.clear();
        if (this.tokenPositions != null) {
            this.tokenPositions.clear();
        }
    }

    protected void skipComment() {
        if (this.text.substring(this.position).startsWith("<!--")) {
            this.position = this.text.indexOf("-->", this.position + 1);
            if (this.position >= 0) {
                this.position += 2;
            }
        } else {
            this.position = this.text.indexOf(">", this.position + 1);
        }
        if (this.position < 0) {
            this.position = this.text.length();
        }
    }

    protected void skipProcessingInstruction() {
        this.position = this.text.indexOf("?>", this.position + 1);
        if (this.position < 0) {
            this.position = this.text.length();
        }
    }

    protected void parseEndTag() {
        int i = this.position + 2;
        while (i < this.text.length()) {
            char charAt = this.text.charAt(i);
            if (Character.isSpaceChar(charAt) || charAt == '>') {
                break;
            } else {
                i++;
            }
        }
        String lowerCase = this.text.substring(this.position + 2, i).toLowerCase();
        if (this.ignoreUntil != null && this.ignoreUntil.equals(lowerCase)) {
            this.ignoreUntil = null;
        }
        if (this.ignoreUntil == null) {
            closeTag(lowerCase);
        }
        while (i < this.text.length() && this.text.charAt(i) != '>') {
            i++;
        }
        this.position = i;
    }

    protected void closeTag(String str) {
        if (this.openTags.containsKey(str)) {
            ArrayList<BeginTag> arrayList = this.openTags.get(str);
            if (arrayList.size() > 0) {
                int size = arrayList.size() - 1;
                this.closedTags.add(new ClosedTag(arrayList.get(size)));
                arrayList.remove(size);
            }
        }
    }

    protected int indexOfNonSpace(int i) {
        if (i < 0) {
            return Integer.MIN_VALUE;
        }
        for (int i2 = i; i2 < this.text.length(); i2++) {
            if (!Character.isSpaceChar(this.text.charAt(i2))) {
                return i2;
            }
        }
        return Integer.MIN_VALUE;
    }

    protected int indexOfEndAttribute(int i, int i2) {
        if (i < 0) {
            return Integer.MIN_VALUE;
        }
        boolean z = false;
        boolean z2 = false;
        for (int i3 = i; i3 <= i2; i3++) {
            char charAt = this.text.charAt(i3);
            if ((charAt == '\"' || charAt == '\'') && !z2) {
                z = !z;
                if (!z) {
                    return i3;
                }
            } else {
                if (!z && (Character.isSpaceChar(charAt) || charAt == '>')) {
                    return i3;
                }
                z2 = charAt == '\\' && !z2;
            }
        }
        return Integer.MIN_VALUE;
    }

    protected int indexOfSpace(int i) {
        if (i < 0) {
            return Integer.MIN_VALUE;
        }
        for (int i2 = i; i2 < this.text.length(); i2++) {
            if (Character.isSpaceChar(this.text.charAt(i2))) {
                return i2;
            }
        }
        return Integer.MIN_VALUE;
    }

    protected int indexOfEquals(int i, int i2) {
        if (i < 0) {
            return Integer.MIN_VALUE;
        }
        for (int i3 = i; i3 < i2; i3++) {
            if (this.text.charAt(i3) == '=') {
                return i3;
            }
        }
        return Integer.MIN_VALUE;
    }

    protected void parseBeginTag() {
        int i = this.position + 1;
        while (i < this.text.length()) {
            char charAt = this.text.charAt(i);
            if (Character.isSpaceChar(charAt) || charAt == '>') {
                break;
            } else {
                i++;
            }
        }
        String lowerCase = this.text.substring(this.position + 1, i).toLowerCase();
        int indexOfNonSpace = indexOfNonSpace(i);
        int indexOf = this.text.indexOf(">", indexOfNonSpace + 1);
        boolean z = false;
        HashMap hashMap = new HashMap();
        while (true) {
            if (indexOfNonSpace >= indexOf || indexOfNonSpace < 0 || indexOf < 0) {
                break;
            }
            int indexOfNonSpace2 = indexOfNonSpace(indexOfNonSpace);
            if (indexOfNonSpace2 > 0) {
                if (this.text.charAt(indexOfNonSpace2) != '>') {
                    if (this.text.charAt(indexOfNonSpace2) == '/' && this.text.length() > indexOfNonSpace2 + 1 && this.text.charAt(indexOfNonSpace2 + 1) == '>') {
                        indexOfNonSpace = indexOfNonSpace2 + 1;
                        z = true;
                        break;
                    }
                } else {
                    indexOfNonSpace = indexOfNonSpace2;
                    break;
                }
            }
            int indexOfEndAttribute = indexOfEndAttribute(indexOfNonSpace2, indexOf);
            int indexOfEquals = indexOfEquals(indexOfNonSpace2, indexOfEndAttribute);
            if (indexOfEquals >= 0 && indexOfEquals != indexOfNonSpace2 && indexOfEndAttribute != indexOfEquals) {
                int i2 = indexOfEquals + 1;
                if (this.text.charAt(i2) == '\"' || this.text.charAt(i2) == '\'') {
                    i2++;
                }
                if (i2 >= indexOfEndAttribute || indexOfNonSpace2 >= indexOfEquals) {
                    indexOfNonSpace = indexOfEndAttribute;
                } else {
                    hashMap.put(this.text.substring(indexOfNonSpace2, indexOfEquals).toLowerCase(), this.text.substring(i2, indexOfEndAttribute));
                    if (indexOfEndAttribute >= this.text.length()) {
                        endParsing();
                        break;
                    }
                    if (this.text.charAt(indexOfEndAttribute) == '\"' || this.text.charAt(indexOfEndAttribute) == '\'') {
                        indexOfEndAttribute++;
                    }
                    indexOfNonSpace = indexOfEndAttribute;
                }
            } else {
                if (indexOfEndAttribute < 0) {
                    indexOfNonSpace = indexOf;
                    break;
                }
                indexOfNonSpace = indexOfEndAttribute;
            }
        }
        if (!ignoredTags.contains(lowerCase)) {
            BeginTag beginTag = new BeginTag(lowerCase, hashMap);
            if (this.openTags.containsKey(lowerCase)) {
                this.openTags.get(lowerCase).add(beginTag);
            } else {
                ArrayList<BeginTag> arrayList = new ArrayList<>();
                arrayList.add(beginTag);
                this.openTags.put(lowerCase, arrayList);
            }
            if (z) {
                closeTag(lowerCase);
            }
        } else if (!z) {
            this.ignoreUntil = lowerCase;
        }
        this.position = indexOfNonSpace;
    }

    protected void endParsing() {
        this.position = this.text.length();
    }

    protected void onSplit() {
        if (this.position - this.lastSplit > 1) {
            int i = this.lastSplit + 1;
            String substring = this.text.substring(i, this.position);
            StringStatus checkTokenStatus = checkTokenStatus(substring);
            switch (AnonymousClass1.$SwitchMap$org$galagosearch$core$parse$TagTokenizer$StringStatus[checkTokenStatus.ordinal()]) {
                case 1:
                    substring = tokenSimpleFix(substring);
                    break;
                case 2:
                    substring = tokenComplexFix(substring);
                    break;
                case CLIRUtils.MinVectorTerms /* 3 */:
                    tokenAcronymProcessing(substring, i, this.position);
                    break;
            }
            if (checkTokenStatus != StringStatus.NeedsAcronymProcessing) {
                addToken(substring, i, this.position);
            }
        }
        this.lastSplit = this.position;
    }

    protected void addToken(String str, int i, int i2) {
        if (str.length() <= 0) {
            return;
        }
        if (str.length() <= 16 || Utility.makeBytes(str).length < 100) {
            this.tokens.add(str);
            this.tokenPositions.add(new Pair(i, i2));
        }
    }

    protected String tokenComplexFix(String str) {
        return tokenSimpleFix(str).toLowerCase();
    }

    protected void tokenAcronymProcessing(String str, int i, int i2) {
        String str2 = tokenComplexFix(str);
        while (str2.startsWith(".")) {
            str2 = str2.substring(1);
            i++;
        }
        while (str2.endsWith(".")) {
            str2 = str2.substring(0, str2.length() - 1);
            i2--;
        }
        if (str2.indexOf(46) < 0) {
            addToken(str2, i, i2);
            return;
        }
        boolean z = str2.length() > 0;
        for (int i3 = 1; i3 < str2.length(); i3 += 2) {
            if (str2.charAt(i3) != '.') {
                z = false;
            }
        }
        if (z) {
            addToken(str2.replace(".", ""), i, i2);
            return;
        }
        int i4 = 0;
        for (int i5 = 0; i5 < str2.length(); i5++) {
            if (str2.charAt(i5) == '.') {
                if (i5 - i4 > 1) {
                    addToken(str2.substring(i4, i5), i + i4, i + i5);
                }
                i4 = i5 + 1;
            }
        }
        if (str2.length() - i4 > 1) {
            addToken(str2.substring(i4), i + i4, i2);
        }
    }

    protected String tokenSimpleFix(String str) {
        char[] charArray = str.toCharArray();
        int i = 0;
        for (int i2 = 0; i2 < charArray.length; i2++) {
            char c = charArray[i2];
            boolean z = c >= 'A' && c <= 'Z';
            boolean z2 = c == '\'';
            if (z) {
                charArray[i] = (char) ((charArray[i2] + 'a') - 65);
            } else if (z2) {
                i--;
            } else {
                charArray[i] = charArray[i2];
            }
            i++;
        }
        return new String(charArray, 0, i);
    }

    protected StringStatus checkTokenStatus(String str) {
        StringStatus stringStatus = StringStatus.Clean;
        char[] charArray = str.toCharArray();
        int i = 0;
        while (true) {
            if (i >= charArray.length) {
                break;
            }
            char c = charArray[i];
            boolean z = c >= 'a' && c <= 'z';
            boolean z2 = c >= '0' && c <= '9';
            if (!z && !z2) {
                boolean z3 = c >= 'A' && c <= 'Z';
                boolean z4 = c == '.';
                boolean z5 = c == '\'';
                if ((!z3 && !z5) || stringStatus != StringStatus.Clean) {
                    if (z4) {
                        stringStatus = StringStatus.NeedsAcronymProcessing;
                        break;
                    }
                    stringStatus = StringStatus.NeedsComplexFix;
                } else {
                    stringStatus = StringStatus.NeedsSimpleFix;
                }
            }
            i++;
        }
        return stringStatus;
    }

    protected void onStartBracket() {
        if (this.position + 1 < this.text.length()) {
            char charAt = this.text.charAt(this.position + 1);
            if (charAt == '/') {
                parseEndTag();
            } else if (charAt == '!') {
                skipComment();
            } else if (charAt == '?') {
                skipProcessingInstruction();
            } else {
                parseBeginTag();
            }
        } else {
            endParsing();
        }
        this.lastSplit = this.position;
    }

    protected ArrayList<Tag> coalesceTags() {
        ArrayList<Tag> arrayList = new ArrayList<>();
        Iterator<ArrayList<BeginTag>> it = this.openTags.values().iterator();
        while (it.hasNext()) {
            Iterator<BeginTag> it2 = it.next().iterator();
            while (it2.hasNext()) {
                BeginTag next = it2.next();
                arrayList.add(new Tag(next.name, next.attributes, next.termPosition, next.termPosition));
            }
        }
        Iterator<ClosedTag> it3 = this.closedTags.iterator();
        while (it3.hasNext()) {
            ClosedTag next2 = it3.next();
            arrayList.add(new Tag(next2.name, next2.attributes, next2.termStart, next2.termEnd));
        }
        Collections.sort(arrayList);
        return arrayList;
    }

    public void onAmpersand() {
        onSplit();
        for (int i = this.position + 1; i < this.text.length(); i++) {
            char charAt = this.text.charAt(i);
            if ((charAt < 'a' || charAt > 'z') && ((charAt < '0' || charAt > '9') && charAt != '#')) {
                if (charAt == ';') {
                    this.position = i;
                    this.lastSplit = i;
                    return;
                }
                return;
            }
        }
    }

    public void tokenize(Document document) {
        reset();
        this.text = document.text;
        while (this.position >= 0 && this.position < this.text.length()) {
            try {
                char charAt = this.text.charAt(this.position);
                if (charAt == '<') {
                    if (this.ignoreUntil == null) {
                        onSplit();
                    }
                    onStartBracket();
                } else if (this.ignoreUntil == null) {
                    if (charAt == '&') {
                        onAmpersand();
                    } else if (charAt < 256 && splits[charAt]) {
                        onSplit();
                    }
                }
                this.position++;
            } catch (Exception e) {
                Logger.getLogger(getClass().toString()).log(Level.WARNING, "Parse failure: " + document.identifier);
            }
        }
        if (this.ignoreUntil == null) {
            onSplit();
        }
        document.terms = new ArrayList(this.tokens);
        document.tags = coalesceTags();
        this.pooler.transform(document);
    }

    public Document tokenize(String str) throws IOException {
        Document document = new Document();
        document.text = str;
        tokenize(document);
        return document;
    }

    public ArrayList<Pair> getTokenPositions() {
        return this.tokenPositions;
    }

    public Class<Document> getInputClass() {
        return Document.class;
    }

    public Class<Document> getOutputClass() {
        return Document.class;
    }
}
