package org.galagosearch.core.parse;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;

/* loaded from: input_file:org/galagosearch/core/parse/TrecWebParser.class */
public class TrecWebParser implements DocumentStreamParser {
    BufferedReader reader;

    public TrecWebParser(BufferedReader bufferedReader) throws FileNotFoundException, IOException {
        this.reader = bufferedReader;
    }

    public String waitFor(String str) throws IOException {
        String readLine;
        do {
            readLine = this.reader.readLine();
            if (readLine == null) {
                return null;
            }
        } while (!readLine.startsWith(str));
        return readLine;
    }

    public void close() throws IOException {
        this.reader.close();
        this.reader = null;
    }

    public String scrubUrl(String str) {
        if (str.charAt(str.length() - 1) == '#') {
            str = str.substring(0, str.length() - 1);
        }
        String replace = str.toLowerCase().replace(":80/", "/");
        if (replace.endsWith(":80")) {
            replace = replace.replace(":80", "");
        }
        while (replace.charAt(replace.length() - 1) == '/') {
            replace = replace.substring(0, replace.length() - 1);
        }
        return replace;
    }

    public String readUrl() throws IOException {
        String readLine = this.reader.readLine();
        int indexOf = readLine.indexOf(32);
        if (indexOf < 0) {
            indexOf = readLine.length();
        }
        return scrubUrl(readLine.substring(0, indexOf));
    }

    @Override // org.galagosearch.core.parse.DocumentStreamParser
    public Document nextDocument() throws IOException {
        if (waitFor("<DOC>") == null) {
            close();
            return null;
        }
        String trim = waitFor("<DOCNO>").substring(7).trim();
        String str = new String(trim.substring(0, trim.length() - 8).trim());
        waitFor("<DOCHDR>");
        String readUrl = readUrl();
        waitFor("</DOCHDR>");
        StringBuilder sb = new StringBuilder(20480);
        while (true) {
            String readLine = this.reader.readLine();
            if (readLine != null && !readLine.startsWith("</DOC>")) {
                sb.append(readLine);
                sb.append('\n');
            }
        }
        Document document = new Document(str, sb.toString());
        document.metadata.put("url", new String(readUrl));
        document.metadata.put("identifier", document.identifier);
        return document;
    }
}
