package com.digitalpebble.stormcrawler.bolt;

import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.TestUtil;
import com.digitalpebble.stormcrawler.parse.ParsingTester;
import java.io.IOException;
import java.util.HashMap;
import org.apache.storm.task.OutputCollector;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

/* loaded from: input_file:com/digitalpebble/stormcrawler/bolt/JSoupParserBoltTest.class */
public class JSoupParserBoltTest extends ParsingTester {
    public static String[] tests = {"<html><head><title>test page</title><META NAME=\"ROBOTS\" CONTENT=\"NONE\"> </head><body> some text</body></html>", "<html><head><title>test page</title><meta name=\"robots\" content=\"all\"> </head><body> some text</body></html>", "<html><head><title>test page</title><MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> </head><body> some text</body></html>", "<html><head><title>test page</title><meta name=\"robots\" content=\"none\"> </head><body> some text</body></html>", "<html><head><title>test page</title><meta name=\"robots\" content=\"noindex,nofollow\"> </head><body> some text</body></html>", "<html><head><title>test page</title><meta name=\"robots\" content=\"noindex,follow\"> </head><body> some text</body></html>", "<html><head><title>test page</title><meta name=\"robots\" content=\"index,nofollow\"> </head><body> some text</body></html>", "<html><head><title>test page</title><meta name=\"robots\" content=\"index,follow\"> <base href=\"http://www.nutch.org/\"></head><body> some text</body></html>", "<html><head><title>test page</title><meta name=\"robots\"> <base href=\"http://www.nutch.org/base/\"></head><body> some text</body></html>"};
    public static final boolean[][] answers = {new boolean[]{true, true, true}, new boolean[]{false, false, false}, new boolean[]{true, true, true}, new boolean[]{true, true, true}, new boolean[]{true, true, false}, new boolean[]{true, false, false}, new boolean[]{false, true, false}, new boolean[]{false, false, false}, new boolean[]{false, false, false}};

    @Before
    public void setupParserBolt() {
        this.bolt = new JSoupParserBolt();
        setupParserBolt(this.bolt);
    }

    @Test
    public void testNoScriptInText() throws IOException {
        this.bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(this.output));
        parse("http://www.digitalpebble.com", "digitalpebble.com.html");
        Assert.assertFalse("Text should not contain the content of script tags", ((String) this.output.getEmitted().remove(0).get(3)).contains("urchinTracker"));
    }

    @Test
    public void testNoFollowOutlinks() throws IOException {
        this.bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(this.output));
        parse("http://www.digitalpebble.com", "digitalpebble.com.html");
        Assert.assertEquals(10L, this.output.getEmitted("status").size());
    }

    @Test
    public void testHTTPRobots() throws IOException {
        this.bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(this.output));
        Metadata metadata = new Metadata();
        metadata.setValues("X-Robots-Tag", new String[]{"noindex", "nofollow"});
        parse("http://www.digitalpebble.com", "digitalpebble.com.html", metadata);
        Assert.assertEquals(0L, this.output.getEmitted("status").size());
        Assert.assertEquals(1L, this.output.getEmitted().size());
        Metadata metadata2 = (Metadata) this.output.getEmitted().remove(0).get(2);
        Assert.assertNotNull(metadata2);
        boolean parseBoolean = Boolean.parseBoolean(metadata2.getFirstValue("robots.noIndex"));
        boolean parseBoolean2 = Boolean.parseBoolean(metadata2.getFirstValue("robots.noFollow"));
        boolean parseBoolean3 = Boolean.parseBoolean(metadata2.getFirstValue("robots.noCache"));
        Assert.assertEquals("incorrect noIndex", true, Boolean.valueOf(parseBoolean));
        Assert.assertEquals("incorrect noFollow", true, Boolean.valueOf(parseBoolean2));
        Assert.assertEquals("incorrect noCache", false, Boolean.valueOf(parseBoolean3));
    }

    @Test
    public void testRobotsMetaProcessor() throws IOException {
        this.bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(this.output));
        for (int i = 0; i < tests.length; i++) {
            parse("http://www.digitalpebble.com", tests[i].getBytes(), new Metadata());
            Assert.assertEquals(1L, this.output.getEmitted().size());
            Metadata metadata = (Metadata) this.output.getEmitted().remove(0).get(2);
            Assert.assertNotNull(metadata);
            boolean parseBoolean = Boolean.parseBoolean(metadata.getFirstValue("robots.noIndex"));
            boolean parseBoolean2 = Boolean.parseBoolean(metadata.getFirstValue("robots.noFollow"));
            boolean parseBoolean3 = Boolean.parseBoolean(metadata.getFirstValue("robots.noCache"));
            Assert.assertEquals("incorrect noIndex value on doc " + i, Boolean.valueOf(answers[i][0]), Boolean.valueOf(parseBoolean));
            Assert.assertEquals("incorrect noFollow value on doc " + i, Boolean.valueOf(answers[i][1]), Boolean.valueOf(parseBoolean2));
            Assert.assertEquals("incorrect noCache value on doc " + i, Boolean.valueOf(answers[i][2]), Boolean.valueOf(parseBoolean3));
        }
    }

    @Test
    public void testHTMLRedir() throws IOException {
        this.bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(this.output));
        parse("http://www.somesite.com", "redir.html");
        Assert.assertEquals(2L, this.output.getEmitted("status").size());
    }
}
