package com.aliasi.test.unit.lm;

import com.aliasi.lm.CompiledTokenizedLM;
import com.aliasi.lm.NGramBoundaryLM;
import com.aliasi.lm.TokenizedLM;
import com.aliasi.lm.TrieIntSeqCounter;
import com.aliasi.lm.UniformBoundaryLM;
import com.aliasi.symbol.SymbolTable;
import com.aliasi.test.unit.Asserts;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.util.Math;
import com.aliasi.util.ScoredObject;
import com.aliasi.util.Strings;
import com.aliasi.xml.XHtmlWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Random;
import java.util.SortedSet;
import junit.framework.Assert;
import org.junit.Test;

/* loaded from: input_file:com/aliasi/test/unit/lm/TokenizedLMTest.class */
public class TokenizedLMTest {
    private static final int MAX_NGRAM = 3;
    private static final double LAMBDA_FACTOR = 4.0d;

    void dumpProbs(String[] strArr, TokenizedLM tokenizedLM) {
        System.out.println("TOKENS: " + Arrays.asList(strArr));
        System.out.println("lm.tokenProbability(): " + tokenizedLM.tokenProbability(strArr, 0, strArr.length));
        System.out.println("lm.tokenProbCharSmooth(): " + tokenizedLM.tokenProbCharSmooth(strArr, 0, strArr.length));
        System.out.println("lm.tokenProbCharSmoothNoBound(): " + tokenizedLM.tokenProbCharSmoothNoBounds(strArr, 0, strArr.length));
        System.out.println();
    }

    @Test
    public void testPabloBug() {
        TokenizedLM tokenizedLM = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, MAX_NGRAM, new NGramBoundaryLM(MAX_NGRAM), new NGramBoundaryLM(MAX_NGRAM), 1.0d);
        for (int i = 0; i < 100; i++) {
            tokenizedLM.handle("ba ba be lo and behold and and lo some more");
        }
        dumpProbs(new String[]{"ba"}, tokenizedLM);
        dumpProbs(new String[]{"bo"}, tokenizedLM);
        dumpProbs(new String[]{"%%"}, tokenizedLM);
    }

    @Test
    public void testTrainSequence() {
        TokenizedLM tokenizedLM = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, MAX_NGRAM);
        SymbolTable symbolTable = tokenizedLM.symbolTable();
        TrieIntSeqCounter sequenceCounter = tokenizedLM.sequenceCounter();
        Assert.assertEquals(1, sequenceCounter.count(new int[0], 0, 0));
        tokenizedLM.trainSequence("a b", 2);
        tokenizedLM.trainSequence("a c", MAX_NGRAM);
        tokenizedLM.trainSequence("a b c", 4);
        int symbolToID = symbolTable.symbolToID(XHtmlWriter.A);
        int symbolToID2 = symbolTable.symbolToID(XHtmlWriter.B);
        int symbolToID3 = symbolTable.symbolToID("c");
        Assert.assertEquals(2, sequenceCounter.count(new int[]{symbolToID, symbolToID2}, 0, 2));
        Assert.assertEquals(MAX_NGRAM, sequenceCounter.count(new int[]{symbolToID, symbolToID3}, 0, 2));
        Assert.assertEquals(4, sequenceCounter.count(new int[]{symbolToID, symbolToID2, symbolToID3}, 0, MAX_NGRAM));
        Assert.assertEquals(5L, sequenceCounter.extensionCount(new int[]{symbolToID}, 0, 1));
        tokenizedLM.trainSequence("a a a c c c", 111);
        Assert.assertEquals(111, sequenceCounter.count(new int[]{symbolToID3, symbolToID3, symbolToID3}, 0, MAX_NGRAM));
        Assert.assertEquals(111L, sequenceCounter.extensionCount(new int[]{symbolToID3, symbolToID3}, 0, 2));
        tokenizedLM.trainSequence(Strings.EMPTY_STRING, 999);
        Assert.assertEquals(1000, sequenceCounter.count(new int[0], 0, 0));
    }

    @Test
    public void testZeroGram() {
        try {
            new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, 0, new UniformBoundaryLM(16), new UniformBoundaryLM(16), LAMBDA_FACTOR);
            Assert.fail();
        } catch (IllegalArgumentException e) {
            Asserts.succeed();
        }
    }

    @Test
    public void testUnigram() {
        new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, 1, new UniformBoundaryLM(16), new UniformBoundaryLM(16), LAMBDA_FACTOR).train("John Smith");
    }

    @Test
    public void testBiggerGram() {
        new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, 4, new UniformBoundaryLM(16), new UniformBoundaryLM(16), LAMBDA_FACTOR).train("John Smith");
    }

    @Test
    public void testChiSquaredIndependence() {
        TokenizedLM tokenizedLM = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, MAX_NGRAM, new UniformBoundaryLM(16), new UniformBoundaryLM(16), LAMBDA_FACTOR);
        tokenizedLM.train("a b c a b d a b e a b");
        SymbolTable symbolTable = tokenizedLM.symbolTable();
        Assert.assertEquals(5, symbolTable.numSymbols());
        int symbolToID = symbolTable.symbolToID(XHtmlWriter.A);
        int symbolToID2 = symbolTable.symbolToID(XHtmlWriter.B);
        int symbolToID3 = symbolTable.symbolToID("c");
        int symbolToID4 = symbolTable.symbolToID("d");
        int symbolToID5 = symbolTable.symbolToID("e");
        Assert.assertTrue(symbolToID >= 0);
        Assert.assertTrue(symbolToID2 >= 0);
        Assert.assertTrue(symbolToID3 >= 0);
        Assert.assertTrue(symbolToID4 >= 0);
        Assert.assertTrue(symbolToID5 >= 0);
        Assert.assertTrue(tokenizedLM.chiSquaredIndependence(new int[]{symbolToID, symbolToID2}) > tokenizedLM.chiSquaredIndependence(new int[]{symbolToID2, symbolToID3}));
        Assert.assertTrue(tokenizedLM.chiSquaredIndependence(new int[]{symbolToID3, symbolToID}) > tokenizedLM.chiSquaredIndependence(new int[]{symbolToID3, symbolToID5}));
    }

    @Test
    public void testConstantSubModels() throws ClassNotFoundException, IOException {
        TokenizedLM tokenizedLM = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, MAX_NGRAM, new UniformBoundaryLM(127), new UniformBoundaryLM(15), LAMBDA_FACTOR);
        double d = 0.2d * 1.0d;
        assertEstimate(Math.log2(d * 0.0625d), tokenizedLM, Strings.EMPTY_STRING);
        double d2 = 1.0d - 0.2d;
        assertEstimate(Math.log2(d2 * d * 6.103515625E-5d * 0.0625d * 0.0625d), tokenizedLM, XHtmlWriter.A);
        double d3 = (0.0625d * 1.0d) / 16.0d;
        assertEstimate(Math.log2(d2 * d2 * d * 0.0625d * 0.0625d * d3 * 6.103515625E-5d * 6.103515625E-5d), tokenizedLM, "a b");
        assertEstimate(Math.log2(d2 * d2 * d2 * d * 0.0625d * 0.0625d * d3 * d3 * 6.103515625E-5d * 6.103515625E-5d * 6.103515625E-5d), tokenizedLM, "a b c");
        assertEstimate(Math.log2(d2 * d2 * d2 * d2 * d * 0.0625d * 0.0625d * d3 * d3 * d3 * 6.103515625E-5d * 6.103515625E-5d * 6.103515625E-5d * 6.103515625E-5d), tokenizedLM, "a b c d");
        tokenizedLM.train(XHtmlWriter.A);
        assertEstimate(Math.log2((1.0d - 0.2d) * 0.2727272727272727d * 0.6666666666666666d * 0.0625d), tokenizedLM, Strings.EMPTY_STRING);
        assertEstimate(Math.log2(((0.2d * 1.0d) + ((1.0d - 0.2d) * 0.2727272727272727d * 0.3333333333333333d)) * ((0.2d * 1.0d) + ((1.0d - 0.2d) * ((0.2d * 1.0d) + ((1.0d - 0.2d) * 0.2727272727272727d * 0.6666666666666666d)))) * 0.0625d * 0.0625d), tokenizedLM, XHtmlWriter.A);
    }

    @Test
    public void testTwo() throws ClassNotFoundException, IOException {
        TokenizedLM tokenizedLM = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, MAX_NGRAM, new UniformBoundaryLM(127), new UniformBoundaryLM(15), LAMBDA_FACTOR);
        assertEqEstimate(tokenizedLM, XHtmlWriter.A);
        assertEqEstimate(tokenizedLM, "a b");
        assertEqEstimate(tokenizedLM, "a a b");
        assertEqEstimate(tokenizedLM, "a b a");
        tokenizedLM.train(XHtmlWriter.A);
        assertEqEstimate(tokenizedLM, XHtmlWriter.A);
        assertEqEstimate(tokenizedLM, "a b");
        assertEqEstimate(tokenizedLM, "a a b");
        assertEqEstimate(tokenizedLM, "a b a");
        tokenizedLM.train("a b c");
        assertEqEstimate(tokenizedLM, XHtmlWriter.A);
        assertEqEstimate(tokenizedLM, "a b");
        assertEqEstimate(tokenizedLM, "a b e");
        tokenizedLM.train("x y");
        assertEqEstimate(tokenizedLM, "x y a b e x y");
        assertEqEstimate(tokenizedLM, Strings.EMPTY_STRING);
        assertEqEstimate(tokenizedLM, "x");
    }

    @Test
    public void testCollocs() {
        TokenizedLM tokenizedLM = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, 4);
        tokenizedLM.train("a b c d");
        tokenizedLM.train("a b e f");
        tokenizedLM.train("c f e");
        SortedSet<ScoredObject<String[]>> collocationSet = tokenizedLM.collocationSet(2, 1, 2);
        Assert.assertEquals(2, collocationSet.size());
        Iterator<ScoredObject<String[]>> it = collocationSet.iterator();
        org.junit.Assert.assertArrayEquals(new String[]{XHtmlWriter.A, XHtmlWriter.B}, it.next().getObject());
        org.junit.Assert.assertArrayEquals(new String[]{"c", "d"}, it.next().getObject());
        TokenizedLM tokenizedLM2 = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, 4);
        tokenizedLM2.train("a b c d");
        tokenizedLM2.train("a b c e");
        tokenizedLM2.train("d e f");
        tokenizedLM2.train("f d e");
        tokenizedLM2.train("e f d");
        SortedSet<ScoredObject<String[]>> collocationSet2 = tokenizedLM2.collocationSet(MAX_NGRAM, 1, 2);
        Assert.assertEquals(2, collocationSet2.size());
        org.junit.Assert.assertArrayEquals(new String[]{XHtmlWriter.A, XHtmlWriter.B, "c"}, collocationSet2.iterator().next().getObject());
        try {
            tokenizedLM2.collocationSet(1, 1, MAX_NGRAM);
            Assert.fail();
        } catch (IllegalArgumentException e) {
            Asserts.succeed();
        }
    }

    static ScoredObject[] newTerms(TokenizedLM tokenizedLM, int i, int i2, int i3, TokenizedLM tokenizedLM2) {
        SortedSet<ScoredObject<String[]>> newTermSet = tokenizedLM.newTermSet(i, i2, i3, tokenizedLM2);
        ScoredObject[] scoredObjectArr = new ScoredObject[newTermSet.size()];
        Iterator<ScoredObject<String[]>> it = newTermSet.iterator();
        for (int i4 = 0; i4 < scoredObjectArr.length; i4++) {
            scoredObjectArr[i4] = it.next();
        }
        return scoredObjectArr;
    }

    static ScoredObject[] oldTerms(TokenizedLM tokenizedLM, int i, int i2, int i3, TokenizedLM tokenizedLM2) {
        SortedSet<ScoredObject<String[]>> oldTermSet = tokenizedLM.oldTermSet(i, i2, i3, tokenizedLM2);
        ScoredObject[] scoredObjectArr = new ScoredObject[oldTermSet.size()];
        Iterator<ScoredObject<String[]>> it = oldTermSet.iterator();
        for (int i4 = 0; i4 < scoredObjectArr.length; i4++) {
            scoredObjectArr[i4] = it.next();
        }
        return scoredObjectArr;
    }

    static ScoredObject[] frequentTerms(TokenizedLM tokenizedLM, int i, int i2) {
        SortedSet<ScoredObject<String[]>> frequentTermSet = tokenizedLM.frequentTermSet(i, i2);
        ScoredObject[] scoredObjectArr = new ScoredObject[frequentTermSet.size()];
        Iterator<ScoredObject<String[]>> it = frequentTermSet.iterator();
        for (int i3 = 0; i3 < scoredObjectArr.length; i3++) {
            scoredObjectArr[i3] = it.next();
        }
        return scoredObjectArr;
    }

    static ScoredObject[] infrequentTerms(TokenizedLM tokenizedLM, int i, int i2) {
        SortedSet<ScoredObject<String[]>> infrequentTermSet = tokenizedLM.infrequentTermSet(i, i2);
        ScoredObject[] scoredObjectArr = new ScoredObject[infrequentTermSet.size()];
        Iterator<ScoredObject<String[]>> it = infrequentTermSet.iterator();
        for (int i3 = 0; i3 < scoredObjectArr.length; i3++) {
            scoredObjectArr[i3] = it.next();
        }
        return scoredObjectArr;
    }

    @Test
    public void testNewAndOldTerms() {
        TokenizedLM tokenizedLM = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, MAX_NGRAM);
        TokenizedLM tokenizedLM2 = new TokenizedLM(IndoEuropeanTokenizerFactory.INSTANCE, MAX_NGRAM);
        tokenizedLM.train("b c d");
        tokenizedLM.train("b c d");
        tokenizedLM.train("b c d");
        tokenizedLM.train("b c f");
        tokenizedLM2.train("b c x");
        tokenizedLM2.train("b c x");
        tokenizedLM2.train("b c x");
        tokenizedLM2.train("b c y");
        org.junit.Assert.assertArrayEquals(new String[]{"c", "d"}, (String[]) newTerms(tokenizedLM, 2, 1, MAX_NGRAM, tokenizedLM2)[0].getObject());
        org.junit.Assert.assertArrayEquals(new String[]{"c", "x"}, (String[]) newTerms(tokenizedLM2, 2, 1, 2, tokenizedLM)[0].getObject());
        org.junit.Assert.assertArrayEquals(new String[]{"c", "f"}, (String[]) oldTerms(tokenizedLM, 2, 1, MAX_NGRAM, tokenizedLM2)[0].getObject());
        org.junit.Assert.assertArrayEquals(new String[]{"c", "y"}, (String[]) oldTerms(tokenizedLM2, 2, 1, MAX_NGRAM, tokenizedLM)[0].getObject());
        ScoredObject[] frequentTerms = frequentTerms(tokenizedLM, 2, 10);
        org.junit.Assert.assertArrayEquals(new String[]{XHtmlWriter.B, "c"}, (String[]) frequentTerms[0].getObject());
        org.junit.Assert.assertArrayEquals(new String[]{"c", "d"}, (String[]) frequentTerms[1].getObject());
        org.junit.Assert.assertArrayEquals(new String[]{"c", "f"}, (String[]) frequentTerms[2].getObject());
        ScoredObject[] infrequentTerms = infrequentTerms(tokenizedLM, 2, 10);
        org.junit.Assert.assertArrayEquals(new String[]{XHtmlWriter.B, "c"}, (String[]) infrequentTerms[2].getObject());
        org.junit.Assert.assertArrayEquals(new String[]{"c", "d"}, (String[]) infrequentTerms[1].getObject());
        org.junit.Assert.assertArrayEquals(new String[]{"c", "f"}, (String[]) infrequentTerms[0].getObject());
    }

    private void assertEstimate(double d, TokenizedLM tokenizedLM, CharSequence charSequence) throws ClassNotFoundException, IOException {
        Assert.assertEquals(d, tokenizedLM.log2Estimate(charSequence), 0.005d);
        assertEqEstimate(tokenizedLM, charSequence.toString());
    }

    public void assertEqEstimate(TokenizedLM tokenizedLM, CharSequence charSequence) throws ClassNotFoundException, IOException {
        Assert.assertEquals(tokenizedLM.log2Estimate(charSequence), writeRead(tokenizedLM).log2Estimate(charSequence), 0.005d);
    }

    private static CompiledTokenizedLM writeRead(TokenizedLM tokenizedLM) {
        try {
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            tokenizedLM.compileTo(new ObjectOutputStream(byteArrayOutputStream));
            return (CompiledTokenizedLM) new ObjectInputStream(new ByteArrayInputStream(byteArrayOutputStream.toByteArray())).readObject();
        } catch (IOException e) {
            Assert.fail(e.toString());
            return null;
        } catch (ClassNotFoundException e2) {
            Assert.fail(e2.toString());
            return null;
        }
    }

    @Test
    public void testMultipleIncrements() {
        Random random = new Random();
        IndoEuropeanTokenizerFactory indoEuropeanTokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE;
        TokenizedLM tokenizedLM = new TokenizedLM(indoEuropeanTokenizerFactory, MAX_NGRAM);
        TokenizedLM tokenizedLM2 = new TokenizedLM(indoEuropeanTokenizerFactory, MAX_NGRAM);
        for (int i = 0; i < 100; i++) {
            StringBuilder sb = new StringBuilder();
            for (int i2 = 0; i2 < 5; i2++) {
                sb.append((char) random.nextInt(16));
                sb.append(' ');
            }
            incrementAssertSynched(tokenizedLM, tokenizedLM2, sb, random.nextInt(10));
        }
    }

    void incrementAssertSynched(TokenizedLM tokenizedLM, TokenizedLM tokenizedLM2, CharSequence charSequence, int i) {
        for (int i2 = 0; i2 < i; i2++) {
            tokenizedLM.train(charSequence);
        }
        tokenizedLM2.train(charSequence, i);
        assertSynched(tokenizedLM, tokenizedLM2);
    }

    void assertSynched(TokenizedLM tokenizedLM, TokenizedLM tokenizedLM2) {
        for (int i = 0; i < 100; i++) {
            for (int i2 = 0; i2 < 5; i2++) {
                assertSynched(tokenizedLM, tokenizedLM2, i2);
            }
        }
    }

    void assertSynched(TokenizedLM tokenizedLM, TokenizedLM tokenizedLM2, int i) {
        Random random = new Random();
        StringBuilder sb = new StringBuilder();
        for (int i2 = 0; i2 < i; i2++) {
            sb.append((char) random.nextInt(16));
            sb.append(' ');
        }
        Assert.assertEquals(tokenizedLM.log2Estimate(sb), tokenizedLM2.log2Estimate(sb), 1.0E-4d);
    }
}
