package ivory.sqe.querygenerator;

import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonPrimitive;
import edu.umd.cloud9.io.map.HMapSFW;
import edu.umd.cloud9.io.pair.PairOfFloatInt;
import edu.umd.cloud9.io.pair.PairOfStrings;
import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;
import ivory.core.ConfigurationException;
import ivory.core.RetrievalEnvironment;
import ivory.core.tokenize.Tokenizer;
import ivory.core.tokenize.TokenizerFactory;
import ivory.sqe.retrieval.Constants;
import ivory.sqe.retrieval.StructuredQuery;
import java.io.IOException;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

/* loaded from: input_file:ivory/sqe/querygenerator/ProbabilisticStructuredQueryGenerator.class */
public class ProbabilisticStructuredQueryGenerator implements QueryGenerator {
    private static final Logger LOG = Logger.getLogger(ProbabilisticStructuredQueryGenerator.class);
    private Tokenizer queryLangTokenizer;
    private Tokenizer queryLangTokenizerWithStemming;
    private Tokenizer docLangTokenizer;
    private VocabularyWritable fVocab_f2e;
    private VocabularyWritable eVocab_f2e;
    private TTable_monolithic_IFAs f2eProbs;
    private int length;
    private int numTransPerToken;
    private float lexProbThreshold;
    private float cumProbThreshold;
    private boolean H6;
    private boolean bigramSegment;
    private RetrievalEnvironment env;
    private String queryLang;
    private String docLang;

    @Override // ivory.sqe.querygenerator.QueryGenerator
    public void init(FileSystem fileSystem, Configuration configuration) throws IOException {
        if (configuration.getBoolean(Constants.Quiet, false)) {
            LOG.setLevel(Level.OFF);
        }
        this.queryLang = configuration.get(Constants.QueryLanguage);
        this.docLang = configuration.get(Constants.DocLanguage);
        this.fVocab_f2e = HadoopAlign.loadVocab(new Path(configuration.get(Constants.QueryVocab)), fileSystem);
        this.eVocab_f2e = HadoopAlign.loadVocab(new Path(configuration.get(Constants.DocVocab)), fileSystem);
        this.f2eProbs = new TTable_monolithic_IFAs(fileSystem, new Path(configuration.get(Constants.f2eProbsPath)), true);
        LOG.info("Stemmed stopword list file in query-language:" + configuration.get(Constants.StemmedStopwordListQ));
        LOG.info("Stemmed stopword list file in doc-language:" + configuration.get(Constants.StemmedStopwordListD));
        this.queryLangTokenizer = TokenizerFactory.createTokenizer(fileSystem, configuration, this.queryLang, configuration.get(Constants.QueryTokenizerData), false, null, null, null);
        this.queryLangTokenizerWithStemming = TokenizerFactory.createTokenizer(fileSystem, configuration, this.queryLang, configuration.get(Constants.QueryTokenizerData), true, null, configuration.get(Constants.StemmedStopwordListQ), null);
        this.docLangTokenizer = TokenizerFactory.createTokenizer(fileSystem, configuration, this.docLang, configuration.get(Constants.DocTokenizerData), true, null, configuration.get(Constants.StemmedStopwordListD), null);
        this.lexProbThreshold = configuration.getFloat(Constants.LexicalProbThreshold, 0.0f);
        this.cumProbThreshold = configuration.getFloat(Constants.CumulativeProbThreshold, 1.0f);
        this.numTransPerToken = configuration.getInt(Constants.NumTransPerToken, Integer.MAX_VALUE);
        String str = configuration.get(Constants.Heuristic6);
        if (str == null || str.equals("off")) {
            this.H6 = false;
        } else {
            this.H6 = true;
        }
        LOG.info("H6 = " + this.H6);
        try {
            this.env = new RetrievalEnvironment(configuration.get("index"), fileSystem);
            this.env.initialize(true);
        } catch (ConfigurationException e) {
            e.printStackTrace();
        }
    }

    @Override // ivory.sqe.querygenerator.QueryGenerator
    public StructuredQuery parseQuery(String str, FileSystem fileSystem, Configuration configuration) {
        JsonArray createJsonArrayFromProbabilities;
        JsonObject jsonObject = new JsonObject();
        String trim = str.trim().split("\\|\\|\\|\\|")[0].trim();
        LOG.info("Original query: " + trim);
        Set<PairOfStrings> processGrammar = processGrammar(fileSystem, configuration, configuration.get(Constants.GrammarPath));
        Map<String, String> stemMapping = Utils.getStemMapping(trim, this.queryLangTokenizer, this.queryLangTokenizerWithStemming, this.docLangTokenizer);
        String[] processContent = this.queryLangTokenizerWithStemming.processContent(trim);
        this.length = processContent.length;
        JsonArray jsonArray = new JsonArray();
        for (String str2 : processContent) {
            LOG.info("Processing token " + str2);
            if (!this.queryLangTokenizerWithStemming.isStopWord(str2)) {
                LOG.info("not stopword");
                if (this.numTransPerToken != 1 || this.bigramSegment) {
                    JsonObject jsonObject2 = new JsonObject();
                    HMapSFW translations = getTranslations(trim, str2, processGrammar, stemMapping);
                    if (translations != null && (createJsonArrayFromProbabilities = Utils.createJsonArrayFromProbabilities(translations)) != null) {
                        jsonObject2.add("#weight", createJsonArrayFromProbabilities);
                        jsonArray.add(jsonObject2);
                    }
                } else {
                    String bestTranslation = getBestTranslation(str2);
                    if (bestTranslation != null) {
                        jsonArray.add(new JsonPrimitive(bestTranslation));
                    }
                }
            }
        }
        jsonObject.add("#combine", jsonArray);
        return new StructuredQuery(jsonObject, this.length);
    }

    public Set<PairOfStrings> processGrammar(FileSystem fileSystem, Configuration configuration, String str) {
        if (!this.H6) {
            return null;
        }
        Set<PairOfStrings> pairsInSCFG = Utils.getPairsInSCFG(fileSystem, str);
        if (pairsInSCFG == null) {
            LOG.info("No probabilities extracted from " + str);
        }
        return pairsInSCFG;
    }

    protected String getBestTranslation(String str) {
        int i = this.fVocab_f2e.get(str);
        if (i <= 0) {
            return null;
        }
        PriorityQueue translationsWithProbs = this.f2eProbs.get(i).getTranslationsWithProbs(this.lexProbThreshold);
        if (translationsWithProbs.isEmpty()) {
            return str;
        }
        return this.eVocab_f2e.get(((PairOfFloatInt) translationsWithProbs.poll()).getRightElement());
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public HMapSFW getTranslations(String str, String str2, Set<PairOfStrings> set, Map<String, String> map) {
        HMapSFW hMapSFW = new HMapSFW();
        int i = this.fVocab_f2e.get(str2);
        if (i <= 0) {
            hMapSFW.put((map == null || map.get(str2) == null) ? str2 : map.get(str2), 1.0f);
            return hMapSFW;
        }
        PriorityQueue translationsWithProbs = this.f2eProbs.get(i).getTranslationsWithProbs(this.lexProbThreshold);
        float f = 0.0f;
        int i2 = 0;
        while (i2 < this.numTransPerToken && !translationsWithProbs.isEmpty()) {
            PairOfFloatInt pairOfFloatInt = (PairOfFloatInt) translationsWithProbs.poll();
            float leftElement = pairOfFloatInt.getLeftElement();
            int rightElement = pairOfFloatInt.getRightElement();
            String str3 = this.eVocab_f2e.get(rightElement);
            if (leftElement <= 0.0f || rightElement <= 0 || this.docLangTokenizer.isStopWord(str3) || !(set == null || set.contains(new PairOfStrings(str2, str3)))) {
                LOG.info("Skipped target stopword/OOV " + str3);
            } else {
                if (this.bigramSegment) {
                    String[] processContent = this.docLangTokenizer.processContent(str3);
                    float length = leftElement / processContent.length;
                    for (String str4 : processContent) {
                        if (this.env.getPostingsList(str4) != null) {
                            hMapSFW.put(str4, length);
                        }
                    }
                    f += leftElement;
                } else if (this.env.getPostingsList(str3) != null) {
                    hMapSFW.increment(str3, leftElement);
                    f += leftElement;
                }
                i2++;
            }
            if (f > this.cumProbThreshold || i2 >= this.numTransPerToken) {
                break;
            }
        }
        for (String str5 : hMapSFW.keySet()) {
            hMapSFW.put(str5, hMapSFW.get(str5) / f);
        }
        return hMapSFW;
    }
}
