package org.wikibrain.phrases;

import com.typesafe.config.Config;
import gnu.trove.map.TLongFloatMap;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TLongFloatHashMap;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.TLongSet;
import gnu.trove.set.hash.TLongHashSet;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikibrain.conf.Configuration;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.conf.Configurator;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.DaoFilter;
import org.wikibrain.core.dao.RawPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.lang.StringNormalizer;
import org.wikibrain.core.model.NameSpace;
import org.wikibrain.core.model.RawPage;
import org.wikibrain.core.nlp.StringTokenizer;
import org.wikibrain.utils.ObjectDb;
import org.wikibrain.utils.ParallelForEach;
import org.wikibrain.utils.Procedure;
import org.wikibrain.utils.WpIOUtils;
import org.wikibrain.utils.WpStringUtils;
import org.wikibrain.utils.WpThreadUtils;

/* loaded from: input_file:org/wikibrain/phrases/LinkProbabilityDao.class */
public class LinkProbabilityDao {
    private static final Logger LOG = LoggerFactory.getLogger(LinkProbabilityDao.class);
    private final File path;
    private final RawPageDao pageDao;
    private final PhraseAnalyzerDao phraseDao;
    private final LanguageSet langs;
    private final StringNormalizer normalizer;
    private ObjectDb<Double> db;
    private TLongFloatMap cache = null;
    private TLongSet subGrams = null;

    /* loaded from: input_file:org/wikibrain/phrases/LinkProbabilityDao$Provider.class */
    public static class Provider extends org.wikibrain.conf.Provider<LinkProbabilityDao> {
        public Provider(Configurator configurator, Configuration configuration) throws ConfigurationException {
            super(configurator, configuration);
        }

        public Class<LinkProbabilityDao> getType() {
            return LinkProbabilityDao.class;
        }

        public String getPath() {
            return "phrases.linkProbability";
        }

        public LinkProbabilityDao get(String str, Config config, Map<String, String> map) throws ConfigurationException {
            LanguageSet languageSet = (LanguageSet) getConfigurator().get(LanguageSet.class);
            File file = new File(config.getString("path"));
            String string = config.hasPath("rawPageDao") ? config.getString("rawPageDao") : null;
            String string2 = config.hasPath("phraseAnalyzer") ? config.getString("phraseAnalyzer") : null;
            RawPageDao rawPageDao = (RawPageDao) getConfigurator().get(RawPageDao.class, string);
            PhraseAnalyzer phraseAnalyzer = (PhraseAnalyzer) getConfigurator().get(PhraseAnalyzer.class, string2);
            if (!(phraseAnalyzer instanceof AnchorTextPhraseAnalyzer)) {
                throw new ConfigurationException("LinkProbabilityDao's phraseAnalyzer must be an AnchorTextPhraseAnalyzer");
            }
            try {
                return new LinkProbabilityDao(file, languageSet, rawPageDao, ((AnchorTextPhraseAnalyzer) phraseAnalyzer).getDao());
            } catch (DaoException e) {
                throw new ConfigurationException(e);
            }
        }

        /* renamed from: get, reason: collision with other method in class */
        public /* bridge */ /* synthetic */ Object m6get(String str, Config config, Map map) throws ConfigurationException {
            return get(str, config, (Map<String, String>) map);
        }
    }

    public LinkProbabilityDao(File file, LanguageSet languageSet, RawPageDao rawPageDao, PhraseAnalyzerDao phraseAnalyzerDao) throws DaoException {
        this.path = file;
        this.langs = languageSet;
        this.pageDao = rawPageDao;
        this.phraseDao = phraseAnalyzerDao;
        this.normalizer = phraseAnalyzerDao.getStringNormalizer();
        if (!file.exists()) {
            LOG.warn("path " + file + " does not exist... LinkProbabilityDao will not work until build() is called.");
            return;
        }
        try {
            this.db = new ObjectDb<>(file, false);
        } catch (IOException e) {
            throw new DaoException(e);
        }
    }

    public boolean isBuilt() {
        return (this.db == null || this.db.isEmpty()) ? false : true;
    }

    public boolean isSubgram(Language language, String str, boolean z) {
        if (this.cache == null || this.subGrams == null) {
            throw new IllegalArgumentException("Subgrams require a cache!");
        }
        long hashCode = hashCode(language, cleanString(language, str, z));
        return this.cache.containsKey(hashCode) || this.subGrams.contains(hashCode);
    }

    private String cleanString(Language language, String str) {
        return cleanString(language, str, false);
    }

    private String cleanString(Language language, String str, boolean z) {
        if (z) {
            str = this.normalizer.normalize(language, str);
        }
        return StringUtils.join(new StringTokenizer().getWords(language, str), " ");
    }

    public double getLinkProbability(Language language, String str) throws DaoException {
        return getLinkProbability(language, str, true);
    }

    public double getLinkProbability(Language language, String str, boolean z) throws DaoException {
        if (this.db == null) {
            throw new IllegalStateException("Dao has not yet been built. Call build()");
        }
        String cleanString = cleanString(language, str, z);
        if (this.cache != null && this.cache.size() > 0) {
            if (this.cache.containsKey(hashCode(language, cleanString))) {
                return this.cache.get(r0);
            }
            return 0.0d;
        }
        try {
            Double d = (Double) this.db.get(language.getLangCode() + ":" + cleanString);
            if (d == null) {
                return 0.0d;
            }
            return d.doubleValue();
        } catch (IOException e) {
            throw new DaoException(e);
        } catch (ClassNotFoundException e2) {
            throw new DaoException(e2);
        }
    }

    public synchronized void useCache(boolean z) {
        long longValue;
        if (!z) {
            this.cache = null;
            return;
        }
        if (this.db == null) {
            this.cache = new TLongFloatHashMap();
            return;
        }
        File file = new File(this.path + "-phrase-cache.bin");
        File file2 = new File(this.path + "-subgram-cache.bin");
        try {
            if (((Double) this.db.get("tstamp")) == null) {
                longValue = System.currentTimeMillis();
                this.db.put("tstamp", Double.valueOf(1.0d * longValue));
                this.db.flush();
            } else {
                longValue = ((Double) this.db.get("tstamp")).longValue();
            }
            if (file.isFile() && file.lastModified() > longValue && file2.isFile() && file2.lastModified() > longValue) {
                try {
                    this.cache = (TLongFloatMap) WpIOUtils.readObjectFromFile(file);
                    this.subGrams = (TLongSet) WpIOUtils.readObjectFromFile(file2);
                    LOG.info("Using up-to-date link probability cache files {} and {}", file, file2);
                    return;
                } catch (IOException e) {
                    LOG.warn("Using link probability dao cache failed: ", e);
                }
            }
            LOG.info("building cache...");
            TLongFloatHashMap tLongFloatHashMap = new TLongFloatHashMap();
            Iterator it = this.db.iterator();
            TLongHashSet tLongHashSet = new TLongHashSet();
            while (it.hasNext()) {
                Pair pair = (Pair) it.next();
                if (!((String) pair.getKey()).equalsIgnoreCase("tstamp")) {
                    if (((String) pair.getKey()).startsWith(":s:")) {
                        tLongHashSet.add(Long.valueOf(((String) pair.getKey()).substring(3)).longValue());
                    } else {
                        String[] split = ((String) pair.getKey()).split(":", 2);
                        tLongFloatHashMap.put(hashCode(Language.getByLangCode(split[0]), split[1]), ((Double) pair.getRight()).floatValue());
                    }
                }
            }
            this.cache = tLongFloatHashMap;
            this.subGrams = tLongHashSet;
            LOG.info("created cache with " + tLongFloatHashMap.size() + " entries and " + tLongHashSet.size() + " subgrams");
            try {
                WpIOUtils.writeObjectToFile(file, tLongFloatHashMap);
                WpIOUtils.writeObjectToFile(file2, tLongHashSet);
            } catch (IOException e2) {
                throw new RuntimeException(e2);
            }
        } catch (IOException e3) {
            throw new RuntimeException(e3);
        } catch (ClassNotFoundException e4) {
            throw new RuntimeException(e4);
        }
    }

    public synchronized void build() throws DaoException {
        if (this.db != null) {
            this.db.close();
        }
        if (this.path.exists()) {
            FileUtils.deleteQuietly(this.path);
        }
        this.path.mkdirs();
        try {
            this.db = new ObjectDb<>(this.path, true);
            Iterator it = this.langs.iterator();
            while (it.hasNext()) {
                build((Language) it.next());
            }
        } catch (IOException e) {
            throw new DaoException(e);
        }
    }

    private void build(Language language) throws DaoException {
        this.subGrams = new TLongHashSet();
        LOG.info("building link probabilities for language " + language);
        final TLongIntHashMap tLongIntHashMap = new TLongIntHashMap();
        Iterator<String> allPhrases = this.phraseDao.getAllPhrases(language);
        StringTokenizer stringTokenizer = new StringTokenizer();
        while (allPhrases.hasNext()) {
            List words = stringTokenizer.getWords(language, allPhrases.next());
            StringBuilder sb = new StringBuilder("");
            long j = -1;
            for (int i = 0; i < words.size(); i++) {
                if (i > 0) {
                    sb.append(' ');
                }
                sb.append((String) words.get(i));
                j = hashCode(language, sb.toString());
                this.subGrams.add(j);
            }
            tLongIntHashMap.put(j, 0);
        }
        LOG.info("found " + tLongIntHashMap.size() + " unique anchortexts and " + this.subGrams.size() + " subgrams");
        ParallelForEach.iterate(this.pageDao.get(new DaoFilter().setRedirect(false).setLanguages(language).setDisambig(false).setNameSpaces(NameSpace.ARTICLE)).iterator(), WpThreadUtils.getMaxThreads(), 100, new Procedure<RawPage>() { // from class: org.wikibrain.phrases.LinkProbabilityDao.1
            public void call(RawPage rawPage) throws Exception {
                LinkProbabilityDao.this.processPage(tLongIntHashMap, rawPage);
            }
        }, 10000);
        int i2 = 0;
        int i3 = 0;
        double d = 0.0d;
        TLongHashSet tLongHashSet = new TLongHashSet();
        TLongIntMap phraseLinkCounts = getPhraseLinkCounts(language);
        Iterator<Pair<String, PrunedCounts<Integer>>> allPhraseCounts = this.phraseDao.getAllPhraseCounts(language);
        while (allPhraseCounts.hasNext()) {
            String cleanString = cleanString(language, (String) allPhraseCounts.next().getLeft());
            long hashCode = hashCode(language, cleanString);
            if (!tLongHashSet.contains(hashCode)) {
                tLongHashSet.add(hashCode);
                try {
                    int i4 = phraseLinkCounts.get(hashCode);
                    int i5 = tLongIntHashMap.get(hashCode);
                    if (i5 == 0) {
                        i3++;
                    }
                    i2++;
                    double d2 = (1.0d * i4) / (i5 + 3.0d);
                    d += d2;
                    this.db.put(language.getLangCode() + ":" + cleanString, Double.valueOf(d2));
                    if (this.cache != null) {
                        this.cache.put(hashCode, (float) d2);
                    }
                } catch (IOException e) {
                    throw new DaoException(e);
                }
            }
        }
        for (long j2 : this.subGrams.toArray()) {
            try {
                this.db.put(":s:" + j2, Double.valueOf(-1.0d));
            } catch (IOException e2) {
                throw new DaoException(e2);
            }
        }
        try {
            this.db.put("tstamp", Double.valueOf(1.0d * System.currentTimeMillis()));
            if (i2 != 0) {
                LOG.info(String.format("Inserted link probabilities for %d anchors with mean probability %.4f and %d mises", Integer.valueOf(i2), Double.valueOf(d / i2), Integer.valueOf(i3)));
            }
            this.db.flush();
        } catch (IOException e3) {
            throw new DaoException(e3);
        }
    }

    private TLongIntMap getPhraseLinkCounts(Language language) {
        Iterator<Pair<String, PrunedCounts<Integer>>> allPhraseCounts = this.phraseDao.getAllPhraseCounts(language);
        TLongIntHashMap tLongIntHashMap = new TLongIntHashMap();
        while (allPhraseCounts.hasNext()) {
            Pair<String, PrunedCounts<Integer>> next = allPhraseCounts.next();
            long hashCode = hashCode(language, cleanString(language, (String) next.getLeft()));
            int total = ((PrunedCounts) next.getRight()).getTotal();
            tLongIntHashMap.adjustOrPutValue(hashCode, total, total);
        }
        return tLongIntHashMap;
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* JADX WARN: Code restructure failed: missing block: B:32:0x00e3, code lost:
    
        continue;
     */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    public void processPage(gnu.trove.map.TLongIntMap r6, org.wikibrain.core.model.RawPage r7) {
        /*
            Method dump skipped, instructions count: 237
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: org.wikibrain.phrases.LinkProbabilityDao.processPage(gnu.trove.map.TLongIntMap, org.wikibrain.core.model.RawPage):void");
    }

    private long hashCode(Language language, String str) {
        return WpStringUtils.longHashCode2(language.getLangCode() + ":" + str);
    }
}
