package org.wikibrain.phrases;

import com.google.code.externalsorting.ExternalSort;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.logging.Logger;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageSet;
import org.wikibrain.core.lang.LocalId;
import org.wikibrain.core.lang.StringNormalizer;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.core.model.Title;
import org.wikibrain.phrases.PrunedCounts;
import org.wikibrain.utils.WpIOUtils;

/* loaded from: input_file:org/wikibrain/phrases/BasePhraseAnalyzer.class */
public abstract class BasePhraseAnalyzer implements PhraseAnalyzer {
    private static final Logger LOG = Logger.getLogger(PhraseAnalyzer.class.getName());
    private final PrunedCounts.Pruner<String> phrasePruner;
    private final PrunedCounts.Pruner<Integer> pagePruner;
    private final StringNormalizer normalizer;
    protected final PhraseAnalyzerDao phraseDao;
    protected final LocalPageDao pageDao;

    /* loaded from: input_file:org/wikibrain/phrases/BasePhraseAnalyzer$Entry.class */
    public static class Entry {
        Language language;
        int localId;
        String title;
        String phrase;
        int count;

        public Entry(Language language, int i, String str, int i2) {
            this.localId = -1;
            this.title = null;
            this.language = language;
            this.localId = i;
            this.phrase = str;
            this.count = i2;
        }

        public Entry(Language language, String str, String str2, int i) {
            this.localId = -1;
            this.title = null;
            this.language = language;
            this.title = str;
            this.phrase = str2;
            this.count = i;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/wikibrain/phrases/BasePhraseAnalyzer$RecordType.class */
    public enum RecordType {
        PAGES,
        PHRASES
    }

    public BasePhraseAnalyzer(PhraseAnalyzerDao phraseAnalyzerDao, LocalPageDao localPageDao, PrunedCounts.Pruner<String> pruner, PrunedCounts.Pruner<Integer> pruner2) {
        this.phrasePruner = pruner;
        this.pagePruner = pruner2;
        this.phraseDao = phraseAnalyzerDao;
        this.pageDao = localPageDao;
        this.normalizer = phraseAnalyzerDao.getStringNormalizer();
    }

    protected abstract Iterable<Entry> getCorpus(LanguageSet languageSet) throws IOException, DaoException;

    @Override // org.wikibrain.phrases.PhraseAnalyzer
    public void loadCorpus(LanguageSet languageSet) throws DaoException, IOException {
        File createTempFile = File.createTempFile("wp_phrases_by_id", "txt");
        createTempFile.deleteOnExit();
        BufferedWriter openWriter = WpIOUtils.openWriter(createTempFile);
        File createTempFile2 = File.createTempFile("wp_phrases_by_phrase", "txt");
        createTempFile2.deleteOnExit();
        BufferedWriter openWriter2 = WpIOUtils.openWriter(createTempFile2);
        long j = 0;
        long j2 = 0;
        for (Entry entry : getCorpus(languageSet)) {
            long j3 = j + 1;
            j = j3;
            if (j3 % 1000000 == 0) {
                LOG.info("processing entry: " + j + ", retained " + j2 + "(" + new DecimalFormat("#.#").format((100.0d * j2) / j) + "%)");
            }
            if (languageSet.containsLanguage(entry.language) && entry.phrase != null && !entry.phrase.trim().isEmpty()) {
                if (entry.title != null && entry.localId < 0) {
                    int idByTitle = this.pageDao.getIdByTitle(new Title(entry.title, entry.language));
                    entry.localId = idByTitle <= 0 ? -1 : idByTitle;
                }
                if (entry.localId >= 0) {
                    j2++;
                    entry.phrase = entry.phrase.replace("\n", " ").replace("\t", " ");
                    String str = entry.language.getLangCode() + "\t" + entry.localId + "\t" + entry.count + "\t" + entry.phrase + "\n";
                    openWriter2.write(entry.language.getLangCode() + ":" + normalize(entry.language, entry.phrase) + "\t" + str);
                    openWriter.write(entry.language.getLangCode() + ":" + entry.localId + "\t" + str);
                }
            }
        }
        openWriter.close();
        openWriter2.close();
        sortInPlace(createTempFile);
        loadFromFile(RecordType.PAGES, createTempFile, this.phrasePruner);
        sortInPlace(createTempFile2);
        loadFromFile(RecordType.PHRASES, createTempFile2, this.pagePruner);
        this.phraseDao.close();
    }

    private String normalize(Language language, String str) {
        return this.normalizer.normalize(language, str).replaceAll("\\s+", " ");
    }

    protected void loadFromFile(RecordType recordType, File file, PrunedCounts.Pruner pruner) throws IOException, DaoException {
        BufferedReader openBufferedReader = WpIOUtils.openBufferedReader(file);
        String str = null;
        int i = 1000;
        ArrayList arrayList = new ArrayList();
        while (true) {
            String readLine = openBufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            String[] split = readLine.split("\t", 5);
            if (split.length != 5) {
                LOG.warning("invalid line in file " + file + ": " + readLine);
            } else {
                if (str != null && !split[0].equals(str)) {
                    if (recordType == RecordType.PAGES) {
                        writePage(arrayList, pruner);
                    } else {
                        writePhrase(arrayList, pruner);
                    }
                    arrayList.clear();
                }
                arrayList.add(new Entry(Language.getByLangCode(split[1]), new Integer(split[2]).intValue(), split[4], new Integer(split[3]).intValue()));
                if (arrayList.size() > (i * 3) / 2) {
                    LOG.warning("large buffer observed: " + arrayList.size() + " for string " + str);
                    i = arrayList.size();
                }
                str = split[0];
            }
        }
        if (recordType == RecordType.PAGES) {
            writePage(arrayList, pruner);
        } else {
            writePhrase(arrayList, pruner);
        }
    }

    protected void writePage(List<Entry> list, PrunedCounts.Pruner pruner) throws DaoException {
        if (list.isEmpty()) {
            return;
        }
        Language language = list.get(0).language;
        int i = list.get(0).localId;
        HashMap hashMap = new HashMap();
        for (Entry entry : list) {
            if (entry.localId != i) {
                throw new IllegalStateException();
            }
            if (entry.language != language) {
                throw new IllegalStateException();
            }
            if (hashMap.containsKey(entry.phrase)) {
                hashMap.put(entry.phrase, Integer.valueOf(((Integer) hashMap.get(entry.phrase)).intValue() + entry.count));
            } else {
                hashMap.put(entry.phrase, Integer.valueOf(entry.count));
            }
        }
        PrunedCounts<String> prune = pruner.prune(hashMap);
        if (prune != null) {
            this.phraseDao.savePageCounts(language, i, prune);
        }
    }

    protected void writePhrase(List<Entry> list, PrunedCounts.Pruner pruner) throws DaoException {
        if (list.isEmpty()) {
            return;
        }
        Language language = list.get(0).language;
        String normalize = normalize(language, list.get(0).phrase);
        HashMap hashMap = new HashMap();
        for (Entry entry : list) {
            if (!normalize(language, entry.phrase).equals(normalize)) {
                LOG.warning("disagreement between phrases " + normalize + " and " + entry.phrase);
            }
            if (entry.language != language) {
                LOG.warning("disagreement between languages " + language + " and " + entry.language);
            }
            if (hashMap.containsKey(Integer.valueOf(entry.localId))) {
                hashMap.put(Integer.valueOf(entry.localId), Integer.valueOf(((Integer) hashMap.get(Integer.valueOf(entry.localId))).intValue() + entry.count));
            } else {
                hashMap.put(Integer.valueOf(entry.localId), Integer.valueOf(entry.count));
            }
        }
        PrunedCounts<Integer> prune = pruner.prune(hashMap);
        if (prune != null) {
            this.phraseDao.savePhraseCounts(language, normalize, prune);
        }
    }

    private void sortInPlace(File file) throws IOException {
        int max = Math.max(100, (int) (file.length() / (Runtime.getRuntime().maxMemory() / 20)));
        LOG.info("sorting " + file + " using max of " + max);
        Comparator<String> comparator = new Comparator<String>() { // from class: org.wikibrain.phrases.BasePhraseAnalyzer.1
            @Override // java.util.Comparator
            public int compare(String str, String str2) {
                return str.compareTo(str2);
            }
        };
        List sortInBatch = ExternalSort.sortInBatch(file, comparator, max, Charset.forName("utf-8"), (File) null, false);
        LOG.info("merging " + file);
        ExternalSort.mergeSortedFiles(sortInBatch, file, comparator, Charset.forName("utf-8"));
        LOG.info("finished sorting" + file);
    }

    @Override // org.wikibrain.phrases.PhraseAnalyzer
    public LinkedHashMap<String, Float> describe(Language language, LocalPage localPage, int i) throws DaoException {
        LinkedHashMap<String, Float> linkedHashMap = new LinkedHashMap<>();
        PrunedCounts<String> pageCounts = this.phraseDao.getPageCounts(language, localPage.getLocalId(), i);
        System.out.println(pageCounts);
        if (pageCounts == null) {
            return null;
        }
        Iterator<String> it = pageCounts.keySet().iterator();
        while (it.hasNext()) {
            linkedHashMap.put(it.next(), Float.valueOf((1.0f * pageCounts.get(r0).intValue()) / pageCounts.getTotal()));
            if (linkedHashMap.size() >= i) {
                break;
            }
        }
        return linkedHashMap;
    }

    @Override // org.wikibrain.phrases.PhraseAnalyzer
    public LinkedHashMap<LocalId, Float> resolve(Language language, String str, int i) throws DaoException {
        LinkedHashMap<LocalId, Float> linkedHashMap = new LinkedHashMap<>();
        PrunedCounts<Integer> phraseCounts = this.phraseDao.getPhraseCounts(language, str, i);
        if (phraseCounts == null) {
            return null;
        }
        Iterator<Integer> it = phraseCounts.keySet().iterator();
        while (it.hasNext()) {
            linkedHashMap.put(new LocalId(language, it.next().intValue()), Float.valueOf((1.0f * phraseCounts.get(r0).intValue()) / phraseCounts.getTotal()));
            if (phraseCounts.size() >= i) {
                break;
            }
        }
        return linkedHashMap;
    }

    public PhraseAnalyzerDao getDao() {
        return this.phraseDao;
    }
}
