| Index: third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
|
| diff --git a/third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..8dc3c819b5e20202d5f0e51dc76eebe8b57a474e
|
| --- /dev/null
|
| +++ b/third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
|
| @@ -0,0 +1,294 @@
|
| +/*
|
| + * Copyright (C) 2013, The Android Open Source Project
|
| + *
|
| + * Licensed under the Apache License, Version 2.0 (the "License");
|
| + * you may not use this file except in compliance with the License.
|
| + * You may obtain a copy of the License at
|
| + *
|
| + * http://www.apache.org/licenses/LICENSE-2.0
|
| + *
|
| + * Unless required by applicable law or agreed to in writing, software
|
| + * distributed under the License is distributed on an "AS IS" BASIS,
|
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| + * See the License for the specific language governing permissions and
|
| + * limitations under the License.
|
| + */
|
| +
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
|
| +
|
| +#include <cstring>
|
| +#include <queue>
|
| +
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/header/header_policy.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/utils/file_utils.h"
|
| +#include "third_party/android_prediction/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
|
| +
|
| +namespace latinime {
|
| +
|
| +bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
|
| + const int unigramCount, const int bigramCount) const {
|
| + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
|
| + BufferWithExtendableBuffer headerBuffer(
|
| + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
|
| + const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
|
| + + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
|
| + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
|
| + unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) {
|
| + AKLOGE("Cannot write header structure to buffer. "
|
| + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
|
| + "extendedRegionSize: %d", false, unigramCount, bigramCount,
|
| + extendedRegionSize);
|
| + return false;
|
| + }
|
| + return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
|
| +}
|
| +
|
| +bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
|
| + const char *const dictDirPath) {
|
| + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
|
| + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
|
| + Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
|
| + Ver4DictConstants::MAX_DICTIONARY_SIZE));
|
| + int unigramCount = 0;
|
| + int bigramCount = 0;
|
| + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
|
| + return false;
|
| + }
|
| + BufferWithExtendableBuffer headerBuffer(
|
| + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
|
| + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
|
| + unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) {
|
| + return false;
|
| + }
|
| + return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
|
| +}
|
| +
|
| +bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
|
| + const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
|
| + int *const outUnigramCount, int *const outBigramCount) {
|
| + Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
|
| + mBuffers->getLanguageModelDictContent(), headerPolicy);
|
| + Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
|
| + Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
|
| + mBuffers->getTerminalPositionLookupTable(), headerPolicy);
|
| + Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
|
| + mBuffers->getTerminalPositionLookupTable());
|
| + Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
|
| + mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
|
| + &shortcutPolicy);
|
| +
|
| + DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
|
| + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
| + DynamicPtGcEventListeners
|
| + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
| + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
|
| + &ptNodeWriter);
|
| + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
| + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
|
| + return false;
|
| + }
|
| + const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
|
| + .getValidUnigramCount();
|
| + const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
|
| + if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
|
| + if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
|
| + AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
|
| + maxUnigramCount);
|
| + return false;
|
| + }
|
| + }
|
| +
|
| + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
| + DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
|
| + traversePolicyToUpdateBigramProbability(&ptNodeWriter);
|
| + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
| + &traversePolicyToUpdateBigramProbability)) {
|
| + return false;
|
| + }
|
| + const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
|
| + const int maxBigramCount = headerPolicy->getMaxBigramCount();
|
| + if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
|
| + if (!truncateBigrams(maxBigramCount)) {
|
| + AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
|
| + return false;
|
| + }
|
| + }
|
| +
|
| + // Mapping from positions in mBuffer to positions in bufferToWrite.
|
| + PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
|
| + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
| + Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
|
| + buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
|
| + &shortcutPolicy);
|
| + DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
|
| + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
|
| + buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
|
| + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
|
| + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
|
| + return false;
|
| + }
|
| +
|
| + // Create policy instances for the GCed dictionary.
|
| + Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
|
| + buffersToWrite->getLanguageModelDictContent(), headerPolicy);
|
| + Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
|
| + Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
|
| + buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
|
| + Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
|
| + buffersToWrite->getTerminalPositionLookupTable());
|
| + Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
|
| + buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
|
| + &newShortcutPolicy);
|
| + // Re-assign terminal IDs for valid terminal PtNodes.
|
| + TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
|
| + if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
|
| + &terminalIdMap)) {
|
| + return false;
|
| + }
|
| + // Run GC for probability dict content.
|
| + if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap,
|
| + mBuffers->getLanguageModelDictContent(), nullptr /* outNgramCount */)) {
|
| + return false;
|
| + }
|
| + // Run GC for bigram dict content.
|
| + if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
|
| + mBuffers->getBigramDictContent(), outBigramCount)) {
|
| + return false;
|
| + }
|
| + // Run GC for shortcut dict content.
|
| + if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
|
| + mBuffers->getShortcutDictContent())) {
|
| + return false;
|
| + }
|
| + DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
|
| + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
| + DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
|
| + traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
|
| + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
|
| + &traversePolicyToUpdateAllPositionFields)) {
|
| + return false;
|
| + }
|
| + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
|
| + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
|
| + traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
|
| + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
|
| + &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
|
| + return false;
|
| + }
|
| + *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
|
| + return true;
|
| +}
|
| +
|
| +bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
|
| + const Ver4PatriciaTrieNodeReader *const ptNodeReader,
|
| + Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
|
| + const TerminalPositionLookupTable *const terminalPosLookupTable =
|
| + mBuffers->getTerminalPositionLookupTable();
|
| + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
|
| + std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
|
| + priorityQueue;
|
| + for (int i = 0; i < nextTerminalId; ++i) {
|
| + const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
|
| + if (terminalPos == NOT_A_DICT_POS) {
|
| + continue;
|
| + }
|
| + const ProbabilityEntry probabilityEntry =
|
| + mBuffers->getLanguageModelDictContent()->getProbabilityEntry(i);
|
| + const int probability = probabilityEntry.hasHistoricalInfo() ?
|
| + ForgettingCurveUtils::decodeProbability(
|
| + probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
|
| + probabilityEntry.getProbability();
|
| + priorityQueue.push(DictProbability(terminalPos, probability,
|
| + probabilityEntry.getHistoricalInfo()->getTimeStamp()));
|
| + }
|
| +
|
| + // Delete unigrams.
|
| + while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
|
| + const int ptNodePos = priorityQueue.top().getDictPos();
|
| + priorityQueue.pop();
|
| + const PtNodeParams ptNodeParams =
|
| + ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
|
| + if (ptNodeParams.representsNonWordInfo()) {
|
| + continue;
|
| + }
|
| + if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
|
| + AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
|
| + return false;
|
| + }
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
|
| + const TerminalPositionLookupTable *const terminalPosLookupTable =
|
| + mBuffers->getTerminalPositionLookupTable();
|
| + const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
|
| + std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
|
| + priorityQueue;
|
| + BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
|
| + for (int i = 0; i < nextTerminalId; ++i) {
|
| + const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
|
| + if (bigramListPos == NOT_A_DICT_POS) {
|
| + continue;
|
| + }
|
| + bool hasNext = true;
|
| + int readingPos = bigramListPos;
|
| + while (hasNext) {
|
| + const BigramEntry bigramEntry =
|
| + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
|
| + const int entryPos = readingPos - bigramDictContent->getBigramEntrySize();
|
| + hasNext = bigramEntry.hasNext();
|
| + if (!bigramEntry.isValid()) {
|
| + continue;
|
| + }
|
| + const int probability = bigramEntry.hasHistoricalInfo() ?
|
| + ForgettingCurveUtils::decodeProbability(
|
| + bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
|
| + bigramEntry.getProbability();
|
| + priorityQueue.push(DictProbability(entryPos, probability,
|
| + bigramEntry.getHistoricalInfo()->getTimeStamp()));
|
| + }
|
| + }
|
| +
|
| + // Delete bigrams.
|
| + while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
|
| + const int entryPos = priorityQueue.top().getDictPos();
|
| + const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
|
| + const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
|
| + if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
|
| + AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
|
| + return false;
|
| + }
|
| + priorityQueue.pop();
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
|
| + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
|
| + if (!ptNodeParams->isTerminal()) {
|
| + return true;
|
| + }
|
| + TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
|
| + mTerminalIdMap->find(ptNodeParams->getTerminalId());
|
| + if (it == mTerminalIdMap->end()) {
|
| + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
|
| + ptNodeParams->getTerminalId(), mTerminalIdMap->size());
|
| + return false;
|
| + }
|
| + if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
|
| + AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
|
| + return false;
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +} // namespace latinime
|
|
|