Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(24)

Unified Diff: third_party/prediction/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp

Issue 1247903003: Add spellcheck and word suggestion to the prediction service (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/prediction/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
diff --git a/third_party/prediction/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/third_party/prediction/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a1ef46d880ceaf1a77a767b735be26d435dd060
--- /dev/null
+++ b/third_party/prediction/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
@@ -0,0 +1,625 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
+ * Do not edit this file other than updating policy's interface.
+ *
+ * This file was generated from
+ * third_party/prediction/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+ */
+
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
+
+#include <vector>
+
+#include "third_party/prediction/suggest/core/dicnode/dic_node.h"
+#include "third_party/prediction/suggest/core/dicnode/dic_node_vector.h"
+#include "third_party/prediction/suggest/core/dictionary/ngram_listener.h"
+#include "third_party/prediction/suggest/core/dictionary/property/bigram_property.h"
+#include "third_party/prediction/suggest/core/dictionary/property/unigram_property.h"
+#include "third_party/prediction/suggest/core/dictionary/property/word_property.h"
+#include "third_party/prediction/suggest/core/session/prev_words_info.h"
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "third_party/prediction/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "third_party/prediction/suggest/policyimpl/dictionary/utils/probability_utils.h"
+
+namespace latinime {
+namespace backward {
+namespace v402 {
+
+// Note that there are corresponding definitions in Java side in
+// BinaryDictionaryTests and
+// BinaryDictionaryDecayingTests.
+const char* const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
+const char* const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+const char* const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY =
+ "MAX_UNIGRAM_COUNT";
+const char* const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY =
+ "MAX_BIGRAM_COUNT";
+const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
+const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
+ Ver4DictConstants::MAX_DICTIONARY_SIZE -
+ MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
+
+void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(
+ const DicNode* const dicNode,
+ DicNodeVector* const childDicNodes) const {
+ if (!dicNode->hasChildren()) {
+ return;
+ }
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
+ while (!readingHelper.isEnd()) {
+ const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
+ if (!ptNodeParams.isValid()) {
+ break;
+ }
+ bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
+ if (isTerminal && mHeaderPolicy->isDecayingDict()) {
+ // A DecayingDict may have a terminal PtNode that has a terminal DicNode
+ // whose
+ // probability is NOT_A_PROBABILITY. In such case, we don't want to treat
+ // it as a
+ // valid terminal DicNode.
+ isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
+ }
+ readingHelper.readNextSiblingNode(ptNodeParams);
+ if (ptNodeParams.representsNonWordInfo()) {
+ // Skip PtNodes that represent non-word information.
+ continue;
+ }
+ childDicNodes->pushLeavingChild(
+ dicNode, ptNodeParams.getHeadPos(), ptNodeParams.getChildrenPos(),
+ ptNodeParams.getProbability(), isTerminal, ptNodeParams.hasChildren(),
+ ptNodeParams.isBlacklisted() ||
+ ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
+ ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
+ }
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ }
+}
+
+int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
+ const int ptNodePos,
+ const int maxCodePointCount,
+ int* const outCodePoints,
+ int* const outUnigramProbability) const {
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodePos(ptNodePos);
+ const int codePointCount =
+ readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
+ maxCodePointCount, outCodePoints, outUnigramProbability);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE(
+ "Dictionary reading error in "
+ "getCodePointsAndProbabilityAndReturnCodePointCount().");
+ }
+ return codePointCount;
+}
+
+int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(
+ const int* const inWord,
+ const int length,
+ const bool forceLowerCaseSearch) const {
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(
+ inWord, length, forceLowerCaseSearch);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ }
+ return ptNodePos;
+}
+
+int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
+ const int bigramProbability) const {
+ if (mHeaderPolicy->isDecayingDict()) {
+ // Both probabilities are encoded. Decode them and get probability.
+ return ForgettingCurveUtils::getProbability(unigramProbability,
+ bigramProbability);
+ } else {
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
+ } else if (bigramProbability == NOT_A_PROBABILITY) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ } else {
+ return bigramProbability;
+ }
+ }
+}
+
+int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(
+ const int* const prevWordsPtNodePos,
+ const int ptNodePos) const {
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_PROBABILITY;
+ }
+ const PtNodeParams ptNodeParams(
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() ||
+ ptNodeParams.isNotAWord()) {
+ return NOT_A_PROBABILITY;
+ }
+ if (prevWordsPtNodePos) {
+ const int bigramsPosition =
+ getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
+ while (bigramsIt.hasNext()) {
+ bigramsIt.next();
+ if (bigramsIt.getBigramPos() == ptNodePos &&
+ bigramsIt.getProbability() != NOT_A_PROBABILITY) {
+ return getProbability(ptNodeParams.getProbability(),
+ bigramsIt.getProbability());
+ }
+ }
+ return NOT_A_PROBABILITY;
+ }
+ return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+}
+
+void Ver4PatriciaTriePolicy::iterateNgramEntries(
+ const int* const prevWordsPtNodePos,
+ NgramListener* const listener) const {
+ if (!prevWordsPtNodePos) {
+ return;
+ }
+ const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
+ while (bigramsIt.hasNext()) {
+ bigramsIt.next();
+ listener->onVisitEntry(bigramsIt.getProbability(),
+ bigramsIt.getBigramPos());
+ }
+}
+
+int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(
+ const int ptNodePos) const {
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_DICT_POS;
+ }
+ const PtNodeParams ptNodeParams(
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (ptNodeParams.isDeleted()) {
+ return NOT_A_DICT_POS;
+ }
+ return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
+ ptNodeParams.getTerminalId());
+}
+
+int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(
+ const int ptNodePos) const {
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_DICT_POS;
+ }
+ const PtNodeParams ptNodeParams(
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (ptNodeParams.isDeleted()) {
+ return NOT_A_DICT_POS;
+ }
+ return mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ ptNodeParams.getTerminalId());
+}
+
+bool Ver4PatriciaTriePolicy::addUnigramEntry(
+ const int* const word,
+ const int length,
+ const UnigramProperty* const unigramProperty) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI(
+ "Warning: addUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >=
+ MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE(
+ "The dictionary is too large to dynamically update. Dictionary size: "
+ "%d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (length > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert to the dictionary, length: %d",
+ length);
+ return false;
+ }
+ for (const auto& shortcut : unigramProperty->getShortcuts()) {
+ if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE(
+ "One of shortcut targets is too long to insert to the dictionary, "
+ "length: %d",
+ shortcut.getTargetCodePoints()->size());
+ return false;
+ }
+ }
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ bool addedNewUnigram = false;
+ int codePointsToAdd[MAX_WORD_LENGTH];
+ int codePointCountToAdd = length;
+ memmove(codePointsToAdd, word, sizeof(int) * length);
+ if (unigramProperty->representsBeginningOfSentence()) {
+ codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(
+ codePointsToAdd, codePointCountToAdd, MAX_WORD_LENGTH);
+ }
+ if (codePointCountToAdd <= 0) {
+ return false;
+ }
+ if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd,
+ codePointCountToAdd, unigramProperty,
+ &addedNewUnigram)) {
+ if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
+ mUnigramCount++;
+ }
+ if (unigramProperty->getShortcuts().size() > 0) {
+ // Add shortcut target.
+ const int wordPos = getTerminalPtNodePositionOfWord(
+ word, length, false /* forceLowerCaseSearch */);
+ if (wordPos == NOT_A_DICT_POS) {
+ AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
+ return false;
+ }
+ for (const auto& shortcut : unigramProperty->getShortcuts()) {
+ if (!mUpdatingHelper.addShortcutTarget(
+ wordPos, shortcut.getTargetCodePoints()->data(),
+ shortcut.getTargetCodePoints()->size(),
+ shortcut.getProbability())) {
+ AKLOGE(
+ "Cannot add new shortcut target. PtNodePos: %d, length: %d, "
+ "probability: %d",
+ wordPos, shortcut.getTargetCodePoints()->size(),
+ shortcut.getProbability());
+ return false;
+ }
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::addNgramEntry(
+ const PrevWordsInfo* const prevWordsInfo,
+ const BigramProperty* const bigramProperty) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >=
+ MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE(
+ "The dictionary is too large to dynamically update. Dictionary size: "
+ "%d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (!prevWordsInfo->isValid()) {
+ AKLOGE(
+ "prev words info is not valid for adding n-gram entry to the "
+ "dictionary.");
+ return false;
+ }
+ if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE(
+ "The word is too long to insert the ngram to the dictionary. "
+ "length: %d",
+ bigramProperty->getTargetCodePoints()->size());
+ return false;
+ }
+ int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
+ false /* tryLowerCaseSearch */);
+ // TODO: Support N-gram.
+ if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
+ if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) {
+ const std::vector<UnigramProperty::ShortcutProperty> shortcuts;
+ const UnigramProperty beginningOfSentenceUnigramProperty(
+ true /* representsBeginningOfSentence */, true /* isNotAWord */,
+ false /* isBlacklisted */, MAX_PROBABILITY /* probability */,
+ NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */,
+ &shortcuts);
+ if (!addUnigramEntry(
+ prevWordsInfo->getNthPrevWordCodePoints(1 /* n */),
+ prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */),
+ &beginningOfSentenceUnigramProperty)) {
+ AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
+ return false;
+ }
+ // Refresh Terminal PtNode positions.
+ prevWordsInfo->getPrevWordsTerminalPtNodePos(
+ this, prevWordsPtNodePos, false /* tryLowerCaseSearch */);
+ } else {
+ return false;
+ }
+ }
+ const int word1Pos = getTerminalPtNodePositionOfWord(
+ bigramProperty->getTargetCodePoints()->data(),
+ bigramProperty->getTargetCodePoints()->size(),
+ false /* forceLowerCaseSearch */);
+ if (word1Pos == NOT_A_DICT_POS) {
+ return false;
+ }
+ bool addedNewBigram = false;
+ if (mUpdatingHelper.addNgramEntry(
+ PtNodePosArrayView::fromObject(prevWordsPtNodePos), word1Pos,
+ bigramProperty, &addedNewBigram)) {
+ if (addedNewBigram) {
+ mBigramCount++;
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::removeNgramEntry(
+ const PrevWordsInfo* const prevWordsInfo,
+ const int* const word,
+ const int length) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI(
+ "Warning: removeNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >=
+ MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE(
+ "The dictionary is too large to dynamically update. Dictionary size: "
+ "%d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (!prevWordsInfo->isValid()) {
+ AKLOGE(
+ "prev words info is not valid for removing n-gram entry form the "
+ "dictionary.");
+ return false;
+ }
+ if (length > MAX_WORD_LENGTH) {
+ AKLOGE(
+ "word is too long to remove n-gram entry form the dictionary. length: "
+ "%d",
+ length);
+ }
+ int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
+ false /* tryLowerCaseSerch */);
+ // TODO: Support N-gram.
+ if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
+ return false;
+ }
+ const int wordPos = getTerminalPtNodePositionOfWord(
+ word, length, false /* forceLowerCaseSearch */);
+ if (wordPos == NOT_A_DICT_POS) {
+ return false;
+ }
+ if (mUpdatingHelper.removeNgramEntry(
+ PtNodePosArrayView::fromObject(prevWordsPtNodePos), wordPos)) {
+ mBigramCount--;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::flush(const char* const filePath) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI(
+ "Warning: flush() is called for non-updatable dictionary. filePath: %s",
+ filePath);
+ return false;
+ }
+ if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) {
+ AKLOGE("Cannot flush the dictionary to file.");
+ mIsCorrupted = true;
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::flushWithGC(const char* const filePath) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
+ AKLOGE("Cannot flush the dictionary to file with GC.");
+ mIsCorrupted = true;
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mBuffers->isNearSizeLimit()) {
+ // Additional buffer size is near the limit.
+ return true;
+ } else if (mHeaderPolicy->getExtendedRegionSize() +
+ mDictBuffer->getUsedAdditionalBufferSize() >
+ Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
+ // Total extended region size of the trie exceeds the limit.
+ return true;
+ } else if (mDictBuffer->getTailPosition() >=
+ MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS &&
+ mDictBuffer->getUsedAdditionalBufferSize() > 0) {
+ // Needs to reduce dictionary size.
+ return true;
+ } else if (mHeaderPolicy->isDecayingDict()) {
+ return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount,
+ mBigramCount, mHeaderPolicy);
+ }
+ return false;
+}
+
+void Ver4PatriciaTriePolicy::getProperty(const char* const query,
+ const int queryLength,
+ char* const outResult,
+ const int maxResultLength) {
+ const int compareLength = queryLength + 1 /* terminator */;
+ if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d", mUnigramCount);
+ } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d", mBigramCount);
+ } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy->isDecayingDict()
+ ? ForgettingCurveUtils::getUnigramCountHardLimit(
+ mHeaderPolicy->getMaxUnigramCount())
+ : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy->isDecayingDict()
+ ? ForgettingCurveUtils::getBigramCountHardLimit(
+ mHeaderPolicy->getMaxBigramCount())
+ : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ }
+}
+
+const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
+ const int* const codePoints,
+ const int codePointCount) const {
+ const int ptNodePos = getTerminalPtNodePositionOfWord(
+ codePoints, codePointCount, false /* forceLowerCaseSearch */);
+ if (ptNodePos == NOT_A_DICT_POS) {
+ AKLOGE("getWordProperty is called for invalid word.");
+ return WordProperty();
+ }
+ const PtNodeParams ptNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ std::vector<int> codePointVector(
+ ptNodeParams.getCodePoints(),
+ ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());
+ const ProbabilityEntry probabilityEntry =
+ mBuffers->getProbabilityDictContent()->getProbabilityEntry(
+ ptNodeParams.getTerminalId());
+ const HistoricalInfo* const historicalInfo =
+ probabilityEntry.getHistoricalInfo();
+ // Fetch bigram information.
+ std::vector<BigramProperty> bigrams;
+ const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
+ if (bigramListPos != NOT_A_DICT_POS) {
+ int bigramWord1CodePoints[MAX_WORD_LENGTH];
+ const BigramDictContent* const bigramDictContent =
+ mBuffers->getBigramDictContent();
+ const TerminalPositionLookupTable* const terminalPositionLookupTable =
+ mBuffers->getTerminalPositionLookupTable();
+ bool hasNext = true;
+ int readingPos = bigramListPos;
+ while (hasNext) {
+ const BigramEntry bigramEntry =
+ bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
+ hasNext = bigramEntry.hasNext();
+ const int word1TerminalId = bigramEntry.getTargetTerminalId();
+ const int word1TerminalPtNodePos =
+ terminalPositionLookupTable->getTerminalPtNodePosition(
+ word1TerminalId);
+ if (word1TerminalPtNodePos == NOT_A_DICT_POS) {
+ continue;
+ }
+ // Word (unigram) probability
+ int word1Probability = NOT_A_PROBABILITY;
+ const int codePointCount =
+ getCodePointsAndProbabilityAndReturnCodePointCount(
+ word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints,
+ &word1Probability);
+ const std::vector<int> word1(bigramWord1CodePoints,
+ bigramWord1CodePoints + codePointCount);
+ const HistoricalInfo* const historicalInfo =
+ bigramEntry.getHistoricalInfo();
+ const int probability =
+ bigramEntry.hasHistoricalInfo()
+ ? ForgettingCurveUtils::decodeProbability(
+ bigramEntry.getHistoricalInfo(), mHeaderPolicy)
+ : bigramEntry.getProbability();
+ bigrams.emplace_back(&word1, probability, historicalInfo->getTimeStamp(),
+ historicalInfo->getLevel(),
+ historicalInfo->getCount());
+ }
+ }
+ // Fetch shortcut information.
+ std::vector<UnigramProperty::ShortcutProperty> shortcuts;
+ int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
+ if (shortcutPos != NOT_A_DICT_POS) {
+ int shortcutTarget[MAX_WORD_LENGTH];
+ const ShortcutDictContent* const shortcutDictContent =
+ mBuffers->getShortcutDictContent();
+ bool hasNext = true;
+ while (hasNext) {
+ int shortcutTargetLength = 0;
+ int shortcutProbability = NOT_A_PROBABILITY;
+ shortcutDictContent->getShortcutEntryAndAdvancePosition(
+ MAX_WORD_LENGTH, shortcutTarget, &shortcutTargetLength,
+ &shortcutProbability, &hasNext, &shortcutPos);
+ const std::vector<int> target(shortcutTarget,
+ shortcutTarget + shortcutTargetLength);
+ shortcuts.emplace_back(&target, shortcutProbability);
+ }
+ }
+ const UnigramProperty unigramProperty(
+ ptNodeParams.representsBeginningOfSentence(), ptNodeParams.isNotAWord(),
+ ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
+ historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
+ historicalInfo->getCount(), &shortcuts);
+ return WordProperty(&codePointVector, &unigramProperty, &bigrams);
+}
+
+int Ver4PatriciaTriePolicy::getNextWordAndNextToken(
+ const int token,
+ int* const outCodePoints,
+ int* const outCodePointCount) {
+ *outCodePointCount = 0;
+ if (token == 0) {
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions
+ traversePolicy(&mTerminalPtNodePositionsForIteratingWords);
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
+ &traversePolicy);
+ }
+ const int terminalPtNodePositionsVectorSize =
+ static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
+ if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
+ AKLOGE("Given token %d is invalid.", token);
+ return 0;
+ }
+ const int terminalPtNodePos =
+ mTerminalPtNodePositionsForIteratingWords[token];
+ int unigramProbability = NOT_A_PROBABILITY;
+ *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
+ terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
+ const int nextToken = token + 1;
+ if (nextToken >= terminalPtNodePositionsVectorSize) {
+ // All words have been iterated.
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ return 0;
+ }
+ return nextToken;
+}
+
+} // namespace v402
+} // namespace backward
+} // namespace latinime

Powered by Google App Engine
This is Rietveld 408576698