third_party/prediction/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp - Issue 1247903003: Add spellcheck and word suggestion to the prediction service

Side by Side Diff: third_party/prediction/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp

Issue 1247903003: Add spellcheck and word suggestion to the prediction service (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« services/prediction/key_set.h ('K') | « third_party/prediction/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h ('k') | third_party/prediction/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (C) 2013, The Android Open Source Project

	3 *

	4 * Licensed under the Apache License, Version 2.0 (the "License");

	5 * you may not use this file except in compliance with the License.

	6 * You may obtain a copy of the License at

	7 *

	8 * http://www.apache.org/licenses/LICENSE-2.0

	9 *

	10 * Unless required by applicable law or agreed to in writing, software

	11 * distributed under the License is distributed on an "AS IS" BASIS,

	12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	13 * See the License for the specific language governing permissions and

	14 * limitations under the License.

	15 */

	16

	17 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/v2/patr icia_trie_policy.h"

	18

	19 #include "third_party/prediction/defines.h"

	20 #include "third_party/prediction/suggest/core/dicnode/dic_node.h"

	21 #include "third_party/prediction/suggest/core/dicnode/dic_node_vector.h"

	22 #include "third_party/prediction/suggest/core/dictionary/binary_dictionary_bigra ms_iterator.h"

	23 #include "third_party/prediction/suggest/core/dictionary/ngram_listener.h"

	24 #include "third_party/prediction/suggest/core/session/prev_words_info.h"

	25 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_comm on/dynamic_pt_reading_helper.h"

	26 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_comm on/patricia_trie_reading_utils.h"

	27 #include "third_party/prediction/suggest/policyimpl/dictionary/utils/probability _utils.h"

	28 #include "third_party/prediction/utils/char_utils.h"

	29

	30 namespace latinime {

	31

	32 void PatriciaTriePolicy::createAndGetAllChildDicNodes(

	33 const DicNode* const dicNode,

	34 DicNodeVector* const childDicNodes) const {

	35 if (!dicNode->hasChildren()) {

	36 return;

	37 }

	38 int nextPos = dicNode->getChildrenPtNodeArrayPos();

	39 if (nextPos < 0 \|\| nextPos >= mDictBufferSize) {

	40 AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d",

	41 nextPos, mDictBufferSize);

	42 mIsCorrupted = true;

	43 ASSERT(false);

	44 return;

	45 }

	46 const int childCount =

	47 PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot,

	48 &nextPos);

	49 for (int i = 0; i < childCount; i++) {

	50 if (nextPos < 0 \|\| nextPos >= mDictBufferSize) {

	51 AKLOGE(

	52 "Child PtNode position is invalid. pos: %d, dict size: %d, "

	53 "childCount: %d / %d",

	54 nextPos, mDictBufferSize, i, childCount);

	55 mIsCorrupted = true;

	56 ASSERT(false);

	57 return;

	58 }

	59 nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);

	60 }

	61 }

	62

	63 // This retrieves code points and the probability of the word by its terminal

	64 // position.

	65 // Due to the fact that words are ordered in the dictionary in a strict

	66 // breadth-first order,

	67 // it is possible to check for this with advantageous complexity. For each

	68 // PtNode array, we search

	69 // for PtNodes with children and compare the children position with the position

	70 // we look for.

	71 // When we shoot the position we look for, it means the word we look for is in

	72 // the children

	73 // of the previous PtNode. The only tricky part is the fact that if we arrive at

	74 // the end of a

	75 // PtNode array with the last PtNode's children position still less than what we

	76 // are searching for,

	77 // we must descend the last PtNode's children (for example, if the word we are

	78 // searching for starts

	79 // with a z, it's the last PtNode of the root array, so all children addresses

	80 // will be smaller

	81 // than the position we look for, and we have to descend the z PtNode).

	82 /* Parameters :

	83 * ptNodePos: the byte position of the terminal PtNode of the word we are

	84 * searching for (this is

	85 * what is stored as the "bigram position" in each bigram)

	86 * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.

	87 * outUnigramProbability: a pointer to an int to write the probability into.

	88 * Return value : the code point count, of 0 if the word was not found.

	89 */

	90 // TODO: Split this function to be more readable

	91 int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(

	92 const int ptNodePos,

	93 const int maxCodePointCount,

	94 int* const outCodePoints,

	95 int* const outUnigramProbability) const {

	96 int pos = getRootPosition();

	97 int wordPos = 0;

	98 // One iteration of the outer loop iterates through PtNode arrays. As stated

	99 // above, we will

	100 // only traverse PtNodes that are actually a part of the terminal we are

	101 // searching, so each

	102 // time we enter this loop we are one depth level further than last time.

	103 // The only reason we count PtNodes is because we want to reduce the

	104 // probability of infinite

	105 // looping in case there is a bug. Since we know there is an upper bound to

	106 // the depth we are

	107 // supposed to traverse, it does not hurt to count iterations.

	108 for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) {

	109 int lastCandidatePtNodePos = 0;

	110 // Let's loop through PtNodes in this PtNode array searching for either the

	111 // terminal

	112 // or one of its ascendants.

	113 if (pos < 0 \|\| pos >= mDictBufferSize) {

	114 AKLOGE("PtNode array position is invalid. pos: %d, dict size: %d", pos,

	115 mDictBufferSize);

	116 mIsCorrupted = true;

	117 ASSERT(false);

	118 *outUnigramProbability = NOT_A_PROBABILITY;

	119 return 0;

	120 }

	121 for (int ptNodeCount =

	122 PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(

	123 mDictRoot, &pos);

	124 ptNodeCount > 0; --ptNodeCount) {

	125 const int startPos = pos;

	126 if (pos < 0 \|\| pos >= mDictBufferSize) {

	127 AKLOGE("PtNode position is invalid. pos: %d, dict size: %d", pos,

	128 mDictBufferSize);

	129 mIsCorrupted = true;

	130 ASSERT(false);

	131 *outUnigramProbability = NOT_A_PROBABILITY;

	132 return 0;

	133 }

	134 const PatriciaTrieReadingUtils::NodeFlags flags =

	135 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);

	136 const int character =

	137 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot,

	138 &pos);

	139 if (ptNodePos == startPos) {

	140 // We found the position. Copy the rest of the code points in the buffer

	141 // and return

	142 // the length.

	143 outCodePoints[wordPos] = character;

	144 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {

	145 int nextChar =

	146 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(

	147 mDictRoot, &pos);

	148 // We count code points in order to avoid infinite loops if the file

	149 // is broken

	150 // or if there is some other bug

	151 int charCount = maxCodePointCount;

	152 while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {

	153 outCodePoints[++wordPos] = nextChar;

	154 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(

	155 mDictRoot, &pos);

	156 }

	157 }

	158 *outUnigramProbability =

	159 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(

	160 mDictRoot, &pos);

	161 return ++wordPos;

	162 }

	163 // We need to skip past this PtNode, so skip any remaining code points

	164 // after the

	165 // first and possibly the probability.

	166 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {

	167 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags,

	168 MAX_WORD_LENGTH, &pos);

	169 }

	170 if (PatriciaTrieReadingUtils::isTerminal(flags)) {

	171 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot,

	172 &pos);

	173 }

	174 // The fact that this PtNode has children is very important. Since we

	175 // already know

	176 // that this PtNode does not match, if it has no children we know it is

	177 // irrelevant

	178 // to what we are searching for.

	179 const bool hasChildren =

	180 PatriciaTrieReadingUtils::hasChildrenInFlags(flags);

	181 // We will write in `found' whether we have passed the children position

	182 // we are

	183 // searching for. For example if we search for "beer", the children of b

	184 // are less

	185 // than the address we are searching for and the children of c are

	186 // greater. When we

	187 // come here for c, we realize this is too big, and that we should descend

	188 // b.

	189 bool found;

	190 if (hasChildren) {

	191 int currentPos = pos;

	192 // Here comes the tricky part. First, read the children position.

	193 const int childrenPos =

	194 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(

	195 mDictRoot, flags, &currentPos);

	196 if (childrenPos > ptNodePos) {

	197 // If the children pos is greater than the position, it means the

	198 // previous

	199 // PtNode, which position is stored in lastCandidatePtNodePos, was the

	200 // right

	201 // one.

	202 found = true;

	203 } else if (1 >= ptNodeCount) {

	204 // However if we are on the LAST PtNode of this array, and we have NOT

	205 // shot the

	206 // position we should descend THIS PtNode. So we trick the

	207 // lastCandidatePtNodePos so that we will descend this PtNode, not the

	208 // previous

	209 // one.

	210 lastCandidatePtNodePos = startPos;

	211 found = true;

	212 } else {

	213 // Else, we should continue looking.

	214 found = false;

	215 }

	216 } else {

	217 // Even if we don't have children here, we could still be on the last

	218 // PtNode of

	219 // this array. If this is the case, we should descend the last PtNode

	220 // that had

	221 // children, and their position is already in lastCandidatePtNodePos.

	222 found = (1 >= ptNodeCount);

	223 }

	224

	225 if (found) {

	226 // Okay, we found the PtNode we should descend. Its position is in

	227 // the lastCandidatePtNodePos variable, so we just re-read it.

	228 if (0 != lastCandidatePtNodePos) {

	229 const PatriciaTrieReadingUtils::NodeFlags lastFlags =

	230 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(

	231 mDictRoot, &lastCandidatePtNodePos);

	232 const int lastChar =

	233 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(

	234 mDictRoot, &lastCandidatePtNodePos);

	235 // We copy all the characters in this PtNode to the buffer

	236 outCodePoints[wordPos] = lastChar;

	237 if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {

	238 int nextChar =

	239 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(

	240 mDictRoot, &lastCandidatePtNodePos);

	241 int charCount = maxCodePointCount;

	242 while (-1 != nextChar && --charCount > 0) {

	243 outCodePoints[++wordPos] = nextChar;

	244 nextChar =

	245 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(

	246 mDictRoot, &lastCandidatePtNodePos);

	247 }

	248 }

	249 ++wordPos;

	250 // Now we only need to branch to the children address. Skip the

	251 // probability if

	252 // it's there, read pos, and break to resume the search at pos.

	253 if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {

	254 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(

	255 mDictRoot, &lastCandidatePtNodePos);

	256 }

	257 pos =

	258 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(

	259 mDictRoot, lastFlags, &lastCandidatePtNodePos);

	260 break;

	261 } else {

	262 // Here is a little tricky part: we come here if we found out that all

	263 // children

	264 // addresses in this PtNode are bigger than the address we are

	265 // searching for.

	266 // Should we conclude the word is not in the dictionary? No! It could

	267 // still be

	268 // one of the remaining PtNodes in this array, so we have to keep

	269 // looking in

	270 // this array until we find it (or we realize it's not there either,

	271 // in which

	272 // case it's actually not in the dictionary). Pass the end of this

	273 // PtNode,

	274 // ready to start the next one.

	275 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {

	276 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(

	277 mDictRoot, flags, &pos);

	278 }

	279 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {

	280 mShortcutListPolicy.skipAllShortcuts(&pos);

	281 }

	282 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {

	283 if (!mBigramListPolicy.skipAllBigrams(&pos)) {

	284 AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.",

	285 mDictBufferSize, pos);

	286 mIsCorrupted = true;

	287 ASSERT(false);

	288 *outUnigramProbability = NOT_A_PROBABILITY;

	289 return 0;

	290 }

	291 }

	292 }

	293 } else {

	294 // If we did not find it, we should record the last children address for

	295 // the next

	296 // iteration.

	297 if (hasChildren)

	298 lastCandidatePtNodePos = startPos;

	299 // Now skip the end of this PtNode (children pos and the attributes if

	300 // any) so that

	301 // our pos is after the end of this PtNode, at the start of the next

	302 // one.

	303 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {

	304 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(

	305 mDictRoot, flags, &pos);

	306 }

	307 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {

	308 mShortcutListPolicy.skipAllShortcuts(&pos);

	309 }

	310 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {

	311 if (!mBigramListPolicy.skipAllBigrams(&pos)) {

	312 AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.",

	313 mDictBufferSize, pos);

	314 mIsCorrupted = true;

	315 ASSERT(false);

	316 *outUnigramProbability = NOT_A_PROBABILITY;

	317 return 0;

	318 }

	319 }

	320 }

	321 }

	322 }

	323 // If we have looked through all the PtNodes and found no match, the ptNodePos

	324 // is

	325 // not the position of a terminal in this dictionary.

	326 return 0;

	327 }

	328

	329 // This function gets the position of the terminal PtNode of the exact matching

	330 // word in the

	331 // dictionary. If no match is found, it returns NOT_A_DICT_POS.

	332 int PatriciaTriePolicy::getTerminalPtNodePositionOfWord(

	333 const int* const inWord,

	334 const int length,

	335 const bool forceLowerCaseSearch) const {

	336 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);

	337 readingHelper.initWithPtNodeArrayPos(getRootPosition());

	338 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(

	339 inWord, length, forceLowerCaseSearch);

	340 if (readingHelper.isError()) {

	341 mIsCorrupted = true;

	342 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");

	343 }

	344 return ptNodePos;

	345 }

	346

	347 int PatriciaTriePolicy::getProbability(const int unigramProbability,

	348 const int bigramProbability) const {

	349 // Due to space constraints, the probability for bigrams is approximate - the

	350 // lower the unigram

	351 // probability, the worse the precision. The theoritical maximum error in

	352 // resulting probability

	353 // is 8 - although in the practice it's never bigger than 3 or 4 in very bad

	354 // cases. This means

	355 // that sometimes, we'll see some bigrams interverted here, but it can't get

	356 // too bad.

	357 if (unigramProbability == NOT_A_PROBABILITY) {

	358 return NOT_A_PROBABILITY;

	359 } else if (bigramProbability == NOT_A_PROBABILITY) {

	360 return ProbabilityUtils::backoff(unigramProbability);

	361 } else {

	362 return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,

	363 bigramProbability);

	364 }

	365 }

	366

	367 int PatriciaTriePolicy::getProbabilityOfPtNode(

	368 const int* const prevWordsPtNodePos,

	369 const int ptNodePos) const {

	370 if (ptNodePos == NOT_A_DICT_POS) {

	371 return NOT_A_PROBABILITY;

	372 }

	373 const PtNodeParams ptNodeParams =

	374 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);

	375 if (ptNodeParams.isNotAWord() \|\| ptNodeParams.isBlacklisted()) {

	376 // If this is not a word, or if it's a blacklisted entry, it should behave

	377 // as

	378 // having no probability outside of the suggestion process (where it should

	379 // be used

	380 // for shortcuts).

	381 return NOT_A_PROBABILITY;

	382 }

	383 if (prevWordsPtNodePos) {

	384 const int bigramsPosition =

	385 getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);

	386 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy,

	387 bigramsPosition);

	388 while (bigramsIt.hasNext()) {

	389 bigramsIt.next();

	390 if (bigramsIt.getBigramPos() == ptNodePos &&

	391 bigramsIt.getProbability() != NOT_A_PROBABILITY) {

	392 return getProbability(ptNodeParams.getProbability(),

	393 bigramsIt.getProbability());

	394 }

	395 }

	396 return NOT_A_PROBABILITY;

	397 }

	398 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);

	399 }

	400

	401 void PatriciaTriePolicy::iterateNgramEntries(

	402 const int* const prevWordsPtNodePos,

	403 NgramListener* const listener) const {

	404 if (!prevWordsPtNodePos) {

	405 return;

	406 }

	407 const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);

	408 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy,

	409 bigramsPosition);

	410 while (bigramsIt.hasNext()) {

	411 bigramsIt.next();

	412 listener->onVisitEntry(bigramsIt.getProbability(),

	413 bigramsIt.getBigramPos());

	414 }

	415 }

	416

	417 int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {

	418 if (ptNodePos == NOT_A_DICT_POS) {

	419 return NOT_A_DICT_POS;

	420 }

	421 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)

	422 .getShortcutPos();

	423 }

	424

	425 int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {

	426 if (ptNodePos == NOT_A_DICT_POS) {

	427 return NOT_A_DICT_POS;

	428 }

	429 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)

	430 .getBigramsPos();

	431 }

	432

	433 int PatriciaTriePolicy::createAndGetLeavingChildNode(

	434 const DicNode* const dicNode,

	435 const int ptNodePos,

	436 DicNodeVector* childDicNodes) const {

	437 PatriciaTrieReadingUtils::NodeFlags flags;

	438 int mergedNodeCodePointCount = 0;

	439 int mergedNodeCodePoints[MAX_WORD_LENGTH];

	440 int probability = NOT_A_PROBABILITY;

	441 int childrenPos = NOT_A_DICT_POS;

	442 int shortcutPos = NOT_A_DICT_POS;

	443 int bigramPos = NOT_A_DICT_POS;

	444 int siblingPos = NOT_A_DICT_POS;

	445 PatriciaTrieReadingUtils::readPtNodeInfo(

	446 mDictRoot, ptNodePos, getShortcutsStructurePolicy(), &mBigramListPolicy,

	447 &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability,

	448 &childrenPos, &shortcutPos, &bigramPos, &siblingPos);

	449 // Skip PtNodes don't start with Unicode code point because they represent

	450 // non-word information.

	451 if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {

	452 childDicNodes->pushLeavingChild(

	453 dicNode, ptNodePos, childrenPos, probability,

	454 PatriciaTrieReadingUtils::isTerminal(flags),

	455 PatriciaTrieReadingUtils::hasChildrenInFlags(flags),

	456 PatriciaTrieReadingUtils::isBlacklisted(flags) \|\|

	457 PatriciaTrieReadingUtils::isNotAWord(flags),

	458 mergedNodeCodePointCount, mergedNodeCodePoints);

	459 }

	460 return siblingPos;

	461 }

	462

	463 const WordProperty PatriciaTriePolicy::getWordProperty(

	464 const int* const codePoints,

	465 const int codePointCount) const {

	466 const int ptNodePos = getTerminalPtNodePositionOfWord(

	467 codePoints, codePointCount, false /* forceLowerCaseSearch */);

	468 if (ptNodePos == NOT_A_DICT_POS) {

	469 AKLOGE("getWordProperty was called for invalid word.");

	470 return WordProperty();

	471 }

	472 const PtNodeParams ptNodeParams =

	473 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);

	474 std::vector<int> codePointVector(

	475 ptNodeParams.getCodePoints(),

	476 ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());

	477 // Fetch bigram information.

	478 std::vector<BigramProperty> bigrams;

	479 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);

	480 int bigramWord1CodePoints[MAX_WORD_LENGTH];

	481 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);

	482 while (bigramsIt.hasNext()) {

	483 // Fetch the next bigram information and forward the iterator.

	484 bigramsIt.next();

	485 // Skip the entry if the entry has been deleted. This never happens for ver2

	486 // dicts.

	487 if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {

	488 int word1Probability = NOT_A_PROBABILITY;

	489 const int word1CodePointCount =

	490 getCodePointsAndProbabilityAndReturnCodePointCount(

	491 bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints,

	492 &word1Probability);

	493 const std::vector<int> word1(bigramWord1CodePoints,

	494 bigramWord1CodePoints + word1CodePointCount);

	495 const int probability =

	496 getProbability(word1Probability, bigramsIt.getProbability());

	497 bigrams.emplace_back(&word1, probability, NOT_A_TIMESTAMP /* timestamp */,

	498 0 /* level /, 0 / count */);

	499 }

	500 }

	501 // Fetch shortcut information.

	502 std::vector<UnigramProperty::ShortcutProperty> shortcuts;

	503 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);

	504 if (shortcutPos != NOT_A_DICT_POS) {

	505 int shortcutTargetCodePoints[MAX_WORD_LENGTH];

	506 ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(

	507 mDictRoot, &shortcutPos);

	508 bool hasNext = true;

	509 while (hasNext) {

	510 const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =

	511 ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot,

	512 &shortcutPos);

	513 hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);

	514 const int shortcutTargetLength =

	515 ShortcutListReadingUtils::readShortcutTarget(

	516 mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints,

	517 &shortcutPos);

	518 const std::vector<int> shortcutTarget(

	519 shortcutTargetCodePoints,

	520 shortcutTargetCodePoints + shortcutTargetLength);

	521 const int shortcutProbability =

	522 ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);

	523 shortcuts.emplace_back(&shortcutTarget, shortcutProbability);

	524 }

	525 }

	526 const UnigramProperty unigramProperty(

	527 ptNodeParams.representsBeginningOfSentence(), ptNodeParams.isNotAWord(),

	528 ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),

	529 NOT_A_TIMESTAMP /* timestamp /, 0 / level /, 0 / count */,

	530 &shortcuts);

	531 return WordProperty(&codePointVector, &unigramProperty, &bigrams);

	532 }

	533

	534 int PatriciaTriePolicy::getNextWordAndNextToken(const int token,

	535 int* const outCodePoints,

	536 int* const outCodePointCount) {

	537 *outCodePointCount = 0;

	538 if (token == 0) {

	539 // Start iterating the dictionary.

	540 mTerminalPtNodePositionsForIteratingWords.clear();

	541 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions

	542 traversePolicy(&mTerminalPtNodePositionsForIteratingWords);

	543 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);

	544 readingHelper.initWithPtNodeArrayPos(getRootPosition());

	545 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(

	546 &traversePolicy);

	547 }

	548 const int terminalPtNodePositionsVectorSize =

	549 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());

	550 if (token < 0 \|\| token >= terminalPtNodePositionsVectorSize) {

	551 AKLOGE("Given token %d is invalid.", token);

	552 return 0;

	553 }

	554 const int terminalPtNodePos =

	555 mTerminalPtNodePositionsForIteratingWords[token];

	556 int unigramProbability = NOT_A_PROBABILITY;

	557 *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(

	558 terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);

	559 const int nextToken = token + 1;

	560 if (nextToken >= terminalPtNodePositionsVectorSize) {

	561 // All words have been iterated.

	562 mTerminalPtNodePositionsForIteratingWords.clear();

	563 return 0;

	564 }

	565 return nextToken;

	566 }

	567

	568 } // namespace latinime

OLD	NEW