Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(14)

Side by Side Diff: third_party/prediction/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp

Issue 1247903003: Add spellcheck and word suggestion to the prediction service (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/v2/patr icia_trie_policy.h"
18
19 #include "third_party/prediction/defines.h"
20 #include "third_party/prediction/suggest/core/dicnode/dic_node.h"
21 #include "third_party/prediction/suggest/core/dicnode/dic_node_vector.h"
22 #include "third_party/prediction/suggest/core/dictionary/binary_dictionary_bigra ms_iterator.h"
23 #include "third_party/prediction/suggest/core/dictionary/ngram_listener.h"
24 #include "third_party/prediction/suggest/core/session/prev_words_info.h"
25 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_comm on/dynamic_pt_reading_helper.h"
26 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_comm on/patricia_trie_reading_utils.h"
27 #include "third_party/prediction/suggest/policyimpl/dictionary/utils/probability _utils.h"
28 #include "third_party/prediction/utils/char_utils.h"
29
30 namespace latinime {
31
32 void PatriciaTriePolicy::createAndGetAllChildDicNodes(
33 const DicNode* const dicNode,
34 DicNodeVector* const childDicNodes) const {
35 if (!dicNode->hasChildren()) {
36 return;
37 }
38 int nextPos = dicNode->getChildrenPtNodeArrayPos();
39 if (nextPos < 0 || nextPos >= mDictBufferSize) {
40 AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d",
41 nextPos, mDictBufferSize);
42 mIsCorrupted = true;
43 ASSERT(false);
44 return;
45 }
46 const int childCount =
47 PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot,
48 &nextPos);
49 for (int i = 0; i < childCount; i++) {
50 if (nextPos < 0 || nextPos >= mDictBufferSize) {
51 AKLOGE(
52 "Child PtNode position is invalid. pos: %d, dict size: %d, "
53 "childCount: %d / %d",
54 nextPos, mDictBufferSize, i, childCount);
55 mIsCorrupted = true;
56 ASSERT(false);
57 return;
58 }
59 nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);
60 }
61 }
62
63 // This retrieves code points and the probability of the word by its terminal
64 // position.
65 // Due to the fact that words are ordered in the dictionary in a strict
66 // breadth-first order,
67 // it is possible to check for this with advantageous complexity. For each
68 // PtNode array, we search
69 // for PtNodes with children and compare the children position with the position
70 // we look for.
71 // When we shoot the position we look for, it means the word we look for is in
72 // the children
73 // of the previous PtNode. The only tricky part is the fact that if we arrive at
74 // the end of a
75 // PtNode array with the last PtNode's children position still less than what we
76 // are searching for,
77 // we must descend the last PtNode's children (for example, if the word we are
78 // searching for starts
79 // with a z, it's the last PtNode of the root array, so all children addresses
80 // will be smaller
81 // than the position we look for, and we have to descend the z PtNode).
82 /* Parameters :
83 * ptNodePos: the byte position of the terminal PtNode of the word we are
84 * searching for (this is
85 * what is stored as the "bigram position" in each bigram)
86 * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
87 * outUnigramProbability: a pointer to an int to write the probability into.
88 * Return value : the code point count, of 0 if the word was not found.
89 */
90 // TODO: Split this function to be more readable
91 int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
92 const int ptNodePos,
93 const int maxCodePointCount,
94 int* const outCodePoints,
95 int* const outUnigramProbability) const {
96 int pos = getRootPosition();
97 int wordPos = 0;
98 // One iteration of the outer loop iterates through PtNode arrays. As stated
99 // above, we will
100 // only traverse PtNodes that are actually a part of the terminal we are
101 // searching, so each
102 // time we enter this loop we are one depth level further than last time.
103 // The only reason we count PtNodes is because we want to reduce the
104 // probability of infinite
105 // looping in case there is a bug. Since we know there is an upper bound to
106 // the depth we are
107 // supposed to traverse, it does not hurt to count iterations.
108 for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) {
109 int lastCandidatePtNodePos = 0;
110 // Let's loop through PtNodes in this PtNode array searching for either the
111 // terminal
112 // or one of its ascendants.
113 if (pos < 0 || pos >= mDictBufferSize) {
114 AKLOGE("PtNode array position is invalid. pos: %d, dict size: %d", pos,
115 mDictBufferSize);
116 mIsCorrupted = true;
117 ASSERT(false);
118 *outUnigramProbability = NOT_A_PROBABILITY;
119 return 0;
120 }
121 for (int ptNodeCount =
122 PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
123 mDictRoot, &pos);
124 ptNodeCount > 0; --ptNodeCount) {
125 const int startPos = pos;
126 if (pos < 0 || pos >= mDictBufferSize) {
127 AKLOGE("PtNode position is invalid. pos: %d, dict size: %d", pos,
128 mDictBufferSize);
129 mIsCorrupted = true;
130 ASSERT(false);
131 *outUnigramProbability = NOT_A_PROBABILITY;
132 return 0;
133 }
134 const PatriciaTrieReadingUtils::NodeFlags flags =
135 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
136 const int character =
137 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot,
138 &pos);
139 if (ptNodePos == startPos) {
140 // We found the position. Copy the rest of the code points in the buffer
141 // and return
142 // the length.
143 outCodePoints[wordPos] = character;
144 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
145 int nextChar =
146 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
147 mDictRoot, &pos);
148 // We count code points in order to avoid infinite loops if the file
149 // is broken
150 // or if there is some other bug
151 int charCount = maxCodePointCount;
152 while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
153 outCodePoints[++wordPos] = nextChar;
154 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
155 mDictRoot, &pos);
156 }
157 }
158 *outUnigramProbability =
159 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
160 mDictRoot, &pos);
161 return ++wordPos;
162 }
163 // We need to skip past this PtNode, so skip any remaining code points
164 // after the
165 // first and possibly the probability.
166 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
167 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags,
168 MAX_WORD_LENGTH, &pos);
169 }
170 if (PatriciaTrieReadingUtils::isTerminal(flags)) {
171 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot,
172 &pos);
173 }
174 // The fact that this PtNode has children is very important. Since we
175 // already know
176 // that this PtNode does not match, if it has no children we know it is
177 // irrelevant
178 // to what we are searching for.
179 const bool hasChildren =
180 PatriciaTrieReadingUtils::hasChildrenInFlags(flags);
181 // We will write in `found' whether we have passed the children position
182 // we are
183 // searching for. For example if we search for "beer", the children of b
184 // are less
185 // than the address we are searching for and the children of c are
186 // greater. When we
187 // come here for c, we realize this is too big, and that we should descend
188 // b.
189 bool found;
190 if (hasChildren) {
191 int currentPos = pos;
192 // Here comes the tricky part. First, read the children position.
193 const int childrenPos =
194 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
195 mDictRoot, flags, &currentPos);
196 if (childrenPos > ptNodePos) {
197 // If the children pos is greater than the position, it means the
198 // previous
199 // PtNode, which position is stored in lastCandidatePtNodePos, was the
200 // right
201 // one.
202 found = true;
203 } else if (1 >= ptNodeCount) {
204 // However if we are on the LAST PtNode of this array, and we have NOT
205 // shot the
206 // position we should descend THIS PtNode. So we trick the
207 // lastCandidatePtNodePos so that we will descend this PtNode, not the
208 // previous
209 // one.
210 lastCandidatePtNodePos = startPos;
211 found = true;
212 } else {
213 // Else, we should continue looking.
214 found = false;
215 }
216 } else {
217 // Even if we don't have children here, we could still be on the last
218 // PtNode of
219 // this array. If this is the case, we should descend the last PtNode
220 // that had
221 // children, and their position is already in lastCandidatePtNodePos.
222 found = (1 >= ptNodeCount);
223 }
224
225 if (found) {
226 // Okay, we found the PtNode we should descend. Its position is in
227 // the lastCandidatePtNodePos variable, so we just re-read it.
228 if (0 != lastCandidatePtNodePos) {
229 const PatriciaTrieReadingUtils::NodeFlags lastFlags =
230 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
231 mDictRoot, &lastCandidatePtNodePos);
232 const int lastChar =
233 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
234 mDictRoot, &lastCandidatePtNodePos);
235 // We copy all the characters in this PtNode to the buffer
236 outCodePoints[wordPos] = lastChar;
237 if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
238 int nextChar =
239 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
240 mDictRoot, &lastCandidatePtNodePos);
241 int charCount = maxCodePointCount;
242 while (-1 != nextChar && --charCount > 0) {
243 outCodePoints[++wordPos] = nextChar;
244 nextChar =
245 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
246 mDictRoot, &lastCandidatePtNodePos);
247 }
248 }
249 ++wordPos;
250 // Now we only need to branch to the children address. Skip the
251 // probability if
252 // it's there, read pos, and break to resume the search at pos.
253 if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
254 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
255 mDictRoot, &lastCandidatePtNodePos);
256 }
257 pos =
258 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
259 mDictRoot, lastFlags, &lastCandidatePtNodePos);
260 break;
261 } else {
262 // Here is a little tricky part: we come here if we found out that all
263 // children
264 // addresses in this PtNode are bigger than the address we are
265 // searching for.
266 // Should we conclude the word is not in the dictionary? No! It could
267 // still be
268 // one of the remaining PtNodes in this array, so we have to keep
269 // looking in
270 // this array until we find it (or we realize it's not there either,
271 // in which
272 // case it's actually not in the dictionary). Pass the end of this
273 // PtNode,
274 // ready to start the next one.
275 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
276 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
277 mDictRoot, flags, &pos);
278 }
279 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
280 mShortcutListPolicy.skipAllShortcuts(&pos);
281 }
282 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
283 if (!mBigramListPolicy.skipAllBigrams(&pos)) {
284 AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.",
285 mDictBufferSize, pos);
286 mIsCorrupted = true;
287 ASSERT(false);
288 *outUnigramProbability = NOT_A_PROBABILITY;
289 return 0;
290 }
291 }
292 }
293 } else {
294 // If we did not find it, we should record the last children address for
295 // the next
296 // iteration.
297 if (hasChildren)
298 lastCandidatePtNodePos = startPos;
299 // Now skip the end of this PtNode (children pos and the attributes if
300 // any) so that
301 // our pos is after the end of this PtNode, at the start of the next
302 // one.
303 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
304 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
305 mDictRoot, flags, &pos);
306 }
307 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
308 mShortcutListPolicy.skipAllShortcuts(&pos);
309 }
310 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
311 if (!mBigramListPolicy.skipAllBigrams(&pos)) {
312 AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.",
313 mDictBufferSize, pos);
314 mIsCorrupted = true;
315 ASSERT(false);
316 *outUnigramProbability = NOT_A_PROBABILITY;
317 return 0;
318 }
319 }
320 }
321 }
322 }
323 // If we have looked through all the PtNodes and found no match, the ptNodePos
324 // is
325 // not the position of a terminal in this dictionary.
326 return 0;
327 }
328
329 // This function gets the position of the terminal PtNode of the exact matching
330 // word in the
331 // dictionary. If no match is found, it returns NOT_A_DICT_POS.
332 int PatriciaTriePolicy::getTerminalPtNodePositionOfWord(
333 const int* const inWord,
334 const int length,
335 const bool forceLowerCaseSearch) const {
336 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
337 readingHelper.initWithPtNodeArrayPos(getRootPosition());
338 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(
339 inWord, length, forceLowerCaseSearch);
340 if (readingHelper.isError()) {
341 mIsCorrupted = true;
342 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
343 }
344 return ptNodePos;
345 }
346
347 int PatriciaTriePolicy::getProbability(const int unigramProbability,
348 const int bigramProbability) const {
349 // Due to space constraints, the probability for bigrams is approximate - the
350 // lower the unigram
351 // probability, the worse the precision. The theoritical maximum error in
352 // resulting probability
353 // is 8 - although in the practice it's never bigger than 3 or 4 in very bad
354 // cases. This means
355 // that sometimes, we'll see some bigrams interverted here, but it can't get
356 // too bad.
357 if (unigramProbability == NOT_A_PROBABILITY) {
358 return NOT_A_PROBABILITY;
359 } else if (bigramProbability == NOT_A_PROBABILITY) {
360 return ProbabilityUtils::backoff(unigramProbability);
361 } else {
362 return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
363 bigramProbability);
364 }
365 }
366
367 int PatriciaTriePolicy::getProbabilityOfPtNode(
368 const int* const prevWordsPtNodePos,
369 const int ptNodePos) const {
370 if (ptNodePos == NOT_A_DICT_POS) {
371 return NOT_A_PROBABILITY;
372 }
373 const PtNodeParams ptNodeParams =
374 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
375 if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) {
376 // If this is not a word, or if it's a blacklisted entry, it should behave
377 // as
378 // having no probability outside of the suggestion process (where it should
379 // be used
380 // for shortcuts).
381 return NOT_A_PROBABILITY;
382 }
383 if (prevWordsPtNodePos) {
384 const int bigramsPosition =
385 getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
386 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy,
387 bigramsPosition);
388 while (bigramsIt.hasNext()) {
389 bigramsIt.next();
390 if (bigramsIt.getBigramPos() == ptNodePos &&
391 bigramsIt.getProbability() != NOT_A_PROBABILITY) {
392 return getProbability(ptNodeParams.getProbability(),
393 bigramsIt.getProbability());
394 }
395 }
396 return NOT_A_PROBABILITY;
397 }
398 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
399 }
400
401 void PatriciaTriePolicy::iterateNgramEntries(
402 const int* const prevWordsPtNodePos,
403 NgramListener* const listener) const {
404 if (!prevWordsPtNodePos) {
405 return;
406 }
407 const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
408 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy,
409 bigramsPosition);
410 while (bigramsIt.hasNext()) {
411 bigramsIt.next();
412 listener->onVisitEntry(bigramsIt.getProbability(),
413 bigramsIt.getBigramPos());
414 }
415 }
416
417 int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
418 if (ptNodePos == NOT_A_DICT_POS) {
419 return NOT_A_DICT_POS;
420 }
421 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)
422 .getShortcutPos();
423 }
424
425 int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
426 if (ptNodePos == NOT_A_DICT_POS) {
427 return NOT_A_DICT_POS;
428 }
429 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)
430 .getBigramsPos();
431 }
432
433 int PatriciaTriePolicy::createAndGetLeavingChildNode(
434 const DicNode* const dicNode,
435 const int ptNodePos,
436 DicNodeVector* childDicNodes) const {
437 PatriciaTrieReadingUtils::NodeFlags flags;
438 int mergedNodeCodePointCount = 0;
439 int mergedNodeCodePoints[MAX_WORD_LENGTH];
440 int probability = NOT_A_PROBABILITY;
441 int childrenPos = NOT_A_DICT_POS;
442 int shortcutPos = NOT_A_DICT_POS;
443 int bigramPos = NOT_A_DICT_POS;
444 int siblingPos = NOT_A_DICT_POS;
445 PatriciaTrieReadingUtils::readPtNodeInfo(
446 mDictRoot, ptNodePos, getShortcutsStructurePolicy(), &mBigramListPolicy,
447 &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability,
448 &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
449 // Skip PtNodes don't start with Unicode code point because they represent
450 // non-word information.
451 if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
452 childDicNodes->pushLeavingChild(
453 dicNode, ptNodePos, childrenPos, probability,
454 PatriciaTrieReadingUtils::isTerminal(flags),
455 PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
456 PatriciaTrieReadingUtils::isBlacklisted(flags) ||
457 PatriciaTrieReadingUtils::isNotAWord(flags),
458 mergedNodeCodePointCount, mergedNodeCodePoints);
459 }
460 return siblingPos;
461 }
462
463 const WordProperty PatriciaTriePolicy::getWordProperty(
464 const int* const codePoints,
465 const int codePointCount) const {
466 const int ptNodePos = getTerminalPtNodePositionOfWord(
467 codePoints, codePointCount, false /* forceLowerCaseSearch */);
468 if (ptNodePos == NOT_A_DICT_POS) {
469 AKLOGE("getWordProperty was called for invalid word.");
470 return WordProperty();
471 }
472 const PtNodeParams ptNodeParams =
473 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
474 std::vector<int> codePointVector(
475 ptNodeParams.getCodePoints(),
476 ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());
477 // Fetch bigram information.
478 std::vector<BigramProperty> bigrams;
479 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
480 int bigramWord1CodePoints[MAX_WORD_LENGTH];
481 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
482 while (bigramsIt.hasNext()) {
483 // Fetch the next bigram information and forward the iterator.
484 bigramsIt.next();
485 // Skip the entry if the entry has been deleted. This never happens for ver2
486 // dicts.
487 if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
488 int word1Probability = NOT_A_PROBABILITY;
489 const int word1CodePointCount =
490 getCodePointsAndProbabilityAndReturnCodePointCount(
491 bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints,
492 &word1Probability);
493 const std::vector<int> word1(bigramWord1CodePoints,
494 bigramWord1CodePoints + word1CodePointCount);
495 const int probability =
496 getProbability(word1Probability, bigramsIt.getProbability());
497 bigrams.emplace_back(&word1, probability, NOT_A_TIMESTAMP /* timestamp */,
498 0 /* level */, 0 /* count */);
499 }
500 }
501 // Fetch shortcut information.
502 std::vector<UnigramProperty::ShortcutProperty> shortcuts;
503 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
504 if (shortcutPos != NOT_A_DICT_POS) {
505 int shortcutTargetCodePoints[MAX_WORD_LENGTH];
506 ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(
507 mDictRoot, &shortcutPos);
508 bool hasNext = true;
509 while (hasNext) {
510 const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
511 ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot,
512 &shortcutPos);
513 hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
514 const int shortcutTargetLength =
515 ShortcutListReadingUtils::readShortcutTarget(
516 mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints,
517 &shortcutPos);
518 const std::vector<int> shortcutTarget(
519 shortcutTargetCodePoints,
520 shortcutTargetCodePoints + shortcutTargetLength);
521 const int shortcutProbability =
522 ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
523 shortcuts.emplace_back(&shortcutTarget, shortcutProbability);
524 }
525 }
526 const UnigramProperty unigramProperty(
527 ptNodeParams.representsBeginningOfSentence(), ptNodeParams.isNotAWord(),
528 ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
529 NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */,
530 &shortcuts);
531 return WordProperty(&codePointVector, &unigramProperty, &bigrams);
532 }
533
534 int PatriciaTriePolicy::getNextWordAndNextToken(const int token,
535 int* const outCodePoints,
536 int* const outCodePointCount) {
537 *outCodePointCount = 0;
538 if (token == 0) {
539 // Start iterating the dictionary.
540 mTerminalPtNodePositionsForIteratingWords.clear();
541 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions
542 traversePolicy(&mTerminalPtNodePositionsForIteratingWords);
543 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
544 readingHelper.initWithPtNodeArrayPos(getRootPosition());
545 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
546 &traversePolicy);
547 }
548 const int terminalPtNodePositionsVectorSize =
549 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
550 if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
551 AKLOGE("Given token %d is invalid.", token);
552 return 0;
553 }
554 const int terminalPtNodePos =
555 mTerminalPtNodePositionsForIteratingWords[token];
556 int unigramProbability = NOT_A_PROBABILITY;
557 *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
558 terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
559 const int nextToken = token + 1;
560 if (nextToken >= terminalPtNodePositionsVectorSize) {
561 // All words have been iterated.
562 mTerminalPtNodePositionsForIteratingWords.clear();
563 return 0;
564 }
565 return nextToken;
566 }
567
568 } // namespace latinime
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698