| OLD | NEW |
| (Empty) | |
| 1 /* |
| 2 * Copyright (C) 2013, The Android Open Source Project |
| 3 * |
| 4 * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 * you may not use this file except in compliance with the License. |
| 6 * You may obtain a copy of the License at |
| 7 * |
| 8 * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 * |
| 10 * Unless required by applicable law or agreed to in writing, software |
| 11 * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 * See the License for the specific language governing permissions and |
| 14 * limitations under the License. |
| 15 */ |
| 16 |
| 17 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/v2/patr
icia_trie_policy.h" |
| 18 |
| 19 #include "third_party/prediction/defines.h" |
| 20 #include "third_party/prediction/suggest/core/dicnode/dic_node.h" |
| 21 #include "third_party/prediction/suggest/core/dicnode/dic_node_vector.h" |
| 22 #include "third_party/prediction/suggest/core/dictionary/binary_dictionary_bigra
ms_iterator.h" |
| 23 #include "third_party/prediction/suggest/core/dictionary/ngram_listener.h" |
| 24 #include "third_party/prediction/suggest/core/session/prev_words_info.h" |
| 25 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_comm
on/dynamic_pt_reading_helper.h" |
| 26 #include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_comm
on/patricia_trie_reading_utils.h" |
| 27 #include "third_party/prediction/suggest/policyimpl/dictionary/utils/probability
_utils.h" |
| 28 #include "third_party/prediction/utils/char_utils.h" |
| 29 |
| 30 namespace latinime { |
| 31 |
| 32 void PatriciaTriePolicy::createAndGetAllChildDicNodes( |
| 33 const DicNode* const dicNode, |
| 34 DicNodeVector* const childDicNodes) const { |
| 35 if (!dicNode->hasChildren()) { |
| 36 return; |
| 37 } |
| 38 int nextPos = dicNode->getChildrenPtNodeArrayPos(); |
| 39 if (nextPos < 0 || nextPos >= mDictBufferSize) { |
| 40 AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d", |
| 41 nextPos, mDictBufferSize); |
| 42 mIsCorrupted = true; |
| 43 ASSERT(false); |
| 44 return; |
| 45 } |
| 46 const int childCount = |
| 47 PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot, |
| 48 &nextPos); |
| 49 for (int i = 0; i < childCount; i++) { |
| 50 if (nextPos < 0 || nextPos >= mDictBufferSize) { |
| 51 AKLOGE( |
| 52 "Child PtNode position is invalid. pos: %d, dict size: %d, " |
| 53 "childCount: %d / %d", |
| 54 nextPos, mDictBufferSize, i, childCount); |
| 55 mIsCorrupted = true; |
| 56 ASSERT(false); |
| 57 return; |
| 58 } |
| 59 nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); |
| 60 } |
| 61 } |
| 62 |
| 63 // This retrieves code points and the probability of the word by its terminal |
| 64 // position. |
| 65 // Due to the fact that words are ordered in the dictionary in a strict |
| 66 // breadth-first order, |
| 67 // it is possible to check for this with advantageous complexity. For each |
| 68 // PtNode array, we search |
| 69 // for PtNodes with children and compare the children position with the position |
| 70 // we look for. |
| 71 // When we shoot the position we look for, it means the word we look for is in |
| 72 // the children |
| 73 // of the previous PtNode. The only tricky part is the fact that if we arrive at |
| 74 // the end of a |
| 75 // PtNode array with the last PtNode's children position still less than what we |
| 76 // are searching for, |
| 77 // we must descend the last PtNode's children (for example, if the word we are |
| 78 // searching for starts |
| 79 // with a z, it's the last PtNode of the root array, so all children addresses |
| 80 // will be smaller |
| 81 // than the position we look for, and we have to descend the z PtNode). |
| 82 /* Parameters : |
| 83 * ptNodePos: the byte position of the terminal PtNode of the word we are |
| 84 * searching for (this is |
| 85 * what is stored as the "bigram position" in each bigram) |
| 86 * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. |
| 87 * outUnigramProbability: a pointer to an int to write the probability into. |
| 88 * Return value : the code point count, of 0 if the word was not found. |
| 89 */ |
| 90 // TODO: Split this function to be more readable |
| 91 int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( |
| 92 const int ptNodePos, |
| 93 const int maxCodePointCount, |
| 94 int* const outCodePoints, |
| 95 int* const outUnigramProbability) const { |
| 96 int pos = getRootPosition(); |
| 97 int wordPos = 0; |
| 98 // One iteration of the outer loop iterates through PtNode arrays. As stated |
| 99 // above, we will |
| 100 // only traverse PtNodes that are actually a part of the terminal we are |
| 101 // searching, so each |
| 102 // time we enter this loop we are one depth level further than last time. |
| 103 // The only reason we count PtNodes is because we want to reduce the |
| 104 // probability of infinite |
| 105 // looping in case there is a bug. Since we know there is an upper bound to |
| 106 // the depth we are |
| 107 // supposed to traverse, it does not hurt to count iterations. |
| 108 for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { |
| 109 int lastCandidatePtNodePos = 0; |
| 110 // Let's loop through PtNodes in this PtNode array searching for either the |
| 111 // terminal |
| 112 // or one of its ascendants. |
| 113 if (pos < 0 || pos >= mDictBufferSize) { |
| 114 AKLOGE("PtNode array position is invalid. pos: %d, dict size: %d", pos, |
| 115 mDictBufferSize); |
| 116 mIsCorrupted = true; |
| 117 ASSERT(false); |
| 118 *outUnigramProbability = NOT_A_PROBABILITY; |
| 119 return 0; |
| 120 } |
| 121 for (int ptNodeCount = |
| 122 PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( |
| 123 mDictRoot, &pos); |
| 124 ptNodeCount > 0; --ptNodeCount) { |
| 125 const int startPos = pos; |
| 126 if (pos < 0 || pos >= mDictBufferSize) { |
| 127 AKLOGE("PtNode position is invalid. pos: %d, dict size: %d", pos, |
| 128 mDictBufferSize); |
| 129 mIsCorrupted = true; |
| 130 ASSERT(false); |
| 131 *outUnigramProbability = NOT_A_PROBABILITY; |
| 132 return 0; |
| 133 } |
| 134 const PatriciaTrieReadingUtils::NodeFlags flags = |
| 135 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); |
| 136 const int character = |
| 137 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, |
| 138 &pos); |
| 139 if (ptNodePos == startPos) { |
| 140 // We found the position. Copy the rest of the code points in the buffer |
| 141 // and return |
| 142 // the length. |
| 143 outCodePoints[wordPos] = character; |
| 144 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { |
| 145 int nextChar = |
| 146 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( |
| 147 mDictRoot, &pos); |
| 148 // We count code points in order to avoid infinite loops if the file |
| 149 // is broken |
| 150 // or if there is some other bug |
| 151 int charCount = maxCodePointCount; |
| 152 while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { |
| 153 outCodePoints[++wordPos] = nextChar; |
| 154 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( |
| 155 mDictRoot, &pos); |
| 156 } |
| 157 } |
| 158 *outUnigramProbability = |
| 159 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( |
| 160 mDictRoot, &pos); |
| 161 return ++wordPos; |
| 162 } |
| 163 // We need to skip past this PtNode, so skip any remaining code points |
| 164 // after the |
| 165 // first and possibly the probability. |
| 166 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { |
| 167 PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, |
| 168 MAX_WORD_LENGTH, &pos); |
| 169 } |
| 170 if (PatriciaTrieReadingUtils::isTerminal(flags)) { |
| 171 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, |
| 172 &pos); |
| 173 } |
| 174 // The fact that this PtNode has children is very important. Since we |
| 175 // already know |
| 176 // that this PtNode does not match, if it has no children we know it is |
| 177 // irrelevant |
| 178 // to what we are searching for. |
| 179 const bool hasChildren = |
| 180 PatriciaTrieReadingUtils::hasChildrenInFlags(flags); |
| 181 // We will write in `found' whether we have passed the children position |
| 182 // we are |
| 183 // searching for. For example if we search for "beer", the children of b |
| 184 // are less |
| 185 // than the address we are searching for and the children of c are |
| 186 // greater. When we |
| 187 // come here for c, we realize this is too big, and that we should descend |
| 188 // b. |
| 189 bool found; |
| 190 if (hasChildren) { |
| 191 int currentPos = pos; |
| 192 // Here comes the tricky part. First, read the children position. |
| 193 const int childrenPos = |
| 194 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( |
| 195 mDictRoot, flags, ¤tPos); |
| 196 if (childrenPos > ptNodePos) { |
| 197 // If the children pos is greater than the position, it means the |
| 198 // previous |
| 199 // PtNode, which position is stored in lastCandidatePtNodePos, was the |
| 200 // right |
| 201 // one. |
| 202 found = true; |
| 203 } else if (1 >= ptNodeCount) { |
| 204 // However if we are on the LAST PtNode of this array, and we have NOT |
| 205 // shot the |
| 206 // position we should descend THIS PtNode. So we trick the |
| 207 // lastCandidatePtNodePos so that we will descend this PtNode, not the |
| 208 // previous |
| 209 // one. |
| 210 lastCandidatePtNodePos = startPos; |
| 211 found = true; |
| 212 } else { |
| 213 // Else, we should continue looking. |
| 214 found = false; |
| 215 } |
| 216 } else { |
| 217 // Even if we don't have children here, we could still be on the last |
| 218 // PtNode of |
| 219 // this array. If this is the case, we should descend the last PtNode |
| 220 // that had |
| 221 // children, and their position is already in lastCandidatePtNodePos. |
| 222 found = (1 >= ptNodeCount); |
| 223 } |
| 224 |
| 225 if (found) { |
| 226 // Okay, we found the PtNode we should descend. Its position is in |
| 227 // the lastCandidatePtNodePos variable, so we just re-read it. |
| 228 if (0 != lastCandidatePtNodePos) { |
| 229 const PatriciaTrieReadingUtils::NodeFlags lastFlags = |
| 230 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( |
| 231 mDictRoot, &lastCandidatePtNodePos); |
| 232 const int lastChar = |
| 233 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( |
| 234 mDictRoot, &lastCandidatePtNodePos); |
| 235 // We copy all the characters in this PtNode to the buffer |
| 236 outCodePoints[wordPos] = lastChar; |
| 237 if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { |
| 238 int nextChar = |
| 239 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( |
| 240 mDictRoot, &lastCandidatePtNodePos); |
| 241 int charCount = maxCodePointCount; |
| 242 while (-1 != nextChar && --charCount > 0) { |
| 243 outCodePoints[++wordPos] = nextChar; |
| 244 nextChar = |
| 245 PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( |
| 246 mDictRoot, &lastCandidatePtNodePos); |
| 247 } |
| 248 } |
| 249 ++wordPos; |
| 250 // Now we only need to branch to the children address. Skip the |
| 251 // probability if |
| 252 // it's there, read pos, and break to resume the search at pos. |
| 253 if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { |
| 254 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( |
| 255 mDictRoot, &lastCandidatePtNodePos); |
| 256 } |
| 257 pos = |
| 258 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( |
| 259 mDictRoot, lastFlags, &lastCandidatePtNodePos); |
| 260 break; |
| 261 } else { |
| 262 // Here is a little tricky part: we come here if we found out that all |
| 263 // children |
| 264 // addresses in this PtNode are bigger than the address we are |
| 265 // searching for. |
| 266 // Should we conclude the word is not in the dictionary? No! It could |
| 267 // still be |
| 268 // one of the remaining PtNodes in this array, so we have to keep |
| 269 // looking in |
| 270 // this array until we find it (or we realize it's not there either, |
| 271 // in which |
| 272 // case it's actually not in the dictionary). Pass the end of this |
| 273 // PtNode, |
| 274 // ready to start the next one. |
| 275 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { |
| 276 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( |
| 277 mDictRoot, flags, &pos); |
| 278 } |
| 279 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { |
| 280 mShortcutListPolicy.skipAllShortcuts(&pos); |
| 281 } |
| 282 if (PatriciaTrieReadingUtils::hasBigrams(flags)) { |
| 283 if (!mBigramListPolicy.skipAllBigrams(&pos)) { |
| 284 AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.", |
| 285 mDictBufferSize, pos); |
| 286 mIsCorrupted = true; |
| 287 ASSERT(false); |
| 288 *outUnigramProbability = NOT_A_PROBABILITY; |
| 289 return 0; |
| 290 } |
| 291 } |
| 292 } |
| 293 } else { |
| 294 // If we did not find it, we should record the last children address for |
| 295 // the next |
| 296 // iteration. |
| 297 if (hasChildren) |
| 298 lastCandidatePtNodePos = startPos; |
| 299 // Now skip the end of this PtNode (children pos and the attributes if |
| 300 // any) so that |
| 301 // our pos is after the end of this PtNode, at the start of the next |
| 302 // one. |
| 303 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { |
| 304 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( |
| 305 mDictRoot, flags, &pos); |
| 306 } |
| 307 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { |
| 308 mShortcutListPolicy.skipAllShortcuts(&pos); |
| 309 } |
| 310 if (PatriciaTrieReadingUtils::hasBigrams(flags)) { |
| 311 if (!mBigramListPolicy.skipAllBigrams(&pos)) { |
| 312 AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.", |
| 313 mDictBufferSize, pos); |
| 314 mIsCorrupted = true; |
| 315 ASSERT(false); |
| 316 *outUnigramProbability = NOT_A_PROBABILITY; |
| 317 return 0; |
| 318 } |
| 319 } |
| 320 } |
| 321 } |
| 322 } |
| 323 // If we have looked through all the PtNodes and found no match, the ptNodePos |
| 324 // is |
| 325 // not the position of a terminal in this dictionary. |
| 326 return 0; |
| 327 } |
| 328 |
| 329 // This function gets the position of the terminal PtNode of the exact matching |
| 330 // word in the |
| 331 // dictionary. If no match is found, it returns NOT_A_DICT_POS. |
| 332 int PatriciaTriePolicy::getTerminalPtNodePositionOfWord( |
| 333 const int* const inWord, |
| 334 const int length, |
| 335 const bool forceLowerCaseSearch) const { |
| 336 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); |
| 337 readingHelper.initWithPtNodeArrayPos(getRootPosition()); |
| 338 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord( |
| 339 inWord, length, forceLowerCaseSearch); |
| 340 if (readingHelper.isError()) { |
| 341 mIsCorrupted = true; |
| 342 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); |
| 343 } |
| 344 return ptNodePos; |
| 345 } |
| 346 |
| 347 int PatriciaTriePolicy::getProbability(const int unigramProbability, |
| 348 const int bigramProbability) const { |
| 349 // Due to space constraints, the probability for bigrams is approximate - the |
| 350 // lower the unigram |
| 351 // probability, the worse the precision. The theoritical maximum error in |
| 352 // resulting probability |
| 353 // is 8 - although in the practice it's never bigger than 3 or 4 in very bad |
| 354 // cases. This means |
| 355 // that sometimes, we'll see some bigrams interverted here, but it can't get |
| 356 // too bad. |
| 357 if (unigramProbability == NOT_A_PROBABILITY) { |
| 358 return NOT_A_PROBABILITY; |
| 359 } else if (bigramProbability == NOT_A_PROBABILITY) { |
| 360 return ProbabilityUtils::backoff(unigramProbability); |
| 361 } else { |
| 362 return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, |
| 363 bigramProbability); |
| 364 } |
| 365 } |
| 366 |
| 367 int PatriciaTriePolicy::getProbabilityOfPtNode( |
| 368 const int* const prevWordsPtNodePos, |
| 369 const int ptNodePos) const { |
| 370 if (ptNodePos == NOT_A_DICT_POS) { |
| 371 return NOT_A_PROBABILITY; |
| 372 } |
| 373 const PtNodeParams ptNodeParams = |
| 374 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); |
| 375 if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) { |
| 376 // If this is not a word, or if it's a blacklisted entry, it should behave |
| 377 // as |
| 378 // having no probability outside of the suggestion process (where it should |
| 379 // be used |
| 380 // for shortcuts). |
| 381 return NOT_A_PROBABILITY; |
| 382 } |
| 383 if (prevWordsPtNodePos) { |
| 384 const int bigramsPosition = |
| 385 getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); |
| 386 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, |
| 387 bigramsPosition); |
| 388 while (bigramsIt.hasNext()) { |
| 389 bigramsIt.next(); |
| 390 if (bigramsIt.getBigramPos() == ptNodePos && |
| 391 bigramsIt.getProbability() != NOT_A_PROBABILITY) { |
| 392 return getProbability(ptNodeParams.getProbability(), |
| 393 bigramsIt.getProbability()); |
| 394 } |
| 395 } |
| 396 return NOT_A_PROBABILITY; |
| 397 } |
| 398 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); |
| 399 } |
| 400 |
| 401 void PatriciaTriePolicy::iterateNgramEntries( |
| 402 const int* const prevWordsPtNodePos, |
| 403 NgramListener* const listener) const { |
| 404 if (!prevWordsPtNodePos) { |
| 405 return; |
| 406 } |
| 407 const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); |
| 408 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, |
| 409 bigramsPosition); |
| 410 while (bigramsIt.hasNext()) { |
| 411 bigramsIt.next(); |
| 412 listener->onVisitEntry(bigramsIt.getProbability(), |
| 413 bigramsIt.getBigramPos()); |
| 414 } |
| 415 } |
| 416 |
| 417 int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { |
| 418 if (ptNodePos == NOT_A_DICT_POS) { |
| 419 return NOT_A_DICT_POS; |
| 420 } |
| 421 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos) |
| 422 .getShortcutPos(); |
| 423 } |
| 424 |
| 425 int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { |
| 426 if (ptNodePos == NOT_A_DICT_POS) { |
| 427 return NOT_A_DICT_POS; |
| 428 } |
| 429 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos) |
| 430 .getBigramsPos(); |
| 431 } |
| 432 |
| 433 int PatriciaTriePolicy::createAndGetLeavingChildNode( |
| 434 const DicNode* const dicNode, |
| 435 const int ptNodePos, |
| 436 DicNodeVector* childDicNodes) const { |
| 437 PatriciaTrieReadingUtils::NodeFlags flags; |
| 438 int mergedNodeCodePointCount = 0; |
| 439 int mergedNodeCodePoints[MAX_WORD_LENGTH]; |
| 440 int probability = NOT_A_PROBABILITY; |
| 441 int childrenPos = NOT_A_DICT_POS; |
| 442 int shortcutPos = NOT_A_DICT_POS; |
| 443 int bigramPos = NOT_A_DICT_POS; |
| 444 int siblingPos = NOT_A_DICT_POS; |
| 445 PatriciaTrieReadingUtils::readPtNodeInfo( |
| 446 mDictRoot, ptNodePos, getShortcutsStructurePolicy(), &mBigramListPolicy, |
| 447 &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability, |
| 448 &childrenPos, &shortcutPos, &bigramPos, &siblingPos); |
| 449 // Skip PtNodes don't start with Unicode code point because they represent |
| 450 // non-word information. |
| 451 if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { |
| 452 childDicNodes->pushLeavingChild( |
| 453 dicNode, ptNodePos, childrenPos, probability, |
| 454 PatriciaTrieReadingUtils::isTerminal(flags), |
| 455 PatriciaTrieReadingUtils::hasChildrenInFlags(flags), |
| 456 PatriciaTrieReadingUtils::isBlacklisted(flags) || |
| 457 PatriciaTrieReadingUtils::isNotAWord(flags), |
| 458 mergedNodeCodePointCount, mergedNodeCodePoints); |
| 459 } |
| 460 return siblingPos; |
| 461 } |
| 462 |
| 463 const WordProperty PatriciaTriePolicy::getWordProperty( |
| 464 const int* const codePoints, |
| 465 const int codePointCount) const { |
| 466 const int ptNodePos = getTerminalPtNodePositionOfWord( |
| 467 codePoints, codePointCount, false /* forceLowerCaseSearch */); |
| 468 if (ptNodePos == NOT_A_DICT_POS) { |
| 469 AKLOGE("getWordProperty was called for invalid word."); |
| 470 return WordProperty(); |
| 471 } |
| 472 const PtNodeParams ptNodeParams = |
| 473 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); |
| 474 std::vector<int> codePointVector( |
| 475 ptNodeParams.getCodePoints(), |
| 476 ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); |
| 477 // Fetch bigram information. |
| 478 std::vector<BigramProperty> bigrams; |
| 479 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); |
| 480 int bigramWord1CodePoints[MAX_WORD_LENGTH]; |
| 481 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos); |
| 482 while (bigramsIt.hasNext()) { |
| 483 // Fetch the next bigram information and forward the iterator. |
| 484 bigramsIt.next(); |
| 485 // Skip the entry if the entry has been deleted. This never happens for ver2 |
| 486 // dicts. |
| 487 if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { |
| 488 int word1Probability = NOT_A_PROBABILITY; |
| 489 const int word1CodePointCount = |
| 490 getCodePointsAndProbabilityAndReturnCodePointCount( |
| 491 bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints, |
| 492 &word1Probability); |
| 493 const std::vector<int> word1(bigramWord1CodePoints, |
| 494 bigramWord1CodePoints + word1CodePointCount); |
| 495 const int probability = |
| 496 getProbability(word1Probability, bigramsIt.getProbability()); |
| 497 bigrams.emplace_back(&word1, probability, NOT_A_TIMESTAMP /* timestamp */, |
| 498 0 /* level */, 0 /* count */); |
| 499 } |
| 500 } |
| 501 // Fetch shortcut information. |
| 502 std::vector<UnigramProperty::ShortcutProperty> shortcuts; |
| 503 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); |
| 504 if (shortcutPos != NOT_A_DICT_POS) { |
| 505 int shortcutTargetCodePoints[MAX_WORD_LENGTH]; |
| 506 ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer( |
| 507 mDictRoot, &shortcutPos); |
| 508 bool hasNext = true; |
| 509 while (hasNext) { |
| 510 const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = |
| 511 ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot, |
| 512 &shortcutPos); |
| 513 hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); |
| 514 const int shortcutTargetLength = |
| 515 ShortcutListReadingUtils::readShortcutTarget( |
| 516 mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints, |
| 517 &shortcutPos); |
| 518 const std::vector<int> shortcutTarget( |
| 519 shortcutTargetCodePoints, |
| 520 shortcutTargetCodePoints + shortcutTargetLength); |
| 521 const int shortcutProbability = |
| 522 ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags); |
| 523 shortcuts.emplace_back(&shortcutTarget, shortcutProbability); |
| 524 } |
| 525 } |
| 526 const UnigramProperty unigramProperty( |
| 527 ptNodeParams.representsBeginningOfSentence(), ptNodeParams.isNotAWord(), |
| 528 ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), |
| 529 NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, |
| 530 &shortcuts); |
| 531 return WordProperty(&codePointVector, &unigramProperty, &bigrams); |
| 532 } |
| 533 |
| 534 int PatriciaTriePolicy::getNextWordAndNextToken(const int token, |
| 535 int* const outCodePoints, |
| 536 int* const outCodePointCount) { |
| 537 *outCodePointCount = 0; |
| 538 if (token == 0) { |
| 539 // Start iterating the dictionary. |
| 540 mTerminalPtNodePositionsForIteratingWords.clear(); |
| 541 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions |
| 542 traversePolicy(&mTerminalPtNodePositionsForIteratingWords); |
| 543 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); |
| 544 readingHelper.initWithPtNodeArrayPos(getRootPosition()); |
| 545 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( |
| 546 &traversePolicy); |
| 547 } |
| 548 const int terminalPtNodePositionsVectorSize = |
| 549 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); |
| 550 if (token < 0 || token >= terminalPtNodePositionsVectorSize) { |
| 551 AKLOGE("Given token %d is invalid.", token); |
| 552 return 0; |
| 553 } |
| 554 const int terminalPtNodePos = |
| 555 mTerminalPtNodePositionsForIteratingWords[token]; |
| 556 int unigramProbability = NOT_A_PROBABILITY; |
| 557 *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( |
| 558 terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); |
| 559 const int nextToken = token + 1; |
| 560 if (nextToken >= terminalPtNodePositionsVectorSize) { |
| 561 // All words have been iterated. |
| 562 mTerminalPtNodePositionsForIteratingWords.clear(); |
| 563 return 0; |
| 564 } |
| 565 return nextToken; |
| 566 } |
| 567 |
| 568 } // namespace latinime |
| OLD | NEW |