Index: third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp |
diff --git a/third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..49ef0bbb91aaba9af4561a17c38239907805de08 |
--- /dev/null |
+++ b/third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp |
@@ -0,0 +1,369 @@ |
+/* |
+ * Copyright (C) 2013, The Android Open Source Project |
+ * |
+ * Licensed under the Apache License, Version 2.0 (the "License"); |
+ * you may not use this file except in compliance with the License. |
+ * You may obtain a copy of the License at |
+ * |
+ * http://www.apache.org/licenses/LICENSE-2.0 |
+ * |
+ * Unless required by applicable law or agreed to in writing, software |
+ * distributed under the License is distributed on an "AS IS" BASIS, |
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
+ * See the License for the specific language governing permissions and |
+ * limitations under the License. |
+ */ |
+ |
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" |
+ |
+#include "third_party/prediction/suggest/core/dictionary/property/unigram_property.h" |
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" |
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" |
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" |
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" |
+#include "third_party/prediction/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" |
+#include "third_party/prediction/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" |
+ |
+namespace latinime { |
+ |
+const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; |
+ |
+bool DynamicPtUpdatingHelper::addUnigramWord( |
+ DynamicPtReadingHelper* const readingHelper, |
+ const int* const wordCodePoints, |
+ const int codePointCount, |
+ const UnigramProperty* const unigramProperty, |
+ bool* const outAddedNewUnigram) { |
+ int parentPos = NOT_A_DICT_POS; |
+ while (!readingHelper->isEnd()) { |
+ const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); |
+ if (!ptNodeParams.isValid()) { |
+ break; |
+ } |
+ const int matchedCodePointCount = |
+ readingHelper->getPrevTotalCodePointCount(); |
+ if (!readingHelper->isMatchedCodePoint( |
+ ptNodeParams, 0 /* index */, |
+ wordCodePoints[matchedCodePointCount])) { |
+ // The first code point is different from target code point. Skip this |
+ // node and read |
+ // the next sibling node. |
+ readingHelper->readNextSiblingNode(ptNodeParams); |
+ continue; |
+ } |
+ // Check following merged node code points. |
+ const int nodeCodePointCount = ptNodeParams.getCodePointCount(); |
+ for (int j = 1; j < nodeCodePointCount; ++j) { |
+ const int nextIndex = matchedCodePointCount + j; |
+ if (nextIndex >= codePointCount || |
+ !readingHelper->isMatchedCodePoint( |
+ ptNodeParams, j, wordCodePoints[matchedCodePointCount + j])) { |
+ *outAddedNewUnigram = true; |
+ return reallocatePtNodeAndAddNewPtNodes( |
+ &ptNodeParams, j, unigramProperty, |
+ wordCodePoints + matchedCodePointCount, |
+ codePointCount - matchedCodePointCount); |
+ } |
+ } |
+ // All characters are matched. |
+ if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) { |
+ return setPtNodeProbability(&ptNodeParams, unigramProperty, |
+ outAddedNewUnigram); |
+ } |
+ if (!ptNodeParams.hasChildren()) { |
+ *outAddedNewUnigram = true; |
+ return createChildrenPtNodeArrayAndAChildPtNode( |
+ &ptNodeParams, unigramProperty, |
+ wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams), |
+ codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams)); |
+ } |
+ // Advance to the children nodes. |
+ parentPos = ptNodeParams.getHeadPos(); |
+ readingHelper->readChildNode(ptNodeParams); |
+ } |
+ if (readingHelper->isError()) { |
+ // The dictionary is invalid. |
+ return false; |
+ } |
+ int pos = readingHelper->getPosOfLastForwardLinkField(); |
+ *outAddedNewUnigram = true; |
+ return createAndInsertNodeIntoPtNodeArray( |
+ parentPos, wordCodePoints + readingHelper->getPrevTotalCodePointCount(), |
+ codePointCount - readingHelper->getPrevTotalCodePointCount(), |
+ unigramProperty, &pos); |
+} |
+ |
+bool DynamicPtUpdatingHelper::addNgramEntry( |
+ const PtNodePosArrayView prevWordsPtNodePos, |
+ const int wordPos, |
+ const BigramProperty* const bigramProperty, |
+ bool* const outAddedNewEntry) { |
+ if (prevWordsPtNodePos.empty()) { |
+ return false; |
+ } |
+ ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); |
+ int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; |
+ for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { |
+ prevWordTerminalIds[i] = |
+ mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( |
+ prevWordsPtNodePos[i]).getTerminalId(); |
+ } |
+ const WordIdArrayView prevWordIds(prevWordTerminalIds, |
+ prevWordsPtNodePos.size()); |
+ const int wordId = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( |
+ wordPos).getTerminalId(); |
+ return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, bigramProperty, |
+ outAddedNewEntry); |
+} |
+ |
+bool DynamicPtUpdatingHelper::removeNgramEntry( |
+ const PtNodePosArrayView prevWordsPtNodePos, |
+ const int wordPos) { |
+ if (prevWordsPtNodePos.empty()) { |
+ return false; |
+ } |
+ ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); |
+ int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; |
+ for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { |
+ prevWordTerminalIds[i] = |
+ mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( |
+ prevWordsPtNodePos[i]).getTerminalId(); |
+ } |
+ const WordIdArrayView prevWordIds(prevWordTerminalIds, |
+ prevWordsPtNodePos.size()); |
+ const int wordId = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( |
+ wordPos).getTerminalId(); |
+ return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId); |
+} |
+ |
+bool DynamicPtUpdatingHelper::addShortcutTarget( |
+ const int wordPos, |
+ const int* const targetCodePoints, |
+ const int targetCodePointCount, |
+ const int shortcutProbability) { |
+ const PtNodeParams ptNodeParams( |
+ mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); |
+ return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, |
+ targetCodePointCount, |
+ shortcutProbability); |
+} |
+ |
+bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray( |
+ const int parentPos, |
+ const int* const nodeCodePoints, |
+ const int nodeCodePointCount, |
+ const UnigramProperty* const unigramProperty, |
+ int* const forwardLinkFieldPos) { |
+ const int newPtNodeArrayPos = mBuffer->getTailPosition(); |
+ if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( |
+ mBuffer, newPtNodeArrayPos, forwardLinkFieldPos)) { |
+ return false; |
+ } |
+ return createNewPtNodeArrayWithAChildPtNode( |
+ parentPos, nodeCodePoints, nodeCodePointCount, unigramProperty); |
+} |
+ |
+bool DynamicPtUpdatingHelper::setPtNodeProbability( |
+ const PtNodeParams* const originalPtNodeParams, |
+ const UnigramProperty* const unigramProperty, |
+ bool* const outAddedNewUnigram) { |
+ if (originalPtNodeParams->isTerminal() && |
+ !originalPtNodeParams->isDeleted()) { |
+ // Overwrites the probability. |
+ *outAddedNewUnigram = false; |
+ return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, |
+ unigramProperty); |
+ } else { |
+ // Make the node terminal and write the probability. |
+ *outAddedNewUnigram = true; |
+ const int movedPos = mBuffer->getTailPosition(); |
+ int writingPos = movedPos; |
+ const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams( |
+ originalPtNodeParams, unigramProperty->isNotAWord(), |
+ unigramProperty->isBlacklisted(), true /* isTerminal */, |
+ originalPtNodeParams->getParentPos(), |
+ originalPtNodeParams->getCodePointCount(), |
+ originalPtNodeParams->getCodePoints(), |
+ unigramProperty->getProbability())); |
+ if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition( |
+ &ptNodeParamsToWrite, unigramProperty, &writingPos)) { |
+ return false; |
+ } |
+ if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, |
+ movedPos)) { |
+ return false; |
+ } |
+ } |
+ return true; |
+} |
+ |
+bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( |
+ const PtNodeParams* const parentPtNodeParams, |
+ const UnigramProperty* const unigramProperty, |
+ const int* const codePoints, |
+ const int codePointCount) { |
+ const int newPtNodeArrayPos = mBuffer->getTailPosition(); |
+ if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, |
+ newPtNodeArrayPos)) { |
+ return false; |
+ } |
+ return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), |
+ codePoints, codePointCount, |
+ unigramProperty); |
+} |
+ |
+bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( |
+ const int parentPtNodePos, |
+ const int* const nodeCodePoints, |
+ const int nodeCodePointCount, |
+ const UnigramProperty* const unigramProperty) { |
+ int writingPos = mBuffer->getTailPosition(); |
+ if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( |
+ mBuffer, 1 /* arraySize */, &writingPos)) { |
+ return false; |
+ } |
+ const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( |
+ unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), |
+ true /* isTerminal */, parentPtNodePos, nodeCodePointCount, |
+ nodeCodePoints, unigramProperty->getProbability())); |
+ if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition( |
+ &ptNodeParamsToWrite, unigramProperty, &writingPos)) { |
+ return false; |
+ } |
+ if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( |
+ mBuffer, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { |
+ return false; |
+ } |
+ return true; |
+} |
+ |
+// Returns whether the dictionary updating was succeeded or not. |
+bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( |
+ const PtNodeParams* const reallocatingPtNodeParams, |
+ const int overlappingCodePointCount, |
+ const UnigramProperty* const unigramProperty, |
+ const int* const newNodeCodePoints, |
+ const int newNodeCodePointCount) { |
+ // When addsExtraChild is true, split the reallocating PtNode and add new |
+ // child. |
+ // Reallocating PtNode: abcde, newNode: abcxy. |
+ // abc (1st, not terminal) __ de (2nd) |
+ // \_ xy (extra child, terminal) |
+ // Otherwise, this method makes 1st part terminal and write information in |
+ // unigramProperty. |
+ // Reallocating PtNode: abcde, newNode: abc. |
+ // abc (1st, terminal) __ de (2nd) |
+ const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; |
+ const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); |
+ int writingPos = firstPartOfReallocatedPtNodePos; |
+ // Write the 1st part of the reallocating node. The children position will be |
+ // updated later |
+ // with actual children position. |
+ if (addsExtraChild) { |
+ const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( |
+ false /* isNotAWord */, false /* isBlacklisted */, |
+ false /* isTerminal */, reallocatingPtNodeParams->getParentPos(), |
+ overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(), |
+ NOT_A_PROBABILITY)); |
+ if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, |
+ &writingPos)) { |
+ return false; |
+ } |
+ } else { |
+ const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( |
+ unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), |
+ true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), |
+ overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(), |
+ unigramProperty->getProbability())); |
+ if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition( |
+ &ptNodeParamsToWrite, unigramProperty, &writingPos)) { |
+ return false; |
+ } |
+ } |
+ const int actualChildrenPos = writingPos; |
+ // Create new children PtNode array. |
+ const size_t newPtNodeCount = addsExtraChild ? 2 : 1; |
+ if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition( |
+ mBuffer, newPtNodeCount, &writingPos)) { |
+ return false; |
+ } |
+ // Write the 2nd part of the reallocating node. |
+ const int secondPartOfReallocatedPtNodePos = writingPos; |
+ const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams( |
+ reallocatingPtNodeParams, reallocatingPtNodeParams->isNotAWord(), |
+ reallocatingPtNodeParams->isBlacklisted(), |
+ reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, |
+ reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount, |
+ reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount, |
+ reallocatingPtNodeParams->getProbability())); |
+ if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, |
+ &writingPos)) { |
+ return false; |
+ } |
+ if (addsExtraChild) { |
+ const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( |
+ unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), |
+ true /* isTerminal */, firstPartOfReallocatedPtNodePos, |
+ newNodeCodePointCount - overlappingCodePointCount, |
+ newNodeCodePoints + overlappingCodePointCount, |
+ unigramProperty->getProbability())); |
+ if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition( |
+ &extraChildPtNodeParams, unigramProperty, &writingPos)) { |
+ return false; |
+ } |
+ } |
+ if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition( |
+ mBuffer, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { |
+ return false; |
+ } |
+ // Update original reallocating PtNode as moved. |
+ if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, |
+ firstPartOfReallocatedPtNodePos, |
+ secondPartOfReallocatedPtNodePos)) { |
+ return false; |
+ } |
+ // Load node info. Information of the 1st part will be fetched. |
+ const PtNodeParams ptNodeParams( |
+ mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( |
+ firstPartOfReallocatedPtNodePos)); |
+ // Update children position. |
+ return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, |
+ actualChildrenPos); |
+} |
+ |
+const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( |
+ const PtNodeParams* const originalPtNodeParams, |
+ const bool isNotAWord, |
+ const bool isBlacklisted, |
+ const bool isTerminal, |
+ const int parentPos, |
+ const int codePointCount, |
+ const int* const codePoints, |
+ const int probability) const { |
+ const PatriciaTrieReadingUtils::NodeFlags flags = |
+ PatriciaTrieReadingUtils::createAndGetFlags( |
+ isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */, |
+ false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, |
+ CHILDREN_POSITION_FIELD_SIZE); |
+ return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, |
+ codePoints, probability); |
+} |
+ |
+const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode( |
+ const bool isNotAWord, |
+ const bool isBlacklisted, |
+ const bool isTerminal, |
+ const int parentPos, |
+ const int codePointCount, |
+ const int* const codePoints, |
+ const int probability) const { |
+ const PatriciaTrieReadingUtils::NodeFlags flags = |
+ PatriciaTrieReadingUtils::createAndGetFlags( |
+ isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */, |
+ false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, |
+ CHILDREN_POSITION_FIELD_SIZE); |
+ return PtNodeParams(flags, parentPos, codePointCount, codePoints, |
+ probability); |
+} |
+ |
+} // namespace latinime |