third_party/prediction/suggest/core/result/suggestions_output_utils.cpp - Issue 1247903003: Add spellcheck and word suggestion to the prediction service

Unified Diff: third_party/prediction/suggest/core/result/suggestions_output_utils.cpp

Issue 1247903003: Add spellcheck and word suggestion to the prediction service (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« services/prediction/key_set.h ('K') | « third_party/prediction/suggest/core/result/suggestions_output_utils.h ('k') | third_party/prediction/suggest/core/session/dic_traverse_session.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/prediction/suggest/core/result/suggestions_output_utils.cpp

diff --git a/third_party/prediction/suggest/core/result/suggestions_output_utils.cpp b/third_party/prediction/suggest/core/result/suggestions_output_utils.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..e35a39051adf133ad157eed3260c4be87ca3e0ef

--- /dev/null

+++ b/third_party/prediction/suggest/core/result/suggestions_output_utils.cpp

@@ -0,0 +1,275 @@

+/*

+ *

+ * Licensed under the Apache License, Version 2.0 (the "License");

+ * you may not use this file except in compliance with the License.

+ * You may obtain a copy of the License at

+ *

+ * http://www.apache.org/licenses/LICENSE-2.0

+ *

+ * Unless required by applicable law or agreed to in writing, software

+ * distributed under the License is distributed on an "AS IS" BASIS,

+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+ * See the License for the specific language governing permissions and

+ * limitations under the License.

+ */

+#include "third_party/prediction/suggest/core/result/suggestions_output_utils.h"

+#include <algorithm>

+#include <vector>

+#include "third_party/prediction/suggest/core/dicnode/dic_node.h"

+#include "third_party/prediction/suggest/core/dicnode/dic_node_utils.h"

+#include "third_party/prediction/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"

+#include "third_party/prediction/suggest/core/dictionary/error_type_utils.h"

+#include "third_party/prediction/suggest/core/policy/scoring.h"

+#include "third_party/prediction/suggest/core/result/suggestion_results.h"

+#include "third_party/prediction/suggest/core/session/dic_traverse_session.h"

+#include "third_party/prediction/suggest/core/suggest_options.h"

+namespace latinime {

+const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;

+/* static */ void SuggestionsOutputUtils::outputSuggestions(

+ const Scoring* const scoringPolicy,

+ DicTraverseSession* traverseSession,

+ const float languageWeight,

+ SuggestionResults* const outSuggestionResults) {

+#if DEBUG_EVALUATE_MOST_PROBABLE_STRING

+ const int terminalSize = 0;

+#else

+ const int terminalSize =

+ traverseSession->getDicTraverseCache()->terminalSize();

+#endif

+ std::vector<DicNode> terminals(terminalSize);

+ for (int index = terminalSize - 1; index >= 0; --index) {

+ traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);

+ }

+ // Compute a language weight when an invalid language weight is passed.

+ // NOT_A_LANGUAGE_WEIGHT (-1) is assumed as an invalid language weight.

+ const float languageWeightToOutputSuggestions =

+ (languageWeight < 0.0f)

+ ? scoringPolicy->getAdjustedLanguageWeight(

+ traverseSession, terminals.data(), terminalSize)

+ : languageWeight;

+ outSuggestionResults->setLanguageWeight(languageWeightToOutputSuggestions);

+ // Force autocorrection for obvious long multi-word suggestions when the top

+ // suggestion is

+ // a long multiple words suggestion.

+ // TODO: Implement a smarter auto-commit method for handling multi-word

+ // suggestions.

+ const bool forceCommitMultiWords =

+ scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop() &&

+ (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT &&

+ !terminals.empty() && terminals.front().hasMultipleWords());

+ // TODO: have partial commit work even with multiple pointers.

+ const bool outputSecondWordFirstLetterInputIndex =

+ traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);

+ const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()

+ ->getHeaderStructurePolicy()

+ ->shouldBoostExactMatches();

+ // Output suggestion results here

+ for (auto& terminalDicNode : terminals) {

+ outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode,

+ languageWeightToOutputSuggestions,

+ boostExactMatches, forceCommitMultiWords,

+ outputSecondWordFirstLetterInputIndex,

+ outSuggestionResults);

+ }

+ scoringPolicy->getMostProbableString(

+ traverseSession, languageWeightToOutputSuggestions, outSuggestionResults);

+/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(

+ const Scoring* const scoringPolicy,

+ DicTraverseSession* traverseSession,

+ const DicNode* const terminalDicNode,

+ const float languageWeight,

+ const bool boostExactMatches,

+ const bool forceCommitMultiWords,

+ const bool outputSecondWordFirstLetterInputIndex,

+ SuggestionResults* const outSuggestionResults) {

+ if (DEBUG_GEO_FULL) {

+ terminalDicNode->dump("OUT:");

+ }

+ const float doubleLetterCost =

+ scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);

+ const float compoundDistance =

+ terminalDicNode->getCompoundDistance(languageWeight) + doubleLetterCost;

+ const bool isPossiblyOffensiveWord =

+ traverseSession->getDictionaryStructurePolicy()->getProbability(

+ terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;

+ const bool isExactMatch =

+ ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());

+ const bool isExactMatchWithIntentionalOmission =

+ ErrorTypeUtils::isExactMatchWithIntentionalOmission(

+ terminalDicNode->getContainedErrorTypes());

+ const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();

+ // Heuristic: We exclude probability=0 first-char-uppercase words from exact

+ // match.

+ // (e.g. "AMD" and "and")

+ const bool isSafeExactMatch =

+ isExactMatch && !(isPossiblyOffensiveWord && isFirstCharUppercase);

+ const int outputTypeFlags =

+ (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) |

+ ((isSafeExactMatch && boostExactMatches)

+ ? Dictionary::KIND_FLAG_EXACT_MATCH

+ : 0) |

+ (isExactMatchWithIntentionalOmission

+ ? Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION

+ : 0);

+ // Entries that are blacklisted or do not represent a word should not be

+ // output.

+ const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();

+ // When we have to block offensive words, non-exact matched offensive words

+ // should not be

+ // output.

+ const bool blockOffensiveWords =

+ traverseSession->getSuggestOptions()->blockOffensiveWords();

+ const bool isBlockedOffensiveWord =

+ blockOffensiveWords && isPossiblyOffensiveWord && !isSafeExactMatch;

+ // Increase output score of top typing suggestion to ensure autocorrection.

+ // TODO: Better integration with java side autocorrection logic.

+ const int finalScore = scoringPolicy->calculateFinalScore(

+ compoundDistance, traverseSession->getInputSize(),

+ terminalDicNode->getContainedErrorTypes(),

+ (forceCommitMultiWords && terminalDicNode->hasMultipleWords()),

+ boostExactMatches);

+ // Don't output invalid or blocked offensive words. However, we still need to

+ // submit their

+ // shortcuts if any.

+ if (isValidWord && !isBlockedOffensiveWord) {

+ int codePoints[MAX_WORD_LENGTH];

+ terminalDicNode->outputResult(codePoints);

+ const int indexToPartialCommit =

+ outputSecondWordFirstLetterInputIndex

+ ? terminalDicNode->getSecondWordFirstInputIndex(

+ traverseSession->getProximityInfoState(0))

+ : NOT_AN_INDEX;

+ outSuggestionResults->addSuggestion(

+ codePoints, terminalDicNode->getTotalNodeCodePointCount(), finalScore,

+ Dictionary::KIND_CORRECTION | outputTypeFlags, indexToPartialCommit,

+ computeFirstWordConfidence(terminalDicNode));

+ }

+ // Output shortcuts.

+ // Shortcut is not supported for multiple words suggestions.

+ // TODO: Check shortcuts during traversal for multiple words suggestions.

+ if (!terminalDicNode->hasMultipleWords()) {

+ BinaryDictionaryShortcutIterator shortcutIt(

+ traverseSession->getDictionaryStructurePolicy()

+ ->getShortcutsStructurePolicy(),

+ traverseSession->getDictionaryStructurePolicy()

+ ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos()));

+ const bool sameAsTyped =

+ scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);

+ outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults);

+ }

+/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(

+ const DicNode* const terminalDicNode) {

+ // Get the number of spaces in the first suggestion

+ const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();

+ // Get the number of characters in the first suggestion

+ const int length = terminalDicNode->getTotalNodeCodePointCount();

+ // Get the distance for the first word of the suggestion

+ const float distance =

+ terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();

+ // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.

+ // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is

+ // under 0 or

+ // above 1,000,000 : under 0 just means it's very bad to commit, and above

+ // 1,000,000 means

+ // we are very confident.

+ // Expected space count is 1 ~ 5

+ static const int MIN_EXPECTED_SPACE_COUNT = 1;

+ static const int MAX_EXPECTED_SPACE_COUNT = 5;

+ // Expected length is about 4 ~ 30

+ static const int MIN_EXPECTED_LENGTH = 4;

+ static const int MAX_EXPECTED_LENGTH = 30;

+ // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0

+ static const float MIN_EXPECTED_DISTANCE = 0.0;

+ static const float MAX_EXPECTED_DISTANCE = 2.0;

+ // This is not strict: it's where most stuff will be falling, but it's still

+ // fine if it's

+ // outside these values. We want to output a value that reflects all of these.

+ // Each factor

+ // contributes a bit.

+ // We need at least a space.

+ if (spaceCount < 1)

+ return NOT_A_FIRST_WORD_CONFIDENCE;

+ // The smaller the edit distance, the higher the contribution.

+ // MIN_EXPECTED_DISTANCE means 0

+ // contribution, while MAX_EXPECTED_DISTANCE means full contribution according

+ // to the

+ // weight of the distance. Clamp to avoid overflows.

+ const float clampedDistance =

+ distance < MIN_EXPECTED_DISTANCE

+ ? MIN_EXPECTED_DISTANCE

+ : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;

+ const int distanceContribution =

+ DISTANCE_WEIGHT_FOR_AUTO_COMMIT *

+ (MAX_EXPECTED_DISTANCE - clampedDistance) /

+ (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);

+ // The larger the suggestion length, the larger the contribution.

+ // MIN_EXPECTED_LENGTH is no

+ // contribution, MAX_EXPECTED_LENGTH is full contribution according to the

+ // weight of the

+ // length. Length is guaranteed to be between 1 and 48, so we don't need to

+ // clamp.

+ const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT *

+ (length - MIN_EXPECTED_LENGTH) /

+ (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);

+ // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT

+ // space is no

+ // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution

+ // according to the

+ // weight of the space count.

+ const int spaceContribution =

+ SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT *

+ (spaceCount - MIN_EXPECTED_SPACE_COUNT) /

+ (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);

+ return distanceContribution + lengthContribution + spaceContribution;

+/* static */ void SuggestionsOutputUtils::outputShortcuts(

+ BinaryDictionaryShortcutIterator* const shortcutIt,

+ const int finalScore,

+ const bool sameAsTyped,

+ SuggestionResults* const outSuggestionResults) {

+ int shortcutTarget[MAX_WORD_LENGTH];

+ while (shortcutIt->hasNextShortcutTarget()) {

+ bool isWhilelist;

+ int shortcutTargetStringLength;

+ shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,

+ &shortcutTargetStringLength, &isWhilelist);

+ int shortcutScore;

+ int kind;

+ if (isWhilelist && sameAsTyped) {

+ shortcutScore = S_INT_MAX;

+ kind = Dictionary::KIND_WHITELIST;

+ } else {

+ // shortcut entry's score == its base entry's score - 1

+ shortcutScore = finalScore;

+ // Protection against int underflow

+ shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1;

+ kind = Dictionary::KIND_SHORTCUT;

+ }

+ outSuggestionResults->addSuggestion(

+ shortcutTarget, shortcutTargetStringLength,

+ std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX,

+ NOT_A_FIRST_WORD_CONFIDENCE);

+ }

+} // namespace latinime