third_party/prediction/suggest/core/suggest.cpp - Issue 1247903003: Add spellcheck and word suggestion to the prediction service

Side by Side Diff: third_party/prediction/suggest/core/suggest.cpp

Issue 1247903003: Add spellcheck and word suggestion to the prediction service (Closed) Base URL: https://github.com/domokit/mojo.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (C) 2012 The Android Open Source Project

	3 *

	4 * Licensed under the Apache License, Version 2.0 (the "License");

	5 * you may not use this file except in compliance with the License.

	6 * You may obtain a copy of the License at

	7 *

	8 * http://www.apache.org/licenses/LICENSE-2.0

	9 *

	10 * Unless required by applicable law or agreed to in writing, software

	11 * distributed under the License is distributed on an "AS IS" BASIS,

	12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	13 * See the License for the specific language governing permissions and

	14 * limitations under the License.

	15 */

	16

	17 #include "third_party/prediction/suggest/core/suggest.h"

	18 #include "third_party/prediction/suggest/core/dicnode/dic_node.h"

	19 #include "third_party/prediction/suggest/core/dicnode/dic_node_priority_queue.h"

	20 #include "third_party/prediction/suggest/core/dicnode/dic_node_vector.h"

	21 #include "third_party/prediction/suggest/core/dictionary/dictionary.h"

	22 #include "third_party/prediction/suggest/core/dictionary/digraph_utils.h"

	23 #include "third_party/prediction/suggest/core/layout/proximity_info.h"

	24 #include "third_party/prediction/suggest/core/policy/dictionary_structure_with_b uffer_policy.h"

	25 #include "third_party/prediction/suggest/core/policy/traversal.h"

	26 #include "third_party/prediction/suggest/core/policy/weighting.h"

	27 #include "third_party/prediction/suggest/core/result/suggestions_output_utils.h"

	28 #include "third_party/prediction/suggest/core/session/dic_traverse_session.h"

	29

	30 namespace latinime {

	31

	32 // Initialization of class constants.

	33 const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2;

	34

	35 /**

	36 * Returns a set of suggestions for the given input touch points. The

	37 *commitPoint argument indicates

	38 * whether to prematurely commit the suggested words up to the given point for

	39 *sentence-level

	40 * suggestion.

	41 *

	42 * Note: Currently does not support concurrent calls across threads. Continuous

	43 *suggestion is

	44 * automatically activated for sequential calls that share the same starting

	45 *input.

	46 * TODO: Stop detecting continuous suggestion. Start using traverseSession

	47 *instead.

	48 */

	49 void Suggest::getSuggestions(

	50 ProximityInfo* pInfo,

	51 void* traverseSession,

	52 int* inputXs,

	53 int* inputYs,

	54 int* times,

	55 int* pointerIds,

	56 int* inputCodePoints,

	57 int inputSize,

	58 const float languageWeight,

	59 SuggestionResults* const outSuggestionResults) const {

	60 PROF_OPEN;

	61 PROF_START(0);

	62 const float maxSpatialDistance = TRAVERSAL->getMaxSpatialDistance();

	63 DicTraverseSession* tSession =

	64 static_cast<DicTraverseSession*>(traverseSession);

	65 tSession->setupForGetSuggestions(

	66 pInfo, inputCodePoints, inputSize, inputXs, inputYs, times, pointerIds,

	67 maxSpatialDistance, TRAVERSAL->getMaxPointerCount());

	68 // TODO: Add the way to evaluate cache

	69

	70 initializeSearch(tSession);

	71 PROF_END(0);

	72 PROF_START(1);

	73

	74 // keep expanding search dicNodes until all have terminated.

	75 while (tSession->getDicTraverseCache()->activeSize() > 0) {

	76 expandCurrentDicNodes(tSession);

	77 tSession->getDicTraverseCache()->advanceActiveDicNodes();

	78 tSession->getDicTraverseCache()->advanceInputIndex(inputSize);

	79 }

	80 PROF_END(1);

	81 PROF_START(2);

	82 SuggestionsOutputUtils::outputSuggestions(SCORING, tSession, languageWeight,

	83 outSuggestionResults);

	84 PROF_END(2);

	85 PROF_CLOSE;

	86 }

	87

	88 /**

	89 * Initializes the search at the root of the lexicon trie. Note that when

	90 * possible the search will

	91 * continue suggestion from where it left off during the last call.

	92 */

	93 void Suggest::initializeSearch(DicTraverseSession* traverseSession) const {

	94 if (!traverseSession->getProximityInfoState(0)->isUsed()) {

	95 return;

	96 }

	97

	98 if (traverseSession->getInputSize() > MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE &&

	99 traverseSession->isContinuousSuggestionPossible()) {

	100 // Continue suggestion

	101 traverseSession->getDicTraverseCache()->continueSearch();

	102 } else {

	103 // Restart recognition at the root.

	104 traverseSession->resetCache(

	105 TRAVERSAL->getMaxCacheSize(traverseSession->getInputSize()),

	106 TRAVERSAL->getTerminalCacheSize());

	107 // Create a new dic node here

	108 DicNode rootNode;

	109 DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(),

	110 traverseSession->getPrevWordsPtNodePos(),

	111 &rootNode);

	112 traverseSession->getDicTraverseCache()->copyPushActive(&rootNode);

	113 }

	114 }

	115

	116 /**

	117 * Expands the dicNodes in the current search priority queue by advancing to the

	118 * possible child

	119 * nodes based on the next touch point(s) (or no touch points for lookahead)

	120 */

	121 void Suggest::expandCurrentDicNodes(DicTraverseSession* traverseSession) const {

	122 const int inputSize = traverseSession->getInputSize();

	123 DicNodeVector childDicNodes(TRAVERSAL->getDefaultExpandDicNodeSize());

	124 DicNode correctionDicNode;

	125

	126 // TODO: Find more efficient caching

	127 const bool shouldDepthLevelCache =

	128 TRAVERSAL->shouldDepthLevelCache(traverseSession);

	129 if (shouldDepthLevelCache) {

	130 traverseSession->getDicTraverseCache()->updateLastCachedInputIndex();

	131 }

	132 if (DEBUG_CACHE) {

	133 AKLOGI("expandCurrentDicNodes depth level cache = %d, inputSize = %d",

	134 shouldDepthLevelCache, inputSize);

	135 }

	136 while (traverseSession->getDicTraverseCache()->activeSize() > 0) {

	137 DicNode dicNode;

	138 traverseSession->getDicTraverseCache()->popActive(&dicNode);

	139 if (dicNode.isTotalInputSizeExceedingLimit()) {

	140 return;

	141 }

	142 childDicNodes.clear();

	143 const int point0Index = dicNode.getInputIndex(0);

	144 const bool canDoLookAheadCorrection =

	145 TRAVERSAL->canDoLookAheadCorrection(traverseSession, &dicNode);

	146 const bool isLookAheadCorrection =

	147 canDoLookAheadCorrection &&

	148 traverseSession->getDicTraverseCache()->isLookAheadCorrectionInputIndex(

	149 static_cast<int>(point0Index));

	150 const bool isCompletion = dicNode.isCompletion(inputSize);

	151

	152 const bool shouldNodeLevelCache =

	153 TRAVERSAL->shouldNodeLevelCache(traverseSession, &dicNode);

	154 if (shouldDepthLevelCache \|\| shouldNodeLevelCache) {

	155 if (DEBUG_CACHE) {

	156 dicNode.dump("PUSH_CACHE");

	157 }

	158 traverseSession->getDicTraverseCache()->copyPushContinue(&dicNode);

	159 dicNode.setCached();

	160 }

	161

	162 if (dicNode.isInDigraph()) {

	163 // Finish digraph handling if the node is in the middle of a digraph

	164 // expansion.

	165 processDicNodeAsDigraph(traverseSession, &dicNode);

	166 } else if (isLookAheadCorrection) {

	167 // The algorithm maintains a small set of "deferred" nodes that have not

	168 // consumed the

	169 // latest touch point yet. These are needed to apply look-ahead correction

	170 // operations

	171 // that require special handling of the latest touch point. For example,

	172 // with insertions

	173 // (e.g., "thiis" -> "this") the latest touch point should not be consumed

	174 // at all.

	175 processDicNodeAsTransposition(traverseSession, &dicNode);

	176 processDicNodeAsInsertion(traverseSession, &dicNode);

	177 } else { // !isLookAheadCorrection

	178 // Only consider typing error corrections if the normalized compound

	179 // distance is

	180 // below a spatial distance threshold.

	181 // NOTE: the threshold may need to be updated if scoring model changes.

	182 // TODO: Remove. Do not prune node here.

	183 const bool allowsErrorCorrections =

	184 TRAVERSAL->allowsErrorCorrections(&dicNode);

	185 // Process for handling space substitution (e.g., hevis => he is)

	186 if (allowsErrorCorrections &&

	187 TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) {

	188 createNextWordDicNode(traverseSession, &dicNode,

	189 true /* spaceSubstitution */);

	190 }

	191

	192 DicNodeUtils::getAllChildDicNodes(

	193 &dicNode, traverseSession->getDictionaryStructurePolicy(),

	194 &childDicNodes);

	195

	196 const int childDicNodesSize = childDicNodes.getSizeAndLock();

	197 for (int i = 0; i < childDicNodesSize; ++i) {

	198 DicNode* const childDicNode = childDicNodes[i];

	199 if (isCompletion) {

	200 // Handle forward lookahead when the lexicon letter exceeds the input

	201 // size.

	202 processDicNodeAsMatch(traverseSession, childDicNode);

	203 continue;

	204 }

	205 if (DigraphUtils::hasDigraphForCodePoint(

	206 traverseSession->getDictionaryStructurePolicy()

	207 ->getHeaderStructurePolicy(),

	208 childDicNode->getNodeCodePoint())) {

	209 correctionDicNode.initByCopy(childDicNode);

	210 correctionDicNode.advanceDigraphIndex();

	211 processDicNodeAsDigraph(traverseSession, &correctionDicNode);

	212 }

	213 if (TRAVERSAL->isOmission(traverseSession, &dicNode, childDicNode,

	214 allowsErrorCorrections)) {

	215 // TODO: (Gesture) Change weight between omission and substitution

	216 // errors

	217 // TODO: (Gesture) Terminal node should not be handled as omission

	218 correctionDicNode.initByCopy(childDicNode);

	219 processDicNodeAsOmission(traverseSession, &correctionDicNode);

	220 }

	221 const ProximityType proximityType = TRAVERSAL->getProximityType(

	222 traverseSession, &dicNode, childDicNode);

	223 switch (proximityType) {

	224 // TODO: Consider the difference of proximityType here

	225 case MATCH_CHAR:

	226 case PROXIMITY_CHAR:

	227 processDicNodeAsMatch(traverseSession, childDicNode);

	228 break;

	229 case ADDITIONAL_PROXIMITY_CHAR:

	230 if (allowsErrorCorrections) {

	231 processDicNodeAsAdditionalProximityChar(traverseSession, &dicNode,

	232 childDicNode);

	233 }

	234 break;

	235 case SUBSTITUTION_CHAR:

	236 if (allowsErrorCorrections) {

	237 processDicNodeAsSubstitution(traverseSession, &dicNode,

	238 childDicNode);

	239 }

	240 break;

	241 case UNRELATED_CHAR:

	242 // Just drop this dicNode and do nothing.

	243 break;

	244 default:

	245 // Just drop this dicNode and do nothing.

	246 break;

	247 }

	248 }

	249

	250 // Push the dicNode for look-ahead correction

	251 if (allowsErrorCorrections && canDoLookAheadCorrection) {

	252 traverseSession->getDicTraverseCache()->copyPushNextActive(&dicNode);

	253 }

	254 }

	255 }

	256 }

	257

	258 void Suggest::processTerminalDicNode(DicTraverseSession* traverseSession,

	259 DicNode* dicNode) const {

	260 if (dicNode->getCompoundDistance() >=

	261 static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) {

	262 return;

	263 }

	264 if (!dicNode->isTerminalDicNode()) {

	265 return;

	266 }

	267 if (dicNode->shouldBeFilteredBySafetyNetForBigram()) {

	268 return;

	269 }

	270 if (!dicNode->hasMatchedOrProximityCodePoints()) {

	271 return;

	272 }

	273 // Create a non-cached node here.

	274 DicNode terminalDicNode(*dicNode);

	275 if (TRAVERSAL->needsToTraverseAllUserInput() &&

	276 dicNode->getInputIndex(0) < traverseSession->getInputSize()) {

	277 Weighting::addCostAndForwardInputIndex(

	278 WEIGHTING, CT_TERMINAL_INSERTION, traverseSession, 0, &terminalDicNode,

	279 traverseSession->getMultiBigramMap());

	280 }

	281 Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TERMINAL,

	282 traverseSession, 0, &terminalDicNode,

	283 traverseSession->getMultiBigramMap());

	284 traverseSession->getDicTraverseCache()->copyPushTerminal(&terminalDicNode);

	285 }

	286

	287 /**

	288 * Adds the expanded dicNode to the next search priority queue. Also creates an

	289 * additional next word

	290 * (by the space omission error correction) search path if input dicNode is on a

	291 * terminal.

	292 */

	293 void Suggest::processExpandedDicNode(DicTraverseSession* traverseSession,

	294 DicNode* dicNode) const {

	295 processTerminalDicNode(traverseSession, dicNode);

	296 if (dicNode->getCompoundDistance() <

	297 static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) {

	298 if (TRAVERSAL->isSpaceOmissionTerminal(traverseSession, dicNode)) {

	299 createNextWordDicNode(traverseSession, dicNode,

	300 false /* spaceSubstitution */);

	301 }

	302 const int allowsLookAhead =

	303 !(dicNode->hasMultipleWords() &&

	304 dicNode->isCompletion(traverseSession->getInputSize()));

	305 if (dicNode->hasChildren() && allowsLookAhead) {

	306 traverseSession->getDicTraverseCache()->copyPushNextActive(dicNode);

	307 }

	308 }

	309 }

	310

	311 void Suggest::processDicNodeAsMatch(DicTraverseSession* traverseSession,

	312 DicNode* childDicNode) const {

	313 weightChildNode(traverseSession, childDicNode);

	314 processExpandedDicNode(traverseSession, childDicNode);

	315 }

	316

	317 void Suggest::processDicNodeAsAdditionalProximityChar(

	318 DicTraverseSession* traverseSession,

	319 DicNode* dicNode,

	320 DicNode* childDicNode) const {

	321 // Note: Most types of corrections don't need to look up the bigram

	322 // information since they do

	323 // not treat the node as a terminal. There is no need to pass the bigram map

	324 // in these cases.

	325 Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_ADDITIONAL_PROXIMITY,

	326 traverseSession, dicNode, childDicNode,

	327 0 /* multiBigramMap */);

	328 weightChildNode(traverseSession, childDicNode);

	329 processExpandedDicNode(traverseSession, childDicNode);

	330 }

	331

	332 void Suggest::processDicNodeAsSubstitution(DicTraverseSession* traverseSession,

	333 DicNode* dicNode,

	334 DicNode* childDicNode) const {

	335 Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_SUBSTITUTION,

	336 traverseSession, dicNode, childDicNode,

	337 0 /* multiBigramMap */);

	338 weightChildNode(traverseSession, childDicNode);

	339 processExpandedDicNode(traverseSession, childDicNode);

	340 }

	341

	342 // Process the DicNode codepoint as a digraph. This means that composite glyphs

	343 // like the German

	344 // u-umlaut is expanded to the transliteration "ue". Note that this happens in

	345 // parallel with

	346 // the normal non-digraph traversal, so both "uber" and "ueber" can be corrected

	347 // to "[u-umlaut]ber".

	348 void Suggest::processDicNodeAsDigraph(DicTraverseSession* traverseSession,

	349 DicNode* childDicNode) const {

	350 weightChildNode(traverseSession, childDicNode);

	351 childDicNode->advanceDigraphIndex();

	352 processExpandedDicNode(traverseSession, childDicNode);

	353 }

	354

	355 /**

	356 * Handle the dicNode as an omission error (e.g., ths => this). Skip the current

	357 * letter and consider

	358 * matches for all possible next letters. Note that just skipping the current

	359 * letter without any

	360 * other conditions tends to flood the search DicNodes cache with omission

	361 * DicNodes. Instead, check

	362 * the possible next letters after the omission to better limit search to

	363 * plausible omissions.

	364 * Note that apostrophes are handled as omissions.

	365 */

	366 void Suggest::processDicNodeAsOmission(DicTraverseSession* traverseSession,

	367 DicNode* dicNode) const {

	368 DicNodeVector childDicNodes;

	369 DicNodeUtils::getAllChildDicNodes(

	370 dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes);

	371

	372 const int size = childDicNodes.getSizeAndLock();

	373 for (int i = 0; i < size; i++) {

	374 DicNode* const childDicNode = childDicNodes[i];

	375 // Treat this word as omission

	376 Weighting::addCostAndForwardInputIndex(

	377 WEIGHTING, CT_OMISSION, traverseSession, dicNode, childDicNode,

	378 0 /* multiBigramMap */);

	379 weightChildNode(traverseSession, childDicNode);

	380 if (!TRAVERSAL->isPossibleOmissionChildNode(traverseSession, dicNode,

	381 childDicNode)) {

	382 continue;

	383 }

	384 processExpandedDicNode(traverseSession, childDicNode);

	385 }

	386 }

	387

	388 /**

	389 * Handle the dicNode as an insertion error (e.g., thiis => this). Skip the

	390 * current touch point and

	391 * consider matches for the next touch point.

	392 */

	393 void Suggest::processDicNodeAsInsertion(DicTraverseSession* traverseSession,

	394 DicNode* dicNode) const {

	395 const int16_t pointIndex = dicNode->getInputIndex(0);

	396 DicNodeVector childDicNodes;

	397 DicNodeUtils::getAllChildDicNodes(

	398 dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes);

	399 const int size = childDicNodes.getSizeAndLock();

	400 for (int i = 0; i < size; i++) {

	401 if (traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(

	402 pointIndex + 1) != childDicNodes[i]->getNodeCodePoint()) {

	403 continue;

	404 }

	405 DicNode* const childDicNode = childDicNodes[i];

	406 Weighting::addCostAndForwardInputIndex(

	407 WEIGHTING, CT_INSERTION, traverseSession, dicNode, childDicNode,

	408 0 /* multiBigramMap */);

	409 processExpandedDicNode(traverseSession, childDicNode);

	410 }

	411 }

	412

	413 /**

	414 * Handle the dicNode as a transposition error (e.g., thsi => this). Swap the

	415 * next two touch points.

	416 */

	417 void Suggest::processDicNodeAsTransposition(DicTraverseSession* traverseSession,

	418 DicNode* dicNode) const {

	419 const int16_t pointIndex = dicNode->getInputIndex(0);

	420 DicNodeVector childDicNodes1;

	421 DicNodeVector childDicNodes2;

	422 DicNodeUtils::getAllChildDicNodes(

	423 dicNode, traverseSession->getDictionaryStructurePolicy(),

	424 &childDicNodes1);

	425 const int childSize1 = childDicNodes1.getSizeAndLock();

	426 for (int i = 0; i < childSize1; i++) {

	427 const ProximityType matchedId1 =

	428 traverseSession->getProximityInfoState(0)->getProximityType(

	429 pointIndex + 1, childDicNodes1[i]->getNodeCodePoint(),

	430 true /* checkProximityChars */);

	431 if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId1)) {

	432 continue;

	433 }

	434 if (childDicNodes1[i]->hasChildren()) {

	435 childDicNodes2.clear();

	436 DicNodeUtils::getAllChildDicNodes(

	437 childDicNodes1[i], traverseSession->getDictionaryStructurePolicy(),

	438 &childDicNodes2);

	439 const int childSize2 = childDicNodes2.getSizeAndLock();

	440 for (int j = 0; j < childSize2; j++) {

	441 DicNode* const childDicNode2 = childDicNodes2[j];

	442 const ProximityType matchedId2 =

	443 traverseSession->getProximityInfoState(0)->getProximityType(

	444 pointIndex, childDicNode2->getNodeCodePoint(),

	445 true /* checkProximityChars */);

	446 if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId2)) {

	447 continue;

	448 }

	449 Weighting::addCostAndForwardInputIndex(

	450 WEIGHTING, CT_TRANSPOSITION, traverseSession, childDicNodes1[i],

	451 childDicNode2, 0 /* multiBigramMap */);

	452 processExpandedDicNode(traverseSession, childDicNode2);

	453 }

	454 }

	455 }

	456 }

	457

	458 /**

	459 * Weight child dicNode by aligning it to the key

	460 */

	461 void Suggest::weightChildNode(DicTraverseSession* traverseSession,

	462 DicNode* dicNode) const {

	463 const int inputSize = traverseSession->getInputSize();

	464 if (dicNode->isCompletion(inputSize)) {

	465 Weighting::addCostAndForwardInputIndex(

	466 WEIGHTING, CT_COMPLETION, traverseSession, 0 /* parentDicNode */,

	467 dicNode, 0 /* multiBigramMap */);

	468 } else { // completion

	469 Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_MATCH, traverseSession,

	470 0 /* parentDicNode */, dicNode,

	471 0 /* multiBigramMap */);

	472 }

	473 }

	474

	475 /**

	476 * Creates a new dicNode that represents a space insertion at the end of the

	477 * input dicNode. Also

	478 * incorporates the unigram / bigram score for the ending word into the new

	479 * dicNode.

	480 */

	481 void Suggest::createNextWordDicNode(DicTraverseSession* traverseSession,

	482 DicNode* dicNode,

	483 const bool spaceSubstitution) const {

	484 if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode)) {

	485 return;

	486 }

	487

	488 // Create a non-cached node here.

	489 DicNode newDicNode;

	490 DicNodeUtils::initAsRootWithPreviousWord(

	491 traverseSession->getDictionaryStructurePolicy(), dicNode, &newDicNode);

	492 const CorrectionType correctionType = spaceSubstitution

	493 ? CT_NEW_WORD_SPACE_SUBSTITUTION

	494 : CT_NEW_WORD_SPACE_OMISSION;

	495 Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType,

	496 traverseSession, dicNode, &newDicNode,

	497 traverseSession->getMultiBigramMap());

	498 if (newDicNode.getCompoundDistance() <

	499 static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) {

	500 // newDicNode is worth continuing to traverse.

	501 // CAVEAT: This pruning is important for speed. Remove this when we can

	502 // afford not to prune

	503 // here because here is not the right place to do pruning. Pruning should

	504 // take place only

	505 // in DicNodePriorityQueue.

	506 traverseSession->getDicTraverseCache()->copyPushNextActive(&newDicNode);

	507 }

	508 }

	509 } // namespace latinime

OLD	NEW

« services/prediction/input_info.cc ('K') | « third_party/prediction/suggest/core/suggest.h ('k') | third_party/prediction/suggest/core/suggest_interface.h » ('j') | no next file with comments »