Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(193)

Side by Side Diff: third_party/prediction/suggest/core/suggest.cpp

Issue 1247903003: Add spellcheck and word suggestion to the prediction service (Closed) Base URL: https://github.com/domokit/mojo.git@master
Patch Set: Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "third_party/prediction/suggest/core/suggest.h"
18 #include "third_party/prediction/suggest/core/dicnode/dic_node.h"
19 #include "third_party/prediction/suggest/core/dicnode/dic_node_priority_queue.h"
20 #include "third_party/prediction/suggest/core/dicnode/dic_node_vector.h"
21 #include "third_party/prediction/suggest/core/dictionary/dictionary.h"
22 #include "third_party/prediction/suggest/core/dictionary/digraph_utils.h"
23 #include "third_party/prediction/suggest/core/layout/proximity_info.h"
24 #include "third_party/prediction/suggest/core/policy/dictionary_structure_with_b uffer_policy.h"
25 #include "third_party/prediction/suggest/core/policy/traversal.h"
26 #include "third_party/prediction/suggest/core/policy/weighting.h"
27 #include "third_party/prediction/suggest/core/result/suggestions_output_utils.h"
28 #include "third_party/prediction/suggest/core/session/dic_traverse_session.h"
29
30 namespace latinime {
31
32 // Initialization of class constants.
33 const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2;
34
35 /**
36 * Returns a set of suggestions for the given input touch points. The
37 *commitPoint argument indicates
38 * whether to prematurely commit the suggested words up to the given point for
39 *sentence-level
40 * suggestion.
41 *
42 * Note: Currently does not support concurrent calls across threads. Continuous
43 *suggestion is
44 * automatically activated for sequential calls that share the same starting
45 *input.
46 * TODO: Stop detecting continuous suggestion. Start using traverseSession
47 *instead.
48 */
49 void Suggest::getSuggestions(
50 ProximityInfo* pInfo,
51 void* traverseSession,
52 int* inputXs,
53 int* inputYs,
54 int* times,
55 int* pointerIds,
56 int* inputCodePoints,
57 int inputSize,
58 const float languageWeight,
59 SuggestionResults* const outSuggestionResults) const {
60 PROF_OPEN;
61 PROF_START(0);
62 const float maxSpatialDistance = TRAVERSAL->getMaxSpatialDistance();
63 DicTraverseSession* tSession =
64 static_cast<DicTraverseSession*>(traverseSession);
65 tSession->setupForGetSuggestions(
66 pInfo, inputCodePoints, inputSize, inputXs, inputYs, times, pointerIds,
67 maxSpatialDistance, TRAVERSAL->getMaxPointerCount());
68 // TODO: Add the way to evaluate cache
69
70 initializeSearch(tSession);
71 PROF_END(0);
72 PROF_START(1);
73
74 // keep expanding search dicNodes until all have terminated.
75 while (tSession->getDicTraverseCache()->activeSize() > 0) {
76 expandCurrentDicNodes(tSession);
77 tSession->getDicTraverseCache()->advanceActiveDicNodes();
78 tSession->getDicTraverseCache()->advanceInputIndex(inputSize);
79 }
80 PROF_END(1);
81 PROF_START(2);
82 SuggestionsOutputUtils::outputSuggestions(SCORING, tSession, languageWeight,
83 outSuggestionResults);
84 PROF_END(2);
85 PROF_CLOSE;
86 }
87
88 /**
89 * Initializes the search at the root of the lexicon trie. Note that when
90 * possible the search will
91 * continue suggestion from where it left off during the last call.
92 */
93 void Suggest::initializeSearch(DicTraverseSession* traverseSession) const {
94 if (!traverseSession->getProximityInfoState(0)->isUsed()) {
95 return;
96 }
97
98 if (traverseSession->getInputSize() > MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE &&
99 traverseSession->isContinuousSuggestionPossible()) {
100 // Continue suggestion
101 traverseSession->getDicTraverseCache()->continueSearch();
102 } else {
103 // Restart recognition at the root.
104 traverseSession->resetCache(
105 TRAVERSAL->getMaxCacheSize(traverseSession->getInputSize()),
106 TRAVERSAL->getTerminalCacheSize());
107 // Create a new dic node here
108 DicNode rootNode;
109 DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(),
110 traverseSession->getPrevWordsPtNodePos(),
111 &rootNode);
112 traverseSession->getDicTraverseCache()->copyPushActive(&rootNode);
113 }
114 }
115
116 /**
117 * Expands the dicNodes in the current search priority queue by advancing to the
118 * possible child
119 * nodes based on the next touch point(s) (or no touch points for lookahead)
120 */
121 void Suggest::expandCurrentDicNodes(DicTraverseSession* traverseSession) const {
122 const int inputSize = traverseSession->getInputSize();
123 DicNodeVector childDicNodes(TRAVERSAL->getDefaultExpandDicNodeSize());
124 DicNode correctionDicNode;
125
126 // TODO: Find more efficient caching
127 const bool shouldDepthLevelCache =
128 TRAVERSAL->shouldDepthLevelCache(traverseSession);
129 if (shouldDepthLevelCache) {
130 traverseSession->getDicTraverseCache()->updateLastCachedInputIndex();
131 }
132 if (DEBUG_CACHE) {
133 AKLOGI("expandCurrentDicNodes depth level cache = %d, inputSize = %d",
134 shouldDepthLevelCache, inputSize);
135 }
136 while (traverseSession->getDicTraverseCache()->activeSize() > 0) {
137 DicNode dicNode;
138 traverseSession->getDicTraverseCache()->popActive(&dicNode);
139 if (dicNode.isTotalInputSizeExceedingLimit()) {
140 return;
141 }
142 childDicNodes.clear();
143 const int point0Index = dicNode.getInputIndex(0);
144 const bool canDoLookAheadCorrection =
145 TRAVERSAL->canDoLookAheadCorrection(traverseSession, &dicNode);
146 const bool isLookAheadCorrection =
147 canDoLookAheadCorrection &&
148 traverseSession->getDicTraverseCache()->isLookAheadCorrectionInputIndex(
149 static_cast<int>(point0Index));
150 const bool isCompletion = dicNode.isCompletion(inputSize);
151
152 const bool shouldNodeLevelCache =
153 TRAVERSAL->shouldNodeLevelCache(traverseSession, &dicNode);
154 if (shouldDepthLevelCache || shouldNodeLevelCache) {
155 if (DEBUG_CACHE) {
156 dicNode.dump("PUSH_CACHE");
157 }
158 traverseSession->getDicTraverseCache()->copyPushContinue(&dicNode);
159 dicNode.setCached();
160 }
161
162 if (dicNode.isInDigraph()) {
163 // Finish digraph handling if the node is in the middle of a digraph
164 // expansion.
165 processDicNodeAsDigraph(traverseSession, &dicNode);
166 } else if (isLookAheadCorrection) {
167 // The algorithm maintains a small set of "deferred" nodes that have not
168 // consumed the
169 // latest touch point yet. These are needed to apply look-ahead correction
170 // operations
171 // that require special handling of the latest touch point. For example,
172 // with insertions
173 // (e.g., "thiis" -> "this") the latest touch point should not be consumed
174 // at all.
175 processDicNodeAsTransposition(traverseSession, &dicNode);
176 processDicNodeAsInsertion(traverseSession, &dicNode);
177 } else { // !isLookAheadCorrection
178 // Only consider typing error corrections if the normalized compound
179 // distance is
180 // below a spatial distance threshold.
181 // NOTE: the threshold may need to be updated if scoring model changes.
182 // TODO: Remove. Do not prune node here.
183 const bool allowsErrorCorrections =
184 TRAVERSAL->allowsErrorCorrections(&dicNode);
185 // Process for handling space substitution (e.g., hevis => he is)
186 if (allowsErrorCorrections &&
187 TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) {
188 createNextWordDicNode(traverseSession, &dicNode,
189 true /* spaceSubstitution */);
190 }
191
192 DicNodeUtils::getAllChildDicNodes(
193 &dicNode, traverseSession->getDictionaryStructurePolicy(),
194 &childDicNodes);
195
196 const int childDicNodesSize = childDicNodes.getSizeAndLock();
197 for (int i = 0; i < childDicNodesSize; ++i) {
198 DicNode* const childDicNode = childDicNodes[i];
199 if (isCompletion) {
200 // Handle forward lookahead when the lexicon letter exceeds the input
201 // size.
202 processDicNodeAsMatch(traverseSession, childDicNode);
203 continue;
204 }
205 if (DigraphUtils::hasDigraphForCodePoint(
206 traverseSession->getDictionaryStructurePolicy()
207 ->getHeaderStructurePolicy(),
208 childDicNode->getNodeCodePoint())) {
209 correctionDicNode.initByCopy(childDicNode);
210 correctionDicNode.advanceDigraphIndex();
211 processDicNodeAsDigraph(traverseSession, &correctionDicNode);
212 }
213 if (TRAVERSAL->isOmission(traverseSession, &dicNode, childDicNode,
214 allowsErrorCorrections)) {
215 // TODO: (Gesture) Change weight between omission and substitution
216 // errors
217 // TODO: (Gesture) Terminal node should not be handled as omission
218 correctionDicNode.initByCopy(childDicNode);
219 processDicNodeAsOmission(traverseSession, &correctionDicNode);
220 }
221 const ProximityType proximityType = TRAVERSAL->getProximityType(
222 traverseSession, &dicNode, childDicNode);
223 switch (proximityType) {
224 // TODO: Consider the difference of proximityType here
225 case MATCH_CHAR:
226 case PROXIMITY_CHAR:
227 processDicNodeAsMatch(traverseSession, childDicNode);
228 break;
229 case ADDITIONAL_PROXIMITY_CHAR:
230 if (allowsErrorCorrections) {
231 processDicNodeAsAdditionalProximityChar(traverseSession, &dicNode,
232 childDicNode);
233 }
234 break;
235 case SUBSTITUTION_CHAR:
236 if (allowsErrorCorrections) {
237 processDicNodeAsSubstitution(traverseSession, &dicNode,
238 childDicNode);
239 }
240 break;
241 case UNRELATED_CHAR:
242 // Just drop this dicNode and do nothing.
243 break;
244 default:
245 // Just drop this dicNode and do nothing.
246 break;
247 }
248 }
249
250 // Push the dicNode for look-ahead correction
251 if (allowsErrorCorrections && canDoLookAheadCorrection) {
252 traverseSession->getDicTraverseCache()->copyPushNextActive(&dicNode);
253 }
254 }
255 }
256 }
257
258 void Suggest::processTerminalDicNode(DicTraverseSession* traverseSession,
259 DicNode* dicNode) const {
260 if (dicNode->getCompoundDistance() >=
261 static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) {
262 return;
263 }
264 if (!dicNode->isTerminalDicNode()) {
265 return;
266 }
267 if (dicNode->shouldBeFilteredBySafetyNetForBigram()) {
268 return;
269 }
270 if (!dicNode->hasMatchedOrProximityCodePoints()) {
271 return;
272 }
273 // Create a non-cached node here.
274 DicNode terminalDicNode(*dicNode);
275 if (TRAVERSAL->needsToTraverseAllUserInput() &&
276 dicNode->getInputIndex(0) < traverseSession->getInputSize()) {
277 Weighting::addCostAndForwardInputIndex(
278 WEIGHTING, CT_TERMINAL_INSERTION, traverseSession, 0, &terminalDicNode,
279 traverseSession->getMultiBigramMap());
280 }
281 Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TERMINAL,
282 traverseSession, 0, &terminalDicNode,
283 traverseSession->getMultiBigramMap());
284 traverseSession->getDicTraverseCache()->copyPushTerminal(&terminalDicNode);
285 }
286
287 /**
288 * Adds the expanded dicNode to the next search priority queue. Also creates an
289 * additional next word
290 * (by the space omission error correction) search path if input dicNode is on a
291 * terminal.
292 */
293 void Suggest::processExpandedDicNode(DicTraverseSession* traverseSession,
294 DicNode* dicNode) const {
295 processTerminalDicNode(traverseSession, dicNode);
296 if (dicNode->getCompoundDistance() <
297 static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) {
298 if (TRAVERSAL->isSpaceOmissionTerminal(traverseSession, dicNode)) {
299 createNextWordDicNode(traverseSession, dicNode,
300 false /* spaceSubstitution */);
301 }
302 const int allowsLookAhead =
303 !(dicNode->hasMultipleWords() &&
304 dicNode->isCompletion(traverseSession->getInputSize()));
305 if (dicNode->hasChildren() && allowsLookAhead) {
306 traverseSession->getDicTraverseCache()->copyPushNextActive(dicNode);
307 }
308 }
309 }
310
311 void Suggest::processDicNodeAsMatch(DicTraverseSession* traverseSession,
312 DicNode* childDicNode) const {
313 weightChildNode(traverseSession, childDicNode);
314 processExpandedDicNode(traverseSession, childDicNode);
315 }
316
317 void Suggest::processDicNodeAsAdditionalProximityChar(
318 DicTraverseSession* traverseSession,
319 DicNode* dicNode,
320 DicNode* childDicNode) const {
321 // Note: Most types of corrections don't need to look up the bigram
322 // information since they do
323 // not treat the node as a terminal. There is no need to pass the bigram map
324 // in these cases.
325 Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_ADDITIONAL_PROXIMITY,
326 traverseSession, dicNode, childDicNode,
327 0 /* multiBigramMap */);
328 weightChildNode(traverseSession, childDicNode);
329 processExpandedDicNode(traverseSession, childDicNode);
330 }
331
332 void Suggest::processDicNodeAsSubstitution(DicTraverseSession* traverseSession,
333 DicNode* dicNode,
334 DicNode* childDicNode) const {
335 Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_SUBSTITUTION,
336 traverseSession, dicNode, childDicNode,
337 0 /* multiBigramMap */);
338 weightChildNode(traverseSession, childDicNode);
339 processExpandedDicNode(traverseSession, childDicNode);
340 }
341
342 // Process the DicNode codepoint as a digraph. This means that composite glyphs
343 // like the German
344 // u-umlaut is expanded to the transliteration "ue". Note that this happens in
345 // parallel with
346 // the normal non-digraph traversal, so both "uber" and "ueber" can be corrected
347 // to "[u-umlaut]ber".
348 void Suggest::processDicNodeAsDigraph(DicTraverseSession* traverseSession,
349 DicNode* childDicNode) const {
350 weightChildNode(traverseSession, childDicNode);
351 childDicNode->advanceDigraphIndex();
352 processExpandedDicNode(traverseSession, childDicNode);
353 }
354
355 /**
356 * Handle the dicNode as an omission error (e.g., ths => this). Skip the current
357 * letter and consider
358 * matches for all possible next letters. Note that just skipping the current
359 * letter without any
360 * other conditions tends to flood the search DicNodes cache with omission
361 * DicNodes. Instead, check
362 * the possible *next* letters after the omission to better limit search to
363 * plausible omissions.
364 * Note that apostrophes are handled as omissions.
365 */
366 void Suggest::processDicNodeAsOmission(DicTraverseSession* traverseSession,
367 DicNode* dicNode) const {
368 DicNodeVector childDicNodes;
369 DicNodeUtils::getAllChildDicNodes(
370 dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes);
371
372 const int size = childDicNodes.getSizeAndLock();
373 for (int i = 0; i < size; i++) {
374 DicNode* const childDicNode = childDicNodes[i];
375 // Treat this word as omission
376 Weighting::addCostAndForwardInputIndex(
377 WEIGHTING, CT_OMISSION, traverseSession, dicNode, childDicNode,
378 0 /* multiBigramMap */);
379 weightChildNode(traverseSession, childDicNode);
380 if (!TRAVERSAL->isPossibleOmissionChildNode(traverseSession, dicNode,
381 childDicNode)) {
382 continue;
383 }
384 processExpandedDicNode(traverseSession, childDicNode);
385 }
386 }
387
388 /**
389 * Handle the dicNode as an insertion error (e.g., thiis => this). Skip the
390 * current touch point and
391 * consider matches for the next touch point.
392 */
393 void Suggest::processDicNodeAsInsertion(DicTraverseSession* traverseSession,
394 DicNode* dicNode) const {
395 const int16_t pointIndex = dicNode->getInputIndex(0);
396 DicNodeVector childDicNodes;
397 DicNodeUtils::getAllChildDicNodes(
398 dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes);
399 const int size = childDicNodes.getSizeAndLock();
400 for (int i = 0; i < size; i++) {
401 if (traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(
402 pointIndex + 1) != childDicNodes[i]->getNodeCodePoint()) {
403 continue;
404 }
405 DicNode* const childDicNode = childDicNodes[i];
406 Weighting::addCostAndForwardInputIndex(
407 WEIGHTING, CT_INSERTION, traverseSession, dicNode, childDicNode,
408 0 /* multiBigramMap */);
409 processExpandedDicNode(traverseSession, childDicNode);
410 }
411 }
412
413 /**
414 * Handle the dicNode as a transposition error (e.g., thsi => this). Swap the
415 * next two touch points.
416 */
417 void Suggest::processDicNodeAsTransposition(DicTraverseSession* traverseSession,
418 DicNode* dicNode) const {
419 const int16_t pointIndex = dicNode->getInputIndex(0);
420 DicNodeVector childDicNodes1;
421 DicNodeVector childDicNodes2;
422 DicNodeUtils::getAllChildDicNodes(
423 dicNode, traverseSession->getDictionaryStructurePolicy(),
424 &childDicNodes1);
425 const int childSize1 = childDicNodes1.getSizeAndLock();
426 for (int i = 0; i < childSize1; i++) {
427 const ProximityType matchedId1 =
428 traverseSession->getProximityInfoState(0)->getProximityType(
429 pointIndex + 1, childDicNodes1[i]->getNodeCodePoint(),
430 true /* checkProximityChars */);
431 if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId1)) {
432 continue;
433 }
434 if (childDicNodes1[i]->hasChildren()) {
435 childDicNodes2.clear();
436 DicNodeUtils::getAllChildDicNodes(
437 childDicNodes1[i], traverseSession->getDictionaryStructurePolicy(),
438 &childDicNodes2);
439 const int childSize2 = childDicNodes2.getSizeAndLock();
440 for (int j = 0; j < childSize2; j++) {
441 DicNode* const childDicNode2 = childDicNodes2[j];
442 const ProximityType matchedId2 =
443 traverseSession->getProximityInfoState(0)->getProximityType(
444 pointIndex, childDicNode2->getNodeCodePoint(),
445 true /* checkProximityChars */);
446 if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId2)) {
447 continue;
448 }
449 Weighting::addCostAndForwardInputIndex(
450 WEIGHTING, CT_TRANSPOSITION, traverseSession, childDicNodes1[i],
451 childDicNode2, 0 /* multiBigramMap */);
452 processExpandedDicNode(traverseSession, childDicNode2);
453 }
454 }
455 }
456 }
457
458 /**
459 * Weight child dicNode by aligning it to the key
460 */
461 void Suggest::weightChildNode(DicTraverseSession* traverseSession,
462 DicNode* dicNode) const {
463 const int inputSize = traverseSession->getInputSize();
464 if (dicNode->isCompletion(inputSize)) {
465 Weighting::addCostAndForwardInputIndex(
466 WEIGHTING, CT_COMPLETION, traverseSession, 0 /* parentDicNode */,
467 dicNode, 0 /* multiBigramMap */);
468 } else { // completion
469 Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_MATCH, traverseSession,
470 0 /* parentDicNode */, dicNode,
471 0 /* multiBigramMap */);
472 }
473 }
474
475 /**
476 * Creates a new dicNode that represents a space insertion at the end of the
477 * input dicNode. Also
478 * incorporates the unigram / bigram score for the ending word into the new
479 * dicNode.
480 */
481 void Suggest::createNextWordDicNode(DicTraverseSession* traverseSession,
482 DicNode* dicNode,
483 const bool spaceSubstitution) const {
484 if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode)) {
485 return;
486 }
487
488 // Create a non-cached node here.
489 DicNode newDicNode;
490 DicNodeUtils::initAsRootWithPreviousWord(
491 traverseSession->getDictionaryStructurePolicy(), dicNode, &newDicNode);
492 const CorrectionType correctionType = spaceSubstitution
493 ? CT_NEW_WORD_SPACE_SUBSTITUTION
494 : CT_NEW_WORD_SPACE_OMISSION;
495 Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType,
496 traverseSession, dicNode, &newDicNode,
497 traverseSession->getMultiBigramMap());
498 if (newDicNode.getCompoundDistance() <
499 static_cast<float>(MAX_VALUE_FOR_WEIGHTING)) {
500 // newDicNode is worth continuing to traverse.
501 // CAVEAT: This pruning is important for speed. Remove this when we can
502 // afford not to prune
503 // here because here is not the right place to do pruning. Pruning should
504 // take place only
505 // in DicNodePriorityQueue.
506 traverseSession->getDicTraverseCache()->copyPushNextActive(&newDicNode);
507 }
508 }
509 } // namespace latinime
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698