Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(201)

Side by Side Diff: third_party/sqlite/sqlite-src-3170000/ext/misc/spellfix.c

Issue 2747283002: [sql] Import reference version of SQLite 3.17.. (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches. See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
16 */
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
19
20 #ifndef SQLITE_AMALGAMATION
21 # include <string.h>
22 # include <stdio.h>
23 # include <stdlib.h>
24 # include <assert.h>
25 # define ALWAYS(X) 1
26 # define NEVER(X) 0
27 typedef unsigned char u8;
28 typedef unsigned short u16;
29 #endif
30 #include <ctype.h>
31
32 #ifndef SQLITE_OMIT_VIRTUALTABLE
33
34 /*
35 ** Character classes for ASCII characters:
36 **
37 ** 0 '' Silent letters: H W
38 ** 1 'A' Any vowel: A E I O U (Y)
39 ** 2 'B' A bilabeal stop or fricative: B F P V W
40 ** 3 'C' Other fricatives or back stops: C G J K Q S X Z
41 ** 4 'D' Alveolar stops: D T
42 ** 5 'H' Letter H at the beginning of a word
43 ** 6 'L' Glide: L
44 ** 7 'R' Semivowel: R
45 ** 8 'M' Nasals: M N
46 ** 9 'Y' Letter Y at the beginning of a word.
47 ** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9
48 ** 11 ' ' White space
49 ** 12 '?' Other.
50 */
51 #define CCLASS_SILENT 0
52 #define CCLASS_VOWEL 1
53 #define CCLASS_B 2
54 #define CCLASS_C 3
55 #define CCLASS_D 4
56 #define CCLASS_H 5
57 #define CCLASS_L 6
58 #define CCLASS_R 7
59 #define CCLASS_M 8
60 #define CCLASS_Y 9
61 #define CCLASS_DIGIT 10
62 #define CCLASS_SPACE 11
63 #define CCLASS_OTHER 12
64
65 /*
66 ** The following table gives the character class for non-initial ASCII
67 ** characters.
68 */
69 static const unsigned char midClass[] = {
70 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
71 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
72 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
73 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
74 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
75 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
76 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
77 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
78 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
79 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
80 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
81 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
82 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
83 /* ' */ CCLASS_SILENT, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
84 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
85 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
86 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
87 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
88 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
89 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
90 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
91 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
92 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
93 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
94 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
95 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
96 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
97 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
98 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
99 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_VOWEL,
100 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
101 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
102 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
103 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
104 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
105 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
106 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
107 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
108 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
109 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
110 /* x */ CCLASS_C, /* y */ CCLASS_VOWEL, /* z */ CCLASS_C,
111 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
112 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
113 };
114 /*
115 ** This tables gives the character class for ASCII characters that form the
116 ** initial character of a word. The only difference from midClass is with
117 ** the letters H, W, and Y.
118 */
119 static const unsigned char initClass[] = {
120 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
121 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
122 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
123 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
124 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
125 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
126 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
127 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
128 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
129 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
130 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
131 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
132 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
133 /* ' */ CCLASS_OTHER, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
134 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
135 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
136 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
137 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
138 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
139 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
140 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
141 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
142 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
143 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
144 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
145 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
146 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
147 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
148 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
149 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_Y,
150 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
151 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
152 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
153 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
154 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
155 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
156 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
157 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
158 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
159 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
160 /* x */ CCLASS_C, /* y */ CCLASS_Y, /* z */ CCLASS_C,
161 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
162 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
163 };
164
165 /*
166 ** Mapping from the character class number (0-13) to a symbol for each
167 ** character class. Note that initClass[] can be used to map the class
168 ** symbol back into the class number.
169 */
170 static const unsigned char className[] = ".ABCDHLRMY9 ?";
171
172 /*
173 ** Generate a "phonetic hash" from a string of ASCII characters
174 ** in zIn[0..nIn-1].
175 **
176 ** * Map characters by character class as defined above.
177 ** * Omit double-letters
178 ** * Omit vowels beside R and L
179 ** * Omit T when followed by CH
180 ** * Omit W when followed by R
181 ** * Omit D when followed by J or G
182 ** * Omit K in KN or G in GN at the beginning of a word
183 **
184 ** Space to hold the result is obtained from sqlite3_malloc()
185 **
186 ** Return NULL if memory allocation fails.
187 */
188 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){
189 unsigned char *zOut = sqlite3_malloc64( nIn + 1 );
190 int i;
191 int nOut = 0;
192 char cPrev = 0x77;
193 char cPrevX = 0x77;
194 const unsigned char *aClass = initClass;
195
196 if( zOut==0 ) return 0;
197 if( nIn>2 ){
198 switch( zIn[0] ){
199 case 'g':
200 case 'k': {
201 if( zIn[1]=='n' ){ zIn++; nIn--; }
202 break;
203 }
204 }
205 }
206 for(i=0; i<nIn; i++){
207 unsigned char c = zIn[i];
208 if( i+1<nIn ){
209 if( c=='w' && zIn[i+1]=='r' ) continue;
210 if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
211 if( i+2<nIn ){
212 if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
213 }
214 }
215 c = aClass[c&0x7f];
216 if( c==CCLASS_SPACE ) continue;
217 if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
218 aClass = midClass;
219 if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){
220 continue; /* No vowels beside L or R */
221 }
222 if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){
223 nOut--; /* No vowels beside L or R */
224 }
225 cPrev = c;
226 if( c==CCLASS_SILENT ) continue;
227 cPrevX = c;
228 c = className[c];
229 assert( nOut>=0 );
230 if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
231 }
232 zOut[nOut] = 0;
233 return zOut;
234 }
235
236 /*
237 ** This is an SQL function wrapper around phoneticHash(). See
238 ** the description of phoneticHash() for additional information.
239 */
240 static void phoneticHashSqlFunc(
241 sqlite3_context *context,
242 int argc,
243 sqlite3_value **argv
244 ){
245 const unsigned char *zIn;
246 unsigned char *zOut;
247
248 zIn = sqlite3_value_text(argv[0]);
249 if( zIn==0 ) return;
250 zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
251 if( zOut==0 ){
252 sqlite3_result_error_nomem(context);
253 }else{
254 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
255 }
256 }
257
258 /*
259 ** Return the character class number for a character given its
260 ** context.
261 */
262 static char characterClass(char cPrev, char c){
263 return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
264 }
265
266 /*
267 ** Return the cost of inserting or deleting character c immediately
268 ** following character cPrev. If cPrev==0, that means c is the first
269 ** character of the word.
270 */
271 static int insertOrDeleteCost(char cPrev, char c, char cNext){
272 char classC = characterClass(cPrev, c);
273 char classCprev;
274
275 if( classC==CCLASS_SILENT ){
276 /* Insert or delete "silent" characters such as H or W */
277 return 1;
278 }
279 if( cPrev==c ){
280 /* Repeated characters, or miss a repeat */
281 return 10;
282 }
283 if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){
284 return 20; /* Insert a vowel before or after 'r' */
285 }
286 classCprev = characterClass(cPrev, cPrev);
287 if( classC==classCprev ){
288 if( classC==CCLASS_VOWEL ){
289 /* Remove or add a new vowel to a vowel cluster */
290 return 15;
291 }else{
292 /* Remove or add a consonant not in the same class */
293 return 50;
294 }
295 }
296
297 /* any other character insertion or deletion */
298 return 100;
299 }
300
301 /*
302 ** Divide the insertion cost by this factor when appending to the
303 ** end of the word.
304 */
305 #define FINAL_INS_COST_DIV 4
306
307 /*
308 ** Return the cost of substituting cTo in place of cFrom assuming
309 ** the previous character is cPrev. If cPrev==0 then cTo is the first
310 ** character of the word.
311 */
312 static int substituteCost(char cPrev, char cFrom, char cTo){
313 char classFrom, classTo;
314 if( cFrom==cTo ){
315 /* Exact match */
316 return 0;
317 }
318 if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
319 /* differ only in case */
320 return 0;
321 }
322 classFrom = characterClass(cPrev, cFrom);
323 classTo = characterClass(cPrev, cTo);
324 if( classFrom==classTo ){
325 /* Same character class */
326 return 40;
327 }
328 if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
329 && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
330 /* Convert from one consonant to another, but in a different class */
331 return 75;
332 }
333 /* Any other subsitution */
334 return 100;
335 }
336
337 /*
338 ** Given two strings zA and zB which are pure ASCII, return the cost
339 ** of transforming zA into zB. If zA ends with '*' assume that it is
340 ** a prefix of zB and give only minimal penalty for extra characters
341 ** on the end of zB.
342 **
343 ** Smaller numbers mean a closer match.
344 **
345 ** Negative values indicate an error:
346 ** -1 One of the inputs is NULL
347 ** -2 Non-ASCII characters on input
348 ** -3 Unable to allocate memory
349 **
350 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
351 ** of zB that matched the pattern in zA. If zA does not end with a '*',
352 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
353 ** If zA does end in a '*', then it is the number of bytes in the prefix
354 ** of zB that was deemed to match zA.
355 */
356 static int editdist1(const char *zA, const char *zB, int *pnMatch){
357 int nA, nB; /* Number of characters in zA[] and zB[] */
358 int xA, xB; /* Loop counters for zA[] and zB[] */
359 char cA = 0, cB; /* Current character of zA and zB */
360 char cAprev, cBprev; /* Previous character of zA and zB */
361 char cAnext, cBnext; /* Next character in zA and zB */
362 int d; /* North-west cost value */
363 int dc = 0; /* North-west character value */
364 int res; /* Final result */
365 int *m; /* The cost matrix */
366 char *cx; /* Corresponding character values */
367 int *toFree = 0; /* Malloced space */
368 int nMatch = 0;
369 int mStack[60+15]; /* Stack space to use if not too much is needed */
370
371 /* Early out if either input is NULL */
372 if( zA==0 || zB==0 ) return -1;
373
374 /* Skip any common prefix */
375 while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
376 if( pnMatch ) *pnMatch = nMatch;
377 if( zA[0]==0 && zB[0]==0 ) return 0;
378
379 #if 0
380 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
381 #endif
382
383 /* Verify input strings and measure their lengths */
384 for(nA=0; zA[nA]; nA++){
385 if( zA[nA]&0x80 ) return -2;
386 }
387 for(nB=0; zB[nB]; nB++){
388 if( zB[nB]&0x80 ) return -2;
389 }
390
391 /* Special processing if either string is empty */
392 if( nA==0 ){
393 cBprev = (char)dc;
394 for(xB=res=0; (cB = zB[xB])!=0; xB++){
395 res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
396 cBprev = cB;
397 }
398 return res;
399 }
400 if( nB==0 ){
401 cAprev = (char)dc;
402 for(xA=res=0; (cA = zA[xA])!=0; xA++){
403 res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
404 cAprev = cA;
405 }
406 return res;
407 }
408
409 /* A is a prefix of B */
410 if( zA[0]=='*' && zA[1]==0 ) return 0;
411
412 /* Allocate and initialize the Wagner matrix */
413 if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
414 m = mStack;
415 }else{
416 m = toFree = sqlite3_malloc64( (nB+1)*5*sizeof(m[0])/4 );
417 if( m==0 ) return -3;
418 }
419 cx = (char*)&m[nB+1];
420
421 /* Compute the Wagner edit distance */
422 m[0] = 0;
423 cx[0] = (char)dc;
424 cBprev = (char)dc;
425 for(xB=1; xB<=nB; xB++){
426 cBnext = zB[xB];
427 cB = zB[xB-1];
428 cx[xB] = cB;
429 m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
430 cBprev = cB;
431 }
432 cAprev = (char)dc;
433 for(xA=1; xA<=nA; xA++){
434 int lastA = (xA==nA);
435 cA = zA[xA-1];
436 cAnext = zA[xA];
437 if( cA=='*' && lastA ) break;
438 d = m[0];
439 dc = cx[0];
440 m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
441 cBprev = 0;
442 for(xB=1; xB<=nB; xB++){
443 int totalCost, insCost, delCost, subCost, ncx;
444 cB = zB[xB-1];
445 cBnext = zB[xB];
446
447 /* Cost to insert cB */
448 insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
449 if( lastA ) insCost /= FINAL_INS_COST_DIV;
450
451 /* Cost to delete cA */
452 delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
453
454 /* Cost to substitute cA->cB */
455 subCost = substituteCost(cx[xB-1], cA, cB);
456
457 /* Best cost */
458 totalCost = insCost + m[xB-1];
459 ncx = cB;
460 if( (delCost + m[xB])<totalCost ){
461 totalCost = delCost + m[xB];
462 ncx = cA;
463 }
464 if( (subCost + d)<totalCost ){
465 totalCost = subCost + d;
466 }
467
468 #if 0
469 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
470 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
471 xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
472 insCost, delCost, subCost, totalCost, ncx?ncx:' ');
473 #endif
474
475 /* Update the matrix */
476 d = m[xB];
477 dc = cx[xB];
478 m[xB] = totalCost;
479 cx[xB] = (char)ncx;
480 cBprev = cB;
481 }
482 cAprev = cA;
483 }
484
485 /* Free the wagner matrix and return the result */
486 if( cA=='*' ){
487 res = m[1];
488 for(xB=1; xB<=nB; xB++){
489 if( m[xB]<res ){
490 res = m[xB];
491 if( pnMatch ) *pnMatch = xB+nMatch;
492 }
493 }
494 }else{
495 res = m[nB];
496 /* In the current implementation, pnMatch is always NULL if zA does
497 ** not end in "*" */
498 assert( pnMatch==0 );
499 }
500 sqlite3_free(toFree);
501 return res;
502 }
503
504 /*
505 ** Function: editdist(A,B)
506 **
507 ** Return the cost of transforming string A into string B. Both strings
508 ** must be pure ASCII text. If A ends with '*' then it is assumed to be
509 ** a prefix of B and extra characters on the end of B have minimal additional
510 ** cost.
511 */
512 static void editdistSqlFunc(
513 sqlite3_context *context,
514 int argc,
515 sqlite3_value **argv
516 ){
517 int res = editdist1(
518 (const char*)sqlite3_value_text(argv[0]),
519 (const char*)sqlite3_value_text(argv[1]),
520 0);
521 if( res<0 ){
522 if( res==(-3) ){
523 sqlite3_result_error_nomem(context);
524 }else if( res==(-2) ){
525 sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
526 }else{
527 sqlite3_result_error(context, "NULL input to editdist()", -1);
528 }
529 }else{
530 sqlite3_result_int(context, res);
531 }
532 }
533
534 /* End of the fixed-cost edit distance implementation
535 ******************************************************************************
536 *****************************************************************************
537 ** Begin: Configurable cost unicode edit distance routines
538 */
539 /* Forward declaration of structures */
540 typedef struct EditDist3Cost EditDist3Cost;
541 typedef struct EditDist3Config EditDist3Config;
542 typedef struct EditDist3Point EditDist3Point;
543 typedef struct EditDist3From EditDist3From;
544 typedef struct EditDist3FromString EditDist3FromString;
545 typedef struct EditDist3To EditDist3To;
546 typedef struct EditDist3ToString EditDist3ToString;
547 typedef struct EditDist3Lang EditDist3Lang;
548
549
550 /*
551 ** An entry in the edit cost table
552 */
553 struct EditDist3Cost {
554 EditDist3Cost *pNext; /* Next cost element */
555 u8 nFrom; /* Number of bytes in aFrom */
556 u8 nTo; /* Number of bytes in aTo */
557 u16 iCost; /* Cost of this transformation */
558 char a[4] ; /* FROM string followed by TO string */
559 /* Additional TO and FROM string bytes appended as necessary */
560 };
561
562 /*
563 ** Edit costs for a particular language ID
564 */
565 struct EditDist3Lang {
566 int iLang; /* Language ID */
567 int iInsCost; /* Default insertion cost */
568 int iDelCost; /* Default deletion cost */
569 int iSubCost; /* Default substitution cost */
570 EditDist3Cost *pCost; /* Costs */
571 };
572
573
574 /*
575 ** The default EditDist3Lang object, with default costs.
576 */
577 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
578
579 /*
580 ** Complete configuration
581 */
582 struct EditDist3Config {
583 int nLang; /* Number of language IDs. Size of a[] */
584 EditDist3Lang *a; /* One for each distinct language ID */
585 };
586
587 /*
588 ** Extra information about each character in the FROM string.
589 */
590 struct EditDist3From {
591 int nSubst; /* Number of substitution cost entries */
592 int nDel; /* Number of deletion cost entries */
593 int nByte; /* Number of bytes in this character */
594 EditDist3Cost **apSubst; /* Array of substitution costs for this element */
595 EditDist3Cost **apDel; /* Array of deletion cost entries */
596 };
597
598 /*
599 ** A precompiled FROM string.
600 *
601 ** In the common case we expect the FROM string to be reused multiple times.
602 ** In other words, the common case will be to measure the edit distance
603 ** from a single origin string to multiple target strings.
604 */
605 struct EditDist3FromString {
606 char *z; /* The complete text of the FROM string */
607 int n; /* Number of characters in the FROM string */
608 int isPrefix; /* True if ends with '*' character */
609 EditDist3From *a; /* Extra info about each char of the FROM string */
610 };
611
612 /*
613 ** Extra information about each character in the TO string.
614 */
615 struct EditDist3To {
616 int nIns; /* Number of insertion cost entries */
617 int nByte; /* Number of bytes in this character */
618 EditDist3Cost **apIns; /* Array of deletion cost entries */
619 };
620
621 /*
622 ** A precompiled FROM string
623 */
624 struct EditDist3ToString {
625 char *z; /* The complete text of the TO string */
626 int n; /* Number of characters in the TO string */
627 EditDist3To *a; /* Extra info about each char of the TO string */
628 };
629
630 /*
631 ** Clear or delete an instance of the object that records all edit-distance
632 ** weights.
633 */
634 static void editDist3ConfigClear(EditDist3Config *p){
635 int i;
636 if( p==0 ) return;
637 for(i=0; i<p->nLang; i++){
638 EditDist3Cost *pCost, *pNext;
639 pCost = p->a[i].pCost;
640 while( pCost ){
641 pNext = pCost->pNext;
642 sqlite3_free(pCost);
643 pCost = pNext;
644 }
645 }
646 sqlite3_free(p->a);
647 memset(p, 0, sizeof(*p));
648 }
649 static void editDist3ConfigDelete(void *pIn){
650 EditDist3Config *p = (EditDist3Config*)pIn;
651 editDist3ConfigClear(p);
652 sqlite3_free(p);
653 }
654
655 /*
656 ** Load all edit-distance weights from a table.
657 */
658 static int editDist3ConfigLoad(
659 EditDist3Config *p, /* The edit distance configuration to load */
660 sqlite3 *db, /* Load from this database */
661 const char *zTable /* Name of the table from which to load */
662 ){
663 sqlite3_stmt *pStmt;
664 int rc, rc2;
665 char *zSql;
666 int iLangPrev = -9999;
667 EditDist3Lang *pLang = 0;
668
669 zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
670 " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
671 if( zSql==0 ) return SQLITE_NOMEM;
672 rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
673 sqlite3_free(zSql);
674 if( rc ) return rc;
675 editDist3ConfigClear(p);
676 while( sqlite3_step(pStmt)==SQLITE_ROW ){
677 int iLang = sqlite3_column_int(pStmt, 0);
678 const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
679 int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
680 const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
681 int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
682 int iCost = sqlite3_column_int(pStmt, 3);
683
684 assert( zFrom!=0 || nFrom==0 );
685 assert( zTo!=0 || nTo==0 );
686 if( nFrom>100 || nTo>100 ) continue;
687 if( iCost<0 ) continue;
688 if( pLang==0 || iLang!=iLangPrev ){
689 EditDist3Lang *pNew;
690 pNew = sqlite3_realloc64(p->a, (p->nLang+1)*sizeof(p->a[0]));
691 if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
692 p->a = pNew;
693 pLang = &p->a[p->nLang];
694 p->nLang++;
695 pLang->iLang = iLang;
696 pLang->iInsCost = 100;
697 pLang->iDelCost = 100;
698 pLang->iSubCost = 150;
699 pLang->pCost = 0;
700 iLangPrev = iLang;
701 }
702 if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
703 pLang->iDelCost = iCost;
704 }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
705 pLang->iInsCost = iCost;
706 }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){
707 pLang->iSubCost = iCost;
708 }else{
709 EditDist3Cost *pCost;
710 int nExtra = nFrom + nTo - 4;
711 if( nExtra<0 ) nExtra = 0;
712 pCost = sqlite3_malloc64( sizeof(*pCost) + nExtra );
713 if( pCost==0 ){ rc = SQLITE_NOMEM; break; }
714 pCost->nFrom = (u8)nFrom;
715 pCost->nTo = (u8)nTo;
716 pCost->iCost = (u16)iCost;
717 memcpy(pCost->a, zFrom, nFrom);
718 memcpy(pCost->a + nFrom, zTo, nTo);
719 pCost->pNext = pLang->pCost;
720 pLang->pCost = pCost;
721 }
722 }
723 rc2 = sqlite3_finalize(pStmt);
724 if( rc==SQLITE_OK ) rc = rc2;
725 return rc;
726 }
727
728 /*
729 ** Return the length (in bytes) of a utf-8 character. Or return a maximum
730 ** of N.
731 */
732 static int utf8Len(unsigned char c, int N){
733 int len = 1;
734 if( c>0x7f ){
735 if( (c&0xe0)==0xc0 ){
736 len = 2;
737 }else if( (c&0xf0)==0xe0 ){
738 len = 3;
739 }else{
740 len = 4;
741 }
742 }
743 if( len>N ) len = N;
744 return len;
745 }
746
747 /*
748 ** Return TRUE (non-zero) if the To side of the given cost matches
749 ** the given string.
750 */
751 static int matchTo(EditDist3Cost *p, const char *z, int n){
752 if( p->nTo>n ) return 0;
753 if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
754 return 1;
755 }
756
757 /*
758 ** Return TRUE (non-zero) if the From side of the given cost matches
759 ** the given string.
760 */
761 static int matchFrom(EditDist3Cost *p, const char *z, int n){
762 assert( p->nFrom<=n );
763 if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
764 return 1;
765 }
766
767 /*
768 ** Return TRUE (non-zero) of the next FROM character and the next TO
769 ** character are the same.
770 */
771 static int matchFromTo(
772 EditDist3FromString *pStr, /* Left hand string */
773 int n1, /* Index of comparison character on the left */
774 const char *z2, /* Right-handl comparison character */
775 int n2 /* Bytes remaining in z2[] */
776 ){
777 int b1 = pStr->a[n1].nByte;
778 if( b1>n2 ) return 0;
779 if( memcmp(pStr->z+n1, z2, b1)!=0 ) return 0;
780 return 1;
781 }
782
783 /*
784 ** Delete an EditDist3FromString objecct
785 */
786 static void editDist3FromStringDelete(EditDist3FromString *p){
787 int i;
788 if( p ){
789 for(i=0; i<p->n; i++){
790 sqlite3_free(p->a[i].apDel);
791 sqlite3_free(p->a[i].apSubst);
792 }
793 sqlite3_free(p);
794 }
795 }
796
797 /*
798 ** Create a EditDist3FromString object.
799 */
800 static EditDist3FromString *editDist3FromStringNew(
801 const EditDist3Lang *pLang,
802 const char *z,
803 int n
804 ){
805 EditDist3FromString *pStr;
806 EditDist3Cost *p;
807 int i;
808
809 if( z==0 ) return 0;
810 if( n<0 ) n = (int)strlen(z);
811 pStr = sqlite3_malloc64( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
812 if( pStr==0 ) return 0;
813 pStr->a = (EditDist3From*)&pStr[1];
814 memset(pStr->a, 0, sizeof(pStr->a[0])*n);
815 pStr->n = n;
816 pStr->z = (char*)&pStr->a[n];
817 memcpy(pStr->z, z, n+1);
818 if( n && z[n-1]=='*' ){
819 pStr->isPrefix = 1;
820 n--;
821 pStr->n--;
822 pStr->z[n] = 0;
823 }else{
824 pStr->isPrefix = 0;
825 }
826
827 for(i=0; i<n; i++){
828 EditDist3From *pFrom = &pStr->a[i];
829 memset(pFrom, 0, sizeof(*pFrom));
830 pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
831 for(p=pLang->pCost; p; p=p->pNext){
832 EditDist3Cost **apNew;
833 if( i+p->nFrom>n ) continue;
834 if( matchFrom(p, z+i, n-i)==0 ) continue;
835 if( p->nTo==0 ){
836 apNew = sqlite3_realloc64(pFrom->apDel,
837 sizeof(*apNew)*(pFrom->nDel+1));
838 if( apNew==0 ) break;
839 pFrom->apDel = apNew;
840 apNew[pFrom->nDel++] = p;
841 }else{
842 apNew = sqlite3_realloc64(pFrom->apSubst,
843 sizeof(*apNew)*(pFrom->nSubst+1));
844 if( apNew==0 ) break;
845 pFrom->apSubst = apNew;
846 apNew[pFrom->nSubst++] = p;
847 }
848 }
849 if( p ){
850 editDist3FromStringDelete(pStr);
851 pStr = 0;
852 break;
853 }
854 }
855 return pStr;
856 }
857
858 /*
859 ** Update entry m[i] such that it is the minimum of its current value
860 ** and m[j]+iCost.
861 **
862 ** If the iCost is 1,000,000 or greater, then consider the cost to be
863 ** infinite and skip the update.
864 */
865 static void updateCost(
866 unsigned int *m,
867 int i,
868 int j,
869 int iCost
870 ){
871 assert( iCost>=0 );
872 if( iCost<10000 ){
873 unsigned int b = m[j] + iCost;
874 if( b<m[i] ) m[i] = b;
875 }
876 }
877
878 /*
879 ** How much stack space (int bytes) to use for Wagner matrix in
880 ** editDist3Core(). If more space than this is required, the entire
881 ** matrix is taken from the heap. To reduce the load on the memory
882 ** allocator, make this value as large as practical for the
883 ** architecture in use.
884 */
885 #ifndef SQLITE_SPELLFIX_STACKALLOC_SZ
886 # define SQLITE_SPELLFIX_STACKALLOC_SZ (1024)
887 #endif
888
889 /* Compute the edit distance between two strings.
890 **
891 ** If an error occurs, return a negative number which is the error code.
892 **
893 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
894 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
895 ** not contain the pattern for a prefix-search, then this is always the number
896 ** of characters in z2. If pFrom does contain a prefix search pattern, then
897 ** it is the number of characters in the prefix of z2 that was deemed to
898 ** match pFrom.
899 */
900 static int editDist3Core(
901 EditDist3FromString *pFrom, /* The FROM string */
902 const char *z2, /* The TO string */
903 int n2, /* Length of the TO string */
904 const EditDist3Lang *pLang, /* Edit weights for a particular language ID */
905 int *pnMatch /* OUT: Characters in matched prefix */
906 ){
907 int k, n;
908 int i1, b1;
909 int i2, b2;
910 EditDist3FromString f = *pFrom;
911 EditDist3To *a2;
912 unsigned int *m;
913 unsigned int *pToFree;
914 int szRow;
915 EditDist3Cost *p;
916 int res;
917 sqlite3_uint64 nByte;
918 unsigned int stackSpace[SQLITE_SPELLFIX_STACKALLOC_SZ/sizeof(unsigned int)];
919
920 /* allocate the Wagner matrix and the aTo[] array for the TO string */
921 n = (f.n+1)*(n2+1);
922 n = (n+1)&~1;
923 nByte = n*sizeof(m[0]) + sizeof(a2[0])*n2;
924 if( nByte<=sizeof(stackSpace) ){
925 m = stackSpace;
926 pToFree = 0;
927 }else{
928 m = pToFree = sqlite3_malloc64( nByte );
929 if( m==0 ) return -1; /* Out of memory */
930 }
931 a2 = (EditDist3To*)&m[n];
932 memset(a2, 0, sizeof(a2[0])*n2);
933
934 /* Fill in the a1[] matrix for all characters of the TO string */
935 for(i2=0; i2<n2; i2++){
936 a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
937 for(p=pLang->pCost; p; p=p->pNext){
938 EditDist3Cost **apNew;
939 if( p->nFrom>0 ) continue;
940 if( i2+p->nTo>n2 ) continue;
941 if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
942 a2[i2].nIns++;
943 apNew = sqlite3_realloc64(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
944 if( apNew==0 ){
945 res = -1; /* Out of memory */
946 goto editDist3Abort;
947 }
948 a2[i2].apIns = apNew;
949 a2[i2].apIns[a2[i2].nIns-1] = p;
950 }
951 }
952
953 /* Prepare to compute the minimum edit distance */
954 szRow = f.n+1;
955 memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
956 m[0] = 0;
957
958 /* First fill in the top-row of the matrix with FROM deletion costs */
959 for(i1=0; i1<f.n; i1 += b1){
960 b1 = f.a[i1].nByte;
961 updateCost(m, i1+b1, i1, pLang->iDelCost);
962 for(k=0; k<f.a[i1].nDel; k++){
963 p = f.a[i1].apDel[k];
964 updateCost(m, i1+p->nFrom, i1, p->iCost);
965 }
966 }
967
968 /* Fill in all subsequent rows, top-to-bottom, left-to-right */
969 for(i2=0; i2<n2; i2 += b2){
970 int rx; /* Starting index for current row */
971 int rxp; /* Starting index for previous row */
972 b2 = a2[i2].nByte;
973 rx = szRow*(i2+b2);
974 rxp = szRow*i2;
975 updateCost(m, rx, rxp, pLang->iInsCost);
976 for(k=0; k<a2[i2].nIns; k++){
977 p = a2[i2].apIns[k];
978 updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
979 }
980 for(i1=0; i1<f.n; i1+=b1){
981 int cx; /* Index of current cell */
982 int cxp; /* Index of cell immediately to the left */
983 int cxd; /* Index of cell to the left and one row above */
984 int cxu; /* Index of cell immediately above */
985 b1 = f.a[i1].nByte;
986 cxp = rx + i1;
987 cx = cxp + b1;
988 cxd = rxp + i1;
989 cxu = cxd + b1;
990 updateCost(m, cx, cxp, pLang->iDelCost);
991 for(k=0; k<f.a[i1].nDel; k++){
992 p = f.a[i1].apDel[k];
993 updateCost(m, cxp+p->nFrom, cxp, p->iCost);
994 }
995 updateCost(m, cx, cxu, pLang->iInsCost);
996 if( matchFromTo(&f, i1, z2+i2, n2-i2) ){
997 updateCost(m, cx, cxd, 0);
998 }
999 updateCost(m, cx, cxd, pLang->iSubCost);
1000 for(k=0; k<f.a[i1].nSubst; k++){
1001 p = f.a[i1].apSubst[k];
1002 if( matchTo(p, z2+i2, n2-i2) ){
1003 updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
1004 }
1005 }
1006 }
1007 }
1008
1009 #if 0 /* Enable for debugging */
1010 printf(" ^");
1011 for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
1012 printf("\n ^:");
1013 for(i1=0; i1<szRow; i1++){
1014 int v = m[i1];
1015 if( v>9999 ) printf(" ****");
1016 else printf(" %4d", v);
1017 }
1018 printf("\n");
1019 for(i2=0; i2<n2; i2++){
1020 printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1021 for(i1=0; i1<szRow; i1++){
1022 int v = m[(i2+1)*szRow+i1];
1023 if( v>9999 ) printf(" ****");
1024 else printf(" %4d", v);
1025 }
1026 printf("\n");
1027 }
1028 #endif
1029
1030 /* Free memory allocations and return the result */
1031 res = (int)m[szRow*(n2+1)-1];
1032 n = n2;
1033 if( f.isPrefix ){
1034 for(i2=1; i2<=n2; i2++){
1035 int b = m[szRow*i2-1];
1036 if( b<=res ){
1037 res = b;
1038 n = i2 - 1;
1039 }
1040 }
1041 }
1042 if( pnMatch ){
1043 int nExtra = 0;
1044 for(k=0; k<n; k++){
1045 if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1046 }
1047 *pnMatch = n - nExtra;
1048 }
1049
1050 editDist3Abort:
1051 for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1052 sqlite3_free(pToFree);
1053 return res;
1054 }
1055
1056 /*
1057 ** Get an appropriate EditDist3Lang object.
1058 */
1059 static const EditDist3Lang *editDist3FindLang(
1060 EditDist3Config *pConfig,
1061 int iLang
1062 ){
1063 int i;
1064 for(i=0; i<pConfig->nLang; i++){
1065 if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1066 }
1067 return &editDist3Lang;
1068 }
1069
1070 /*
1071 ** Function: editdist3(A,B,iLang)
1072 ** editdist3(tablename)
1073 **
1074 ** Return the cost of transforming string A into string B using edit
1075 ** weights for iLang.
1076 **
1077 ** The second form loads edit weights into memory from a table.
1078 */
1079 static void editDist3SqlFunc(
1080 sqlite3_context *context,
1081 int argc,
1082 sqlite3_value **argv
1083 ){
1084 EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1085 sqlite3 *db = sqlite3_context_db_handle(context);
1086 int rc;
1087 if( argc==1 ){
1088 const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1089 rc = editDist3ConfigLoad(pConfig, db, zTable);
1090 if( rc ) sqlite3_result_error_code(context, rc);
1091 }else{
1092 const char *zA = (const char*)sqlite3_value_text(argv[0]);
1093 const char *zB = (const char*)sqlite3_value_text(argv[1]);
1094 int nA = sqlite3_value_bytes(argv[0]);
1095 int nB = sqlite3_value_bytes(argv[1]);
1096 int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1097 const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1098 EditDist3FromString *pFrom;
1099 int dist;
1100
1101 pFrom = editDist3FromStringNew(pLang, zA, nA);
1102 if( pFrom==0 ){
1103 sqlite3_result_error_nomem(context);
1104 return;
1105 }
1106 dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1107 editDist3FromStringDelete(pFrom);
1108 if( dist==(-1) ){
1109 sqlite3_result_error_nomem(context);
1110 }else{
1111 sqlite3_result_int(context, dist);
1112 }
1113 }
1114 }
1115
1116 /*
1117 ** Register the editDist3 function with SQLite
1118 */
1119 static int editDist3Install(sqlite3 *db){
1120 int rc;
1121 EditDist3Config *pConfig = sqlite3_malloc64( sizeof(*pConfig) );
1122 if( pConfig==0 ) return SQLITE_NOMEM;
1123 memset(pConfig, 0, sizeof(*pConfig));
1124 rc = sqlite3_create_function_v2(db, "editdist3",
1125 2, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1126 if( rc==SQLITE_OK ){
1127 rc = sqlite3_create_function_v2(db, "editdist3",
1128 3, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1129 }
1130 if( rc==SQLITE_OK ){
1131 rc = sqlite3_create_function_v2(db, "editdist3",
1132 1, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0,
1133 editDist3ConfigDelete);
1134 }else{
1135 sqlite3_free(pConfig);
1136 }
1137 return rc;
1138 }
1139 /* End configurable cost unicode edit distance routines
1140 ******************************************************************************
1141 ******************************************************************************
1142 ** Begin transliterate unicode-to-ascii implementation
1143 */
1144
1145 #if !SQLITE_AMALGAMATION
1146 /*
1147 ** This lookup table is used to help decode the first byte of
1148 ** a multi-byte UTF8 character.
1149 */
1150 static const unsigned char sqlite3Utf8Trans1[] = {
1151 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1152 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1153 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1154 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1155 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1156 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1157 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1158 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1159 };
1160 #endif
1161
1162 /*
1163 ** Return the value of the first UTF-8 character in the string.
1164 */
1165 static int utf8Read(const unsigned char *z, int n, int *pSize){
1166 int c, i;
1167
1168 /* All callers to this routine (in the current implementation)
1169 ** always have n>0. */
1170 if( NEVER(n==0) ){
1171 c = i = 0;
1172 }else{
1173 c = z[0];
1174 i = 1;
1175 if( c>=0xc0 ){
1176 c = sqlite3Utf8Trans1[c-0xc0];
1177 while( i<n && (z[i] & 0xc0)==0x80 ){
1178 c = (c<<6) + (0x3f & z[i++]);
1179 }
1180 }
1181 }
1182 *pSize = i;
1183 return c;
1184 }
1185
1186 /*
1187 ** Return the number of characters in the utf-8 string in the nIn byte
1188 ** buffer pointed to by zIn.
1189 */
1190 static int utf8Charlen(const char *zIn, int nIn){
1191 int i;
1192 int nChar = 0;
1193 for(i=0; i<nIn; nChar++){
1194 int sz;
1195 utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1196 i += sz;
1197 }
1198 return nChar;
1199 }
1200
1201 /*
1202 ** Table of translations from unicode characters into ASCII.
1203 */
1204 static const struct {
1205 unsigned short int cFrom;
1206 unsigned char cTo0, cTo1;
1207 } translit[] = {
1208 { 0x00A0, 0x20, 0x00 }, /*   to */
1209 { 0x00B5, 0x75, 0x00 }, /* µ to u */
1210 { 0x00C0, 0x41, 0x00 }, /* À to A */
1211 { 0x00C1, 0x41, 0x00 }, /* Á to A */
1212 { 0x00C2, 0x41, 0x00 }, /* Â to A */
1213 { 0x00C3, 0x41, 0x00 }, /* Ã to A */
1214 { 0x00C4, 0x41, 0x65 }, /* Ä to Ae */
1215 { 0x00C5, 0x41, 0x61 }, /* Å to Aa */
1216 { 0x00C6, 0x41, 0x45 }, /* Æ to AE */
1217 { 0x00C7, 0x43, 0x00 }, /* Ç to C */
1218 { 0x00C8, 0x45, 0x00 }, /* È to E */
1219 { 0x00C9, 0x45, 0x00 }, /* É to E */
1220 { 0x00CA, 0x45, 0x00 }, /* Ê to E */
1221 { 0x00CB, 0x45, 0x00 }, /* Ë to E */
1222 { 0x00CC, 0x49, 0x00 }, /* Ì to I */
1223 { 0x00CD, 0x49, 0x00 }, /* Í to I */
1224 { 0x00CE, 0x49, 0x00 }, /* Î to I */
1225 { 0x00CF, 0x49, 0x00 }, /* Ï to I */
1226 { 0x00D0, 0x44, 0x00 }, /* Ð to D */
1227 { 0x00D1, 0x4E, 0x00 }, /* Ñ to N */
1228 { 0x00D2, 0x4F, 0x00 }, /* Ò to O */
1229 { 0x00D3, 0x4F, 0x00 }, /* Ó to O */
1230 { 0x00D4, 0x4F, 0x00 }, /* Ô to O */
1231 { 0x00D5, 0x4F, 0x00 }, /* Õ to O */
1232 { 0x00D6, 0x4F, 0x65 }, /* Ö to Oe */
1233 { 0x00D7, 0x78, 0x00 }, /* × to x */
1234 { 0x00D8, 0x4F, 0x00 }, /* Ø to O */
1235 { 0x00D9, 0x55, 0x00 }, /* Ù to U */
1236 { 0x00DA, 0x55, 0x00 }, /* Ú to U */
1237 { 0x00DB, 0x55, 0x00 }, /* Û to U */
1238 { 0x00DC, 0x55, 0x65 }, /* Ü to Ue */
1239 { 0x00DD, 0x59, 0x00 }, /* Ý to Y */
1240 { 0x00DE, 0x54, 0x68 }, /* Þ to Th */
1241 { 0x00DF, 0x73, 0x73 }, /* ß to ss */
1242 { 0x00E0, 0x61, 0x00 }, /* à to a */
1243 { 0x00E1, 0x61, 0x00 }, /* á to a */
1244 { 0x00E2, 0x61, 0x00 }, /* â to a */
1245 { 0x00E3, 0x61, 0x00 }, /* ã to a */
1246 { 0x00E4, 0x61, 0x65 }, /* ä to ae */
1247 { 0x00E5, 0x61, 0x61 }, /* å to aa */
1248 { 0x00E6, 0x61, 0x65 }, /* æ to ae */
1249 { 0x00E7, 0x63, 0x00 }, /* ç to c */
1250 { 0x00E8, 0x65, 0x00 }, /* è to e */
1251 { 0x00E9, 0x65, 0x00 }, /* é to e */
1252 { 0x00EA, 0x65, 0x00 }, /* ê to e */
1253 { 0x00EB, 0x65, 0x00 }, /* ë to e */
1254 { 0x00EC, 0x69, 0x00 }, /* ì to i */
1255 { 0x00ED, 0x69, 0x00 }, /* í to i */
1256 { 0x00EE, 0x69, 0x00 }, /* î to i */
1257 { 0x00EF, 0x69, 0x00 }, /* ï to i */
1258 { 0x00F0, 0x64, 0x00 }, /* ð to d */
1259 { 0x00F1, 0x6E, 0x00 }, /* ñ to n */
1260 { 0x00F2, 0x6F, 0x00 }, /* ò to o */
1261 { 0x00F3, 0x6F, 0x00 }, /* ó to o */
1262 { 0x00F4, 0x6F, 0x00 }, /* ô to o */
1263 { 0x00F5, 0x6F, 0x00 }, /* õ to o */
1264 { 0x00F6, 0x6F, 0x65 }, /* ö to oe */
1265 { 0x00F7, 0x3A, 0x00 }, /* ÷ to : */
1266 { 0x00F8, 0x6F, 0x00 }, /* ø to o */
1267 { 0x00F9, 0x75, 0x00 }, /* ù to u */
1268 { 0x00FA, 0x75, 0x00 }, /* ú to u */
1269 { 0x00FB, 0x75, 0x00 }, /* û to u */
1270 { 0x00FC, 0x75, 0x65 }, /* ü to ue */
1271 { 0x00FD, 0x79, 0x00 }, /* ý to y */
1272 { 0x00FE, 0x74, 0x68 }, /* þ to th */
1273 { 0x00FF, 0x79, 0x00 }, /* ÿ to y */
1274 { 0x0100, 0x41, 0x00 }, /* Ā to A */
1275 { 0x0101, 0x61, 0x00 }, /* ā to a */
1276 { 0x0102, 0x41, 0x00 }, /* Ă to A */
1277 { 0x0103, 0x61, 0x00 }, /* ă to a */
1278 { 0x0104, 0x41, 0x00 }, /* Ą to A */
1279 { 0x0105, 0x61, 0x00 }, /* ą to a */
1280 { 0x0106, 0x43, 0x00 }, /* Ć to C */
1281 { 0x0107, 0x63, 0x00 }, /* ć to c */
1282 { 0x0108, 0x43, 0x68 }, /* Ĉ to Ch */
1283 { 0x0109, 0x63, 0x68 }, /* ĉ to ch */
1284 { 0x010A, 0x43, 0x00 }, /* Ċ to C */
1285 { 0x010B, 0x63, 0x00 }, /* ċ to c */
1286 { 0x010C, 0x43, 0x00 }, /* Č to C */
1287 { 0x010D, 0x63, 0x00 }, /* č to c */
1288 { 0x010E, 0x44, 0x00 }, /* Ď to D */
1289 { 0x010F, 0x64, 0x00 }, /* ď to d */
1290 { 0x0110, 0x44, 0x00 }, /* Đ to D */
1291 { 0x0111, 0x64, 0x00 }, /* đ to d */
1292 { 0x0112, 0x45, 0x00 }, /* Ē to E */
1293 { 0x0113, 0x65, 0x00 }, /* ē to e */
1294 { 0x0114, 0x45, 0x00 }, /* Ĕ to E */
1295 { 0x0115, 0x65, 0x00 }, /* ĕ to e */
1296 { 0x0116, 0x45, 0x00 }, /* Ė to E */
1297 { 0x0117, 0x65, 0x00 }, /* ė to e */
1298 { 0x0118, 0x45, 0x00 }, /* Ę to E */
1299 { 0x0119, 0x65, 0x00 }, /* ę to e */
1300 { 0x011A, 0x45, 0x00 }, /* Ě to E */
1301 { 0x011B, 0x65, 0x00 }, /* ě to e */
1302 { 0x011C, 0x47, 0x68 }, /* Ĝ to Gh */
1303 { 0x011D, 0x67, 0x68 }, /* ĝ to gh */
1304 { 0x011E, 0x47, 0x00 }, /* Ğ to G */
1305 { 0x011F, 0x67, 0x00 }, /* ğ to g */
1306 { 0x0120, 0x47, 0x00 }, /* Ġ to G */
1307 { 0x0121, 0x67, 0x00 }, /* ġ to g */
1308 { 0x0122, 0x47, 0x00 }, /* Ģ to G */
1309 { 0x0123, 0x67, 0x00 }, /* ģ to g */
1310 { 0x0124, 0x48, 0x68 }, /* Ĥ to Hh */
1311 { 0x0125, 0x68, 0x68 }, /* ĥ to hh */
1312 { 0x0126, 0x48, 0x00 }, /* Ħ to H */
1313 { 0x0127, 0x68, 0x00 }, /* ħ to h */
1314 { 0x0128, 0x49, 0x00 }, /* Ĩ to I */
1315 { 0x0129, 0x69, 0x00 }, /* ĩ to i */
1316 { 0x012A, 0x49, 0x00 }, /* Ī to I */
1317 { 0x012B, 0x69, 0x00 }, /* ī to i */
1318 { 0x012C, 0x49, 0x00 }, /* Ĭ to I */
1319 { 0x012D, 0x69, 0x00 }, /* ĭ to i */
1320 { 0x012E, 0x49, 0x00 }, /* Į to I */
1321 { 0x012F, 0x69, 0x00 }, /* į to i */
1322 { 0x0130, 0x49, 0x00 }, /* İ to I */
1323 { 0x0131, 0x69, 0x00 }, /* ı to i */
1324 { 0x0132, 0x49, 0x4A }, /* IJ to IJ */
1325 { 0x0133, 0x69, 0x6A }, /* ij to ij */
1326 { 0x0134, 0x4A, 0x68 }, /* Ĵ to Jh */
1327 { 0x0135, 0x6A, 0x68 }, /* ĵ to jh */
1328 { 0x0136, 0x4B, 0x00 }, /* Ķ to K */
1329 { 0x0137, 0x6B, 0x00 }, /* ķ to k */
1330 { 0x0138, 0x6B, 0x00 }, /* ĸ to k */
1331 { 0x0139, 0x4C, 0x00 }, /* Ĺ to L */
1332 { 0x013A, 0x6C, 0x00 }, /* ĺ to l */
1333 { 0x013B, 0x4C, 0x00 }, /* Ļ to L */
1334 { 0x013C, 0x6C, 0x00 }, /* ļ to l */
1335 { 0x013D, 0x4C, 0x00 }, /* Ľ to L */
1336 { 0x013E, 0x6C, 0x00 }, /* ľ to l */
1337 { 0x013F, 0x4C, 0x2E }, /* Ŀ to L. */
1338 { 0x0140, 0x6C, 0x2E }, /* ŀ to l. */
1339 { 0x0141, 0x4C, 0x00 }, /* Ł to L */
1340 { 0x0142, 0x6C, 0x00 }, /* ł to l */
1341 { 0x0143, 0x4E, 0x00 }, /* Ń to N */
1342 { 0x0144, 0x6E, 0x00 }, /* ń to n */
1343 { 0x0145, 0x4E, 0x00 }, /* Ņ to N */
1344 { 0x0146, 0x6E, 0x00 }, /* ņ to n */
1345 { 0x0147, 0x4E, 0x00 }, /* Ň to N */
1346 { 0x0148, 0x6E, 0x00 }, /* ň to n */
1347 { 0x0149, 0x27, 0x6E }, /* ʼn to 'n */
1348 { 0x014A, 0x4E, 0x47 }, /* Ŋ to NG */
1349 { 0x014B, 0x6E, 0x67 }, /* ŋ to ng */
1350 { 0x014C, 0x4F, 0x00 }, /* Ō to O */
1351 { 0x014D, 0x6F, 0x00 }, /* ō to o */
1352 { 0x014E, 0x4F, 0x00 }, /* Ŏ to O */
1353 { 0x014F, 0x6F, 0x00 }, /* ŏ to o */
1354 { 0x0150, 0x4F, 0x00 }, /* Ő to O */
1355 { 0x0151, 0x6F, 0x00 }, /* ő to o */
1356 { 0x0152, 0x4F, 0x45 }, /* Œ to OE */
1357 { 0x0153, 0x6F, 0x65 }, /* œ to oe */
1358 { 0x0154, 0x52, 0x00 }, /* Ŕ to R */
1359 { 0x0155, 0x72, 0x00 }, /* ŕ to r */
1360 { 0x0156, 0x52, 0x00 }, /* Ŗ to R */
1361 { 0x0157, 0x72, 0x00 }, /* ŗ to r */
1362 { 0x0158, 0x52, 0x00 }, /* Ř to R */
1363 { 0x0159, 0x72, 0x00 }, /* ř to r */
1364 { 0x015A, 0x53, 0x00 }, /* Ś to S */
1365 { 0x015B, 0x73, 0x00 }, /* ś to s */
1366 { 0x015C, 0x53, 0x68 }, /* Ŝ to Sh */
1367 { 0x015D, 0x73, 0x68 }, /* ŝ to sh */
1368 { 0x015E, 0x53, 0x00 }, /* Ş to S */
1369 { 0x015F, 0x73, 0x00 }, /* ş to s */
1370 { 0x0160, 0x53, 0x00 }, /* Š to S */
1371 { 0x0161, 0x73, 0x00 }, /* š to s */
1372 { 0x0162, 0x54, 0x00 }, /* Ţ to T */
1373 { 0x0163, 0x74, 0x00 }, /* ţ to t */
1374 { 0x0164, 0x54, 0x00 }, /* Ť to T */
1375 { 0x0165, 0x74, 0x00 }, /* ť to t */
1376 { 0x0166, 0x54, 0x00 }, /* Ŧ to T */
1377 { 0x0167, 0x74, 0x00 }, /* ŧ to t */
1378 { 0x0168, 0x55, 0x00 }, /* Ũ to U */
1379 { 0x0169, 0x75, 0x00 }, /* ũ to u */
1380 { 0x016A, 0x55, 0x00 }, /* Ū to U */
1381 { 0x016B, 0x75, 0x00 }, /* ū to u */
1382 { 0x016C, 0x55, 0x00 }, /* Ŭ to U */
1383 { 0x016D, 0x75, 0x00 }, /* ŭ to u */
1384 { 0x016E, 0x55, 0x00 }, /* Ů to U */
1385 { 0x016F, 0x75, 0x00 }, /* ů to u */
1386 { 0x0170, 0x55, 0x00 }, /* Ű to U */
1387 { 0x0171, 0x75, 0x00 }, /* ű to u */
1388 { 0x0172, 0x55, 0x00 }, /* Ų to U */
1389 { 0x0173, 0x75, 0x00 }, /* ų to u */
1390 { 0x0174, 0x57, 0x00 }, /* Ŵ to W */
1391 { 0x0175, 0x77, 0x00 }, /* ŵ to w */
1392 { 0x0176, 0x59, 0x00 }, /* Ŷ to Y */
1393 { 0x0177, 0x79, 0x00 }, /* ŷ to y */
1394 { 0x0178, 0x59, 0x00 }, /* Ÿ to Y */
1395 { 0x0179, 0x5A, 0x00 }, /* Ź to Z */
1396 { 0x017A, 0x7A, 0x00 }, /* ź to z */
1397 { 0x017B, 0x5A, 0x00 }, /* Ż to Z */
1398 { 0x017C, 0x7A, 0x00 }, /* ż to z */
1399 { 0x017D, 0x5A, 0x00 }, /* Ž to Z */
1400 { 0x017E, 0x7A, 0x00 }, /* ž to z */
1401 { 0x017F, 0x73, 0x00 }, /* ſ to s */
1402 { 0x0192, 0x66, 0x00 }, /* ƒ to f */
1403 { 0x0218, 0x53, 0x00 }, /* Ș to S */
1404 { 0x0219, 0x73, 0x00 }, /* ș to s */
1405 { 0x021A, 0x54, 0x00 }, /* Ț to T */
1406 { 0x021B, 0x74, 0x00 }, /* ț to t */
1407 { 0x0386, 0x41, 0x00 }, /* Ά to A */
1408 { 0x0388, 0x45, 0x00 }, /* Έ to E */
1409 { 0x0389, 0x49, 0x00 }, /* Ή to I */
1410 { 0x038A, 0x49, 0x00 }, /* Ί to I */
1411 { 0x038C, 0x4f, 0x00 }, /* Ό to O */
1412 { 0x038E, 0x59, 0x00 }, /* Ύ to Y */
1413 { 0x038F, 0x4f, 0x00 }, /* Ώ to O */
1414 { 0x0390, 0x69, 0x00 }, /* ΐ to i */
1415 { 0x0391, 0x41, 0x00 }, /* Α to A */
1416 { 0x0392, 0x42, 0x00 }, /* Β to B */
1417 { 0x0393, 0x47, 0x00 }, /* Γ to G */
1418 { 0x0394, 0x44, 0x00 }, /* Δ to D */
1419 { 0x0395, 0x45, 0x00 }, /* Ε to E */
1420 { 0x0396, 0x5a, 0x00 }, /* Ζ to Z */
1421 { 0x0397, 0x49, 0x00 }, /* Η to I */
1422 { 0x0398, 0x54, 0x68 }, /* Θ to Th */
1423 { 0x0399, 0x49, 0x00 }, /* Ι to I */
1424 { 0x039A, 0x4b, 0x00 }, /* Κ to K */
1425 { 0x039B, 0x4c, 0x00 }, /* Λ to L */
1426 { 0x039C, 0x4d, 0x00 }, /* Μ to M */
1427 { 0x039D, 0x4e, 0x00 }, /* Ν to N */
1428 { 0x039E, 0x58, 0x00 }, /* Ξ to X */
1429 { 0x039F, 0x4f, 0x00 }, /* Ο to O */
1430 { 0x03A0, 0x50, 0x00 }, /* Π to P */
1431 { 0x03A1, 0x52, 0x00 }, /* Ρ to R */
1432 { 0x03A3, 0x53, 0x00 }, /* Σ to S */
1433 { 0x03A4, 0x54, 0x00 }, /* Τ to T */
1434 { 0x03A5, 0x59, 0x00 }, /* Υ to Y */
1435 { 0x03A6, 0x46, 0x00 }, /* Φ to F */
1436 { 0x03A7, 0x43, 0x68 }, /* Χ to Ch */
1437 { 0x03A8, 0x50, 0x73 }, /* Ψ to Ps */
1438 { 0x03A9, 0x4f, 0x00 }, /* Ω to O */
1439 { 0x03AA, 0x49, 0x00 }, /* Ϊ to I */
1440 { 0x03AB, 0x59, 0x00 }, /* Ϋ to Y */
1441 { 0x03AC, 0x61, 0x00 }, /* ά to a */
1442 { 0x03AD, 0x65, 0x00 }, /* έ to e */
1443 { 0x03AE, 0x69, 0x00 }, /* ή to i */
1444 { 0x03AF, 0x69, 0x00 }, /* ί to i */
1445 { 0x03B1, 0x61, 0x00 }, /* α to a */
1446 { 0x03B2, 0x62, 0x00 }, /* β to b */
1447 { 0x03B3, 0x67, 0x00 }, /* γ to g */
1448 { 0x03B4, 0x64, 0x00 }, /* δ to d */
1449 { 0x03B5, 0x65, 0x00 }, /* ε to e */
1450 { 0x03B6, 0x7a, 0x00 }, /* ζ to z */
1451 { 0x03B7, 0x69, 0x00 }, /* η to i */
1452 { 0x03B8, 0x74, 0x68 }, /* θ to th */
1453 { 0x03B9, 0x69, 0x00 }, /* ι to i */
1454 { 0x03BA, 0x6b, 0x00 }, /* κ to k */
1455 { 0x03BB, 0x6c, 0x00 }, /* λ to l */
1456 { 0x03BC, 0x6d, 0x00 }, /* μ to m */
1457 { 0x03BD, 0x6e, 0x00 }, /* ν to n */
1458 { 0x03BE, 0x78, 0x00 }, /* ξ to x */
1459 { 0x03BF, 0x6f, 0x00 }, /* ο to o */
1460 { 0x03C0, 0x70, 0x00 }, /* π to p */
1461 { 0x03C1, 0x72, 0x00 }, /* ρ to r */
1462 { 0x03C3, 0x73, 0x00 }, /* σ to s */
1463 { 0x03C4, 0x74, 0x00 }, /* τ to t */
1464 { 0x03C5, 0x79, 0x00 }, /* υ to y */
1465 { 0x03C6, 0x66, 0x00 }, /* φ to f */
1466 { 0x03C7, 0x63, 0x68 }, /* χ to ch */
1467 { 0x03C8, 0x70, 0x73 }, /* ψ to ps */
1468 { 0x03C9, 0x6f, 0x00 }, /* ω to o */
1469 { 0x03CA, 0x69, 0x00 }, /* ϊ to i */
1470 { 0x03CB, 0x79, 0x00 }, /* ϋ to y */
1471 { 0x03CC, 0x6f, 0x00 }, /* ό to o */
1472 { 0x03CD, 0x79, 0x00 }, /* ύ to y */
1473 { 0x03CE, 0x69, 0x00 }, /* ώ to i */
1474 { 0x0400, 0x45, 0x00 }, /* Ѐ to E */
1475 { 0x0401, 0x45, 0x00 }, /* Ё to E */
1476 { 0x0402, 0x44, 0x00 }, /* Ђ to D */
1477 { 0x0403, 0x47, 0x00 }, /* Ѓ to G */
1478 { 0x0404, 0x45, 0x00 }, /* Є to E */
1479 { 0x0405, 0x5a, 0x00 }, /* Ѕ to Z */
1480 { 0x0406, 0x49, 0x00 }, /* І to I */
1481 { 0x0407, 0x49, 0x00 }, /* Ї to I */
1482 { 0x0408, 0x4a, 0x00 }, /* Ј to J */
1483 { 0x0409, 0x49, 0x00 }, /* Љ to I */
1484 { 0x040A, 0x4e, 0x00 }, /* Њ to N */
1485 { 0x040B, 0x44, 0x00 }, /* Ћ to D */
1486 { 0x040C, 0x4b, 0x00 }, /* Ќ to K */
1487 { 0x040D, 0x49, 0x00 }, /* Ѝ to I */
1488 { 0x040E, 0x55, 0x00 }, /* Ў to U */
1489 { 0x040F, 0x44, 0x00 }, /* Џ to D */
1490 { 0x0410, 0x41, 0x00 }, /* А to A */
1491 { 0x0411, 0x42, 0x00 }, /* Б to B */
1492 { 0x0412, 0x56, 0x00 }, /* В to V */
1493 { 0x0413, 0x47, 0x00 }, /* Г to G */
1494 { 0x0414, 0x44, 0x00 }, /* Д to D */
1495 { 0x0415, 0x45, 0x00 }, /* Е to E */
1496 { 0x0416, 0x5a, 0x68 }, /* Ж to Zh */
1497 { 0x0417, 0x5a, 0x00 }, /* З to Z */
1498 { 0x0418, 0x49, 0x00 }, /* И to I */
1499 { 0x0419, 0x49, 0x00 }, /* Й to I */
1500 { 0x041A, 0x4b, 0x00 }, /* К to K */
1501 { 0x041B, 0x4c, 0x00 }, /* Л to L */
1502 { 0x041C, 0x4d, 0x00 }, /* М to M */
1503 { 0x041D, 0x4e, 0x00 }, /* Н to N */
1504 { 0x041E, 0x4f, 0x00 }, /* О to O */
1505 { 0x041F, 0x50, 0x00 }, /* П to P */
1506 { 0x0420, 0x52, 0x00 }, /* Р to R */
1507 { 0x0421, 0x53, 0x00 }, /* С to S */
1508 { 0x0422, 0x54, 0x00 }, /* Т to T */
1509 { 0x0423, 0x55, 0x00 }, /* У to U */
1510 { 0x0424, 0x46, 0x00 }, /* Ф to F */
1511 { 0x0425, 0x4b, 0x68 }, /* Х to Kh */
1512 { 0x0426, 0x54, 0x63 }, /* Ц to Tc */
1513 { 0x0427, 0x43, 0x68 }, /* Ч to Ch */
1514 { 0x0428, 0x53, 0x68 }, /* Ш to Sh */
1515 { 0x0429, 0x53, 0x68 }, /* Щ to Shch */
1516 { 0x042A, 0x61, 0x00 }, /* to A */
1517 { 0x042B, 0x59, 0x00 }, /* Ы to Y */
1518 { 0x042C, 0x59, 0x00 }, /* to Y */
1519 { 0x042D, 0x45, 0x00 }, /* Э to E */
1520 { 0x042E, 0x49, 0x75 }, /* Ю to Iu */
1521 { 0x042F, 0x49, 0x61 }, /* Я to Ia */
1522 { 0x0430, 0x61, 0x00 }, /* а to a */
1523 { 0x0431, 0x62, 0x00 }, /* б to b */
1524 { 0x0432, 0x76, 0x00 }, /* в to v */
1525 { 0x0433, 0x67, 0x00 }, /* г to g */
1526 { 0x0434, 0x64, 0x00 }, /* д to d */
1527 { 0x0435, 0x65, 0x00 }, /* е to e */
1528 { 0x0436, 0x7a, 0x68 }, /* ж to zh */
1529 { 0x0437, 0x7a, 0x00 }, /* з to z */
1530 { 0x0438, 0x69, 0x00 }, /* и to i */
1531 { 0x0439, 0x69, 0x00 }, /* й to i */
1532 { 0x043A, 0x6b, 0x00 }, /* к to k */
1533 { 0x043B, 0x6c, 0x00 }, /* л to l */
1534 { 0x043C, 0x6d, 0x00 }, /* м to m */
1535 { 0x043D, 0x6e, 0x00 }, /* н to n */
1536 { 0x043E, 0x6f, 0x00 }, /* о to o */
1537 { 0x043F, 0x70, 0x00 }, /* п to p */
1538 { 0x0440, 0x72, 0x00 }, /* р to r */
1539 { 0x0441, 0x73, 0x00 }, /* с to s */
1540 { 0x0442, 0x74, 0x00 }, /* т to t */
1541 { 0x0443, 0x75, 0x00 }, /* у to u */
1542 { 0x0444, 0x66, 0x00 }, /* ф to f */
1543 { 0x0445, 0x6b, 0x68 }, /* х to kh */
1544 { 0x0446, 0x74, 0x63 }, /* ц to tc */
1545 { 0x0447, 0x63, 0x68 }, /* ч to ch */
1546 { 0x0448, 0x73, 0x68 }, /* ш to sh */
1547 { 0x0449, 0x73, 0x68 }, /* щ to shch */
1548 { 0x044A, 0x61, 0x00 }, /* to a */
1549 { 0x044B, 0x79, 0x00 }, /* ы to y */
1550 { 0x044C, 0x79, 0x00 }, /* to y */
1551 { 0x044D, 0x65, 0x00 }, /* э to e */
1552 { 0x044E, 0x69, 0x75 }, /* ю to iu */
1553 { 0x044F, 0x69, 0x61 }, /* я to ia */
1554 { 0x0450, 0x65, 0x00 }, /* ѐ to e */
1555 { 0x0451, 0x65, 0x00 }, /* ё to e */
1556 { 0x0452, 0x64, 0x00 }, /* ђ to d */
1557 { 0x0453, 0x67, 0x00 }, /* ѓ to g */
1558 { 0x0454, 0x65, 0x00 }, /* є to e */
1559 { 0x0455, 0x7a, 0x00 }, /* ѕ to z */
1560 { 0x0456, 0x69, 0x00 }, /* і to i */
1561 { 0x0457, 0x69, 0x00 }, /* ї to i */
1562 { 0x0458, 0x6a, 0x00 }, /* ј to j */
1563 { 0x0459, 0x69, 0x00 }, /* љ to i */
1564 { 0x045A, 0x6e, 0x00 }, /* њ to n */
1565 { 0x045B, 0x64, 0x00 }, /* ћ to d */
1566 { 0x045C, 0x6b, 0x00 }, /* ќ to k */
1567 { 0x045D, 0x69, 0x00 }, /* ѝ to i */
1568 { 0x045E, 0x75, 0x00 }, /* ў to u */
1569 { 0x045F, 0x64, 0x00 }, /* џ to d */
1570 { 0x1E02, 0x42, 0x00 }, /* Ḃ to B */
1571 { 0x1E03, 0x62, 0x00 }, /* ḃ to b */
1572 { 0x1E0A, 0x44, 0x00 }, /* Ḋ to D */
1573 { 0x1E0B, 0x64, 0x00 }, /* ḋ to d */
1574 { 0x1E1E, 0x46, 0x00 }, /* Ḟ to F */
1575 { 0x1E1F, 0x66, 0x00 }, /* ḟ to f */
1576 { 0x1E40, 0x4D, 0x00 }, /* Ṁ to M */
1577 { 0x1E41, 0x6D, 0x00 }, /* ṁ to m */
1578 { 0x1E56, 0x50, 0x00 }, /* Ṗ to P */
1579 { 0x1E57, 0x70, 0x00 }, /* ṗ to p */
1580 { 0x1E60, 0x53, 0x00 }, /* Ṡ to S */
1581 { 0x1E61, 0x73, 0x00 }, /* ṡ to s */
1582 { 0x1E6A, 0x54, 0x00 }, /* Ṫ to T */
1583 { 0x1E6B, 0x74, 0x00 }, /* ṫ to t */
1584 { 0x1E80, 0x57, 0x00 }, /* Ẁ to W */
1585 { 0x1E81, 0x77, 0x00 }, /* ẁ to w */
1586 { 0x1E82, 0x57, 0x00 }, /* Ẃ to W */
1587 { 0x1E83, 0x77, 0x00 }, /* ẃ to w */
1588 { 0x1E84, 0x57, 0x00 }, /* Ẅ to W */
1589 { 0x1E85, 0x77, 0x00 }, /* ẅ to w */
1590 { 0x1EF2, 0x59, 0x00 }, /* Ỳ to Y */
1591 { 0x1EF3, 0x79, 0x00 }, /* ỳ to y */
1592 { 0xFB00, 0x66, 0x66 }, /* ff to ff */
1593 { 0xFB01, 0x66, 0x69 }, /* fi to fi */
1594 { 0xFB02, 0x66, 0x6C }, /* fl to fl */
1595 { 0xFB05, 0x73, 0x74 }, /* ſt to st */
1596 { 0xFB06, 0x73, 0x74 }, /* st to st */
1597 };
1598
1599 /*
1600 ** Convert the input string from UTF-8 into pure ASCII by converting
1601 ** all non-ASCII characters to some combination of characters in the
1602 ** ASCII subset.
1603 **
1604 ** The returned string might contain more characters than the input.
1605 **
1606 ** Space to hold the returned string comes from sqlite3_malloc() and
1607 ** should be freed by the caller.
1608 */
1609 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1610 unsigned char *zOut = sqlite3_malloc64( nIn*4 + 1 );
1611 int c, sz, nOut;
1612 if( zOut==0 ) return 0;
1613 nOut = 0;
1614 while( nIn>0 ){
1615 c = utf8Read(zIn, nIn, &sz);
1616 zIn += sz;
1617 nIn -= sz;
1618 if( c<=127 ){
1619 zOut[nOut++] = (unsigned char)c;
1620 }else{
1621 int xTop, xBtm, x;
1622 xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1623 xBtm = 0;
1624 while( xTop>=xBtm ){
1625 x = (xTop + xBtm)/2;
1626 if( translit[x].cFrom==c ){
1627 zOut[nOut++] = translit[x].cTo0;
1628 if( translit[x].cTo1 ){
1629 zOut[nOut++] = translit[x].cTo1;
1630 /* Add an extra "ch" after the "sh" for Щ and щ */
1631 if( c==0x0429 || c== 0x0449 ){
1632 zOut[nOut++] = 'c';
1633 zOut[nOut++] = 'h';
1634 }
1635 }
1636 c = 0;
1637 break;
1638 }else if( translit[x].cFrom>c ){
1639 xTop = x-1;
1640 }else{
1641 xBtm = x+1;
1642 }
1643 }
1644 if( c ) zOut[nOut++] = '?';
1645 }
1646 }
1647 zOut[nOut] = 0;
1648 return zOut;
1649 }
1650
1651 /*
1652 ** Return the number of characters in the shortest prefix of the input
1653 ** string that transliterates to an ASCII string nTrans bytes or longer.
1654 ** Or, if the transliteration of the input string is less than nTrans
1655 ** bytes in size, return the number of characters in the input string.
1656 */
1657 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
1658 int i, c, sz, nOut;
1659 int nChar;
1660
1661 i = nOut = 0;
1662 for(nChar=0; i<nIn && nOut<nTrans; nChar++){
1663 c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1664 i += sz;
1665
1666 nOut++;
1667 if( c>=128 ){
1668 int xTop, xBtm, x;
1669 xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1670 xBtm = 0;
1671 while( xTop>=xBtm ){
1672 x = (xTop + xBtm)/2;
1673 if( translit[x].cFrom==c ){
1674 if( translit[x].cTo1 ) nOut++;
1675 if( c==0x0429 || c== 0x0449 ) nOut += 2;
1676 break;
1677 }else if( translit[x].cFrom>c ){
1678 xTop = x-1;
1679 }else{
1680 xBtm = x+1;
1681 }
1682 }
1683 }
1684 }
1685
1686 return nChar;
1687 }
1688
1689
1690 /*
1691 ** spellfix1_translit(X)
1692 **
1693 ** Convert a string that contains non-ASCII Roman characters into
1694 ** pure ASCII.
1695 */
1696 static void transliterateSqlFunc(
1697 sqlite3_context *context,
1698 int argc,
1699 sqlite3_value **argv
1700 ){
1701 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1702 int nIn = sqlite3_value_bytes(argv[0]);
1703 unsigned char *zOut = transliterate(zIn, nIn);
1704 if( zOut==0 ){
1705 sqlite3_result_error_nomem(context);
1706 }else{
1707 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1708 }
1709 }
1710
1711 /*
1712 ** spellfix1_scriptcode(X)
1713 **
1714 ** Try to determine the dominant script used by the word X and return
1715 ** its ISO 15924 numeric code.
1716 **
1717 ** The current implementation only understands the following scripts:
1718 **
1719 ** 215 (Latin)
1720 ** 220 (Cyrillic)
1721 ** 200 (Greek)
1722 **
1723 ** This routine will return 998 if the input X contains characters from
1724 ** two or more of the above scripts or 999 if X contains no characters
1725 ** from any of the above scripts.
1726 */
1727 static void scriptCodeSqlFunc(
1728 sqlite3_context *context,
1729 int argc,
1730 sqlite3_value **argv
1731 ){
1732 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1733 int nIn = sqlite3_value_bytes(argv[0]);
1734 int c, sz;
1735 int scriptMask = 0;
1736 int res;
1737 int seenDigit = 0;
1738 # define SCRIPT_LATIN 0x0001
1739 # define SCRIPT_CYRILLIC 0x0002
1740 # define SCRIPT_GREEK 0x0004
1741 # define SCRIPT_HEBREW 0x0008
1742 # define SCRIPT_ARABIC 0x0010
1743
1744 while( nIn>0 ){
1745 c = utf8Read(zIn, nIn, &sz);
1746 zIn += sz;
1747 nIn -= sz;
1748 if( c<0x02af ){
1749 if( c>=0x80 || midClass[c&0x7f]<CCLASS_DIGIT ){
1750 scriptMask |= SCRIPT_LATIN;
1751 }else if( c>='0' && c<='9' ){
1752 seenDigit = 1;
1753 }
1754 }else if( c>=0x0400 && c<=0x04ff ){
1755 scriptMask |= SCRIPT_CYRILLIC;
1756 }else if( c>=0x0386 && c<=0x03ce ){
1757 scriptMask |= SCRIPT_GREEK;
1758 }else if( c>=0x0590 && c<=0x05ff ){
1759 scriptMask |= SCRIPT_HEBREW;
1760 }else if( c>=0x0600 && c<=0x06ff ){
1761 scriptMask |= SCRIPT_ARABIC;
1762 }
1763 }
1764 if( scriptMask==0 && seenDigit ) scriptMask = SCRIPT_LATIN;
1765 switch( scriptMask ){
1766 case 0: res = 999; break;
1767 case SCRIPT_LATIN: res = 215; break;
1768 case SCRIPT_CYRILLIC: res = 220; break;
1769 case SCRIPT_GREEK: res = 200; break;
1770 case SCRIPT_HEBREW: res = 125; break;
1771 case SCRIPT_ARABIC: res = 160; break;
1772 default: res = 998; break;
1773 }
1774 sqlite3_result_int(context, res);
1775 }
1776
1777 /* End transliterate
1778 ******************************************************************************
1779 ******************************************************************************
1780 ** Begin spellfix1 virtual table.
1781 */
1782
1783 /* Maximum length of a phonehash used for querying the shadow table */
1784 #define SPELLFIX_MX_HASH 32
1785
1786 /* Maximum number of hash strings to examine per query */
1787 #define SPELLFIX_MX_RUN 1
1788
1789 typedef struct spellfix1_vtab spellfix1_vtab;
1790 typedef struct spellfix1_cursor spellfix1_cursor;
1791
1792 /* Fuzzy-search virtual table object */
1793 struct spellfix1_vtab {
1794 sqlite3_vtab base; /* Base class - must be first */
1795 sqlite3 *db; /* Database connection */
1796 char *zDbName; /* Name of database holding this table */
1797 char *zTableName; /* Name of the virtual table */
1798 char *zCostTable; /* Table holding edit-distance cost numbers */
1799 EditDist3Config *pConfig3; /* Parsed edit distance costs */
1800 };
1801
1802 /* Fuzzy-search cursor object */
1803 struct spellfix1_cursor {
1804 sqlite3_vtab_cursor base; /* Base class - must be first */
1805 spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
1806 char *zPattern; /* rhs of MATCH clause */
1807 int idxNum; /* idxNum value passed to xFilter() */
1808 int nRow; /* Number of rows of content */
1809 int nAlloc; /* Number of allocated rows */
1810 int iRow; /* Current row of content */
1811 int iLang; /* Value of the langid= constraint */
1812 int iTop; /* Value of the top= constraint */
1813 int iScope; /* Value of the scope= constraint */
1814 int nSearch; /* Number of vocabulary items checked */
1815 sqlite3_stmt *pFullScan; /* Shadow query for a full table scan */
1816 struct spellfix1_row { /* For each row of content */
1817 sqlite3_int64 iRowid; /* Rowid for this row */
1818 char *zWord; /* Text for this row */
1819 int iRank; /* Rank for this row */
1820 int iDistance; /* Distance from pattern for this row */
1821 int iScore; /* Score for sorting */
1822 int iMatchlen; /* Value of matchlen column (or -1) */
1823 char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
1824 } *a;
1825 };
1826
1827 /*
1828 ** Construct one or more SQL statements from the format string given
1829 ** and then evaluate those statements. The success code is written
1830 ** into *pRc.
1831 **
1832 ** If *pRc is initially non-zero then this routine is a no-op.
1833 */
1834 static void spellfix1DbExec(
1835 int *pRc, /* Success code */
1836 sqlite3 *db, /* Database in which to run SQL */
1837 const char *zFormat, /* Format string for SQL */
1838 ... /* Arguments to the format string */
1839 ){
1840 va_list ap;
1841 char *zSql;
1842 if( *pRc ) return;
1843 va_start(ap, zFormat);
1844 zSql = sqlite3_vmprintf(zFormat, ap);
1845 va_end(ap);
1846 if( zSql==0 ){
1847 *pRc = SQLITE_NOMEM;
1848 }else{
1849 *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1850 sqlite3_free(zSql);
1851 }
1852 }
1853
1854 /*
1855 ** xDisconnect/xDestroy method for the fuzzy-search module.
1856 */
1857 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1858 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1859 int rc = SQLITE_OK;
1860 if( isDestroy ){
1861 sqlite3 *db = p->db;
1862 spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1863 p->zDbName, p->zTableName);
1864 }
1865 if( rc==SQLITE_OK ){
1866 sqlite3_free(p->zTableName);
1867 editDist3ConfigDelete(p->pConfig3);
1868 sqlite3_free(p->zCostTable);
1869 sqlite3_free(p);
1870 }
1871 return rc;
1872 }
1873 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1874 return spellfix1Uninit(0, pVTab);
1875 }
1876 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1877 return spellfix1Uninit(1, pVTab);
1878 }
1879
1880 /*
1881 ** Make a copy of a string. Remove leading and trailing whitespace
1882 ** and dequote it.
1883 */
1884 static char *spellfix1Dequote(const char *zIn){
1885 char *zOut;
1886 int i, j;
1887 char c;
1888 while( isspace((unsigned char)zIn[0]) ) zIn++;
1889 zOut = sqlite3_mprintf("%s", zIn);
1890 if( zOut==0 ) return 0;
1891 i = (int)strlen(zOut);
1892 #if 0 /* The parser will never leave spaces at the end */
1893 while( i>0 && isspace(zOut[i-1]) ){ i--; }
1894 #endif
1895 zOut[i] = 0;
1896 c = zOut[0];
1897 if( c=='\'' || c=='"' ){
1898 for(i=1, j=0; ALWAYS(zOut[i]); i++){
1899 zOut[j++] = zOut[i];
1900 if( zOut[i]==c ){
1901 if( zOut[i+1]==c ){
1902 i++;
1903 }else{
1904 zOut[j-1] = 0;
1905 break;
1906 }
1907 }
1908 }
1909 }
1910 return zOut;
1911 }
1912
1913
1914 /*
1915 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
1916 **
1917 ** argv[0] -> module name ("spellfix1")
1918 ** argv[1] -> database name
1919 ** argv[2] -> table name
1920 ** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
1921 */
1922 static int spellfix1Init(
1923 int isCreate,
1924 sqlite3 *db,
1925 void *pAux,
1926 int argc, const char *const*argv,
1927 sqlite3_vtab **ppVTab,
1928 char **pzErr
1929 ){
1930 spellfix1_vtab *pNew = 0;
1931 /* const char *zModule = argv[0]; // not used */
1932 const char *zDbName = argv[1];
1933 const char *zTableName = argv[2];
1934 int nDbName;
1935 int rc = SQLITE_OK;
1936 int i;
1937
1938 nDbName = (int)strlen(zDbName);
1939 pNew = sqlite3_malloc64( sizeof(*pNew) + nDbName + 1);
1940 if( pNew==0 ){
1941 rc = SQLITE_NOMEM;
1942 }else{
1943 memset(pNew, 0, sizeof(*pNew));
1944 pNew->zDbName = (char*)&pNew[1];
1945 memcpy(pNew->zDbName, zDbName, nDbName+1);
1946 pNew->zTableName = sqlite3_mprintf("%s", zTableName);
1947 pNew->db = db;
1948 if( pNew->zTableName==0 ){
1949 rc = SQLITE_NOMEM;
1950 }else{
1951 rc = sqlite3_declare_vtab(db,
1952 "CREATE TABLE x(word,rank,distance,langid, "
1953 "score, matchlen, phonehash HIDDEN, "
1954 "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
1955 "soundslike HIDDEN, command HIDDEN)"
1956 );
1957 #define SPELLFIX_COL_WORD 0
1958 #define SPELLFIX_COL_RANK 1
1959 #define SPELLFIX_COL_DISTANCE 2
1960 #define SPELLFIX_COL_LANGID 3
1961 #define SPELLFIX_COL_SCORE 4
1962 #define SPELLFIX_COL_MATCHLEN 5
1963 #define SPELLFIX_COL_PHONEHASH 6
1964 #define SPELLFIX_COL_TOP 7
1965 #define SPELLFIX_COL_SCOPE 8
1966 #define SPELLFIX_COL_SRCHCNT 9
1967 #define SPELLFIX_COL_SOUNDSLIKE 10
1968 #define SPELLFIX_COL_COMMAND 11
1969 }
1970 if( rc==SQLITE_OK && isCreate ){
1971 spellfix1DbExec(&rc, db,
1972 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
1973 " id INTEGER PRIMARY KEY,\n"
1974 " rank INT,\n"
1975 " langid INT,\n"
1976 " word TEXT,\n"
1977 " k1 TEXT,\n"
1978 " k2 TEXT\n"
1979 ");\n",
1980 zDbName, zTableName
1981 );
1982 spellfix1DbExec(&rc, db,
1983 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
1984 "ON \"%w_vocab\"(langid,k2);",
1985 zDbName, zTableName, zTableName
1986 );
1987 }
1988 for(i=3; rc==SQLITE_OK && i<argc; i++){
1989 if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
1990 pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
1991 if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
1992 continue;
1993 }
1994 *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
1995 rc = SQLITE_ERROR;
1996 }
1997 }
1998
1999 if( rc && pNew ){
2000 *ppVTab = 0;
2001 spellfix1Uninit(0, &pNew->base);
2002 }else{
2003 *ppVTab = (sqlite3_vtab *)pNew;
2004 }
2005 return rc;
2006 }
2007
2008 /*
2009 ** The xConnect and xCreate methods
2010 */
2011 static int spellfix1Connect(
2012 sqlite3 *db,
2013 void *pAux,
2014 int argc, const char *const*argv,
2015 sqlite3_vtab **ppVTab,
2016 char **pzErr
2017 ){
2018 return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
2019 }
2020 static int spellfix1Create(
2021 sqlite3 *db,
2022 void *pAux,
2023 int argc, const char *const*argv,
2024 sqlite3_vtab **ppVTab,
2025 char **pzErr
2026 ){
2027 return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
2028 }
2029
2030 /*
2031 ** Clear all of the content from a cursor.
2032 */
2033 static void spellfix1ResetCursor(spellfix1_cursor *pCur){
2034 int i;
2035 for(i=0; i<pCur->nRow; i++){
2036 sqlite3_free(pCur->a[i].zWord);
2037 }
2038 pCur->nRow = 0;
2039 pCur->iRow = 0;
2040 pCur->nSearch = 0;
2041 if( pCur->pFullScan ){
2042 sqlite3_finalize(pCur->pFullScan);
2043 pCur->pFullScan = 0;
2044 }
2045 }
2046
2047 /*
2048 ** Resize the cursor to hold up to N rows of content
2049 */
2050 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
2051 struct spellfix1_row *aNew;
2052 assert( N>=pCur->nRow );
2053 aNew = sqlite3_realloc64(pCur->a, sizeof(pCur->a[0])*N);
2054 if( aNew==0 && N>0 ){
2055 spellfix1ResetCursor(pCur);
2056 sqlite3_free(pCur->a);
2057 pCur->nAlloc = 0;
2058 pCur->a = 0;
2059 }else{
2060 pCur->nAlloc = N;
2061 pCur->a = aNew;
2062 }
2063 }
2064
2065
2066 /*
2067 ** Close a fuzzy-search cursor.
2068 */
2069 static int spellfix1Close(sqlite3_vtab_cursor *cur){
2070 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2071 spellfix1ResetCursor(pCur);
2072 spellfix1ResizeCursor(pCur, 0);
2073 sqlite3_free(pCur->zPattern);
2074 sqlite3_free(pCur);
2075 return SQLITE_OK;
2076 }
2077
2078 #define SPELLFIX_IDXNUM_MATCH 0x01 /* word MATCH $str */
2079 #define SPELLFIX_IDXNUM_LANGID 0x02 /* langid == $langid */
2080 #define SPELLFIX_IDXNUM_TOP 0x04 /* top = $top */
2081 #define SPELLFIX_IDXNUM_SCOPE 0x08 /* scope = $scope */
2082 #define SPELLFIX_IDXNUM_DISTLT 0x10 /* distance < $distance */
2083 #define SPELLFIX_IDXNUM_DISTLE 0x20 /* distance <= $distance */
2084 #define SPELLFIX_IDXNUM_ROWID 0x40 /* rowid = $rowid */
2085 #define SPELLFIX_IDXNUM_DIST (0x10|0x20) /* DISTLT and DISTLE */
2086
2087 /*
2088 **
2089 ** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined
2090 ** above.
2091 **
2092 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2093 ** if specified and in that order.
2094 */
2095 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
2096 int iPlan = 0;
2097 int iLangTerm = -1;
2098 int iTopTerm = -1;
2099 int iScopeTerm = -1;
2100 int iDistTerm = -1;
2101 int iRowidTerm = -1;
2102 int i;
2103 const struct sqlite3_index_constraint *pConstraint;
2104 pConstraint = pIdxInfo->aConstraint;
2105 for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
2106 if( pConstraint->usable==0 ) continue;
2107
2108 /* Terms of the form: word MATCH $str */
2109 if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0
2110 && pConstraint->iColumn==SPELLFIX_COL_WORD
2111 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2112 ){
2113 iPlan |= SPELLFIX_IDXNUM_MATCH;
2114 pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2115 pIdxInfo->aConstraintUsage[i].omit = 1;
2116 }
2117
2118 /* Terms of the form: langid = $langid */
2119 if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0
2120 && pConstraint->iColumn==SPELLFIX_COL_LANGID
2121 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2122 ){
2123 iPlan |= SPELLFIX_IDXNUM_LANGID;
2124 iLangTerm = i;
2125 }
2126
2127 /* Terms of the form: top = $top */
2128 if( (iPlan & SPELLFIX_IDXNUM_TOP)==0
2129 && pConstraint->iColumn==SPELLFIX_COL_TOP
2130 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2131 ){
2132 iPlan |= SPELLFIX_IDXNUM_TOP;
2133 iTopTerm = i;
2134 }
2135
2136 /* Terms of the form: scope = $scope */
2137 if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0
2138 && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2139 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2140 ){
2141 iPlan |= SPELLFIX_IDXNUM_SCOPE;
2142 iScopeTerm = i;
2143 }
2144
2145 /* Terms of the form: distance < $dist or distance <= $dist */
2146 if( (iPlan & SPELLFIX_IDXNUM_DIST)==0
2147 && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2148 && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2149 || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2150 ){
2151 if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ){
2152 iPlan |= SPELLFIX_IDXNUM_DISTLT;
2153 }else{
2154 iPlan |= SPELLFIX_IDXNUM_DISTLE;
2155 }
2156 iDistTerm = i;
2157 }
2158
2159 /* Terms of the form: distance < $dist or distance <= $dist */
2160 if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0
2161 && pConstraint->iColumn<0
2162 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2163 ){
2164 iPlan |= SPELLFIX_IDXNUM_ROWID;
2165 iRowidTerm = i;
2166 }
2167 }
2168 if( iPlan&SPELLFIX_IDXNUM_MATCH ){
2169 int idx = 2;
2170 pIdxInfo->idxNum = iPlan;
2171 if( pIdxInfo->nOrderBy==1
2172 && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2173 && pIdxInfo->aOrderBy[0].desc==0
2174 ){
2175 pIdxInfo->orderByConsumed = 1; /* Default order by iScore */
2176 }
2177 if( iPlan&SPELLFIX_IDXNUM_LANGID ){
2178 pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2179 pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2180 }
2181 if( iPlan&SPELLFIX_IDXNUM_TOP ){
2182 pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2183 pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2184 }
2185 if( iPlan&SPELLFIX_IDXNUM_SCOPE ){
2186 pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2187 pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2188 }
2189 if( iPlan&SPELLFIX_IDXNUM_DIST ){
2190 pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2191 pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2192 }
2193 pIdxInfo->estimatedCost = 1e5;
2194 }else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ){
2195 pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID;
2196 pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2197 pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2198 pIdxInfo->estimatedCost = 5;
2199 }else{
2200 pIdxInfo->idxNum = 0;
2201 pIdxInfo->estimatedCost = 1e50;
2202 }
2203 return SQLITE_OK;
2204 }
2205
2206 /*
2207 ** Open a new fuzzy-search cursor.
2208 */
2209 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2210 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2211 spellfix1_cursor *pCur;
2212 pCur = sqlite3_malloc64( sizeof(*pCur) );
2213 if( pCur==0 ) return SQLITE_NOMEM;
2214 memset(pCur, 0, sizeof(*pCur));
2215 pCur->pVTab = p;
2216 *ppCursor = &pCur->base;
2217 return SQLITE_OK;
2218 }
2219
2220 /*
2221 ** Adjust a distance measurement by the words rank in order to show
2222 ** preference to common words.
2223 */
2224 static int spellfix1Score(int iDistance, int iRank){
2225 int iLog2;
2226 for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
2227 return iDistance + 32 - iLog2;
2228 }
2229
2230 /*
2231 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2232 ** that they sort in order of increasing distance.
2233 */
2234 static int SQLITE_CDECL spellfix1RowCompare(const void *A, const void *B){
2235 const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2236 const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2237 return a->iScore - b->iScore;
2238 }
2239
2240 /*
2241 ** A structure used to pass information from spellfix1FilterForMatch()
2242 ** into spellfix1RunQuery().
2243 */
2244 typedef struct MatchQuery {
2245 spellfix1_cursor *pCur; /* The cursor being queried */
2246 sqlite3_stmt *pStmt; /* shadow table query statment */
2247 char zHash[SPELLFIX_MX_HASH]; /* The current phonehash for zPattern */
2248 const char *zPattern; /* Transliterated input string */
2249 int nPattern; /* Length of zPattern */
2250 EditDist3FromString *pMatchStr3; /* Original unicode string */
2251 EditDist3Config *pConfig3; /* Edit-distance cost coefficients */
2252 const EditDist3Lang *pLang; /* The selected language coefficients */
2253 int iLang; /* The language id */
2254 int iScope; /* Default scope */
2255 int iMaxDist; /* Maximum allowed edit distance, or -1 */
2256 int rc; /* Error code */
2257 int nRun; /* Number of prior runs for the same zPattern */
2258 char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH]; /* Prior hashes */
2259 } MatchQuery;
2260
2261 /*
2262 ** Run a query looking for the best matches against zPattern using
2263 ** zHash as the character class seed hash.
2264 */
2265 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
2266 const char *zK1;
2267 const char *zWord;
2268 int iDist;
2269 int iRank;
2270 int iScore;
2271 int iWorst = 0;
2272 int idx;
2273 int idxWorst = -1;
2274 int i;
2275 int iScope = p->iScope;
2276 spellfix1_cursor *pCur = p->pCur;
2277 sqlite3_stmt *pStmt = p->pStmt;
2278 char zHash1[SPELLFIX_MX_HASH];
2279 char zHash2[SPELLFIX_MX_HASH];
2280 char *zClass;
2281 int nClass;
2282 int rc;
2283
2284 if( pCur->a==0 || p->rc ) return; /* Prior memory allocation failure */
2285 zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2286 if( zClass==0 ){
2287 p->rc = SQLITE_NOMEM;
2288 return;
2289 }
2290 nClass = (int)strlen(zClass);
2291 if( nClass>SPELLFIX_MX_HASH-2 ){
2292 nClass = SPELLFIX_MX_HASH-2;
2293 zClass[nClass] = 0;
2294 }
2295 if( nClass<=iScope ){
2296 if( nClass>2 ){
2297 iScope = nClass-1;
2298 }else{
2299 iScope = nClass;
2300 }
2301 }
2302 memcpy(zHash1, zClass, iScope);
2303 sqlite3_free(zClass);
2304 zHash1[iScope] = 0;
2305 memcpy(zHash2, zHash1, iScope);
2306 zHash2[iScope] = 'Z';
2307 zHash2[iScope+1] = 0;
2308 #if SPELLFIX_MX_RUN>1
2309 for(i=0; i<p->nRun; i++){
2310 if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2311 }
2312 #endif
2313 assert( p->nRun<SPELLFIX_MX_RUN );
2314 memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2315 if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2316 || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2317 ){
2318 p->rc = SQLITE_NOMEM;
2319 return;
2320 }
2321 #if SPELLFIX_MX_RUN>1
2322 for(i=0; i<pCur->nRow; i++){
2323 if( pCur->a[i].iScore>iWorst ){
2324 iWorst = pCur->a[i].iScore;
2325 idxWorst = i;
2326 }
2327 }
2328 #endif
2329 while( sqlite3_step(pStmt)==SQLITE_ROW ){
2330 int iMatchlen = -1;
2331 iRank = sqlite3_column_int(pStmt, 2);
2332 if( p->pMatchStr3 ){
2333 int nWord = sqlite3_column_bytes(pStmt, 1);
2334 zWord = (const char*)sqlite3_column_text(pStmt, 1);
2335 iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2336 }else{
2337 zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2338 if( zK1==0 ) continue;
2339 iDist = editdist1(p->zPattern, zK1, 0);
2340 }
2341 if( iDist<0 ){
2342 p->rc = SQLITE_NOMEM;
2343 break;
2344 }
2345 pCur->nSearch++;
2346
2347 /* If there is a "distance < $dist" or "distance <= $dist" constraint,
2348 ** check if this row meets it. If not, jump back up to the top of the
2349 ** loop to process the next row. Otherwise, if the row does match the
2350 ** distance constraint, check if the pCur->a[] array is already full.
2351 ** If it is and no explicit "top = ?" constraint was present in the
2352 ** query, grow the array to ensure there is room for the new entry. */
2353 assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) );
2354 if( p->iMaxDist>=0 ){
2355 if( iDist>p->iMaxDist ) continue;
2356 if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ){
2357 spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2358 if( pCur->a==0 ) break;
2359 }
2360 }
2361
2362 iScore = spellfix1Score(iDist,iRank);
2363 if( pCur->nRow<pCur->nAlloc ){
2364 idx = pCur->nRow;
2365 }else if( iScore<iWorst ){
2366 idx = idxWorst;
2367 sqlite3_free(pCur->a[idx].zWord);
2368 }else{
2369 continue;
2370 }
2371
2372 pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2373 if( pCur->a[idx].zWord==0 ){
2374 p->rc = SQLITE_NOMEM;
2375 break;
2376 }
2377 pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2378 pCur->a[idx].iRank = iRank;
2379 pCur->a[idx].iDistance = iDist;
2380 pCur->a[idx].iScore = iScore;
2381 pCur->a[idx].iMatchlen = iMatchlen;
2382 memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2383 if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2384 if( pCur->nRow==pCur->nAlloc ){
2385 iWorst = pCur->a[0].iScore;
2386 idxWorst = 0;
2387 for(i=1; i<pCur->nRow; i++){
2388 iScore = pCur->a[i].iScore;
2389 if( iWorst<iScore ){
2390 iWorst = iScore;
2391 idxWorst = i;
2392 }
2393 }
2394 }
2395 }
2396 rc = sqlite3_reset(pStmt);
2397 if( rc ) p->rc = rc;
2398 }
2399
2400 /*
2401 ** This version of the xFilter method work if the MATCH term is present
2402 ** and we are doing a scan.
2403 */
2404 static int spellfix1FilterForMatch(
2405 spellfix1_cursor *pCur,
2406 int argc,
2407 sqlite3_value **argv
2408 ){
2409 int idxNum = pCur->idxNum;
2410 const unsigned char *zMatchThis; /* RHS of the MATCH operator */
2411 EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2412 char *zPattern; /* Transliteration of zMatchThis */
2413 int nPattern; /* Length of zPattern */
2414 int iLimit = 20; /* Max number of rows of output */
2415 int iScope = 3; /* Use this many characters of zClass */
2416 int iLang = 0; /* Language code */
2417 char *zSql; /* SQL of shadow table query */
2418 sqlite3_stmt *pStmt = 0; /* Shadow table query */
2419 int rc; /* Result code */
2420 int idx = 1; /* Next available filter parameter */
2421 spellfix1_vtab *p = pCur->pVTab; /* The virtual table that owns pCur */
2422 MatchQuery x; /* For passing info to RunQuery() */
2423
2424 /* Load the cost table if we have not already done so */
2425 if( p->zCostTable!=0 && p->pConfig3==0 ){
2426 p->pConfig3 = sqlite3_malloc64( sizeof(p->pConfig3[0]) );
2427 if( p->pConfig3==0 ) return SQLITE_NOMEM;
2428 memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2429 rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2430 if( rc ) return rc;
2431 }
2432 memset(&x, 0, sizeof(x));
2433 x.iScope = 3; /* Default scope if none specified by "WHERE scope=N" */
2434 x.iMaxDist = -1; /* Maximum allowed edit distance */
2435
2436 if( idxNum&2 ){
2437 iLang = sqlite3_value_int(argv[idx++]);
2438 }
2439 if( idxNum&4 ){
2440 iLimit = sqlite3_value_int(argv[idx++]);
2441 if( iLimit<1 ) iLimit = 1;
2442 }
2443 if( idxNum&8 ){
2444 x.iScope = sqlite3_value_int(argv[idx++]);
2445 if( x.iScope<1 ) x.iScope = 1;
2446 if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2447 }
2448 if( idxNum&(16|32) ){
2449 x.iMaxDist = sqlite3_value_int(argv[idx++]);
2450 if( idxNum&16 ) x.iMaxDist--;
2451 if( x.iMaxDist<0 ) x.iMaxDist = 0;
2452 }
2453 spellfix1ResetCursor(pCur);
2454 spellfix1ResizeCursor(pCur, iLimit);
2455 zMatchThis = sqlite3_value_text(argv[0]);
2456 if( zMatchThis==0 ) return SQLITE_OK;
2457 if( p->pConfig3 ){
2458 x.pLang = editDist3FindLang(p->pConfig3, iLang);
2459 pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2460 if( pMatchStr3==0 ){
2461 x.rc = SQLITE_NOMEM;
2462 goto filter_exit;
2463 }
2464 }else{
2465 x.pLang = 0;
2466 }
2467 zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2468 sqlite3_free(pCur->zPattern);
2469 pCur->zPattern = zPattern;
2470 if( zPattern==0 ){
2471 x.rc = SQLITE_NOMEM;
2472 goto filter_exit;
2473 }
2474 nPattern = (int)strlen(zPattern);
2475 if( zPattern[nPattern-1]=='*' ) nPattern--;
2476 zSql = sqlite3_mprintf(
2477 "SELECT id, word, rank, k1"
2478 " FROM \"%w\".\"%w_vocab\""
2479 " WHERE langid=%d AND k2>=?1 AND k2<?2",
2480 p->zDbName, p->zTableName, iLang
2481 );
2482 if( zSql==0 ){
2483 x.rc = SQLITE_NOMEM;
2484 pStmt = 0;
2485 goto filter_exit;
2486 }
2487 rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2488 sqlite3_free(zSql);
2489 pCur->iLang = iLang;
2490 x.pCur = pCur;
2491 x.pStmt = pStmt;
2492 x.zPattern = zPattern;
2493 x.nPattern = nPattern;
2494 x.pMatchStr3 = pMatchStr3;
2495 x.iLang = iLang;
2496 x.rc = rc;
2497 x.pConfig3 = p->pConfig3;
2498 if( x.rc==SQLITE_OK ){
2499 spellfix1RunQuery(&x, zPattern, nPattern);
2500 }
2501
2502 if( pCur->a ){
2503 qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2504 pCur->iTop = iLimit;
2505 pCur->iScope = iScope;
2506 }else{
2507 x.rc = SQLITE_NOMEM;
2508 }
2509
2510 filter_exit:
2511 sqlite3_finalize(pStmt);
2512 editDist3FromStringDelete(pMatchStr3);
2513 return x.rc;
2514 }
2515
2516 /*
2517 ** This version of xFilter handles a full-table scan case
2518 */
2519 static int spellfix1FilterForFullScan(
2520 spellfix1_cursor *pCur,
2521 int argc,
2522 sqlite3_value **argv
2523 ){
2524 int rc = SQLITE_OK;
2525 int idxNum = pCur->idxNum;
2526 char *zSql;
2527 spellfix1_vtab *pVTab = pCur->pVTab;
2528 spellfix1ResetCursor(pCur);
2529 assert( idxNum==0 || idxNum==64 );
2530 zSql = sqlite3_mprintf(
2531 "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2532 pVTab->zDbName, pVTab->zTableName,
2533 ((idxNum & 64) ? " WHERE rowid=?" : "")
2534 );
2535 if( zSql==0 ) return SQLITE_NOMEM;
2536 rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2537 sqlite3_free(zSql);
2538 if( rc==SQLITE_OK && (idxNum & 64) ){
2539 assert( argc==1 );
2540 rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2541 }
2542 pCur->nRow = pCur->iRow = 0;
2543 if( rc==SQLITE_OK ){
2544 rc = sqlite3_step(pCur->pFullScan);
2545 if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
2546 if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
2547 }else{
2548 pCur->iRow = 0;
2549 }
2550 return rc;
2551 }
2552
2553
2554 /*
2555 ** Called to "rewind" a cursor back to the beginning so that
2556 ** it starts its output over again. Always called at least once
2557 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2558 */
2559 static int spellfix1Filter(
2560 sqlite3_vtab_cursor *cur,
2561 int idxNum, const char *idxStr,
2562 int argc, sqlite3_value **argv
2563 ){
2564 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2565 int rc;
2566 pCur->idxNum = idxNum;
2567 if( idxNum & 1 ){
2568 rc = spellfix1FilterForMatch(pCur, argc, argv);
2569 }else{
2570 rc = spellfix1FilterForFullScan(pCur, argc, argv);
2571 }
2572 return rc;
2573 }
2574
2575
2576 /*
2577 ** Advance a cursor to its next row of output
2578 */
2579 static int spellfix1Next(sqlite3_vtab_cursor *cur){
2580 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2581 int rc = SQLITE_OK;
2582 if( pCur->iRow < pCur->nRow ){
2583 if( pCur->pFullScan ){
2584 rc = sqlite3_step(pCur->pFullScan);
2585 if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2586 if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2587 }else{
2588 pCur->iRow++;
2589 }
2590 }
2591 return rc;
2592 }
2593
2594 /*
2595 ** Return TRUE if we are at the end-of-file
2596 */
2597 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
2598 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2599 return pCur->iRow>=pCur->nRow;
2600 }
2601
2602 /*
2603 ** Return columns from the current row.
2604 */
2605 static int spellfix1Column(
2606 sqlite3_vtab_cursor *cur,
2607 sqlite3_context *ctx,
2608 int i
2609 ){
2610 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2611 if( pCur->pFullScan ){
2612 if( i<=SPELLFIX_COL_LANGID ){
2613 sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2614 }else{
2615 sqlite3_result_null(ctx);
2616 }
2617 return SQLITE_OK;
2618 }
2619 switch( i ){
2620 case SPELLFIX_COL_WORD: {
2621 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2622 break;
2623 }
2624 case SPELLFIX_COL_RANK: {
2625 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2626 break;
2627 }
2628 case SPELLFIX_COL_DISTANCE: {
2629 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2630 break;
2631 }
2632 case SPELLFIX_COL_LANGID: {
2633 sqlite3_result_int(ctx, pCur->iLang);
2634 break;
2635 }
2636 case SPELLFIX_COL_SCORE: {
2637 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2638 break;
2639 }
2640 case SPELLFIX_COL_MATCHLEN: {
2641 int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2642 if( iMatchlen<0 ){
2643 int nPattern = (int)strlen(pCur->zPattern);
2644 char *zWord = pCur->a[pCur->iRow].zWord;
2645 int nWord = (int)strlen(zWord);
2646
2647 if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
2648 char *zTranslit;
2649 int res;
2650 zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2651 if( !zTranslit ) return SQLITE_NOMEM;
2652 res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2653 sqlite3_free(zTranslit);
2654 if( res<0 ) return SQLITE_NOMEM;
2655 iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2656 }else{
2657 iMatchlen = utf8Charlen(zWord, nWord);
2658 }
2659 }
2660
2661 sqlite3_result_int(ctx, iMatchlen);
2662 break;
2663 }
2664 case SPELLFIX_COL_PHONEHASH: {
2665 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2666 break;
2667 }
2668 case SPELLFIX_COL_TOP: {
2669 sqlite3_result_int(ctx, pCur->iTop);
2670 break;
2671 }
2672 case SPELLFIX_COL_SCOPE: {
2673 sqlite3_result_int(ctx, pCur->iScope);
2674 break;
2675 }
2676 case SPELLFIX_COL_SRCHCNT: {
2677 sqlite3_result_int(ctx, pCur->nSearch);
2678 break;
2679 }
2680 default: {
2681 sqlite3_result_null(ctx);
2682 break;
2683 }
2684 }
2685 return SQLITE_OK;
2686 }
2687
2688 /*
2689 ** The rowid.
2690 */
2691 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
2692 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2693 if( pCur->pFullScan ){
2694 *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2695 }else{
2696 *pRowid = pCur->a[pCur->iRow].iRowid;
2697 }
2698 return SQLITE_OK;
2699 }
2700
2701 /*
2702 ** This function is called by the xUpdate() method. It returns a string
2703 ** containing the conflict mode that xUpdate() should use for the current
2704 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE".
2705 */
2706 static const char *spellfix1GetConflict(sqlite3 *db){
2707 static const char *azConflict[] = {
2708 /* Note: Instead of "FAIL" - "ABORT". */
2709 "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE"
2710 };
2711 int eConflict = sqlite3_vtab_on_conflict(db);
2712
2713 assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE
2714 || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT
2715 || eConflict==SQLITE_REPLACE
2716 );
2717 assert( SQLITE_ROLLBACK==1 );
2718 assert( SQLITE_IGNORE==2 );
2719 assert( SQLITE_FAIL==3 );
2720 assert( SQLITE_ABORT==4 );
2721 assert( SQLITE_REPLACE==5 );
2722
2723 return azConflict[eConflict-1];
2724 }
2725
2726 /*
2727 ** The xUpdate() method.
2728 */
2729 static int spellfix1Update(
2730 sqlite3_vtab *pVTab,
2731 int argc,
2732 sqlite3_value **argv,
2733 sqlite_int64 *pRowid
2734 ){
2735 int rc = SQLITE_OK;
2736 sqlite3_int64 rowid, newRowid;
2737 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2738 sqlite3 *db = p->db;
2739
2740 if( argc==1 ){
2741 /* A delete operation on the rowid given by argv[0] */
2742 rowid = *pRowid = sqlite3_value_int64(argv[0]);
2743 spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2744 " WHERE id=%lld",
2745 p->zDbName, p->zTableName, rowid);
2746 }else{
2747 const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2748 int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2749 int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2750 int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2751 const unsigned char *zSoundslike =
2752 sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2753 int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2754 char *zK1, *zK2;
2755 int i;
2756 char c;
2757 const char *zConflict = spellfix1GetConflict(db);
2758
2759 if( zWord==0 ){
2760 /* Inserts of the form: INSERT INTO table(command) VALUES('xyzzy');
2761 ** cause zWord to be NULL, so we look at the "command" column to see
2762 ** what special actions to take */
2763 const char *zCmd =
2764 (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2765 if( zCmd==0 ){
2766 pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2767 p->zTableName);
2768 return SQLITE_CONSTRAINT_NOTNULL;
2769 }
2770 if( strcmp(zCmd,"reset")==0 ){
2771 /* Reset the edit cost table (if there is one). */
2772 editDist3ConfigDelete(p->pConfig3);
2773 p->pConfig3 = 0;
2774 return SQLITE_OK;
2775 }
2776 if( strncmp(zCmd,"edit_cost_table=",16)==0 ){
2777 editDist3ConfigDelete(p->pConfig3);
2778 p->pConfig3 = 0;
2779 sqlite3_free(p->zCostTable);
2780 p->zCostTable = spellfix1Dequote(zCmd+16);
2781 if( p->zCostTable==0 ) return SQLITE_NOMEM;
2782 if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){
2783 sqlite3_free(p->zCostTable);
2784 p->zCostTable = 0;
2785 }
2786 return SQLITE_OK;
2787 }
2788 pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2789 p->zTableName, zCmd);
2790 return SQLITE_ERROR;
2791 }
2792 if( iRank<1 ) iRank = 1;
2793 if( zSoundslike ){
2794 zK1 = (char*)transliterate(zSoundslike, nSoundslike);
2795 }else{
2796 zK1 = (char*)transliterate(zWord, nWord);
2797 }
2798 if( zK1==0 ) return SQLITE_NOMEM;
2799 for(i=0; (c = zK1[i])!=0; i++){
2800 if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
2801 }
2802 zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
2803 if( zK2==0 ){
2804 sqlite3_free(zK1);
2805 return SQLITE_NOMEM;
2806 }
2807 if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
2808 if( sqlite3_value_type(argv[1])==SQLITE_NULL ){
2809 spellfix1DbExec(&rc, db,
2810 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2811 "VALUES(%d,%d,%Q,%Q,%Q)",
2812 p->zDbName, p->zTableName,
2813 iRank, iLang, zWord, zK1, zK2
2814 );
2815 }else{
2816 newRowid = sqlite3_value_int64(argv[1]);
2817 spellfix1DbExec(&rc, db,
2818 "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
2819 "VALUES(%lld,%d,%d,%Q,%Q,%Q)",
2820 zConflict, p->zDbName, p->zTableName,
2821 newRowid, iRank, iLang, zWord, zK1, zK2
2822 );
2823 }
2824 *pRowid = sqlite3_last_insert_rowid(db);
2825 }else{
2826 rowid = sqlite3_value_int64(argv[0]);
2827 newRowid = *pRowid = sqlite3_value_int64(argv[1]);
2828 spellfix1DbExec(&rc, db,
2829 "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2830 " word=%Q, k1=%Q, k2=%Q WHERE id=%lld",
2831 zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang,
2832 zWord, zK1, zK2, rowid
2833 );
2834 }
2835 sqlite3_free(zK1);
2836 sqlite3_free(zK2);
2837 }
2838 return rc;
2839 }
2840
2841 /*
2842 ** Rename the spellfix1 table.
2843 */
2844 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
2845 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2846 sqlite3 *db = p->db;
2847 int rc = SQLITE_OK;
2848 char *zNewName = sqlite3_mprintf("%s", zNew);
2849 if( zNewName==0 ){
2850 return SQLITE_NOMEM;
2851 }
2852 spellfix1DbExec(&rc, db,
2853 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2854 p->zDbName, p->zTableName, zNewName
2855 );
2856 if( rc==SQLITE_OK ){
2857 sqlite3_free(p->zTableName);
2858 p->zTableName = zNewName;
2859 }else{
2860 sqlite3_free(zNewName);
2861 }
2862 return rc;
2863 }
2864
2865
2866 /*
2867 ** A virtual table module that provides fuzzy search.
2868 */
2869 static sqlite3_module spellfix1Module = {
2870 0, /* iVersion */
2871 spellfix1Create, /* xCreate - handle CREATE VIRTUAL TABLE */
2872 spellfix1Connect, /* xConnect - reconnected to an existing table */
2873 spellfix1BestIndex, /* xBestIndex - figure out how to do a query */
2874 spellfix1Disconnect, /* xDisconnect - close a connection */
2875 spellfix1Destroy, /* xDestroy - handle DROP TABLE */
2876 spellfix1Open, /* xOpen - open a cursor */
2877 spellfix1Close, /* xClose - close a cursor */
2878 spellfix1Filter, /* xFilter - configure scan constraints */
2879 spellfix1Next, /* xNext - advance a cursor */
2880 spellfix1Eof, /* xEof - check for end of scan */
2881 spellfix1Column, /* xColumn - read data */
2882 spellfix1Rowid, /* xRowid - read data */
2883 spellfix1Update, /* xUpdate */
2884 0, /* xBegin */
2885 0, /* xSync */
2886 0, /* xCommit */
2887 0, /* xRollback */
2888 0, /* xFindMethod */
2889 spellfix1Rename, /* xRename */
2890 };
2891
2892 /*
2893 ** Register the various functions and the virtual table.
2894 */
2895 static int spellfix1Register(sqlite3 *db){
2896 int rc = SQLITE_OK;
2897 int i;
2898 rc = sqlite3_create_function(db, "spellfix1_translit", 1, SQLITE_UTF8, 0,
2899 transliterateSqlFunc, 0, 0);
2900 if( rc==SQLITE_OK ){
2901 rc = sqlite3_create_function(db, "spellfix1_editdist", 2, SQLITE_UTF8, 0,
2902 editdistSqlFunc, 0, 0);
2903 }
2904 if( rc==SQLITE_OK ){
2905 rc = sqlite3_create_function(db, "spellfix1_phonehash", 1, SQLITE_UTF8, 0,
2906 phoneticHashSqlFunc, 0, 0);
2907 }
2908 if( rc==SQLITE_OK ){
2909 rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1, SQLITE_UTF8, 0,
2910 scriptCodeSqlFunc, 0, 0);
2911 }
2912 if( rc==SQLITE_OK ){
2913 rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
2914 }
2915 if( rc==SQLITE_OK ){
2916 rc = editDist3Install(db);
2917 }
2918
2919 /* Verify sanity of the translit[] table */
2920 for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
2921 assert( translit[i].cFrom<translit[i+1].cFrom );
2922 }
2923
2924 return rc;
2925 }
2926
2927 #endif /* SQLITE_OMIT_VIRTUALTABLE */
2928
2929 /*
2930 ** Extension load function.
2931 */
2932 #ifdef _WIN32
2933 __declspec(dllexport)
2934 #endif
2935 int sqlite3_spellfix_init(
2936 sqlite3 *db,
2937 char **pzErrMsg,
2938 const sqlite3_api_routines *pApi
2939 ){
2940 SQLITE_EXTENSION_INIT2(pApi);
2941 #ifndef SQLITE_OMIT_VIRTUALTABLE
2942 return spellfix1Register(db);
2943 #endif
2944 return SQLITE_OK;
2945 }
OLDNEW
« no previous file with comments | « third_party/sqlite/sqlite-src-3170000/ext/misc/showauth.c ('k') | third_party/sqlite/sqlite-src-3170000/ext/misc/totype.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698