Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(244)

Side by Side Diff: third_party/sqlite/sqlite-src-3100200/ext/misc/spellfix.c

Issue 2846743003: [sql] Remove SQLite 3.10.2 reference directory. (Closed)
Patch Set: Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches. See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
16 */
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
19
20 #ifndef SQLITE_AMALGAMATION
21 # include <string.h>
22 # include <stdio.h>
23 # include <stdlib.h>
24 # include <assert.h>
25 # define ALWAYS(X) 1
26 # define NEVER(X) 0
27 typedef unsigned char u8;
28 typedef unsigned short u16;
29 #endif
30 #include <ctype.h>
31
32 #ifndef SQLITE_OMIT_VIRTUALTABLE
33
34 /*
35 ** Character classes for ASCII characters:
36 **
37 ** 0 '' Silent letters: H W
38 ** 1 'A' Any vowel: A E I O U (Y)
39 ** 2 'B' A bilabeal stop or fricative: B F P V W
40 ** 3 'C' Other fricatives or back stops: C G J K Q S X Z
41 ** 4 'D' Alveolar stops: D T
42 ** 5 'H' Letter H at the beginning of a word
43 ** 6 'L' Glide: L
44 ** 7 'R' Semivowel: R
45 ** 8 'M' Nasals: M N
46 ** 9 'Y' Letter Y at the beginning of a word.
47 ** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9
48 ** 11 ' ' White space
49 ** 12 '?' Other.
50 */
51 #define CCLASS_SILENT 0
52 #define CCLASS_VOWEL 1
53 #define CCLASS_B 2
54 #define CCLASS_C 3
55 #define CCLASS_D 4
56 #define CCLASS_H 5
57 #define CCLASS_L 6
58 #define CCLASS_R 7
59 #define CCLASS_M 8
60 #define CCLASS_Y 9
61 #define CCLASS_DIGIT 10
62 #define CCLASS_SPACE 11
63 #define CCLASS_OTHER 12
64
65 /*
66 ** The following table gives the character class for non-initial ASCII
67 ** characters.
68 */
69 static const unsigned char midClass[] = {
70 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
71 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
72 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
73 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
74 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
75 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
76 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
77 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
78 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
79 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
80 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
81 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
82 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
83 /* ' */ CCLASS_SILENT, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
84 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
85 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
86 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
87 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
88 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
89 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
90 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
91 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
92 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
93 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
94 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
95 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
96 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
97 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
98 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
99 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_VOWEL,
100 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
101 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
102 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
103 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
104 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
105 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
106 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
107 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
108 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
109 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
110 /* x */ CCLASS_C, /* y */ CCLASS_VOWEL, /* z */ CCLASS_C,
111 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
112 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
113 };
114 /*
115 ** This tables gives the character class for ASCII characters that form the
116 ** initial character of a word. The only difference from midClass is with
117 ** the letters H, W, and Y.
118 */
119 static const unsigned char initClass[] = {
120 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
121 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
122 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
123 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
124 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
125 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
126 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
127 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
128 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
129 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
130 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
131 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
132 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
133 /* ' */ CCLASS_OTHER, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
134 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
135 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
136 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
137 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
138 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
139 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
140 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
141 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
142 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
143 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
144 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
145 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
146 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
147 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
148 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
149 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_Y,
150 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
151 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
152 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
153 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
154 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
155 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
156 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
157 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
158 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
159 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
160 /* x */ CCLASS_C, /* y */ CCLASS_Y, /* z */ CCLASS_C,
161 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
162 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
163 };
164
165 /*
166 ** Mapping from the character class number (0-13) to a symbol for each
167 ** character class. Note that initClass[] can be used to map the class
168 ** symbol back into the class number.
169 */
170 static const unsigned char className[] = ".ABCDHLRMY9 ?";
171
172 /*
173 ** Generate a "phonetic hash" from a string of ASCII characters
174 ** in zIn[0..nIn-1].
175 **
176 ** * Map characters by character class as defined above.
177 ** * Omit double-letters
178 ** * Omit vowels beside R and L
179 ** * Omit T when followed by CH
180 ** * Omit W when followed by R
181 ** * Omit D when followed by J or G
182 ** * Omit K in KN or G in GN at the beginning of a word
183 **
184 ** Space to hold the result is obtained from sqlite3_malloc()
185 **
186 ** Return NULL if memory allocation fails.
187 */
188 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){
189 unsigned char *zOut = sqlite3_malloc( nIn + 1 );
190 int i;
191 int nOut = 0;
192 char cPrev = 0x77;
193 char cPrevX = 0x77;
194 const unsigned char *aClass = initClass;
195
196 if( zOut==0 ) return 0;
197 if( nIn>2 ){
198 switch( zIn[0] ){
199 case 'g':
200 case 'k': {
201 if( zIn[1]=='n' ){ zIn++; nIn--; }
202 break;
203 }
204 }
205 }
206 for(i=0; i<nIn; i++){
207 unsigned char c = zIn[i];
208 if( i+1<nIn ){
209 if( c=='w' && zIn[i+1]=='r' ) continue;
210 if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
211 if( i+2<nIn ){
212 if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
213 }
214 }
215 c = aClass[c&0x7f];
216 if( c==CCLASS_SPACE ) continue;
217 if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
218 aClass = midClass;
219 if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){
220 continue; /* No vowels beside L or R */
221 }
222 if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){
223 nOut--; /* No vowels beside L or R */
224 }
225 cPrev = c;
226 if( c==CCLASS_SILENT ) continue;
227 cPrevX = c;
228 c = className[c];
229 assert( nOut>=0 );
230 if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
231 }
232 zOut[nOut] = 0;
233 return zOut;
234 }
235
236 /*
237 ** This is an SQL function wrapper around phoneticHash(). See
238 ** the description of phoneticHash() for additional information.
239 */
240 static void phoneticHashSqlFunc(
241 sqlite3_context *context,
242 int argc,
243 sqlite3_value **argv
244 ){
245 const unsigned char *zIn;
246 unsigned char *zOut;
247
248 zIn = sqlite3_value_text(argv[0]);
249 if( zIn==0 ) return;
250 zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
251 if( zOut==0 ){
252 sqlite3_result_error_nomem(context);
253 }else{
254 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
255 }
256 }
257
258 /*
259 ** Return the character class number for a character given its
260 ** context.
261 */
262 static char characterClass(char cPrev, char c){
263 return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
264 }
265
266 /*
267 ** Return the cost of inserting or deleting character c immediately
268 ** following character cPrev. If cPrev==0, that means c is the first
269 ** character of the word.
270 */
271 static int insertOrDeleteCost(char cPrev, char c, char cNext){
272 char classC = characterClass(cPrev, c);
273 char classCprev;
274
275 if( classC==CCLASS_SILENT ){
276 /* Insert or delete "silent" characters such as H or W */
277 return 1;
278 }
279 if( cPrev==c ){
280 /* Repeated characters, or miss a repeat */
281 return 10;
282 }
283 if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){
284 return 20; /* Insert a vowel before or after 'r' */
285 }
286 classCprev = characterClass(cPrev, cPrev);
287 if( classC==classCprev ){
288 if( classC==CCLASS_VOWEL ){
289 /* Remove or add a new vowel to a vowel cluster */
290 return 15;
291 }else{
292 /* Remove or add a consonant not in the same class */
293 return 50;
294 }
295 }
296
297 /* any other character insertion or deletion */
298 return 100;
299 }
300
301 /*
302 ** Divide the insertion cost by this factor when appending to the
303 ** end of the word.
304 */
305 #define FINAL_INS_COST_DIV 4
306
307 /*
308 ** Return the cost of substituting cTo in place of cFrom assuming
309 ** the previous character is cPrev. If cPrev==0 then cTo is the first
310 ** character of the word.
311 */
312 static int substituteCost(char cPrev, char cFrom, char cTo){
313 char classFrom, classTo;
314 if( cFrom==cTo ){
315 /* Exact match */
316 return 0;
317 }
318 if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
319 /* differ only in case */
320 return 0;
321 }
322 classFrom = characterClass(cPrev, cFrom);
323 classTo = characterClass(cPrev, cTo);
324 if( classFrom==classTo ){
325 /* Same character class */
326 return 40;
327 }
328 if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
329 && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
330 /* Convert from one consonant to another, but in a different class */
331 return 75;
332 }
333 /* Any other subsitution */
334 return 100;
335 }
336
337 /*
338 ** Given two strings zA and zB which are pure ASCII, return the cost
339 ** of transforming zA into zB. If zA ends with '*' assume that it is
340 ** a prefix of zB and give only minimal penalty for extra characters
341 ** on the end of zB.
342 **
343 ** Smaller numbers mean a closer match.
344 **
345 ** Negative values indicate an error:
346 ** -1 One of the inputs is NULL
347 ** -2 Non-ASCII characters on input
348 ** -3 Unable to allocate memory
349 **
350 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
351 ** of zB that matched the pattern in zA. If zA does not end with a '*',
352 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
353 ** If zA does end in a '*', then it is the number of bytes in the prefix
354 ** of zB that was deemed to match zA.
355 */
356 static int editdist1(const char *zA, const char *zB, int *pnMatch){
357 int nA, nB; /* Number of characters in zA[] and zB[] */
358 int xA, xB; /* Loop counters for zA[] and zB[] */
359 char cA = 0, cB; /* Current character of zA and zB */
360 char cAprev, cBprev; /* Previous character of zA and zB */
361 char cAnext, cBnext; /* Next character in zA and zB */
362 int d; /* North-west cost value */
363 int dc = 0; /* North-west character value */
364 int res; /* Final result */
365 int *m; /* The cost matrix */
366 char *cx; /* Corresponding character values */
367 int *toFree = 0; /* Malloced space */
368 int mStack[60+15]; /* Stack space to use if not too much is needed */
369 int nMatch = 0;
370
371 /* Early out if either input is NULL */
372 if( zA==0 || zB==0 ) return -1;
373
374 /* Skip any common prefix */
375 while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
376 if( pnMatch ) *pnMatch = nMatch;
377 if( zA[0]==0 && zB[0]==0 ) return 0;
378
379 #if 0
380 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
381 #endif
382
383 /* Verify input strings and measure their lengths */
384 for(nA=0; zA[nA]; nA++){
385 if( zA[nA]&0x80 ) return -2;
386 }
387 for(nB=0; zB[nB]; nB++){
388 if( zB[nB]&0x80 ) return -2;
389 }
390
391 /* Special processing if either string is empty */
392 if( nA==0 ){
393 cBprev = dc;
394 for(xB=res=0; (cB = zB[xB])!=0; xB++){
395 res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
396 cBprev = cB;
397 }
398 return res;
399 }
400 if( nB==0 ){
401 cAprev = dc;
402 for(xA=res=0; (cA = zA[xA])!=0; xA++){
403 res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
404 cAprev = cA;
405 }
406 return res;
407 }
408
409 /* A is a prefix of B */
410 if( zA[0]=='*' && zA[1]==0 ) return 0;
411
412 /* Allocate and initialize the Wagner matrix */
413 if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
414 m = mStack;
415 }else{
416 m = toFree = sqlite3_malloc( (nB+1)*5*sizeof(m[0])/4 );
417 if( m==0 ) return -3;
418 }
419 cx = (char*)&m[nB+1];
420
421 /* Compute the Wagner edit distance */
422 m[0] = 0;
423 cx[0] = dc;
424 cBprev = dc;
425 for(xB=1; xB<=nB; xB++){
426 cBnext = zB[xB];
427 cB = zB[xB-1];
428 cx[xB] = cB;
429 m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
430 cBprev = cB;
431 }
432 cAprev = dc;
433 for(xA=1; xA<=nA; xA++){
434 int lastA = (xA==nA);
435 cA = zA[xA-1];
436 cAnext = zA[xA];
437 if( cA=='*' && lastA ) break;
438 d = m[0];
439 dc = cx[0];
440 m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
441 cBprev = 0;
442 for(xB=1; xB<=nB; xB++){
443 int totalCost, insCost, delCost, subCost, ncx;
444 cB = zB[xB-1];
445 cBnext = zB[xB];
446
447 /* Cost to insert cB */
448 insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
449 if( lastA ) insCost /= FINAL_INS_COST_DIV;
450
451 /* Cost to delete cA */
452 delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
453
454 /* Cost to substitute cA->cB */
455 subCost = substituteCost(cx[xB-1], cA, cB);
456
457 /* Best cost */
458 totalCost = insCost + m[xB-1];
459 ncx = cB;
460 if( (delCost + m[xB])<totalCost ){
461 totalCost = delCost + m[xB];
462 ncx = cA;
463 }
464 if( (subCost + d)<totalCost ){
465 totalCost = subCost + d;
466 }
467
468 #if 0
469 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
470 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
471 xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
472 insCost, delCost, subCost, totalCost, ncx?ncx:' ');
473 #endif
474
475 /* Update the matrix */
476 d = m[xB];
477 dc = cx[xB];
478 m[xB] = totalCost;
479 cx[xB] = ncx;
480 cBprev = cB;
481 }
482 cAprev = cA;
483 }
484
485 /* Free the wagner matrix and return the result */
486 if( cA=='*' ){
487 res = m[1];
488 for(xB=1; xB<=nB; xB++){
489 if( m[xB]<res ){
490 res = m[xB];
491 if( pnMatch ) *pnMatch = xB+nMatch;
492 }
493 }
494 }else{
495 res = m[nB];
496 /* In the current implementation, pnMatch is always NULL if zA does
497 ** not end in "*" */
498 assert( pnMatch==0 );
499 }
500 sqlite3_free(toFree);
501 return res;
502 }
503
504 /*
505 ** Function: editdist(A,B)
506 **
507 ** Return the cost of transforming string A into string B. Both strings
508 ** must be pure ASCII text. If A ends with '*' then it is assumed to be
509 ** a prefix of B and extra characters on the end of B have minimal additional
510 ** cost.
511 */
512 static void editdistSqlFunc(
513 sqlite3_context *context,
514 int argc,
515 sqlite3_value **argv
516 ){
517 int res = editdist1(
518 (const char*)sqlite3_value_text(argv[0]),
519 (const char*)sqlite3_value_text(argv[1]),
520 0);
521 if( res<0 ){
522 if( res==(-3) ){
523 sqlite3_result_error_nomem(context);
524 }else if( res==(-2) ){
525 sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
526 }else{
527 sqlite3_result_error(context, "NULL input to editdist()", -1);
528 }
529 }else{
530 sqlite3_result_int(context, res);
531 }
532 }
533
534 /* End of the fixed-cost edit distance implementation
535 ******************************************************************************
536 *****************************************************************************
537 ** Begin: Configurable cost unicode edit distance routines
538 */
539 /* Forward declaration of structures */
540 typedef struct EditDist3Cost EditDist3Cost;
541 typedef struct EditDist3Config EditDist3Config;
542 typedef struct EditDist3Point EditDist3Point;
543 typedef struct EditDist3From EditDist3From;
544 typedef struct EditDist3FromString EditDist3FromString;
545 typedef struct EditDist3To EditDist3To;
546 typedef struct EditDist3ToString EditDist3ToString;
547 typedef struct EditDist3Lang EditDist3Lang;
548
549
550 /*
551 ** An entry in the edit cost table
552 */
553 struct EditDist3Cost {
554 EditDist3Cost *pNext; /* Next cost element */
555 u8 nFrom; /* Number of bytes in aFrom */
556 u8 nTo; /* Number of bytes in aTo */
557 u16 iCost; /* Cost of this transformation */
558 char a[4] ; /* FROM string followed by TO string */
559 /* Additional TO and FROM string bytes appended as necessary */
560 };
561
562 /*
563 ** Edit costs for a particular language ID
564 */
565 struct EditDist3Lang {
566 int iLang; /* Language ID */
567 int iInsCost; /* Default insertion cost */
568 int iDelCost; /* Default deletion cost */
569 int iSubCost; /* Default substitution cost */
570 EditDist3Cost *pCost; /* Costs */
571 };
572
573
574 /*
575 ** The default EditDist3Lang object, with default costs.
576 */
577 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
578
579 /*
580 ** Complete configuration
581 */
582 struct EditDist3Config {
583 int nLang; /* Number of language IDs. Size of a[] */
584 EditDist3Lang *a; /* One for each distinct language ID */
585 };
586
587 /*
588 ** Extra information about each character in the FROM string.
589 */
590 struct EditDist3From {
591 int nSubst; /* Number of substitution cost entries */
592 int nDel; /* Number of deletion cost entries */
593 int nByte; /* Number of bytes in this character */
594 EditDist3Cost **apSubst; /* Array of substitution costs for this element */
595 EditDist3Cost **apDel; /* Array of deletion cost entries */
596 };
597
598 /*
599 ** A precompiled FROM string.
600 *
601 ** In the common case we expect the FROM string to be reused multiple times.
602 ** In other words, the common case will be to measure the edit distance
603 ** from a single origin string to multiple target strings.
604 */
605 struct EditDist3FromString {
606 char *z; /* The complete text of the FROM string */
607 int n; /* Number of characters in the FROM string */
608 int isPrefix; /* True if ends with '*' character */
609 EditDist3From *a; /* Extra info about each char of the FROM string */
610 };
611
612 /*
613 ** Extra information about each character in the TO string.
614 */
615 struct EditDist3To {
616 int nIns; /* Number of insertion cost entries */
617 int nByte; /* Number of bytes in this character */
618 EditDist3Cost **apIns; /* Array of deletion cost entries */
619 };
620
621 /*
622 ** A precompiled FROM string
623 */
624 struct EditDist3ToString {
625 char *z; /* The complete text of the TO string */
626 int n; /* Number of characters in the TO string */
627 EditDist3To *a; /* Extra info about each char of the TO string */
628 };
629
630 /*
631 ** Clear or delete an instance of the object that records all edit-distance
632 ** weights.
633 */
634 static void editDist3ConfigClear(EditDist3Config *p){
635 int i;
636 if( p==0 ) return;
637 for(i=0; i<p->nLang; i++){
638 EditDist3Cost *pCost, *pNext;
639 pCost = p->a[i].pCost;
640 while( pCost ){
641 pNext = pCost->pNext;
642 sqlite3_free(pCost);
643 pCost = pNext;
644 }
645 }
646 sqlite3_free(p->a);
647 memset(p, 0, sizeof(*p));
648 }
649 static void editDist3ConfigDelete(void *pIn){
650 EditDist3Config *p = (EditDist3Config*)pIn;
651 editDist3ConfigClear(p);
652 sqlite3_free(p);
653 }
654
655 /*
656 ** Load all edit-distance weights from a table.
657 */
658 static int editDist3ConfigLoad(
659 EditDist3Config *p, /* The edit distance configuration to load */
660 sqlite3 *db, /* Load from this database */
661 const char *zTable /* Name of the table from which to load */
662 ){
663 sqlite3_stmt *pStmt;
664 int rc, rc2;
665 char *zSql;
666 int iLangPrev = -9999;
667 EditDist3Lang *pLang = 0;
668
669 zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
670 " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
671 if( zSql==0 ) return SQLITE_NOMEM;
672 rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
673 sqlite3_free(zSql);
674 if( rc ) return rc;
675 editDist3ConfigClear(p);
676 while( sqlite3_step(pStmt)==SQLITE_ROW ){
677 int iLang = sqlite3_column_int(pStmt, 0);
678 const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
679 int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
680 const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
681 int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
682 int iCost = sqlite3_column_int(pStmt, 3);
683
684 assert( zFrom!=0 || nFrom==0 );
685 assert( zTo!=0 || nTo==0 );
686 if( nFrom>100 || nTo>100 ) continue;
687 if( iCost<0 ) continue;
688 if( pLang==0 || iLang!=iLangPrev ){
689 EditDist3Lang *pNew;
690 pNew = sqlite3_realloc(p->a, (p->nLang+1)*sizeof(p->a[0]));
691 if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
692 p->a = pNew;
693 pLang = &p->a[p->nLang];
694 p->nLang++;
695 pLang->iLang = iLang;
696 pLang->iInsCost = 100;
697 pLang->iDelCost = 100;
698 pLang->iSubCost = 150;
699 pLang->pCost = 0;
700 iLangPrev = iLang;
701 }
702 if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
703 pLang->iDelCost = iCost;
704 }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
705 pLang->iInsCost = iCost;
706 }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){
707 pLang->iSubCost = iCost;
708 }else{
709 EditDist3Cost *pCost;
710 int nExtra = nFrom + nTo - 4;
711 if( nExtra<0 ) nExtra = 0;
712 pCost = sqlite3_malloc( sizeof(*pCost) + nExtra );
713 if( pCost==0 ){ rc = SQLITE_NOMEM; break; }
714 pCost->nFrom = nFrom;
715 pCost->nTo = nTo;
716 pCost->iCost = iCost;
717 memcpy(pCost->a, zFrom, nFrom);
718 memcpy(pCost->a + nFrom, zTo, nTo);
719 pCost->pNext = pLang->pCost;
720 pLang->pCost = pCost;
721 }
722 }
723 rc2 = sqlite3_finalize(pStmt);
724 if( rc==SQLITE_OK ) rc = rc2;
725 return rc;
726 }
727
728 /*
729 ** Return the length (in bytes) of a utf-8 character. Or return a maximum
730 ** of N.
731 */
732 static int utf8Len(unsigned char c, int N){
733 int len = 1;
734 if( c>0x7f ){
735 if( (c&0xe0)==0xc0 ){
736 len = 2;
737 }else if( (c&0xf0)==0xe0 ){
738 len = 3;
739 }else{
740 len = 4;
741 }
742 }
743 if( len>N ) len = N;
744 return len;
745 }
746
747 /*
748 ** Return TRUE (non-zero) if the To side of the given cost matches
749 ** the given string.
750 */
751 static int matchTo(EditDist3Cost *p, const char *z, int n){
752 if( p->nTo>n ) return 0;
753 if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
754 return 1;
755 }
756
757 /*
758 ** Return TRUE (non-zero) if the From side of the given cost matches
759 ** the given string.
760 */
761 static int matchFrom(EditDist3Cost *p, const char *z, int n){
762 assert( p->nFrom<=n );
763 if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
764 return 1;
765 }
766
767 /*
768 ** Return TRUE (non-zero) of the next FROM character and the next TO
769 ** character are the same.
770 */
771 static int matchFromTo(
772 EditDist3FromString *pStr, /* Left hand string */
773 int n1, /* Index of comparison character on the left */
774 const char *z2, /* Right-handl comparison character */
775 int n2 /* Bytes remaining in z2[] */
776 ){
777 int b1 = pStr->a[n1].nByte;
778 if( b1>n2 ) return 0;
779 if( memcmp(pStr->z+n1, z2, b1)!=0 ) return 0;
780 return 1;
781 }
782
783 /*
784 ** Delete an EditDist3FromString objecct
785 */
786 static void editDist3FromStringDelete(EditDist3FromString *p){
787 int i;
788 if( p ){
789 for(i=0; i<p->n; i++){
790 sqlite3_free(p->a[i].apDel);
791 sqlite3_free(p->a[i].apSubst);
792 }
793 sqlite3_free(p);
794 }
795 }
796
797 /*
798 ** Create a EditDist3FromString object.
799 */
800 static EditDist3FromString *editDist3FromStringNew(
801 const EditDist3Lang *pLang,
802 const char *z,
803 int n
804 ){
805 EditDist3FromString *pStr;
806 EditDist3Cost *p;
807 int i;
808
809 if( z==0 ) return 0;
810 if( n<0 ) n = (int)strlen(z);
811 pStr = sqlite3_malloc( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
812 if( pStr==0 ) return 0;
813 pStr->a = (EditDist3From*)&pStr[1];
814 memset(pStr->a, 0, sizeof(pStr->a[0])*n);
815 pStr->n = n;
816 pStr->z = (char*)&pStr->a[n];
817 memcpy(pStr->z, z, n+1);
818 if( n && z[n-1]=='*' ){
819 pStr->isPrefix = 1;
820 n--;
821 pStr->n--;
822 pStr->z[n] = 0;
823 }else{
824 pStr->isPrefix = 0;
825 }
826
827 for(i=0; i<n; i++){
828 EditDist3From *pFrom = &pStr->a[i];
829 memset(pFrom, 0, sizeof(*pFrom));
830 pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
831 for(p=pLang->pCost; p; p=p->pNext){
832 EditDist3Cost **apNew;
833 if( i+p->nFrom>n ) continue;
834 if( matchFrom(p, z+i, n-i)==0 ) continue;
835 if( p->nTo==0 ){
836 apNew = sqlite3_realloc(pFrom->apDel,
837 sizeof(*apNew)*(pFrom->nDel+1));
838 if( apNew==0 ) break;
839 pFrom->apDel = apNew;
840 apNew[pFrom->nDel++] = p;
841 }else{
842 apNew = sqlite3_realloc(pFrom->apSubst,
843 sizeof(*apNew)*(pFrom->nSubst+1));
844 if( apNew==0 ) break;
845 pFrom->apSubst = apNew;
846 apNew[pFrom->nSubst++] = p;
847 }
848 }
849 if( p ){
850 editDist3FromStringDelete(pStr);
851 pStr = 0;
852 break;
853 }
854 }
855 return pStr;
856 }
857
858 /*
859 ** Update entry m[i] such that it is the minimum of its current value
860 ** and m[j]+iCost.
861 **
862 ** If the iCost is 1,000,000 or greater, then consider the cost to be
863 ** infinite and skip the update.
864 */
865 static void updateCost(
866 unsigned int *m,
867 int i,
868 int j,
869 int iCost
870 ){
871 assert( iCost>=0 );
872 if( iCost<10000 ){
873 unsigned int b = m[j] + iCost;
874 if( b<m[i] ) m[i] = b;
875 }
876 }
877
878 /* Compute the edit distance between two strings.
879 **
880 ** If an error occurs, return a negative number which is the error code.
881 **
882 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
883 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
884 ** not contain the pattern for a prefix-search, then this is always the number
885 ** of characters in z2. If pFrom does contain a prefix search pattern, then
886 ** it is the number of characters in the prefix of z2 that was deemed to
887 ** match pFrom.
888 */
889 static int editDist3Core(
890 EditDist3FromString *pFrom, /* The FROM string */
891 const char *z2, /* The TO string */
892 int n2, /* Length of the TO string */
893 const EditDist3Lang *pLang, /* Edit weights for a particular language ID */
894 int *pnMatch /* OUT: Characters in matched prefix */
895 ){
896 int k, n;
897 int i1, b1;
898 int i2, b2;
899 EditDist3FromString f = *pFrom;
900 EditDist3To *a2;
901 unsigned int *m;
902 int szRow;
903 EditDist3Cost *p;
904 int res;
905
906 /* allocate the Wagner matrix and the aTo[] array for the TO string */
907 n = (f.n+1)*(n2+1);
908 n = (n+1)&~1;
909 m = sqlite3_malloc( n*sizeof(m[0]) + sizeof(a2[0])*n2 );
910 if( m==0 ) return -1; /* Out of memory */
911 a2 = (EditDist3To*)&m[n];
912 memset(a2, 0, sizeof(a2[0])*n2);
913
914 /* Fill in the a1[] matrix for all characters of the TO string */
915 for(i2=0; i2<n2; i2++){
916 a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
917 for(p=pLang->pCost; p; p=p->pNext){
918 EditDist3Cost **apNew;
919 if( p->nFrom>0 ) continue;
920 if( i2+p->nTo>n2 ) continue;
921 if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
922 a2[i2].nIns++;
923 apNew = sqlite3_realloc(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
924 if( apNew==0 ){
925 res = -1; /* Out of memory */
926 goto editDist3Abort;
927 }
928 a2[i2].apIns = apNew;
929 a2[i2].apIns[a2[i2].nIns-1] = p;
930 }
931 }
932
933 /* Prepare to compute the minimum edit distance */
934 szRow = f.n+1;
935 memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
936 m[0] = 0;
937
938 /* First fill in the top-row of the matrix with FROM deletion costs */
939 for(i1=0; i1<f.n; i1 += b1){
940 b1 = f.a[i1].nByte;
941 updateCost(m, i1+b1, i1, pLang->iDelCost);
942 for(k=0; k<f.a[i1].nDel; k++){
943 p = f.a[i1].apDel[k];
944 updateCost(m, i1+p->nFrom, i1, p->iCost);
945 }
946 }
947
948 /* Fill in all subsequent rows, top-to-bottom, left-to-right */
949 for(i2=0; i2<n2; i2 += b2){
950 int rx; /* Starting index for current row */
951 int rxp; /* Starting index for previous row */
952 b2 = a2[i2].nByte;
953 rx = szRow*(i2+b2);
954 rxp = szRow*i2;
955 updateCost(m, rx, rxp, pLang->iInsCost);
956 for(k=0; k<a2[i2].nIns; k++){
957 p = a2[i2].apIns[k];
958 updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
959 }
960 for(i1=0; i1<f.n; i1+=b1){
961 int cx; /* Index of current cell */
962 int cxp; /* Index of cell immediately to the left */
963 int cxd; /* Index of cell to the left and one row above */
964 int cxu; /* Index of cell immediately above */
965 b1 = f.a[i1].nByte;
966 cxp = rx + i1;
967 cx = cxp + b1;
968 cxd = rxp + i1;
969 cxu = cxd + b1;
970 updateCost(m, cx, cxp, pLang->iDelCost);
971 for(k=0; k<f.a[i1].nDel; k++){
972 p = f.a[i1].apDel[k];
973 updateCost(m, cxp+p->nFrom, cxp, p->iCost);
974 }
975 updateCost(m, cx, cxu, pLang->iInsCost);
976 if( matchFromTo(&f, i1, z2+i2, n2-i2) ){
977 updateCost(m, cx, cxd, 0);
978 }
979 updateCost(m, cx, cxd, pLang->iSubCost);
980 for(k=0; k<f.a[i1].nSubst; k++){
981 p = f.a[i1].apSubst[k];
982 if( matchTo(p, z2+i2, n2-i2) ){
983 updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
984 }
985 }
986 }
987 }
988
989 #if 0 /* Enable for debugging */
990 printf(" ^");
991 for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
992 printf("\n ^:");
993 for(i1=0; i1<szRow; i1++){
994 int v = m[i1];
995 if( v>9999 ) printf(" ****");
996 else printf(" %4d", v);
997 }
998 printf("\n");
999 for(i2=0; i2<n2; i2++){
1000 printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1001 for(i1=0; i1<szRow; i1++){
1002 int v = m[(i2+1)*szRow+i1];
1003 if( v>9999 ) printf(" ****");
1004 else printf(" %4d", v);
1005 }
1006 printf("\n");
1007 }
1008 #endif
1009
1010 /* Free memory allocations and return the result */
1011 res = (int)m[szRow*(n2+1)-1];
1012 n = n2;
1013 if( f.isPrefix ){
1014 for(i2=1; i2<=n2; i2++){
1015 int b = m[szRow*i2-1];
1016 if( b<=res ){
1017 res = b;
1018 n = i2 - 1;
1019 }
1020 }
1021 }
1022 if( pnMatch ){
1023 int nExtra = 0;
1024 for(k=0; k<n; k++){
1025 if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1026 }
1027 *pnMatch = n - nExtra;
1028 }
1029
1030 editDist3Abort:
1031 for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1032 sqlite3_free(m);
1033 return res;
1034 }
1035
1036 /*
1037 ** Get an appropriate EditDist3Lang object.
1038 */
1039 static const EditDist3Lang *editDist3FindLang(
1040 EditDist3Config *pConfig,
1041 int iLang
1042 ){
1043 int i;
1044 for(i=0; i<pConfig->nLang; i++){
1045 if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1046 }
1047 return &editDist3Lang;
1048 }
1049
1050 /*
1051 ** Function: editdist3(A,B,iLang)
1052 ** editdist3(tablename)
1053 **
1054 ** Return the cost of transforming string A into string B using edit
1055 ** weights for iLang.
1056 **
1057 ** The second form loads edit weights into memory from a table.
1058 */
1059 static void editDist3SqlFunc(
1060 sqlite3_context *context,
1061 int argc,
1062 sqlite3_value **argv
1063 ){
1064 EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1065 sqlite3 *db = sqlite3_context_db_handle(context);
1066 int rc;
1067 if( argc==1 ){
1068 const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1069 rc = editDist3ConfigLoad(pConfig, db, zTable);
1070 if( rc ) sqlite3_result_error_code(context, rc);
1071 }else{
1072 const char *zA = (const char*)sqlite3_value_text(argv[0]);
1073 const char *zB = (const char*)sqlite3_value_text(argv[1]);
1074 int nA = sqlite3_value_bytes(argv[0]);
1075 int nB = sqlite3_value_bytes(argv[1]);
1076 int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1077 const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1078 EditDist3FromString *pFrom;
1079 int dist;
1080
1081 pFrom = editDist3FromStringNew(pLang, zA, nA);
1082 if( pFrom==0 ){
1083 sqlite3_result_error_nomem(context);
1084 return;
1085 }
1086 dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1087 editDist3FromStringDelete(pFrom);
1088 if( dist==(-1) ){
1089 sqlite3_result_error_nomem(context);
1090 }else{
1091 sqlite3_result_int(context, dist);
1092 }
1093 }
1094 }
1095
1096 /*
1097 ** Register the editDist3 function with SQLite
1098 */
1099 static int editDist3Install(sqlite3 *db){
1100 int rc;
1101 EditDist3Config *pConfig = sqlite3_malloc( sizeof(*pConfig) );
1102 if( pConfig==0 ) return SQLITE_NOMEM;
1103 memset(pConfig, 0, sizeof(*pConfig));
1104 rc = sqlite3_create_function_v2(db, "editdist3",
1105 2, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1106 if( rc==SQLITE_OK ){
1107 rc = sqlite3_create_function_v2(db, "editdist3",
1108 3, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0, 0);
1109 }
1110 if( rc==SQLITE_OK ){
1111 rc = sqlite3_create_function_v2(db, "editdist3",
1112 1, SQLITE_UTF8, pConfig, editDist3SqlFunc, 0, 0,
1113 editDist3ConfigDelete);
1114 }else{
1115 sqlite3_free(pConfig);
1116 }
1117 return rc;
1118 }
1119 /* End configurable cost unicode edit distance routines
1120 ******************************************************************************
1121 ******************************************************************************
1122 ** Begin transliterate unicode-to-ascii implementation
1123 */
1124
1125 #if !SQLITE_AMALGAMATION
1126 /*
1127 ** This lookup table is used to help decode the first byte of
1128 ** a multi-byte UTF8 character.
1129 */
1130 static const unsigned char sqlite3Utf8Trans1[] = {
1131 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1132 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1133 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1134 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1135 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1136 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1137 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1138 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1139 };
1140 #endif
1141
1142 /*
1143 ** Return the value of the first UTF-8 character in the string.
1144 */
1145 static int utf8Read(const unsigned char *z, int n, int *pSize){
1146 int c, i;
1147
1148 /* All callers to this routine (in the current implementation)
1149 ** always have n>0. */
1150 if( NEVER(n==0) ){
1151 c = i = 0;
1152 }else{
1153 c = z[0];
1154 i = 1;
1155 if( c>=0xc0 ){
1156 c = sqlite3Utf8Trans1[c-0xc0];
1157 while( i<n && (z[i] & 0xc0)==0x80 ){
1158 c = (c<<6) + (0x3f & z[i++]);
1159 }
1160 }
1161 }
1162 *pSize = i;
1163 return c;
1164 }
1165
1166 /*
1167 ** Return the number of characters in the utf-8 string in the nIn byte
1168 ** buffer pointed to by zIn.
1169 */
1170 static int utf8Charlen(const char *zIn, int nIn){
1171 int i;
1172 int nChar = 0;
1173 for(i=0; i<nIn; nChar++){
1174 int sz;
1175 utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1176 i += sz;
1177 }
1178 return nChar;
1179 }
1180
1181 /*
1182 ** Table of translations from unicode characters into ASCII.
1183 */
1184 static const struct {
1185 unsigned short int cFrom;
1186 unsigned char cTo0, cTo1;
1187 } translit[] = {
1188 { 0x00A0, 0x20, 0x00 }, /*   to */
1189 { 0x00B5, 0x75, 0x00 }, /* µ to u */
1190 { 0x00C0, 0x41, 0x00 }, /* À to A */
1191 { 0x00C1, 0x41, 0x00 }, /* Á to A */
1192 { 0x00C2, 0x41, 0x00 }, /* Â to A */
1193 { 0x00C3, 0x41, 0x00 }, /* Ã to A */
1194 { 0x00C4, 0x41, 0x65 }, /* Ä to Ae */
1195 { 0x00C5, 0x41, 0x61 }, /* Å to Aa */
1196 { 0x00C6, 0x41, 0x45 }, /* Æ to AE */
1197 { 0x00C7, 0x43, 0x00 }, /* Ç to C */
1198 { 0x00C8, 0x45, 0x00 }, /* È to E */
1199 { 0x00C9, 0x45, 0x00 }, /* É to E */
1200 { 0x00CA, 0x45, 0x00 }, /* Ê to E */
1201 { 0x00CB, 0x45, 0x00 }, /* Ë to E */
1202 { 0x00CC, 0x49, 0x00 }, /* Ì to I */
1203 { 0x00CD, 0x49, 0x00 }, /* Í to I */
1204 { 0x00CE, 0x49, 0x00 }, /* Î to I */
1205 { 0x00CF, 0x49, 0x00 }, /* Ï to I */
1206 { 0x00D0, 0x44, 0x00 }, /* Ð to D */
1207 { 0x00D1, 0x4E, 0x00 }, /* Ñ to N */
1208 { 0x00D2, 0x4F, 0x00 }, /* Ò to O */
1209 { 0x00D3, 0x4F, 0x00 }, /* Ó to O */
1210 { 0x00D4, 0x4F, 0x00 }, /* Ô to O */
1211 { 0x00D5, 0x4F, 0x00 }, /* Õ to O */
1212 { 0x00D6, 0x4F, 0x65 }, /* Ö to Oe */
1213 { 0x00D7, 0x78, 0x00 }, /* × to x */
1214 { 0x00D8, 0x4F, 0x00 }, /* Ø to O */
1215 { 0x00D9, 0x55, 0x00 }, /* Ù to U */
1216 { 0x00DA, 0x55, 0x00 }, /* Ú to U */
1217 { 0x00DB, 0x55, 0x00 }, /* Û to U */
1218 { 0x00DC, 0x55, 0x65 }, /* Ü to Ue */
1219 { 0x00DD, 0x59, 0x00 }, /* Ý to Y */
1220 { 0x00DE, 0x54, 0x68 }, /* Þ to Th */
1221 { 0x00DF, 0x73, 0x73 }, /* ß to ss */
1222 { 0x00E0, 0x61, 0x00 }, /* à to a */
1223 { 0x00E1, 0x61, 0x00 }, /* á to a */
1224 { 0x00E2, 0x61, 0x00 }, /* â to a */
1225 { 0x00E3, 0x61, 0x00 }, /* ã to a */
1226 { 0x00E4, 0x61, 0x65 }, /* ä to ae */
1227 { 0x00E5, 0x61, 0x61 }, /* å to aa */
1228 { 0x00E6, 0x61, 0x65 }, /* æ to ae */
1229 { 0x00E7, 0x63, 0x00 }, /* ç to c */
1230 { 0x00E8, 0x65, 0x00 }, /* è to e */
1231 { 0x00E9, 0x65, 0x00 }, /* é to e */
1232 { 0x00EA, 0x65, 0x00 }, /* ê to e */
1233 { 0x00EB, 0x65, 0x00 }, /* ë to e */
1234 { 0x00EC, 0x69, 0x00 }, /* ì to i */
1235 { 0x00ED, 0x69, 0x00 }, /* í to i */
1236 { 0x00EE, 0x69, 0x00 }, /* î to i */
1237 { 0x00EF, 0x69, 0x00 }, /* ï to i */
1238 { 0x00F0, 0x64, 0x00 }, /* ð to d */
1239 { 0x00F1, 0x6E, 0x00 }, /* ñ to n */
1240 { 0x00F2, 0x6F, 0x00 }, /* ò to o */
1241 { 0x00F3, 0x6F, 0x00 }, /* ó to o */
1242 { 0x00F4, 0x6F, 0x00 }, /* ô to o */
1243 { 0x00F5, 0x6F, 0x00 }, /* õ to o */
1244 { 0x00F6, 0x6F, 0x65 }, /* ö to oe */
1245 { 0x00F7, 0x3A, 0x00 }, /* ÷ to : */
1246 { 0x00F8, 0x6F, 0x00 }, /* ø to o */
1247 { 0x00F9, 0x75, 0x00 }, /* ù to u */
1248 { 0x00FA, 0x75, 0x00 }, /* ú to u */
1249 { 0x00FB, 0x75, 0x00 }, /* û to u */
1250 { 0x00FC, 0x75, 0x65 }, /* ü to ue */
1251 { 0x00FD, 0x79, 0x00 }, /* ý to y */
1252 { 0x00FE, 0x74, 0x68 }, /* þ to th */
1253 { 0x00FF, 0x79, 0x00 }, /* ÿ to y */
1254 { 0x0100, 0x41, 0x00 }, /* Ā to A */
1255 { 0x0101, 0x61, 0x00 }, /* ā to a */
1256 { 0x0102, 0x41, 0x00 }, /* Ă to A */
1257 { 0x0103, 0x61, 0x00 }, /* ă to a */
1258 { 0x0104, 0x41, 0x00 }, /* Ą to A */
1259 { 0x0105, 0x61, 0x00 }, /* ą to a */
1260 { 0x0106, 0x43, 0x00 }, /* Ć to C */
1261 { 0x0107, 0x63, 0x00 }, /* ć to c */
1262 { 0x0108, 0x43, 0x68 }, /* Ĉ to Ch */
1263 { 0x0109, 0x63, 0x68 }, /* ĉ to ch */
1264 { 0x010A, 0x43, 0x00 }, /* Ċ to C */
1265 { 0x010B, 0x63, 0x00 }, /* ċ to c */
1266 { 0x010C, 0x43, 0x00 }, /* Č to C */
1267 { 0x010D, 0x63, 0x00 }, /* č to c */
1268 { 0x010E, 0x44, 0x00 }, /* Ď to D */
1269 { 0x010F, 0x64, 0x00 }, /* ď to d */
1270 { 0x0110, 0x44, 0x00 }, /* Đ to D */
1271 { 0x0111, 0x64, 0x00 }, /* đ to d */
1272 { 0x0112, 0x45, 0x00 }, /* Ē to E */
1273 { 0x0113, 0x65, 0x00 }, /* ē to e */
1274 { 0x0114, 0x45, 0x00 }, /* Ĕ to E */
1275 { 0x0115, 0x65, 0x00 }, /* ĕ to e */
1276 { 0x0116, 0x45, 0x00 }, /* Ė to E */
1277 { 0x0117, 0x65, 0x00 }, /* ė to e */
1278 { 0x0118, 0x45, 0x00 }, /* Ę to E */
1279 { 0x0119, 0x65, 0x00 }, /* ę to e */
1280 { 0x011A, 0x45, 0x00 }, /* Ě to E */
1281 { 0x011B, 0x65, 0x00 }, /* ě to e */
1282 { 0x011C, 0x47, 0x68 }, /* Ĝ to Gh */
1283 { 0x011D, 0x67, 0x68 }, /* ĝ to gh */
1284 { 0x011E, 0x47, 0x00 }, /* Ğ to G */
1285 { 0x011F, 0x67, 0x00 }, /* ğ to g */
1286 { 0x0120, 0x47, 0x00 }, /* Ġ to G */
1287 { 0x0121, 0x67, 0x00 }, /* ġ to g */
1288 { 0x0122, 0x47, 0x00 }, /* Ģ to G */
1289 { 0x0123, 0x67, 0x00 }, /* ģ to g */
1290 { 0x0124, 0x48, 0x68 }, /* Ĥ to Hh */
1291 { 0x0125, 0x68, 0x68 }, /* ĥ to hh */
1292 { 0x0126, 0x48, 0x00 }, /* Ħ to H */
1293 { 0x0127, 0x68, 0x00 }, /* ħ to h */
1294 { 0x0128, 0x49, 0x00 }, /* Ĩ to I */
1295 { 0x0129, 0x69, 0x00 }, /* ĩ to i */
1296 { 0x012A, 0x49, 0x00 }, /* Ī to I */
1297 { 0x012B, 0x69, 0x00 }, /* ī to i */
1298 { 0x012C, 0x49, 0x00 }, /* Ĭ to I */
1299 { 0x012D, 0x69, 0x00 }, /* ĭ to i */
1300 { 0x012E, 0x49, 0x00 }, /* Į to I */
1301 { 0x012F, 0x69, 0x00 }, /* į to i */
1302 { 0x0130, 0x49, 0x00 }, /* İ to I */
1303 { 0x0131, 0x69, 0x00 }, /* ı to i */
1304 { 0x0132, 0x49, 0x4A }, /* IJ to IJ */
1305 { 0x0133, 0x69, 0x6A }, /* ij to ij */
1306 { 0x0134, 0x4A, 0x68 }, /* Ĵ to Jh */
1307 { 0x0135, 0x6A, 0x68 }, /* ĵ to jh */
1308 { 0x0136, 0x4B, 0x00 }, /* Ķ to K */
1309 { 0x0137, 0x6B, 0x00 }, /* ķ to k */
1310 { 0x0138, 0x6B, 0x00 }, /* ĸ to k */
1311 { 0x0139, 0x4C, 0x00 }, /* Ĺ to L */
1312 { 0x013A, 0x6C, 0x00 }, /* ĺ to l */
1313 { 0x013B, 0x4C, 0x00 }, /* Ļ to L */
1314 { 0x013C, 0x6C, 0x00 }, /* ļ to l */
1315 { 0x013D, 0x4C, 0x00 }, /* Ľ to L */
1316 { 0x013E, 0x6C, 0x00 }, /* ľ to l */
1317 { 0x013F, 0x4C, 0x2E }, /* Ŀ to L. */
1318 { 0x0140, 0x6C, 0x2E }, /* ŀ to l. */
1319 { 0x0141, 0x4C, 0x00 }, /* Ł to L */
1320 { 0x0142, 0x6C, 0x00 }, /* ł to l */
1321 { 0x0143, 0x4E, 0x00 }, /* Ń to N */
1322 { 0x0144, 0x6E, 0x00 }, /* ń to n */
1323 { 0x0145, 0x4E, 0x00 }, /* Ņ to N */
1324 { 0x0146, 0x6E, 0x00 }, /* ņ to n */
1325 { 0x0147, 0x4E, 0x00 }, /* Ň to N */
1326 { 0x0148, 0x6E, 0x00 }, /* ň to n */
1327 { 0x0149, 0x27, 0x6E }, /* ʼn to 'n */
1328 { 0x014A, 0x4E, 0x47 }, /* Ŋ to NG */
1329 { 0x014B, 0x6E, 0x67 }, /* ŋ to ng */
1330 { 0x014C, 0x4F, 0x00 }, /* Ō to O */
1331 { 0x014D, 0x6F, 0x00 }, /* ō to o */
1332 { 0x014E, 0x4F, 0x00 }, /* Ŏ to O */
1333 { 0x014F, 0x6F, 0x00 }, /* ŏ to o */
1334 { 0x0150, 0x4F, 0x00 }, /* Ő to O */
1335 { 0x0151, 0x6F, 0x00 }, /* ő to o */
1336 { 0x0152, 0x4F, 0x45 }, /* Œ to OE */
1337 { 0x0153, 0x6F, 0x65 }, /* œ to oe */
1338 { 0x0154, 0x52, 0x00 }, /* Ŕ to R */
1339 { 0x0155, 0x72, 0x00 }, /* ŕ to r */
1340 { 0x0156, 0x52, 0x00 }, /* Ŗ to R */
1341 { 0x0157, 0x72, 0x00 }, /* ŗ to r */
1342 { 0x0158, 0x52, 0x00 }, /* Ř to R */
1343 { 0x0159, 0x72, 0x00 }, /* ř to r */
1344 { 0x015A, 0x53, 0x00 }, /* Ś to S */
1345 { 0x015B, 0x73, 0x00 }, /* ś to s */
1346 { 0x015C, 0x53, 0x68 }, /* Ŝ to Sh */
1347 { 0x015D, 0x73, 0x68 }, /* ŝ to sh */
1348 { 0x015E, 0x53, 0x00 }, /* Ş to S */
1349 { 0x015F, 0x73, 0x00 }, /* ş to s */
1350 { 0x0160, 0x53, 0x00 }, /* Š to S */
1351 { 0x0161, 0x73, 0x00 }, /* š to s */
1352 { 0x0162, 0x54, 0x00 }, /* Ţ to T */
1353 { 0x0163, 0x74, 0x00 }, /* ţ to t */
1354 { 0x0164, 0x54, 0x00 }, /* Ť to T */
1355 { 0x0165, 0x74, 0x00 }, /* ť to t */
1356 { 0x0166, 0x54, 0x00 }, /* Ŧ to T */
1357 { 0x0167, 0x74, 0x00 }, /* ŧ to t */
1358 { 0x0168, 0x55, 0x00 }, /* Ũ to U */
1359 { 0x0169, 0x75, 0x00 }, /* ũ to u */
1360 { 0x016A, 0x55, 0x00 }, /* Ū to U */
1361 { 0x016B, 0x75, 0x00 }, /* ū to u */
1362 { 0x016C, 0x55, 0x00 }, /* Ŭ to U */
1363 { 0x016D, 0x75, 0x00 }, /* ŭ to u */
1364 { 0x016E, 0x55, 0x00 }, /* Ů to U */
1365 { 0x016F, 0x75, 0x00 }, /* ů to u */
1366 { 0x0170, 0x55, 0x00 }, /* Ű to U */
1367 { 0x0171, 0x75, 0x00 }, /* ű to u */
1368 { 0x0172, 0x55, 0x00 }, /* Ų to U */
1369 { 0x0173, 0x75, 0x00 }, /* ų to u */
1370 { 0x0174, 0x57, 0x00 }, /* Ŵ to W */
1371 { 0x0175, 0x77, 0x00 }, /* ŵ to w */
1372 { 0x0176, 0x59, 0x00 }, /* Ŷ to Y */
1373 { 0x0177, 0x79, 0x00 }, /* ŷ to y */
1374 { 0x0178, 0x59, 0x00 }, /* Ÿ to Y */
1375 { 0x0179, 0x5A, 0x00 }, /* Ź to Z */
1376 { 0x017A, 0x7A, 0x00 }, /* ź to z */
1377 { 0x017B, 0x5A, 0x00 }, /* Ż to Z */
1378 { 0x017C, 0x7A, 0x00 }, /* ż to z */
1379 { 0x017D, 0x5A, 0x00 }, /* Ž to Z */
1380 { 0x017E, 0x7A, 0x00 }, /* ž to z */
1381 { 0x017F, 0x73, 0x00 }, /* ſ to s */
1382 { 0x0192, 0x66, 0x00 }, /* ƒ to f */
1383 { 0x0218, 0x53, 0x00 }, /* Ș to S */
1384 { 0x0219, 0x73, 0x00 }, /* ș to s */
1385 { 0x021A, 0x54, 0x00 }, /* Ț to T */
1386 { 0x021B, 0x74, 0x00 }, /* ț to t */
1387 { 0x0386, 0x41, 0x00 }, /* Ά to A */
1388 { 0x0388, 0x45, 0x00 }, /* Έ to E */
1389 { 0x0389, 0x49, 0x00 }, /* Ή to I */
1390 { 0x038A, 0x49, 0x00 }, /* Ί to I */
1391 { 0x038C, 0x4f, 0x00 }, /* Ό to O */
1392 { 0x038E, 0x59, 0x00 }, /* Ύ to Y */
1393 { 0x038F, 0x4f, 0x00 }, /* Ώ to O */
1394 { 0x0390, 0x69, 0x00 }, /* ΐ to i */
1395 { 0x0391, 0x41, 0x00 }, /* Α to A */
1396 { 0x0392, 0x42, 0x00 }, /* Β to B */
1397 { 0x0393, 0x47, 0x00 }, /* Γ to G */
1398 { 0x0394, 0x44, 0x00 }, /* Δ to D */
1399 { 0x0395, 0x45, 0x00 }, /* Ε to E */
1400 { 0x0396, 0x5a, 0x00 }, /* Ζ to Z */
1401 { 0x0397, 0x49, 0x00 }, /* Η to I */
1402 { 0x0398, 0x54, 0x68 }, /* Θ to Th */
1403 { 0x0399, 0x49, 0x00 }, /* Ι to I */
1404 { 0x039A, 0x4b, 0x00 }, /* Κ to K */
1405 { 0x039B, 0x4c, 0x00 }, /* Λ to L */
1406 { 0x039C, 0x4d, 0x00 }, /* Μ to M */
1407 { 0x039D, 0x4e, 0x00 }, /* Ν to N */
1408 { 0x039E, 0x58, 0x00 }, /* Ξ to X */
1409 { 0x039F, 0x4f, 0x00 }, /* Ο to O */
1410 { 0x03A0, 0x50, 0x00 }, /* Π to P */
1411 { 0x03A1, 0x52, 0x00 }, /* Ρ to R */
1412 { 0x03A3, 0x53, 0x00 }, /* Σ to S */
1413 { 0x03A4, 0x54, 0x00 }, /* Τ to T */
1414 { 0x03A5, 0x59, 0x00 }, /* Υ to Y */
1415 { 0x03A6, 0x46, 0x00 }, /* Φ to F */
1416 { 0x03A7, 0x43, 0x68 }, /* Χ to Ch */
1417 { 0x03A8, 0x50, 0x73 }, /* Ψ to Ps */
1418 { 0x03A9, 0x4f, 0x00 }, /* Ω to O */
1419 { 0x03AA, 0x49, 0x00 }, /* Ϊ to I */
1420 { 0x03AB, 0x59, 0x00 }, /* Ϋ to Y */
1421 { 0x03AC, 0x61, 0x00 }, /* ά to a */
1422 { 0x03AD, 0x65, 0x00 }, /* έ to e */
1423 { 0x03AE, 0x69, 0x00 }, /* ή to i */
1424 { 0x03AF, 0x69, 0x00 }, /* ί to i */
1425 { 0x03B1, 0x61, 0x00 }, /* α to a */
1426 { 0x03B2, 0x62, 0x00 }, /* β to b */
1427 { 0x03B3, 0x67, 0x00 }, /* γ to g */
1428 { 0x03B4, 0x64, 0x00 }, /* δ to d */
1429 { 0x03B5, 0x65, 0x00 }, /* ε to e */
1430 { 0x03B6, 0x7a, 0x00 }, /* ζ to z */
1431 { 0x03B7, 0x69, 0x00 }, /* η to i */
1432 { 0x03B8, 0x74, 0x68 }, /* θ to th */
1433 { 0x03B9, 0x69, 0x00 }, /* ι to i */
1434 { 0x03BA, 0x6b, 0x00 }, /* κ to k */
1435 { 0x03BB, 0x6c, 0x00 }, /* λ to l */
1436 { 0x03BC, 0x6d, 0x00 }, /* μ to m */
1437 { 0x03BD, 0x6e, 0x00 }, /* ν to n */
1438 { 0x03BE, 0x78, 0x00 }, /* ξ to x */
1439 { 0x03BF, 0x6f, 0x00 }, /* ο to o */
1440 { 0x03C0, 0x70, 0x00 }, /* π to p */
1441 { 0x03C1, 0x72, 0x00 }, /* ρ to r */
1442 { 0x03C3, 0x73, 0x00 }, /* σ to s */
1443 { 0x03C4, 0x74, 0x00 }, /* τ to t */
1444 { 0x03C5, 0x79, 0x00 }, /* υ to y */
1445 { 0x03C6, 0x66, 0x00 }, /* φ to f */
1446 { 0x03C7, 0x63, 0x68 }, /* χ to ch */
1447 { 0x03C8, 0x70, 0x73 }, /* ψ to ps */
1448 { 0x03C9, 0x6f, 0x00 }, /* ω to o */
1449 { 0x03CA, 0x69, 0x00 }, /* ϊ to i */
1450 { 0x03CB, 0x79, 0x00 }, /* ϋ to y */
1451 { 0x03CC, 0x6f, 0x00 }, /* ό to o */
1452 { 0x03CD, 0x79, 0x00 }, /* ύ to y */
1453 { 0x03CE, 0x69, 0x00 }, /* ώ to i */
1454 { 0x0400, 0x45, 0x00 }, /* Ѐ to E */
1455 { 0x0401, 0x45, 0x00 }, /* Ё to E */
1456 { 0x0402, 0x44, 0x00 }, /* Ђ to D */
1457 { 0x0403, 0x47, 0x00 }, /* Ѓ to G */
1458 { 0x0404, 0x45, 0x00 }, /* Є to E */
1459 { 0x0405, 0x5a, 0x00 }, /* Ѕ to Z */
1460 { 0x0406, 0x49, 0x00 }, /* І to I */
1461 { 0x0407, 0x49, 0x00 }, /* Ї to I */
1462 { 0x0408, 0x4a, 0x00 }, /* Ј to J */
1463 { 0x0409, 0x49, 0x00 }, /* Љ to I */
1464 { 0x040A, 0x4e, 0x00 }, /* Њ to N */
1465 { 0x040B, 0x44, 0x00 }, /* Ћ to D */
1466 { 0x040C, 0x4b, 0x00 }, /* Ќ to K */
1467 { 0x040D, 0x49, 0x00 }, /* Ѝ to I */
1468 { 0x040E, 0x55, 0x00 }, /* Ў to U */
1469 { 0x040F, 0x44, 0x00 }, /* Џ to D */
1470 { 0x0410, 0x41, 0x00 }, /* А to A */
1471 { 0x0411, 0x42, 0x00 }, /* Б to B */
1472 { 0x0412, 0x56, 0x00 }, /* В to V */
1473 { 0x0413, 0x47, 0x00 }, /* Г to G */
1474 { 0x0414, 0x44, 0x00 }, /* Д to D */
1475 { 0x0415, 0x45, 0x00 }, /* Е to E */
1476 { 0x0416, 0x5a, 0x68 }, /* Ж to Zh */
1477 { 0x0417, 0x5a, 0x00 }, /* З to Z */
1478 { 0x0418, 0x49, 0x00 }, /* И to I */
1479 { 0x0419, 0x49, 0x00 }, /* Й to I */
1480 { 0x041A, 0x4b, 0x00 }, /* К to K */
1481 { 0x041B, 0x4c, 0x00 }, /* Л to L */
1482 { 0x041C, 0x4d, 0x00 }, /* М to M */
1483 { 0x041D, 0x4e, 0x00 }, /* Н to N */
1484 { 0x041E, 0x4f, 0x00 }, /* О to O */
1485 { 0x041F, 0x50, 0x00 }, /* П to P */
1486 { 0x0420, 0x52, 0x00 }, /* Р to R */
1487 { 0x0421, 0x53, 0x00 }, /* С to S */
1488 { 0x0422, 0x54, 0x00 }, /* Т to T */
1489 { 0x0423, 0x55, 0x00 }, /* У to U */
1490 { 0x0424, 0x46, 0x00 }, /* Ф to F */
1491 { 0x0425, 0x4b, 0x68 }, /* Х to Kh */
1492 { 0x0426, 0x54, 0x63 }, /* Ц to Tc */
1493 { 0x0427, 0x43, 0x68 }, /* Ч to Ch */
1494 { 0x0428, 0x53, 0x68 }, /* Ш to Sh */
1495 { 0x0429, 0x53, 0x68 }, /* Щ to Shch */
1496 { 0x042A, 0x61, 0x00 }, /* to A */
1497 { 0x042B, 0x59, 0x00 }, /* Ы to Y */
1498 { 0x042C, 0x59, 0x00 }, /* to Y */
1499 { 0x042D, 0x45, 0x00 }, /* Э to E */
1500 { 0x042E, 0x49, 0x75 }, /* Ю to Iu */
1501 { 0x042F, 0x49, 0x61 }, /* Я to Ia */
1502 { 0x0430, 0x61, 0x00 }, /* а to a */
1503 { 0x0431, 0x62, 0x00 }, /* б to b */
1504 { 0x0432, 0x76, 0x00 }, /* в to v */
1505 { 0x0433, 0x67, 0x00 }, /* г to g */
1506 { 0x0434, 0x64, 0x00 }, /* д to d */
1507 { 0x0435, 0x65, 0x00 }, /* е to e */
1508 { 0x0436, 0x7a, 0x68 }, /* ж to zh */
1509 { 0x0437, 0x7a, 0x00 }, /* з to z */
1510 { 0x0438, 0x69, 0x00 }, /* и to i */
1511 { 0x0439, 0x69, 0x00 }, /* й to i */
1512 { 0x043A, 0x6b, 0x00 }, /* к to k */
1513 { 0x043B, 0x6c, 0x00 }, /* л to l */
1514 { 0x043C, 0x6d, 0x00 }, /* м to m */
1515 { 0x043D, 0x6e, 0x00 }, /* н to n */
1516 { 0x043E, 0x6f, 0x00 }, /* о to o */
1517 { 0x043F, 0x70, 0x00 }, /* п to p */
1518 { 0x0440, 0x72, 0x00 }, /* р to r */
1519 { 0x0441, 0x73, 0x00 }, /* с to s */
1520 { 0x0442, 0x74, 0x00 }, /* т to t */
1521 { 0x0443, 0x75, 0x00 }, /* у to u */
1522 { 0x0444, 0x66, 0x00 }, /* ф to f */
1523 { 0x0445, 0x6b, 0x68 }, /* х to kh */
1524 { 0x0446, 0x74, 0x63 }, /* ц to tc */
1525 { 0x0447, 0x63, 0x68 }, /* ч to ch */
1526 { 0x0448, 0x73, 0x68 }, /* ш to sh */
1527 { 0x0449, 0x73, 0x68 }, /* щ to shch */
1528 { 0x044A, 0x61, 0x00 }, /* to a */
1529 { 0x044B, 0x79, 0x00 }, /* ы to y */
1530 { 0x044C, 0x79, 0x00 }, /* to y */
1531 { 0x044D, 0x65, 0x00 }, /* э to e */
1532 { 0x044E, 0x69, 0x75 }, /* ю to iu */
1533 { 0x044F, 0x69, 0x61 }, /* я to ia */
1534 { 0x0450, 0x65, 0x00 }, /* ѐ to e */
1535 { 0x0451, 0x65, 0x00 }, /* ё to e */
1536 { 0x0452, 0x64, 0x00 }, /* ђ to d */
1537 { 0x0453, 0x67, 0x00 }, /* ѓ to g */
1538 { 0x0454, 0x65, 0x00 }, /* є to e */
1539 { 0x0455, 0x7a, 0x00 }, /* ѕ to z */
1540 { 0x0456, 0x69, 0x00 }, /* і to i */
1541 { 0x0457, 0x69, 0x00 }, /* ї to i */
1542 { 0x0458, 0x6a, 0x00 }, /* ј to j */
1543 { 0x0459, 0x69, 0x00 }, /* љ to i */
1544 { 0x045A, 0x6e, 0x00 }, /* њ to n */
1545 { 0x045B, 0x64, 0x00 }, /* ћ to d */
1546 { 0x045C, 0x6b, 0x00 }, /* ќ to k */
1547 { 0x045D, 0x69, 0x00 }, /* ѝ to i */
1548 { 0x045E, 0x75, 0x00 }, /* ў to u */
1549 { 0x045F, 0x64, 0x00 }, /* џ to d */
1550 { 0x1E02, 0x42, 0x00 }, /* Ḃ to B */
1551 { 0x1E03, 0x62, 0x00 }, /* ḃ to b */
1552 { 0x1E0A, 0x44, 0x00 }, /* Ḋ to D */
1553 { 0x1E0B, 0x64, 0x00 }, /* ḋ to d */
1554 { 0x1E1E, 0x46, 0x00 }, /* Ḟ to F */
1555 { 0x1E1F, 0x66, 0x00 }, /* ḟ to f */
1556 { 0x1E40, 0x4D, 0x00 }, /* Ṁ to M */
1557 { 0x1E41, 0x6D, 0x00 }, /* ṁ to m */
1558 { 0x1E56, 0x50, 0x00 }, /* Ṗ to P */
1559 { 0x1E57, 0x70, 0x00 }, /* ṗ to p */
1560 { 0x1E60, 0x53, 0x00 }, /* Ṡ to S */
1561 { 0x1E61, 0x73, 0x00 }, /* ṡ to s */
1562 { 0x1E6A, 0x54, 0x00 }, /* Ṫ to T */
1563 { 0x1E6B, 0x74, 0x00 }, /* ṫ to t */
1564 { 0x1E80, 0x57, 0x00 }, /* Ẁ to W */
1565 { 0x1E81, 0x77, 0x00 }, /* ẁ to w */
1566 { 0x1E82, 0x57, 0x00 }, /* Ẃ to W */
1567 { 0x1E83, 0x77, 0x00 }, /* ẃ to w */
1568 { 0x1E84, 0x57, 0x00 }, /* Ẅ to W */
1569 { 0x1E85, 0x77, 0x00 }, /* ẅ to w */
1570 { 0x1EF2, 0x59, 0x00 }, /* Ỳ to Y */
1571 { 0x1EF3, 0x79, 0x00 }, /* ỳ to y */
1572 { 0xFB00, 0x66, 0x66 }, /* ff to ff */
1573 { 0xFB01, 0x66, 0x69 }, /* fi to fi */
1574 { 0xFB02, 0x66, 0x6C }, /* fl to fl */
1575 { 0xFB05, 0x73, 0x74 }, /* ſt to st */
1576 { 0xFB06, 0x73, 0x74 }, /* st to st */
1577 };
1578
1579 /*
1580 ** Convert the input string from UTF-8 into pure ASCII by converting
1581 ** all non-ASCII characters to some combination of characters in the
1582 ** ASCII subset.
1583 **
1584 ** The returned string might contain more characters than the input.
1585 **
1586 ** Space to hold the returned string comes from sqlite3_malloc() and
1587 ** should be freed by the caller.
1588 */
1589 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1590 unsigned char *zOut = sqlite3_malloc( nIn*4 + 1 );
1591 int c, sz, nOut;
1592 if( zOut==0 ) return 0;
1593 nOut = 0;
1594 while( nIn>0 ){
1595 c = utf8Read(zIn, nIn, &sz);
1596 zIn += sz;
1597 nIn -= sz;
1598 if( c<=127 ){
1599 zOut[nOut++] = c;
1600 }else{
1601 int xTop, xBtm, x;
1602 xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1603 xBtm = 0;
1604 while( xTop>=xBtm ){
1605 x = (xTop + xBtm)/2;
1606 if( translit[x].cFrom==c ){
1607 zOut[nOut++] = translit[x].cTo0;
1608 if( translit[x].cTo1 ){
1609 zOut[nOut++] = translit[x].cTo1;
1610 /* Add an extra "ch" after the "sh" for Щ and щ */
1611 if( c==0x0429 || c== 0x0449 ){
1612 zOut[nOut++] = 'c';
1613 zOut[nOut++] = 'h';
1614 }
1615 }
1616 c = 0;
1617 break;
1618 }else if( translit[x].cFrom>c ){
1619 xTop = x-1;
1620 }else{
1621 xBtm = x+1;
1622 }
1623 }
1624 if( c ) zOut[nOut++] = '?';
1625 }
1626 }
1627 zOut[nOut] = 0;
1628 return zOut;
1629 }
1630
1631 /*
1632 ** Return the number of characters in the shortest prefix of the input
1633 ** string that transliterates to an ASCII string nTrans bytes or longer.
1634 ** Or, if the transliteration of the input string is less than nTrans
1635 ** bytes in size, return the number of characters in the input string.
1636 */
1637 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
1638 int i, c, sz, nOut;
1639 int nChar;
1640
1641 i = nOut = 0;
1642 for(nChar=0; i<nIn && nOut<nTrans; nChar++){
1643 c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1644 i += sz;
1645
1646 nOut++;
1647 if( c>=128 ){
1648 int xTop, xBtm, x;
1649 xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1650 xBtm = 0;
1651 while( xTop>=xBtm ){
1652 x = (xTop + xBtm)/2;
1653 if( translit[x].cFrom==c ){
1654 if( translit[x].cTo1 ) nOut++;
1655 if( c==0x0429 || c== 0x0449 ) nOut += 2;
1656 break;
1657 }else if( translit[x].cFrom>c ){
1658 xTop = x-1;
1659 }else{
1660 xBtm = x+1;
1661 }
1662 }
1663 }
1664 }
1665
1666 return nChar;
1667 }
1668
1669
1670 /*
1671 ** spellfix1_translit(X)
1672 **
1673 ** Convert a string that contains non-ASCII Roman characters into
1674 ** pure ASCII.
1675 */
1676 static void transliterateSqlFunc(
1677 sqlite3_context *context,
1678 int argc,
1679 sqlite3_value **argv
1680 ){
1681 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1682 int nIn = sqlite3_value_bytes(argv[0]);
1683 unsigned char *zOut = transliterate(zIn, nIn);
1684 if( zOut==0 ){
1685 sqlite3_result_error_nomem(context);
1686 }else{
1687 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1688 }
1689 }
1690
1691 /*
1692 ** spellfix1_scriptcode(X)
1693 **
1694 ** Try to determine the dominant script used by the word X and return
1695 ** its ISO 15924 numeric code.
1696 **
1697 ** The current implementation only understands the following scripts:
1698 **
1699 ** 215 (Latin)
1700 ** 220 (Cyrillic)
1701 ** 200 (Greek)
1702 **
1703 ** This routine will return 998 if the input X contains characters from
1704 ** two or more of the above scripts or 999 if X contains no characters
1705 ** from any of the above scripts.
1706 */
1707 static void scriptCodeSqlFunc(
1708 sqlite3_context *context,
1709 int argc,
1710 sqlite3_value **argv
1711 ){
1712 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1713 int nIn = sqlite3_value_bytes(argv[0]);
1714 int c, sz;
1715 int scriptMask = 0;
1716 int res;
1717 # define SCRIPT_LATIN 0x0001
1718 # define SCRIPT_CYRILLIC 0x0002
1719 # define SCRIPT_GREEK 0x0004
1720 # define SCRIPT_HEBREW 0x0008
1721 # define SCRIPT_ARABIC 0x0010
1722
1723 while( nIn>0 ){
1724 c = utf8Read(zIn, nIn, &sz);
1725 zIn += sz;
1726 nIn -= sz;
1727 if( c<0x02af && (c>=0x80 || midClass[c&0x7f]<CCLASS_DIGIT) ){
1728 scriptMask |= SCRIPT_LATIN;
1729 }else if( c>=0x0400 && c<=0x04ff ){
1730 scriptMask |= SCRIPT_CYRILLIC;
1731 }else if( c>=0x0386 && c<=0x03ce ){
1732 scriptMask |= SCRIPT_GREEK;
1733 }else if( c>=0x0590 && c<=0x05ff ){
1734 scriptMask |= SCRIPT_HEBREW;
1735 }else if( c>=0x0600 && c<=0x06ff ){
1736 scriptMask |= SCRIPT_ARABIC;
1737 }
1738 }
1739 switch( scriptMask ){
1740 case 0: res = 999; break;
1741 case SCRIPT_LATIN: res = 215; break;
1742 case SCRIPT_CYRILLIC: res = 220; break;
1743 case SCRIPT_GREEK: res = 200; break;
1744 case SCRIPT_HEBREW: res = 125; break;
1745 case SCRIPT_ARABIC: res = 160; break;
1746 default: res = 998; break;
1747 }
1748 sqlite3_result_int(context, res);
1749 }
1750
1751 /* End transliterate
1752 ******************************************************************************
1753 ******************************************************************************
1754 ** Begin spellfix1 virtual table.
1755 */
1756
1757 /* Maximum length of a phonehash used for querying the shadow table */
1758 #define SPELLFIX_MX_HASH 8
1759
1760 /* Maximum number of hash strings to examine per query */
1761 #define SPELLFIX_MX_RUN 1
1762
1763 typedef struct spellfix1_vtab spellfix1_vtab;
1764 typedef struct spellfix1_cursor spellfix1_cursor;
1765
1766 /* Fuzzy-search virtual table object */
1767 struct spellfix1_vtab {
1768 sqlite3_vtab base; /* Base class - must be first */
1769 sqlite3 *db; /* Database connection */
1770 char *zDbName; /* Name of database holding this table */
1771 char *zTableName; /* Name of the virtual table */
1772 char *zCostTable; /* Table holding edit-distance cost numbers */
1773 EditDist3Config *pConfig3; /* Parsed edit distance costs */
1774 };
1775
1776 /* Fuzzy-search cursor object */
1777 struct spellfix1_cursor {
1778 sqlite3_vtab_cursor base; /* Base class - must be first */
1779 spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
1780 char *zPattern; /* rhs of MATCH clause */
1781 int idxNum; /* idxNum value passed to xFilter() */
1782 int nRow; /* Number of rows of content */
1783 int nAlloc; /* Number of allocated rows */
1784 int iRow; /* Current row of content */
1785 int iLang; /* Value of the langid= constraint */
1786 int iTop; /* Value of the top= constraint */
1787 int iScope; /* Value of the scope= constraint */
1788 int nSearch; /* Number of vocabulary items checked */
1789 sqlite3_stmt *pFullScan; /* Shadow query for a full table scan */
1790 struct spellfix1_row { /* For each row of content */
1791 sqlite3_int64 iRowid; /* Rowid for this row */
1792 char *zWord; /* Text for this row */
1793 int iRank; /* Rank for this row */
1794 int iDistance; /* Distance from pattern for this row */
1795 int iScore; /* Score for sorting */
1796 int iMatchlen; /* Value of matchlen column (or -1) */
1797 char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
1798 } *a;
1799 };
1800
1801 /*
1802 ** Construct one or more SQL statements from the format string given
1803 ** and then evaluate those statements. The success code is written
1804 ** into *pRc.
1805 **
1806 ** If *pRc is initially non-zero then this routine is a no-op.
1807 */
1808 static void spellfix1DbExec(
1809 int *pRc, /* Success code */
1810 sqlite3 *db, /* Database in which to run SQL */
1811 const char *zFormat, /* Format string for SQL */
1812 ... /* Arguments to the format string */
1813 ){
1814 va_list ap;
1815 char *zSql;
1816 if( *pRc ) return;
1817 va_start(ap, zFormat);
1818 zSql = sqlite3_vmprintf(zFormat, ap);
1819 va_end(ap);
1820 if( zSql==0 ){
1821 *pRc = SQLITE_NOMEM;
1822 }else{
1823 *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1824 sqlite3_free(zSql);
1825 }
1826 }
1827
1828 /*
1829 ** xDisconnect/xDestroy method for the fuzzy-search module.
1830 */
1831 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1832 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1833 int rc = SQLITE_OK;
1834 if( isDestroy ){
1835 sqlite3 *db = p->db;
1836 spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1837 p->zDbName, p->zTableName);
1838 }
1839 if( rc==SQLITE_OK ){
1840 sqlite3_free(p->zTableName);
1841 editDist3ConfigDelete(p->pConfig3);
1842 sqlite3_free(p->zCostTable);
1843 sqlite3_free(p);
1844 }
1845 return rc;
1846 }
1847 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1848 return spellfix1Uninit(0, pVTab);
1849 }
1850 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1851 return spellfix1Uninit(1, pVTab);
1852 }
1853
1854 /*
1855 ** Make a copy of a string. Remove leading and trailing whitespace
1856 ** and dequote it.
1857 */
1858 static char *spellfix1Dequote(const char *zIn){
1859 char *zOut;
1860 int i, j;
1861 char c;
1862 while( isspace((unsigned char)zIn[0]) ) zIn++;
1863 zOut = sqlite3_mprintf("%s", zIn);
1864 if( zOut==0 ) return 0;
1865 i = (int)strlen(zOut);
1866 #if 0 /* The parser will never leave spaces at the end */
1867 while( i>0 && isspace(zOut[i-1]) ){ i--; }
1868 #endif
1869 zOut[i] = 0;
1870 c = zOut[0];
1871 if( c=='\'' || c=='"' ){
1872 for(i=1, j=0; ALWAYS(zOut[i]); i++){
1873 zOut[j++] = zOut[i];
1874 if( zOut[i]==c ){
1875 if( zOut[i+1]==c ){
1876 i++;
1877 }else{
1878 zOut[j-1] = 0;
1879 break;
1880 }
1881 }
1882 }
1883 }
1884 return zOut;
1885 }
1886
1887
1888 /*
1889 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
1890 **
1891 ** argv[0] -> module name ("spellfix1")
1892 ** argv[1] -> database name
1893 ** argv[2] -> table name
1894 ** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
1895 */
1896 static int spellfix1Init(
1897 int isCreate,
1898 sqlite3 *db,
1899 void *pAux,
1900 int argc, const char *const*argv,
1901 sqlite3_vtab **ppVTab,
1902 char **pzErr
1903 ){
1904 spellfix1_vtab *pNew = 0;
1905 /* const char *zModule = argv[0]; // not used */
1906 const char *zDbName = argv[1];
1907 const char *zTableName = argv[2];
1908 int nDbName;
1909 int rc = SQLITE_OK;
1910 int i;
1911
1912 nDbName = (int)strlen(zDbName);
1913 pNew = sqlite3_malloc( sizeof(*pNew) + nDbName + 1);
1914 if( pNew==0 ){
1915 rc = SQLITE_NOMEM;
1916 }else{
1917 memset(pNew, 0, sizeof(*pNew));
1918 pNew->zDbName = (char*)&pNew[1];
1919 memcpy(pNew->zDbName, zDbName, nDbName+1);
1920 pNew->zTableName = sqlite3_mprintf("%s", zTableName);
1921 pNew->db = db;
1922 if( pNew->zTableName==0 ){
1923 rc = SQLITE_NOMEM;
1924 }else{
1925 rc = sqlite3_declare_vtab(db,
1926 "CREATE TABLE x(word,rank,distance,langid, "
1927 "score, matchlen, phonehash HIDDEN, "
1928 "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
1929 "soundslike HIDDEN, command HIDDEN)"
1930 );
1931 #define SPELLFIX_COL_WORD 0
1932 #define SPELLFIX_COL_RANK 1
1933 #define SPELLFIX_COL_DISTANCE 2
1934 #define SPELLFIX_COL_LANGID 3
1935 #define SPELLFIX_COL_SCORE 4
1936 #define SPELLFIX_COL_MATCHLEN 5
1937 #define SPELLFIX_COL_PHONEHASH 6
1938 #define SPELLFIX_COL_TOP 7
1939 #define SPELLFIX_COL_SCOPE 8
1940 #define SPELLFIX_COL_SRCHCNT 9
1941 #define SPELLFIX_COL_SOUNDSLIKE 10
1942 #define SPELLFIX_COL_COMMAND 11
1943 }
1944 if( rc==SQLITE_OK && isCreate ){
1945 spellfix1DbExec(&rc, db,
1946 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
1947 " id INTEGER PRIMARY KEY,\n"
1948 " rank INT,\n"
1949 " langid INT,\n"
1950 " word TEXT,\n"
1951 " k1 TEXT,\n"
1952 " k2 TEXT\n"
1953 ");\n",
1954 zDbName, zTableName
1955 );
1956 spellfix1DbExec(&rc, db,
1957 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
1958 "ON \"%w_vocab\"(langid,k2);",
1959 zDbName, zTableName, zTableName
1960 );
1961 }
1962 for(i=3; rc==SQLITE_OK && i<argc; i++){
1963 if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
1964 pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
1965 if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
1966 continue;
1967 }
1968 *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
1969 rc = SQLITE_ERROR;
1970 }
1971 }
1972
1973 if( rc && pNew ){
1974 *ppVTab = 0;
1975 spellfix1Uninit(0, &pNew->base);
1976 }else{
1977 *ppVTab = (sqlite3_vtab *)pNew;
1978 }
1979 return rc;
1980 }
1981
1982 /*
1983 ** The xConnect and xCreate methods
1984 */
1985 static int spellfix1Connect(
1986 sqlite3 *db,
1987 void *pAux,
1988 int argc, const char *const*argv,
1989 sqlite3_vtab **ppVTab,
1990 char **pzErr
1991 ){
1992 return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
1993 }
1994 static int spellfix1Create(
1995 sqlite3 *db,
1996 void *pAux,
1997 int argc, const char *const*argv,
1998 sqlite3_vtab **ppVTab,
1999 char **pzErr
2000 ){
2001 return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
2002 }
2003
2004 /*
2005 ** Clear all of the content from a cursor.
2006 */
2007 static void spellfix1ResetCursor(spellfix1_cursor *pCur){
2008 int i;
2009 for(i=0; i<pCur->nRow; i++){
2010 sqlite3_free(pCur->a[i].zWord);
2011 }
2012 pCur->nRow = 0;
2013 pCur->iRow = 0;
2014 pCur->nSearch = 0;
2015 if( pCur->pFullScan ){
2016 sqlite3_finalize(pCur->pFullScan);
2017 pCur->pFullScan = 0;
2018 }
2019 }
2020
2021 /*
2022 ** Resize the cursor to hold up to N rows of content
2023 */
2024 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
2025 struct spellfix1_row *aNew;
2026 assert( N>=pCur->nRow );
2027 aNew = sqlite3_realloc(pCur->a, sizeof(pCur->a[0])*N);
2028 if( aNew==0 && N>0 ){
2029 spellfix1ResetCursor(pCur);
2030 sqlite3_free(pCur->a);
2031 pCur->nAlloc = 0;
2032 pCur->a = 0;
2033 }else{
2034 pCur->nAlloc = N;
2035 pCur->a = aNew;
2036 }
2037 }
2038
2039
2040 /*
2041 ** Close a fuzzy-search cursor.
2042 */
2043 static int spellfix1Close(sqlite3_vtab_cursor *cur){
2044 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2045 spellfix1ResetCursor(pCur);
2046 spellfix1ResizeCursor(pCur, 0);
2047 sqlite3_free(pCur->zPattern);
2048 sqlite3_free(pCur);
2049 return SQLITE_OK;
2050 }
2051
2052 #define SPELLFIX_IDXNUM_MATCH 0x01 /* word MATCH $str */
2053 #define SPELLFIX_IDXNUM_LANGID 0x02 /* langid == $langid */
2054 #define SPELLFIX_IDXNUM_TOP 0x04 /* top = $top */
2055 #define SPELLFIX_IDXNUM_SCOPE 0x08 /* scope = $scope */
2056 #define SPELLFIX_IDXNUM_DISTLT 0x10 /* distance < $distance */
2057 #define SPELLFIX_IDXNUM_DISTLE 0x20 /* distance <= $distance */
2058 #define SPELLFIX_IDXNUM_ROWID 0x40 /* rowid = $rowid */
2059 #define SPELLFIX_IDXNUM_DIST (0x10|0x20) /* DISTLT and DISTLE */
2060
2061 /*
2062 **
2063 ** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined
2064 ** above.
2065 **
2066 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2067 ** if specified and in that order.
2068 */
2069 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
2070 int iPlan = 0;
2071 int iLangTerm = -1;
2072 int iTopTerm = -1;
2073 int iScopeTerm = -1;
2074 int iDistTerm = -1;
2075 int iRowidTerm = -1;
2076 int i;
2077 const struct sqlite3_index_constraint *pConstraint;
2078 pConstraint = pIdxInfo->aConstraint;
2079 for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
2080 if( pConstraint->usable==0 ) continue;
2081
2082 /* Terms of the form: word MATCH $str */
2083 if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0
2084 && pConstraint->iColumn==SPELLFIX_COL_WORD
2085 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2086 ){
2087 iPlan |= SPELLFIX_IDXNUM_MATCH;
2088 pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2089 pIdxInfo->aConstraintUsage[i].omit = 1;
2090 }
2091
2092 /* Terms of the form: langid = $langid */
2093 if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0
2094 && pConstraint->iColumn==SPELLFIX_COL_LANGID
2095 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2096 ){
2097 iPlan |= SPELLFIX_IDXNUM_LANGID;
2098 iLangTerm = i;
2099 }
2100
2101 /* Terms of the form: top = $top */
2102 if( (iPlan & SPELLFIX_IDXNUM_TOP)==0
2103 && pConstraint->iColumn==SPELLFIX_COL_TOP
2104 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2105 ){
2106 iPlan |= SPELLFIX_IDXNUM_TOP;
2107 iTopTerm = i;
2108 }
2109
2110 /* Terms of the form: scope = $scope */
2111 if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0
2112 && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2113 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2114 ){
2115 iPlan |= SPELLFIX_IDXNUM_SCOPE;
2116 iScopeTerm = i;
2117 }
2118
2119 /* Terms of the form: distance < $dist or distance <= $dist */
2120 if( (iPlan & SPELLFIX_IDXNUM_DIST)==0
2121 && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2122 && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2123 || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2124 ){
2125 if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ){
2126 iPlan |= SPELLFIX_IDXNUM_DISTLT;
2127 }else{
2128 iPlan |= SPELLFIX_IDXNUM_DISTLE;
2129 }
2130 iDistTerm = i;
2131 }
2132
2133 /* Terms of the form: distance < $dist or distance <= $dist */
2134 if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0
2135 && pConstraint->iColumn<0
2136 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2137 ){
2138 iPlan |= SPELLFIX_IDXNUM_ROWID;
2139 iRowidTerm = i;
2140 }
2141 }
2142 if( iPlan&SPELLFIX_IDXNUM_MATCH ){
2143 int idx = 2;
2144 pIdxInfo->idxNum = iPlan;
2145 if( pIdxInfo->nOrderBy==1
2146 && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2147 && pIdxInfo->aOrderBy[0].desc==0
2148 ){
2149 pIdxInfo->orderByConsumed = 1; /* Default order by iScore */
2150 }
2151 if( iPlan&SPELLFIX_IDXNUM_LANGID ){
2152 pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2153 pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2154 }
2155 if( iPlan&SPELLFIX_IDXNUM_TOP ){
2156 pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2157 pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2158 }
2159 if( iPlan&SPELLFIX_IDXNUM_SCOPE ){
2160 pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2161 pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2162 }
2163 if( iPlan&SPELLFIX_IDXNUM_DIST ){
2164 pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2165 pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2166 }
2167 pIdxInfo->estimatedCost = 1e5;
2168 }else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ){
2169 pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID;
2170 pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2171 pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2172 pIdxInfo->estimatedCost = 5;
2173 }else{
2174 pIdxInfo->idxNum = 0;
2175 pIdxInfo->estimatedCost = 1e50;
2176 }
2177 return SQLITE_OK;
2178 }
2179
2180 /*
2181 ** Open a new fuzzy-search cursor.
2182 */
2183 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2184 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2185 spellfix1_cursor *pCur;
2186 pCur = sqlite3_malloc( sizeof(*pCur) );
2187 if( pCur==0 ) return SQLITE_NOMEM;
2188 memset(pCur, 0, sizeof(*pCur));
2189 pCur->pVTab = p;
2190 *ppCursor = &pCur->base;
2191 return SQLITE_OK;
2192 }
2193
2194 /*
2195 ** Adjust a distance measurement by the words rank in order to show
2196 ** preference to common words.
2197 */
2198 static int spellfix1Score(int iDistance, int iRank){
2199 int iLog2;
2200 for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
2201 return iDistance + 32 - iLog2;
2202 }
2203
2204 /*
2205 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2206 ** that they sort in order of increasing distance.
2207 */
2208 static int spellfix1RowCompare(const void *A, const void *B){
2209 const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2210 const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2211 return a->iScore - b->iScore;
2212 }
2213
2214 /*
2215 ** A structure used to pass information from spellfix1FilterForMatch()
2216 ** into spellfix1RunQuery().
2217 */
2218 typedef struct MatchQuery {
2219 spellfix1_cursor *pCur; /* The cursor being queried */
2220 sqlite3_stmt *pStmt; /* shadow table query statment */
2221 char zHash[SPELLFIX_MX_HASH]; /* The current phonehash for zPattern */
2222 const char *zPattern; /* Transliterated input string */
2223 int nPattern; /* Length of zPattern */
2224 EditDist3FromString *pMatchStr3; /* Original unicode string */
2225 EditDist3Config *pConfig3; /* Edit-distance cost coefficients */
2226 const EditDist3Lang *pLang; /* The selected language coefficients */
2227 int iLang; /* The language id */
2228 int iScope; /* Default scope */
2229 int iMaxDist; /* Maximum allowed edit distance, or -1 */
2230 int rc; /* Error code */
2231 int nRun; /* Number of prior runs for the same zPattern */
2232 char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH]; /* Prior hashes */
2233 } MatchQuery;
2234
2235 /*
2236 ** Run a query looking for the best matches against zPattern using
2237 ** zHash as the character class seed hash.
2238 */
2239 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
2240 const char *zK1;
2241 const char *zWord;
2242 int iDist;
2243 int iRank;
2244 int iScore;
2245 int iWorst = 0;
2246 int idx;
2247 int idxWorst = -1;
2248 int i;
2249 int iScope = p->iScope;
2250 spellfix1_cursor *pCur = p->pCur;
2251 sqlite3_stmt *pStmt = p->pStmt;
2252 char zHash1[SPELLFIX_MX_HASH];
2253 char zHash2[SPELLFIX_MX_HASH];
2254 char *zClass;
2255 int nClass;
2256 int rc;
2257
2258 if( pCur->a==0 || p->rc ) return; /* Prior memory allocation failure */
2259 zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2260 if( zClass==0 ){
2261 p->rc = SQLITE_NOMEM;
2262 return;
2263 }
2264 nClass = (int)strlen(zClass);
2265 if( nClass>SPELLFIX_MX_HASH-2 ){
2266 nClass = SPELLFIX_MX_HASH-2;
2267 zClass[nClass] = 0;
2268 }
2269 if( nClass<=iScope ){
2270 if( nClass>2 ){
2271 iScope = nClass-1;
2272 }else{
2273 iScope = nClass;
2274 }
2275 }
2276 memcpy(zHash1, zClass, iScope);
2277 sqlite3_free(zClass);
2278 zHash1[iScope] = 0;
2279 memcpy(zHash2, zHash1, iScope);
2280 zHash2[iScope] = 'Z';
2281 zHash2[iScope+1] = 0;
2282 #if SPELLFIX_MX_RUN>1
2283 for(i=0; i<p->nRun; i++){
2284 if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2285 }
2286 #endif
2287 assert( p->nRun<SPELLFIX_MX_RUN );
2288 memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2289 if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2290 || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2291 ){
2292 p->rc = SQLITE_NOMEM;
2293 return;
2294 }
2295 #if SPELLFIX_MX_RUN>1
2296 for(i=0; i<pCur->nRow; i++){
2297 if( pCur->a[i].iScore>iWorst ){
2298 iWorst = pCur->a[i].iScore;
2299 idxWorst = i;
2300 }
2301 }
2302 #endif
2303 while( sqlite3_step(pStmt)==SQLITE_ROW ){
2304 int iMatchlen = -1;
2305 iRank = sqlite3_column_int(pStmt, 2);
2306 if( p->pMatchStr3 ){
2307 int nWord = sqlite3_column_bytes(pStmt, 1);
2308 zWord = (const char*)sqlite3_column_text(pStmt, 1);
2309 iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2310 }else{
2311 zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2312 if( zK1==0 ) continue;
2313 iDist = editdist1(p->zPattern, zK1, 0);
2314 }
2315 if( iDist<0 ){
2316 p->rc = SQLITE_NOMEM;
2317 break;
2318 }
2319 pCur->nSearch++;
2320
2321 /* If there is a "distance < $dist" or "distance <= $dist" constraint,
2322 ** check if this row meets it. If not, jump back up to the top of the
2323 ** loop to process the next row. Otherwise, if the row does match the
2324 ** distance constraint, check if the pCur->a[] array is already full.
2325 ** If it is and no explicit "top = ?" constraint was present in the
2326 ** query, grow the array to ensure there is room for the new entry. */
2327 assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) );
2328 if( p->iMaxDist>=0 ){
2329 if( iDist>p->iMaxDist ) continue;
2330 if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ){
2331 spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2332 if( pCur->a==0 ) break;
2333 }
2334 }
2335
2336 iScore = spellfix1Score(iDist,iRank);
2337 if( pCur->nRow<pCur->nAlloc ){
2338 idx = pCur->nRow;
2339 }else if( iScore<iWorst ){
2340 idx = idxWorst;
2341 sqlite3_free(pCur->a[idx].zWord);
2342 }else{
2343 continue;
2344 }
2345
2346 pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2347 if( pCur->a[idx].zWord==0 ){
2348 p->rc = SQLITE_NOMEM;
2349 break;
2350 }
2351 pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2352 pCur->a[idx].iRank = iRank;
2353 pCur->a[idx].iDistance = iDist;
2354 pCur->a[idx].iScore = iScore;
2355 pCur->a[idx].iMatchlen = iMatchlen;
2356 memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2357 if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2358 if( pCur->nRow==pCur->nAlloc ){
2359 iWorst = pCur->a[0].iScore;
2360 idxWorst = 0;
2361 for(i=1; i<pCur->nRow; i++){
2362 iScore = pCur->a[i].iScore;
2363 if( iWorst<iScore ){
2364 iWorst = iScore;
2365 idxWorst = i;
2366 }
2367 }
2368 }
2369 }
2370 rc = sqlite3_reset(pStmt);
2371 if( rc ) p->rc = rc;
2372 }
2373
2374 /*
2375 ** This version of the xFilter method work if the MATCH term is present
2376 ** and we are doing a scan.
2377 */
2378 static int spellfix1FilterForMatch(
2379 spellfix1_cursor *pCur,
2380 int argc,
2381 sqlite3_value **argv
2382 ){
2383 int idxNum = pCur->idxNum;
2384 const unsigned char *zMatchThis; /* RHS of the MATCH operator */
2385 EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2386 char *zPattern; /* Transliteration of zMatchThis */
2387 int nPattern; /* Length of zPattern */
2388 int iLimit = 20; /* Max number of rows of output */
2389 int iScope = 3; /* Use this many characters of zClass */
2390 int iLang = 0; /* Language code */
2391 char *zSql; /* SQL of shadow table query */
2392 sqlite3_stmt *pStmt = 0; /* Shadow table query */
2393 int rc; /* Result code */
2394 int idx = 1; /* Next available filter parameter */
2395 spellfix1_vtab *p = pCur->pVTab; /* The virtual table that owns pCur */
2396 MatchQuery x; /* For passing info to RunQuery() */
2397
2398 /* Load the cost table if we have not already done so */
2399 if( p->zCostTable!=0 && p->pConfig3==0 ){
2400 p->pConfig3 = sqlite3_malloc( sizeof(p->pConfig3[0]) );
2401 if( p->pConfig3==0 ) return SQLITE_NOMEM;
2402 memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2403 rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2404 if( rc ) return rc;
2405 }
2406 memset(&x, 0, sizeof(x));
2407 x.iScope = 3; /* Default scope if none specified by "WHERE scope=N" */
2408 x.iMaxDist = -1; /* Maximum allowed edit distance */
2409
2410 if( idxNum&2 ){
2411 iLang = sqlite3_value_int(argv[idx++]);
2412 }
2413 if( idxNum&4 ){
2414 iLimit = sqlite3_value_int(argv[idx++]);
2415 if( iLimit<1 ) iLimit = 1;
2416 }
2417 if( idxNum&8 ){
2418 x.iScope = sqlite3_value_int(argv[idx++]);
2419 if( x.iScope<1 ) x.iScope = 1;
2420 if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2421 }
2422 if( idxNum&(16|32) ){
2423 x.iMaxDist = sqlite3_value_int(argv[idx++]);
2424 if( idxNum&16 ) x.iMaxDist--;
2425 if( x.iMaxDist<0 ) x.iMaxDist = 0;
2426 }
2427 spellfix1ResetCursor(pCur);
2428 spellfix1ResizeCursor(pCur, iLimit);
2429 zMatchThis = sqlite3_value_text(argv[0]);
2430 if( zMatchThis==0 ) return SQLITE_OK;
2431 if( p->pConfig3 ){
2432 x.pLang = editDist3FindLang(p->pConfig3, iLang);
2433 pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2434 if( pMatchStr3==0 ){
2435 x.rc = SQLITE_NOMEM;
2436 goto filter_exit;
2437 }
2438 }else{
2439 x.pLang = 0;
2440 }
2441 zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2442 sqlite3_free(pCur->zPattern);
2443 pCur->zPattern = zPattern;
2444 if( zPattern==0 ){
2445 x.rc = SQLITE_NOMEM;
2446 goto filter_exit;
2447 }
2448 nPattern = (int)strlen(zPattern);
2449 if( zPattern[nPattern-1]=='*' ) nPattern--;
2450 zSql = sqlite3_mprintf(
2451 "SELECT id, word, rank, k1"
2452 " FROM \"%w\".\"%w_vocab\""
2453 " WHERE langid=%d AND k2>=?1 AND k2<?2",
2454 p->zDbName, p->zTableName, iLang
2455 );
2456 if( zSql==0 ){
2457 x.rc = SQLITE_NOMEM;
2458 pStmt = 0;
2459 goto filter_exit;
2460 }
2461 rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2462 sqlite3_free(zSql);
2463 pCur->iLang = iLang;
2464 x.pCur = pCur;
2465 x.pStmt = pStmt;
2466 x.zPattern = zPattern;
2467 x.nPattern = nPattern;
2468 x.pMatchStr3 = pMatchStr3;
2469 x.iLang = iLang;
2470 x.rc = rc;
2471 x.pConfig3 = p->pConfig3;
2472 if( x.rc==SQLITE_OK ){
2473 spellfix1RunQuery(&x, zPattern, nPattern);
2474 }
2475
2476 if( pCur->a ){
2477 qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2478 pCur->iTop = iLimit;
2479 pCur->iScope = iScope;
2480 }else{
2481 x.rc = SQLITE_NOMEM;
2482 }
2483
2484 filter_exit:
2485 sqlite3_finalize(pStmt);
2486 editDist3FromStringDelete(pMatchStr3);
2487 return x.rc;
2488 }
2489
2490 /*
2491 ** This version of xFilter handles a full-table scan case
2492 */
2493 static int spellfix1FilterForFullScan(
2494 spellfix1_cursor *pCur,
2495 int argc,
2496 sqlite3_value **argv
2497 ){
2498 int rc = SQLITE_OK;
2499 int idxNum = pCur->idxNum;
2500 char *zSql;
2501 spellfix1_vtab *pVTab = pCur->pVTab;
2502 spellfix1ResetCursor(pCur);
2503 assert( idxNum==0 || idxNum==64 );
2504 zSql = sqlite3_mprintf(
2505 "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2506 pVTab->zDbName, pVTab->zTableName,
2507 ((idxNum & 64) ? " WHERE rowid=?" : "")
2508 );
2509 if( zSql==0 ) return SQLITE_NOMEM;
2510 rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2511 sqlite3_free(zSql);
2512 if( rc==SQLITE_OK && (idxNum & 64) ){
2513 assert( argc==1 );
2514 rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2515 }
2516 pCur->nRow = pCur->iRow = 0;
2517 if( rc==SQLITE_OK ){
2518 rc = sqlite3_step(pCur->pFullScan);
2519 if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
2520 if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
2521 }else{
2522 pCur->iRow = 0;
2523 }
2524 return rc;
2525 }
2526
2527
2528 /*
2529 ** Called to "rewind" a cursor back to the beginning so that
2530 ** it starts its output over again. Always called at least once
2531 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2532 */
2533 static int spellfix1Filter(
2534 sqlite3_vtab_cursor *cur,
2535 int idxNum, const char *idxStr,
2536 int argc, sqlite3_value **argv
2537 ){
2538 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2539 int rc;
2540 pCur->idxNum = idxNum;
2541 if( idxNum & 1 ){
2542 rc = spellfix1FilterForMatch(pCur, argc, argv);
2543 }else{
2544 rc = spellfix1FilterForFullScan(pCur, argc, argv);
2545 }
2546 return rc;
2547 }
2548
2549
2550 /*
2551 ** Advance a cursor to its next row of output
2552 */
2553 static int spellfix1Next(sqlite3_vtab_cursor *cur){
2554 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2555 int rc = SQLITE_OK;
2556 if( pCur->iRow < pCur->nRow ){
2557 if( pCur->pFullScan ){
2558 rc = sqlite3_step(pCur->pFullScan);
2559 if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2560 if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2561 }else{
2562 pCur->iRow++;
2563 }
2564 }
2565 return rc;
2566 }
2567
2568 /*
2569 ** Return TRUE if we are at the end-of-file
2570 */
2571 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
2572 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2573 return pCur->iRow>=pCur->nRow;
2574 }
2575
2576 /*
2577 ** Return columns from the current row.
2578 */
2579 static int spellfix1Column(
2580 sqlite3_vtab_cursor *cur,
2581 sqlite3_context *ctx,
2582 int i
2583 ){
2584 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2585 if( pCur->pFullScan ){
2586 if( i<=SPELLFIX_COL_LANGID ){
2587 sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2588 }else{
2589 sqlite3_result_null(ctx);
2590 }
2591 return SQLITE_OK;
2592 }
2593 switch( i ){
2594 case SPELLFIX_COL_WORD: {
2595 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2596 break;
2597 }
2598 case SPELLFIX_COL_RANK: {
2599 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2600 break;
2601 }
2602 case SPELLFIX_COL_DISTANCE: {
2603 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2604 break;
2605 }
2606 case SPELLFIX_COL_LANGID: {
2607 sqlite3_result_int(ctx, pCur->iLang);
2608 break;
2609 }
2610 case SPELLFIX_COL_SCORE: {
2611 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2612 break;
2613 }
2614 case SPELLFIX_COL_MATCHLEN: {
2615 int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2616 if( iMatchlen<0 ){
2617 int nPattern = (int)strlen(pCur->zPattern);
2618 char *zWord = pCur->a[pCur->iRow].zWord;
2619 int nWord = (int)strlen(zWord);
2620
2621 if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
2622 char *zTranslit;
2623 int res;
2624 zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2625 if( !zTranslit ) return SQLITE_NOMEM;
2626 res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2627 sqlite3_free(zTranslit);
2628 if( res<0 ) return SQLITE_NOMEM;
2629 iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2630 }else{
2631 iMatchlen = utf8Charlen(zWord, nWord);
2632 }
2633 }
2634
2635 sqlite3_result_int(ctx, iMatchlen);
2636 break;
2637 }
2638 case SPELLFIX_COL_PHONEHASH: {
2639 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2640 break;
2641 }
2642 case SPELLFIX_COL_TOP: {
2643 sqlite3_result_int(ctx, pCur->iTop);
2644 break;
2645 }
2646 case SPELLFIX_COL_SCOPE: {
2647 sqlite3_result_int(ctx, pCur->iScope);
2648 break;
2649 }
2650 case SPELLFIX_COL_SRCHCNT: {
2651 sqlite3_result_int(ctx, pCur->nSearch);
2652 break;
2653 }
2654 default: {
2655 sqlite3_result_null(ctx);
2656 break;
2657 }
2658 }
2659 return SQLITE_OK;
2660 }
2661
2662 /*
2663 ** The rowid.
2664 */
2665 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
2666 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2667 if( pCur->pFullScan ){
2668 *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2669 }else{
2670 *pRowid = pCur->a[pCur->iRow].iRowid;
2671 }
2672 return SQLITE_OK;
2673 }
2674
2675 /*
2676 ** This function is called by the xUpdate() method. It returns a string
2677 ** containing the conflict mode that xUpdate() should use for the current
2678 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE".
2679 */
2680 static const char *spellfix1GetConflict(sqlite3 *db){
2681 static const char *azConflict[] = {
2682 /* Note: Instead of "FAIL" - "ABORT". */
2683 "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE"
2684 };
2685 int eConflict = sqlite3_vtab_on_conflict(db);
2686
2687 assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE
2688 || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT
2689 || eConflict==SQLITE_REPLACE
2690 );
2691 assert( SQLITE_ROLLBACK==1 );
2692 assert( SQLITE_IGNORE==2 );
2693 assert( SQLITE_FAIL==3 );
2694 assert( SQLITE_ABORT==4 );
2695 assert( SQLITE_REPLACE==5 );
2696
2697 return azConflict[eConflict-1];
2698 }
2699
2700 /*
2701 ** The xUpdate() method.
2702 */
2703 static int spellfix1Update(
2704 sqlite3_vtab *pVTab,
2705 int argc,
2706 sqlite3_value **argv,
2707 sqlite_int64 *pRowid
2708 ){
2709 int rc = SQLITE_OK;
2710 sqlite3_int64 rowid, newRowid;
2711 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2712 sqlite3 *db = p->db;
2713
2714 if( argc==1 ){
2715 /* A delete operation on the rowid given by argv[0] */
2716 rowid = *pRowid = sqlite3_value_int64(argv[0]);
2717 spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2718 " WHERE id=%lld",
2719 p->zDbName, p->zTableName, rowid);
2720 }else{
2721 const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2722 int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2723 int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2724 int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2725 const unsigned char *zSoundslike =
2726 sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2727 int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2728 char *zK1, *zK2;
2729 int i;
2730 char c;
2731 const char *zConflict = spellfix1GetConflict(db);
2732
2733 if( zWord==0 ){
2734 /* Inserts of the form: INSERT INTO table(command) VALUES('xyzzy');
2735 ** cause zWord to be NULL, so we look at the "command" column to see
2736 ** what special actions to take */
2737 const char *zCmd =
2738 (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2739 if( zCmd==0 ){
2740 pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2741 p->zTableName);
2742 return SQLITE_CONSTRAINT_NOTNULL;
2743 }
2744 if( strcmp(zCmd,"reset")==0 ){
2745 /* Reset the edit cost table (if there is one). */
2746 editDist3ConfigDelete(p->pConfig3);
2747 p->pConfig3 = 0;
2748 return SQLITE_OK;
2749 }
2750 if( strncmp(zCmd,"edit_cost_table=",16)==0 ){
2751 editDist3ConfigDelete(p->pConfig3);
2752 p->pConfig3 = 0;
2753 sqlite3_free(p->zCostTable);
2754 p->zCostTable = spellfix1Dequote(zCmd+16);
2755 if( p->zCostTable==0 ) return SQLITE_NOMEM;
2756 if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){
2757 sqlite3_free(p->zCostTable);
2758 p->zCostTable = 0;
2759 }
2760 return SQLITE_OK;
2761 }
2762 pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2763 p->zTableName, zCmd);
2764 return SQLITE_ERROR;
2765 }
2766 if( iRank<1 ) iRank = 1;
2767 if( zSoundslike ){
2768 zK1 = (char*)transliterate(zSoundslike, nSoundslike);
2769 }else{
2770 zK1 = (char*)transliterate(zWord, nWord);
2771 }
2772 if( zK1==0 ) return SQLITE_NOMEM;
2773 for(i=0; (c = zK1[i])!=0; i++){
2774 if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
2775 }
2776 zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
2777 if( zK2==0 ){
2778 sqlite3_free(zK1);
2779 return SQLITE_NOMEM;
2780 }
2781 if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
2782 if( sqlite3_value_type(argv[1])==SQLITE_NULL ){
2783 spellfix1DbExec(&rc, db,
2784 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2785 "VALUES(%d,%d,%Q,%Q,%Q)",
2786 p->zDbName, p->zTableName,
2787 iRank, iLang, zWord, zK1, zK2
2788 );
2789 }else{
2790 newRowid = sqlite3_value_int64(argv[1]);
2791 spellfix1DbExec(&rc, db,
2792 "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
2793 "VALUES(%lld,%d,%d,%Q,%Q,%Q)",
2794 zConflict, p->zDbName, p->zTableName,
2795 newRowid, iRank, iLang, zWord, zK1, zK2
2796 );
2797 }
2798 *pRowid = sqlite3_last_insert_rowid(db);
2799 }else{
2800 rowid = sqlite3_value_int64(argv[0]);
2801 newRowid = *pRowid = sqlite3_value_int64(argv[1]);
2802 spellfix1DbExec(&rc, db,
2803 "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2804 " word=%Q, k1=%Q, k2=%Q WHERE id=%lld",
2805 zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang,
2806 zWord, zK1, zK2, rowid
2807 );
2808 }
2809 sqlite3_free(zK1);
2810 sqlite3_free(zK2);
2811 }
2812 return rc;
2813 }
2814
2815 /*
2816 ** Rename the spellfix1 table.
2817 */
2818 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
2819 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2820 sqlite3 *db = p->db;
2821 int rc = SQLITE_OK;
2822 char *zNewName = sqlite3_mprintf("%s", zNew);
2823 if( zNewName==0 ){
2824 return SQLITE_NOMEM;
2825 }
2826 spellfix1DbExec(&rc, db,
2827 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2828 p->zDbName, p->zTableName, zNewName
2829 );
2830 if( rc==SQLITE_OK ){
2831 sqlite3_free(p->zTableName);
2832 p->zTableName = zNewName;
2833 }else{
2834 sqlite3_free(zNewName);
2835 }
2836 return rc;
2837 }
2838
2839
2840 /*
2841 ** A virtual table module that provides fuzzy search.
2842 */
2843 static sqlite3_module spellfix1Module = {
2844 0, /* iVersion */
2845 spellfix1Create, /* xCreate - handle CREATE VIRTUAL TABLE */
2846 spellfix1Connect, /* xConnect - reconnected to an existing table */
2847 spellfix1BestIndex, /* xBestIndex - figure out how to do a query */
2848 spellfix1Disconnect, /* xDisconnect - close a connection */
2849 spellfix1Destroy, /* xDestroy - handle DROP TABLE */
2850 spellfix1Open, /* xOpen - open a cursor */
2851 spellfix1Close, /* xClose - close a cursor */
2852 spellfix1Filter, /* xFilter - configure scan constraints */
2853 spellfix1Next, /* xNext - advance a cursor */
2854 spellfix1Eof, /* xEof - check for end of scan */
2855 spellfix1Column, /* xColumn - read data */
2856 spellfix1Rowid, /* xRowid - read data */
2857 spellfix1Update, /* xUpdate */
2858 0, /* xBegin */
2859 0, /* xSync */
2860 0, /* xCommit */
2861 0, /* xRollback */
2862 0, /* xFindMethod */
2863 spellfix1Rename, /* xRename */
2864 };
2865
2866 /*
2867 ** Register the various functions and the virtual table.
2868 */
2869 static int spellfix1Register(sqlite3 *db){
2870 int rc = SQLITE_OK;
2871 int i;
2872 rc = sqlite3_create_function(db, "spellfix1_translit", 1, SQLITE_UTF8, 0,
2873 transliterateSqlFunc, 0, 0);
2874 if( rc==SQLITE_OK ){
2875 rc = sqlite3_create_function(db, "spellfix1_editdist", 2, SQLITE_UTF8, 0,
2876 editdistSqlFunc, 0, 0);
2877 }
2878 if( rc==SQLITE_OK ){
2879 rc = sqlite3_create_function(db, "spellfix1_phonehash", 1, SQLITE_UTF8, 0,
2880 phoneticHashSqlFunc, 0, 0);
2881 }
2882 if( rc==SQLITE_OK ){
2883 rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1, SQLITE_UTF8, 0,
2884 scriptCodeSqlFunc, 0, 0);
2885 }
2886 if( rc==SQLITE_OK ){
2887 rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
2888 }
2889 if( rc==SQLITE_OK ){
2890 rc = editDist3Install(db);
2891 }
2892
2893 /* Verify sanity of the translit[] table */
2894 for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
2895 assert( translit[i].cFrom<translit[i+1].cFrom );
2896 }
2897
2898 return rc;
2899 }
2900
2901 #endif /* SQLITE_OMIT_VIRTUALTABLE */
2902
2903 /*
2904 ** Extension load function.
2905 */
2906 #ifdef _WIN32
2907 __declspec(dllexport)
2908 #endif
2909 int sqlite3_spellfix_init(
2910 sqlite3 *db,
2911 char **pzErrMsg,
2912 const sqlite3_api_routines *pApi
2913 ){
2914 SQLITE_EXTENSION_INIT2(pApi);
2915 #ifndef SQLITE_OMIT_VIRTUALTABLE
2916 return spellfix1Register(db);
2917 #endif
2918 return SQLITE_OK;
2919 }
OLDNEW
« no previous file with comments | « third_party/sqlite/sqlite-src-3100200/ext/misc/showauth.c ('k') | third_party/sqlite/sqlite-src-3100200/ext/misc/totype.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698