OLD | NEW |
1 /* | 1 /* |
2 ** 2006 September 30 | 2 ** 2006 September 30 |
3 ** | 3 ** |
4 ** The author disclaims copyright to this source code. In place of | 4 ** The author disclaims copyright to this source code. In place of |
5 ** a legal notice, here is a blessing: | 5 ** a legal notice, here is a blessing: |
6 ** | 6 ** |
7 ** May you do good and not evil. | 7 ** May you do good and not evil. |
8 ** May you find forgiveness for yourself and forgive others. | 8 ** May you find forgiveness for yourself and forgive others. |
9 ** May you share freely, never taking more than you give. | 9 ** May you share freely, never taking more than you give. |
10 ** | 10 ** |
11 ************************************************************************* | 11 ************************************************************************* |
12 ** Implementation of the full-text-search tokenizer that implements | 12 ** Implementation of the full-text-search tokenizer that implements |
13 ** a Porter stemmer. | 13 ** a Porter stemmer. |
14 */ | 14 */ |
15 | 15 |
16 /* | 16 /* |
17 ** The code in this file is only compiled if: | 17 ** The code in this file is only compiled if: |
18 ** | 18 ** |
19 ** * The FTS3 module is being built as an extension | 19 ** * The FTS3 module is being built as an extension |
20 ** (in which case SQLITE_CORE is not defined), or | 20 ** (in which case SQLITE_CORE is not defined), or |
21 ** | 21 ** |
22 ** * The FTS3 module is being built into the core of | 22 ** * The FTS3 module is being built into the core of |
23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). |
24 */ | 24 */ |
25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | 25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
26 | 26 |
| 27 #include "fts3Int.h" |
27 | 28 |
28 #include <assert.h> | 29 #include <assert.h> |
29 #include <stdlib.h> | 30 #include <stdlib.h> |
30 #include <stdio.h> | 31 #include <stdio.h> |
31 #include <string.h> | 32 #include <string.h> |
32 | 33 |
33 #include "fts3_tokenizer.h" | 34 #include "fts3_tokenizer.h" |
34 | 35 |
35 /* | 36 /* |
36 ** Class derived from sqlite3_tokenizer | 37 ** Class derived from sqlite3_tokenizer |
37 */ | 38 */ |
38 typedef struct porter_tokenizer { | 39 typedef struct porter_tokenizer { |
39 sqlite3_tokenizer base; /* Base class */ | 40 sqlite3_tokenizer base; /* Base class */ |
40 } porter_tokenizer; | 41 } porter_tokenizer; |
41 | 42 |
42 /* | 43 /* |
43 ** Class derived from sqlit3_tokenizer_cursor | 44 ** Class derived from sqlit3_tokenizer_cursor |
44 */ | 45 */ |
45 typedef struct porter_tokenizer_cursor { | 46 typedef struct porter_tokenizer_cursor { |
46 sqlite3_tokenizer_cursor base; | 47 sqlite3_tokenizer_cursor base; |
47 const char *zInput; /* input we are tokenizing */ | 48 const char *zInput; /* input we are tokenizing */ |
48 int nInput; /* size of the input */ | 49 int nInput; /* size of the input */ |
49 int iOffset; /* current position in zInput */ | 50 int iOffset; /* current position in zInput */ |
50 int iToken; /* index of next token to be returned */ | 51 int iToken; /* index of next token to be returned */ |
51 char *zToken; /* storage for current token */ | 52 char *zToken; /* storage for current token */ |
52 int nAllocated; /* space allocated to zToken buffer */ | 53 int nAllocated; /* space allocated to zToken buffer */ |
53 } porter_tokenizer_cursor; | 54 } porter_tokenizer_cursor; |
54 | 55 |
55 | 56 |
56 /* Forward declaration */ | |
57 static const sqlite3_tokenizer_module porterTokenizerModule; | |
58 | |
59 | |
60 /* | 57 /* |
61 ** Create a new tokenizer instance. | 58 ** Create a new tokenizer instance. |
62 */ | 59 */ |
63 static int porterCreate( | 60 static int porterCreate( |
64 int argc, const char * const *argv, | 61 int argc, const char * const *argv, |
65 sqlite3_tokenizer **ppTokenizer | 62 sqlite3_tokenizer **ppTokenizer |
66 ){ | 63 ){ |
67 porter_tokenizer *t; | 64 porter_tokenizer *t; |
| 65 |
| 66 UNUSED_PARAMETER(argc); |
| 67 UNUSED_PARAMETER(argv); |
| 68 |
68 t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t)); | 69 t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t)); |
69 if( t==NULL ) return SQLITE_NOMEM; | 70 if( t==NULL ) return SQLITE_NOMEM; |
70 memset(t, 0, sizeof(*t)); | 71 memset(t, 0, sizeof(*t)); |
71 *ppTokenizer = &t->base; | 72 *ppTokenizer = &t->base; |
72 return SQLITE_OK; | 73 return SQLITE_OK; |
73 } | 74 } |
74 | 75 |
75 /* | 76 /* |
76 ** Destroy a tokenizer | 77 ** Destroy a tokenizer |
77 */ | 78 */ |
78 static int porterDestroy(sqlite3_tokenizer *pTokenizer){ | 79 static int porterDestroy(sqlite3_tokenizer *pTokenizer){ |
79 sqlite3_free(pTokenizer); | 80 sqlite3_free(pTokenizer); |
80 return SQLITE_OK; | 81 return SQLITE_OK; |
81 } | 82 } |
82 | 83 |
83 /* | 84 /* |
84 ** Prepare to begin tokenizing a particular string. The input | 85 ** Prepare to begin tokenizing a particular string. The input |
85 ** string to be tokenized is zInput[0..nInput-1]. A cursor | 86 ** string to be tokenized is zInput[0..nInput-1]. A cursor |
86 ** used to incrementally tokenize this string is returned in | 87 ** used to incrementally tokenize this string is returned in |
87 ** *ppCursor. | 88 ** *ppCursor. |
88 */ | 89 */ |
89 static int porterOpen( | 90 static int porterOpen( |
90 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | 91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ |
91 const char *zInput, int nInput, /* String to be tokenized */ | 92 const char *zInput, int nInput, /* String to be tokenized */ |
92 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | 93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ |
93 ){ | 94 ){ |
94 porter_tokenizer_cursor *c; | 95 porter_tokenizer_cursor *c; |
95 | 96 |
| 97 UNUSED_PARAMETER(pTokenizer); |
| 98 |
96 c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); | 99 c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); |
97 if( c==NULL ) return SQLITE_NOMEM; | 100 if( c==NULL ) return SQLITE_NOMEM; |
98 | 101 |
99 c->zInput = zInput; | 102 c->zInput = zInput; |
100 if( zInput==0 ){ | 103 if( zInput==0 ){ |
101 c->nInput = 0; | 104 c->nInput = 0; |
102 }else if( nInput<0 ){ | 105 }else if( nInput<0 ){ |
103 c->nInput = (int)strlen(zInput); | 106 c->nInput = (int)strlen(zInput); |
104 }else{ | 107 }else{ |
105 c->nInput = nInput; | 108 c->nInput = nInput; |
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
226 return *z!=0; | 229 return *z!=0; |
227 } | 230 } |
228 | 231 |
229 /* | 232 /* |
230 ** Return TRUE if the word ends in a double consonant. | 233 ** Return TRUE if the word ends in a double consonant. |
231 ** | 234 ** |
232 ** The text is reversed here. So we are really looking at | 235 ** The text is reversed here. So we are really looking at |
233 ** the first two characters of z[]. | 236 ** the first two characters of z[]. |
234 */ | 237 */ |
235 static int doubleConsonant(const char *z){ | 238 static int doubleConsonant(const char *z){ |
236 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); | 239 return isConsonant(z) && z[0]==z[1]; |
237 } | 240 } |
238 | 241 |
239 /* | 242 /* |
240 ** Return TRUE if the word ends with three letters which | 243 ** Return TRUE if the word ends with three letters which |
241 ** are consonant-vowel-consonent and where the final consonant | 244 ** are consonant-vowel-consonent and where the final consonant |
242 ** is not 'w', 'x', or 'y'. | 245 ** is not 'w', 'x', or 'y'. |
243 ** | 246 ** |
244 ** The word is reversed here. So we are really checking the | 247 ** The word is reversed here. So we are really checking the |
245 ** first three letters and the first one cannot be in [wxy]. | 248 ** first three letters and the first one cannot be in [wxy]. |
246 */ | 249 */ |
247 static int star_oh(const char *z){ | 250 static int star_oh(const char *z){ |
248 return | 251 return |
249 z[0]!=0 && isConsonant(z) && | 252 isConsonant(z) && |
250 z[0]!='w' && z[0]!='x' && z[0]!='y' && | 253 z[0]!='w' && z[0]!='x' && z[0]!='y' && |
251 z[1]!=0 && isVowel(z+1) && | 254 isVowel(z+1) && |
252 z[2]!=0 && isConsonant(z+2); | 255 isConsonant(z+2); |
253 } | 256 } |
254 | 257 |
255 /* | 258 /* |
256 ** If the word ends with zFrom and xCond() is true for the stem | 259 ** If the word ends with zFrom and xCond() is true for the stem |
257 ** of the word that preceeds the zFrom ending, then change the | 260 ** of the word that preceeds the zFrom ending, then change the |
258 ** ending to zTo. | 261 ** ending to zTo. |
259 ** | 262 ** |
260 ** The input word *pz and zFrom are both in reverse order. zTo | 263 ** The input word *pz and zFrom are both in reverse order. zTo |
261 ** is in normal order. | 264 ** is in normal order. |
262 ** | 265 ** |
(...skipping 23 matching lines...) Expand all Loading... |
286 ** inappropriate. The input word is copied into the output with | 289 ** inappropriate. The input word is copied into the output with |
287 ** US-ASCII case folding. If the input word is too long (more | 290 ** US-ASCII case folding. If the input word is too long (more |
288 ** than 20 bytes if it contains no digits or more than 6 bytes if | 291 ** than 20 bytes if it contains no digits or more than 6 bytes if |
289 ** it contains digits) then word is truncated to 20 or 6 bytes | 292 ** it contains digits) then word is truncated to 20 or 6 bytes |
290 ** by taking 10 or 3 bytes from the beginning and end. | 293 ** by taking 10 or 3 bytes from the beginning and end. |
291 */ | 294 */ |
292 static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | 295 static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ |
293 int i, mx, j; | 296 int i, mx, j; |
294 int hasDigit = 0; | 297 int hasDigit = 0; |
295 for(i=0; i<nIn; i++){ | 298 for(i=0; i<nIn; i++){ |
296 int c = zIn[i]; | 299 char c = zIn[i]; |
297 if( c>='A' && c<='Z' ){ | 300 if( c>='A' && c<='Z' ){ |
298 zOut[i] = c - 'A' + 'a'; | 301 zOut[i] = c - 'A' + 'a'; |
299 }else{ | 302 }else{ |
300 if( c>='0' && c<='9' ) hasDigit = 1; | 303 if( c>='0' && c<='9' ) hasDigit = 1; |
301 zOut[i] = c; | 304 zOut[i] = c; |
302 } | 305 } |
303 } | 306 } |
304 mx = hasDigit ? 3 : 10; | 307 mx = hasDigit ? 3 : 10; |
305 if( nIn>mx*2 ){ | 308 if( nIn>mx*2 ){ |
306 for(j=mx, i=nIn-mx; i<nIn; i++, j++){ | 309 for(j=mx, i=nIn-mx; i<nIn; i++, j++){ |
(...skipping 23 matching lines...) Expand all Loading... |
330 ** | 333 ** |
331 ** If the input word contains not digits but does characters not | 334 ** If the input word contains not digits but does characters not |
332 ** in [a-zA-Z] then no stemming is attempted and this routine just | 335 ** in [a-zA-Z] then no stemming is attempted and this routine just |
333 ** copies the input into the input into the output with US-ASCII | 336 ** copies the input into the input into the output with US-ASCII |
334 ** case folding. | 337 ** case folding. |
335 ** | 338 ** |
336 ** Stemming never increases the length of the word. So there is | 339 ** Stemming never increases the length of the word. So there is |
337 ** no chance of overflowing the zOut buffer. | 340 ** no chance of overflowing the zOut buffer. |
338 */ | 341 */ |
339 static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | 342 static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ |
340 int i, j, c; | 343 int i, j; |
341 char zReverse[28]; | 344 char zReverse[28]; |
342 char *z, *z2; | 345 char *z, *z2; |
343 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ | 346 if( nIn<3 || nIn>=(int)sizeof(zReverse)-7 ){ |
344 /* The word is too big or too small for the porter stemmer. | 347 /* The word is too big or too small for the porter stemmer. |
345 ** Fallback to the copy stemmer */ | 348 ** Fallback to the copy stemmer */ |
346 copy_stemmer(zIn, nIn, zOut, pnOut); | 349 copy_stemmer(zIn, nIn, zOut, pnOut); |
347 return; | 350 return; |
348 } | 351 } |
349 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ | 352 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ |
350 c = zIn[i]; | 353 char c = zIn[i]; |
351 if( c>='A' && c<='Z' ){ | 354 if( c>='A' && c<='Z' ){ |
352 zReverse[j] = c + 'a' - 'A'; | 355 zReverse[j] = c + 'a' - 'A'; |
353 }else if( c>='a' && c<='z' ){ | 356 }else if( c>='a' && c<='z' ){ |
354 zReverse[j] = c; | 357 zReverse[j] = c; |
355 }else{ | 358 }else{ |
356 /* The use of a character not in [a-zA-Z] means that we fallback | 359 /* The use of a character not in [a-zA-Z] means that we fallback |
357 ** to the copy stemmer */ | 360 ** to the copy stemmer */ |
358 copy_stemmer(zIn, nIn, zOut, pnOut); | 361 copy_stemmer(zIn, nIn, zOut, pnOut); |
359 return; | 362 return; |
360 } | 363 } |
(...skipping 178 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
539 } | 542 } |
540 | 543 |
541 /* Step 5b */ | 544 /* Step 5b */ |
542 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ | 545 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ |
543 z++; | 546 z++; |
544 } | 547 } |
545 | 548 |
546 /* z[] is now the stemmed word in reverse order. Flip it back | 549 /* z[] is now the stemmed word in reverse order. Flip it back |
547 ** around into forward order and return. | 550 ** around into forward order and return. |
548 */ | 551 */ |
549 *pnOut = i = strlen(z); | 552 *pnOut = i = (int)strlen(z); |
550 zOut[i] = 0; | 553 zOut[i] = 0; |
551 while( *z ){ | 554 while( *z ){ |
552 zOut[--i] = *(z++); | 555 zOut[--i] = *(z++); |
553 } | 556 } |
554 } | 557 } |
555 | 558 |
556 /* | 559 /* |
557 ** Characters that can be part of a token. We assume any character | 560 ** Characters that can be part of a token. We assume any character |
558 ** whose value is greater than 0x80 (any UTF character) can be | 561 ** whose value is greater than 0x80 (any UTF character) can be |
559 ** part of a token. In other words, delimiters all must have | 562 ** part of a token. In other words, delimiters all must have |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
594 | 597 |
595 /* Count non-delimiter characters. */ | 598 /* Count non-delimiter characters. */ |
596 iStartOffset = c->iOffset; | 599 iStartOffset = c->iOffset; |
597 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ | 600 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ |
598 c->iOffset++; | 601 c->iOffset++; |
599 } | 602 } |
600 | 603 |
601 if( c->iOffset>iStartOffset ){ | 604 if( c->iOffset>iStartOffset ){ |
602 int n = c->iOffset-iStartOffset; | 605 int n = c->iOffset-iStartOffset; |
603 if( n>c->nAllocated ){ | 606 if( n>c->nAllocated ){ |
| 607 char *pNew; |
604 c->nAllocated = n+20; | 608 c->nAllocated = n+20; |
605 c->zToken = sqlite3_realloc(c->zToken, c->nAllocated); | 609 pNew = sqlite3_realloc(c->zToken, c->nAllocated); |
606 if( c->zToken==NULL ) return SQLITE_NOMEM; | 610 if( !pNew ) return SQLITE_NOMEM; |
| 611 c->zToken = pNew; |
607 } | 612 } |
608 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); | 613 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); |
609 *pzToken = c->zToken; | 614 *pzToken = c->zToken; |
610 *piStartOffset = iStartOffset; | 615 *piStartOffset = iStartOffset; |
611 *piEndOffset = c->iOffset; | 616 *piEndOffset = c->iOffset; |
612 *piPosition = c->iToken++; | 617 *piPosition = c->iToken++; |
613 return SQLITE_OK; | 618 return SQLITE_OK; |
614 } | 619 } |
615 } | 620 } |
616 return SQLITE_DONE; | 621 return SQLITE_DONE; |
(...skipping 15 matching lines...) Expand all Loading... |
632 ** Allocate a new porter tokenizer. Return a pointer to the new | 637 ** Allocate a new porter tokenizer. Return a pointer to the new |
633 ** tokenizer in *ppModule | 638 ** tokenizer in *ppModule |
634 */ | 639 */ |
635 void sqlite3Fts3PorterTokenizerModule( | 640 void sqlite3Fts3PorterTokenizerModule( |
636 sqlite3_tokenizer_module const**ppModule | 641 sqlite3_tokenizer_module const**ppModule |
637 ){ | 642 ){ |
638 *ppModule = &porterTokenizerModule; | 643 *ppModule = &porterTokenizerModule; |
639 } | 644 } |
640 | 645 |
641 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | 646 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |
OLD | NEW |