| OLD | NEW |
| 1 /* | 1 /* |
| 2 ** 2006 September 30 | 2 ** 2006 September 30 |
| 3 ** | 3 ** |
| 4 ** The author disclaims copyright to this source code. In place of | 4 ** The author disclaims copyright to this source code. In place of |
| 5 ** a legal notice, here is a blessing: | 5 ** a legal notice, here is a blessing: |
| 6 ** | 6 ** |
| 7 ** May you do good and not evil. | 7 ** May you do good and not evil. |
| 8 ** May you find forgiveness for yourself and forgive others. | 8 ** May you find forgiveness for yourself and forgive others. |
| 9 ** May you share freely, never taking more than you give. | 9 ** May you share freely, never taking more than you give. |
| 10 ** | 10 ** |
| 11 ************************************************************************* | 11 ************************************************************************* |
| 12 ** Implementation of the full-text-search tokenizer that implements | 12 ** Implementation of the full-text-search tokenizer that implements |
| 13 ** a Porter stemmer. | 13 ** a Porter stemmer. |
| 14 */ | 14 */ |
| 15 | 15 |
| 16 /* | 16 /* |
| 17 ** The code in this file is only compiled if: | 17 ** The code in this file is only compiled if: |
| 18 ** | 18 ** |
| 19 ** * The FTS3 module is being built as an extension | 19 ** * The FTS3 module is being built as an extension |
| 20 ** (in which case SQLITE_CORE is not defined), or | 20 ** (in which case SQLITE_CORE is not defined), or |
| 21 ** | 21 ** |
| 22 ** * The FTS3 module is being built into the core of | 22 ** * The FTS3 module is being built into the core of |
| 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). |
| 24 */ | 24 */ |
| 25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | 25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
| 26 | 26 |
| 27 #include "fts3Int.h" |
| 27 | 28 |
| 28 #include <assert.h> | 29 #include <assert.h> |
| 29 #include <stdlib.h> | 30 #include <stdlib.h> |
| 30 #include <stdio.h> | 31 #include <stdio.h> |
| 31 #include <string.h> | 32 #include <string.h> |
| 32 | 33 |
| 33 #include "fts3_tokenizer.h" | 34 #include "fts3_tokenizer.h" |
| 34 | 35 |
| 35 /* | 36 /* |
| 36 ** Class derived from sqlite3_tokenizer | 37 ** Class derived from sqlite3_tokenizer |
| 37 */ | 38 */ |
| 38 typedef struct porter_tokenizer { | 39 typedef struct porter_tokenizer { |
| 39 sqlite3_tokenizer base; /* Base class */ | 40 sqlite3_tokenizer base; /* Base class */ |
| 40 } porter_tokenizer; | 41 } porter_tokenizer; |
| 41 | 42 |
| 42 /* | 43 /* |
| 43 ** Class derived from sqlit3_tokenizer_cursor | 44 ** Class derived from sqlit3_tokenizer_cursor |
| 44 */ | 45 */ |
| 45 typedef struct porter_tokenizer_cursor { | 46 typedef struct porter_tokenizer_cursor { |
| 46 sqlite3_tokenizer_cursor base; | 47 sqlite3_tokenizer_cursor base; |
| 47 const char *zInput; /* input we are tokenizing */ | 48 const char *zInput; /* input we are tokenizing */ |
| 48 int nInput; /* size of the input */ | 49 int nInput; /* size of the input */ |
| 49 int iOffset; /* current position in zInput */ | 50 int iOffset; /* current position in zInput */ |
| 50 int iToken; /* index of next token to be returned */ | 51 int iToken; /* index of next token to be returned */ |
| 51 char *zToken; /* storage for current token */ | 52 char *zToken; /* storage for current token */ |
| 52 int nAllocated; /* space allocated to zToken buffer */ | 53 int nAllocated; /* space allocated to zToken buffer */ |
| 53 } porter_tokenizer_cursor; | 54 } porter_tokenizer_cursor; |
| 54 | 55 |
| 55 | 56 |
| 56 /* Forward declaration */ | |
| 57 static const sqlite3_tokenizer_module porterTokenizerModule; | |
| 58 | |
| 59 | |
| 60 /* | 57 /* |
| 61 ** Create a new tokenizer instance. | 58 ** Create a new tokenizer instance. |
| 62 */ | 59 */ |
| 63 static int porterCreate( | 60 static int porterCreate( |
| 64 int argc, const char * const *argv, | 61 int argc, const char * const *argv, |
| 65 sqlite3_tokenizer **ppTokenizer | 62 sqlite3_tokenizer **ppTokenizer |
| 66 ){ | 63 ){ |
| 67 porter_tokenizer *t; | 64 porter_tokenizer *t; |
| 65 |
| 66 UNUSED_PARAMETER(argc); |
| 67 UNUSED_PARAMETER(argv); |
| 68 |
| 68 t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t)); | 69 t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t)); |
| 69 if( t==NULL ) return SQLITE_NOMEM; | 70 if( t==NULL ) return SQLITE_NOMEM; |
| 70 memset(t, 0, sizeof(*t)); | 71 memset(t, 0, sizeof(*t)); |
| 71 *ppTokenizer = &t->base; | 72 *ppTokenizer = &t->base; |
| 72 return SQLITE_OK; | 73 return SQLITE_OK; |
| 73 } | 74 } |
| 74 | 75 |
| 75 /* | 76 /* |
| 76 ** Destroy a tokenizer | 77 ** Destroy a tokenizer |
| 77 */ | 78 */ |
| 78 static int porterDestroy(sqlite3_tokenizer *pTokenizer){ | 79 static int porterDestroy(sqlite3_tokenizer *pTokenizer){ |
| 79 sqlite3_free(pTokenizer); | 80 sqlite3_free(pTokenizer); |
| 80 return SQLITE_OK; | 81 return SQLITE_OK; |
| 81 } | 82 } |
| 82 | 83 |
| 83 /* | 84 /* |
| 84 ** Prepare to begin tokenizing a particular string. The input | 85 ** Prepare to begin tokenizing a particular string. The input |
| 85 ** string to be tokenized is zInput[0..nInput-1]. A cursor | 86 ** string to be tokenized is zInput[0..nInput-1]. A cursor |
| 86 ** used to incrementally tokenize this string is returned in | 87 ** used to incrementally tokenize this string is returned in |
| 87 ** *ppCursor. | 88 ** *ppCursor. |
| 88 */ | 89 */ |
| 89 static int porterOpen( | 90 static int porterOpen( |
| 90 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | 91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ |
| 91 const char *zInput, int nInput, /* String to be tokenized */ | 92 const char *zInput, int nInput, /* String to be tokenized */ |
| 92 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | 93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ |
| 93 ){ | 94 ){ |
| 94 porter_tokenizer_cursor *c; | 95 porter_tokenizer_cursor *c; |
| 95 | 96 |
| 97 UNUSED_PARAMETER(pTokenizer); |
| 98 |
| 96 c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); | 99 c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); |
| 97 if( c==NULL ) return SQLITE_NOMEM; | 100 if( c==NULL ) return SQLITE_NOMEM; |
| 98 | 101 |
| 99 c->zInput = zInput; | 102 c->zInput = zInput; |
| 100 if( zInput==0 ){ | 103 if( zInput==0 ){ |
| 101 c->nInput = 0; | 104 c->nInput = 0; |
| 102 }else if( nInput<0 ){ | 105 }else if( nInput<0 ){ |
| 103 c->nInput = (int)strlen(zInput); | 106 c->nInput = (int)strlen(zInput); |
| 104 }else{ | 107 }else{ |
| 105 c->nInput = nInput; | 108 c->nInput = nInput; |
| (...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 226 return *z!=0; | 229 return *z!=0; |
| 227 } | 230 } |
| 228 | 231 |
| 229 /* | 232 /* |
| 230 ** Return TRUE if the word ends in a double consonant. | 233 ** Return TRUE if the word ends in a double consonant. |
| 231 ** | 234 ** |
| 232 ** The text is reversed here. So we are really looking at | 235 ** The text is reversed here. So we are really looking at |
| 233 ** the first two characters of z[]. | 236 ** the first two characters of z[]. |
| 234 */ | 237 */ |
| 235 static int doubleConsonant(const char *z){ | 238 static int doubleConsonant(const char *z){ |
| 236 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); | 239 return isConsonant(z) && z[0]==z[1]; |
| 237 } | 240 } |
| 238 | 241 |
| 239 /* | 242 /* |
| 240 ** Return TRUE if the word ends with three letters which | 243 ** Return TRUE if the word ends with three letters which |
| 241 ** are consonant-vowel-consonent and where the final consonant | 244 ** are consonant-vowel-consonent and where the final consonant |
| 242 ** is not 'w', 'x', or 'y'. | 245 ** is not 'w', 'x', or 'y'. |
| 243 ** | 246 ** |
| 244 ** The word is reversed here. So we are really checking the | 247 ** The word is reversed here. So we are really checking the |
| 245 ** first three letters and the first one cannot be in [wxy]. | 248 ** first three letters and the first one cannot be in [wxy]. |
| 246 */ | 249 */ |
| 247 static int star_oh(const char *z){ | 250 static int star_oh(const char *z){ |
| 248 return | 251 return |
| 249 z[0]!=0 && isConsonant(z) && | 252 isConsonant(z) && |
| 250 z[0]!='w' && z[0]!='x' && z[0]!='y' && | 253 z[0]!='w' && z[0]!='x' && z[0]!='y' && |
| 251 z[1]!=0 && isVowel(z+1) && | 254 isVowel(z+1) && |
| 252 z[2]!=0 && isConsonant(z+2); | 255 isConsonant(z+2); |
| 253 } | 256 } |
| 254 | 257 |
| 255 /* | 258 /* |
| 256 ** If the word ends with zFrom and xCond() is true for the stem | 259 ** If the word ends with zFrom and xCond() is true for the stem |
| 257 ** of the word that preceeds the zFrom ending, then change the | 260 ** of the word that preceeds the zFrom ending, then change the |
| 258 ** ending to zTo. | 261 ** ending to zTo. |
| 259 ** | 262 ** |
| 260 ** The input word *pz and zFrom are both in reverse order. zTo | 263 ** The input word *pz and zFrom are both in reverse order. zTo |
| 261 ** is in normal order. | 264 ** is in normal order. |
| 262 ** | 265 ** |
| (...skipping 23 matching lines...) Expand all Loading... |
| 286 ** inappropriate. The input word is copied into the output with | 289 ** inappropriate. The input word is copied into the output with |
| 287 ** US-ASCII case folding. If the input word is too long (more | 290 ** US-ASCII case folding. If the input word is too long (more |
| 288 ** than 20 bytes if it contains no digits or more than 6 bytes if | 291 ** than 20 bytes if it contains no digits or more than 6 bytes if |
| 289 ** it contains digits) then word is truncated to 20 or 6 bytes | 292 ** it contains digits) then word is truncated to 20 or 6 bytes |
| 290 ** by taking 10 or 3 bytes from the beginning and end. | 293 ** by taking 10 or 3 bytes from the beginning and end. |
| 291 */ | 294 */ |
| 292 static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | 295 static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ |
| 293 int i, mx, j; | 296 int i, mx, j; |
| 294 int hasDigit = 0; | 297 int hasDigit = 0; |
| 295 for(i=0; i<nIn; i++){ | 298 for(i=0; i<nIn; i++){ |
| 296 int c = zIn[i]; | 299 char c = zIn[i]; |
| 297 if( c>='A' && c<='Z' ){ | 300 if( c>='A' && c<='Z' ){ |
| 298 zOut[i] = c - 'A' + 'a'; | 301 zOut[i] = c - 'A' + 'a'; |
| 299 }else{ | 302 }else{ |
| 300 if( c>='0' && c<='9' ) hasDigit = 1; | 303 if( c>='0' && c<='9' ) hasDigit = 1; |
| 301 zOut[i] = c; | 304 zOut[i] = c; |
| 302 } | 305 } |
| 303 } | 306 } |
| 304 mx = hasDigit ? 3 : 10; | 307 mx = hasDigit ? 3 : 10; |
| 305 if( nIn>mx*2 ){ | 308 if( nIn>mx*2 ){ |
| 306 for(j=mx, i=nIn-mx; i<nIn; i++, j++){ | 309 for(j=mx, i=nIn-mx; i<nIn; i++, j++){ |
| (...skipping 23 matching lines...) Expand all Loading... |
| 330 ** | 333 ** |
| 331 ** If the input word contains not digits but does characters not | 334 ** If the input word contains not digits but does characters not |
| 332 ** in [a-zA-Z] then no stemming is attempted and this routine just | 335 ** in [a-zA-Z] then no stemming is attempted and this routine just |
| 333 ** copies the input into the input into the output with US-ASCII | 336 ** copies the input into the input into the output with US-ASCII |
| 334 ** case folding. | 337 ** case folding. |
| 335 ** | 338 ** |
| 336 ** Stemming never increases the length of the word. So there is | 339 ** Stemming never increases the length of the word. So there is |
| 337 ** no chance of overflowing the zOut buffer. | 340 ** no chance of overflowing the zOut buffer. |
| 338 */ | 341 */ |
| 339 static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ | 342 static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ |
| 340 int i, j, c; | 343 int i, j; |
| 341 char zReverse[28]; | 344 char zReverse[28]; |
| 342 char *z, *z2; | 345 char *z, *z2; |
| 343 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ | 346 if( nIn<3 || nIn>=(int)sizeof(zReverse)-7 ){ |
| 344 /* The word is too big or too small for the porter stemmer. | 347 /* The word is too big or too small for the porter stemmer. |
| 345 ** Fallback to the copy stemmer */ | 348 ** Fallback to the copy stemmer */ |
| 346 copy_stemmer(zIn, nIn, zOut, pnOut); | 349 copy_stemmer(zIn, nIn, zOut, pnOut); |
| 347 return; | 350 return; |
| 348 } | 351 } |
| 349 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ | 352 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ |
| 350 c = zIn[i]; | 353 char c = zIn[i]; |
| 351 if( c>='A' && c<='Z' ){ | 354 if( c>='A' && c<='Z' ){ |
| 352 zReverse[j] = c + 'a' - 'A'; | 355 zReverse[j] = c + 'a' - 'A'; |
| 353 }else if( c>='a' && c<='z' ){ | 356 }else if( c>='a' && c<='z' ){ |
| 354 zReverse[j] = c; | 357 zReverse[j] = c; |
| 355 }else{ | 358 }else{ |
| 356 /* The use of a character not in [a-zA-Z] means that we fallback | 359 /* The use of a character not in [a-zA-Z] means that we fallback |
| 357 ** to the copy stemmer */ | 360 ** to the copy stemmer */ |
| 358 copy_stemmer(zIn, nIn, zOut, pnOut); | 361 copy_stemmer(zIn, nIn, zOut, pnOut); |
| 359 return; | 362 return; |
| 360 } | 363 } |
| (...skipping 178 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 539 } | 542 } |
| 540 | 543 |
| 541 /* Step 5b */ | 544 /* Step 5b */ |
| 542 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ | 545 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ |
| 543 z++; | 546 z++; |
| 544 } | 547 } |
| 545 | 548 |
| 546 /* z[] is now the stemmed word in reverse order. Flip it back | 549 /* z[] is now the stemmed word in reverse order. Flip it back |
| 547 ** around into forward order and return. | 550 ** around into forward order and return. |
| 548 */ | 551 */ |
| 549 *pnOut = i = strlen(z); | 552 *pnOut = i = (int)strlen(z); |
| 550 zOut[i] = 0; | 553 zOut[i] = 0; |
| 551 while( *z ){ | 554 while( *z ){ |
| 552 zOut[--i] = *(z++); | 555 zOut[--i] = *(z++); |
| 553 } | 556 } |
| 554 } | 557 } |
| 555 | 558 |
| 556 /* | 559 /* |
| 557 ** Characters that can be part of a token. We assume any character | 560 ** Characters that can be part of a token. We assume any character |
| 558 ** whose value is greater than 0x80 (any UTF character) can be | 561 ** whose value is greater than 0x80 (any UTF character) can be |
| 559 ** part of a token. In other words, delimiters all must have | 562 ** part of a token. In other words, delimiters all must have |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 594 | 597 |
| 595 /* Count non-delimiter characters. */ | 598 /* Count non-delimiter characters. */ |
| 596 iStartOffset = c->iOffset; | 599 iStartOffset = c->iOffset; |
| 597 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ | 600 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ |
| 598 c->iOffset++; | 601 c->iOffset++; |
| 599 } | 602 } |
| 600 | 603 |
| 601 if( c->iOffset>iStartOffset ){ | 604 if( c->iOffset>iStartOffset ){ |
| 602 int n = c->iOffset-iStartOffset; | 605 int n = c->iOffset-iStartOffset; |
| 603 if( n>c->nAllocated ){ | 606 if( n>c->nAllocated ){ |
| 607 char *pNew; |
| 604 c->nAllocated = n+20; | 608 c->nAllocated = n+20; |
| 605 c->zToken = sqlite3_realloc(c->zToken, c->nAllocated); | 609 pNew = sqlite3_realloc(c->zToken, c->nAllocated); |
| 606 if( c->zToken==NULL ) return SQLITE_NOMEM; | 610 if( !pNew ) return SQLITE_NOMEM; |
| 611 c->zToken = pNew; |
| 607 } | 612 } |
| 608 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); | 613 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); |
| 609 *pzToken = c->zToken; | 614 *pzToken = c->zToken; |
| 610 *piStartOffset = iStartOffset; | 615 *piStartOffset = iStartOffset; |
| 611 *piEndOffset = c->iOffset; | 616 *piEndOffset = c->iOffset; |
| 612 *piPosition = c->iToken++; | 617 *piPosition = c->iToken++; |
| 613 return SQLITE_OK; | 618 return SQLITE_OK; |
| 614 } | 619 } |
| 615 } | 620 } |
| 616 return SQLITE_DONE; | 621 return SQLITE_DONE; |
| (...skipping 15 matching lines...) Expand all Loading... |
| 632 ** Allocate a new porter tokenizer. Return a pointer to the new | 637 ** Allocate a new porter tokenizer. Return a pointer to the new |
| 633 ** tokenizer in *ppModule | 638 ** tokenizer in *ppModule |
| 634 */ | 639 */ |
| 635 void sqlite3Fts3PorterTokenizerModule( | 640 void sqlite3Fts3PorterTokenizerModule( |
| 636 sqlite3_tokenizer_module const**ppModule | 641 sqlite3_tokenizer_module const**ppModule |
| 637 ){ | 642 ){ |
| 638 *ppModule = &porterTokenizerModule; | 643 *ppModule = &porterTokenizerModule; |
| 639 } | 644 } |
| 640 | 645 |
| 641 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | 646 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |
| OLD | NEW |