Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Side by Side Diff: third_party/sqlite/src/ext/fts3/fts3_porter.c

Issue 6990047: Import SQLite 3.7.6.3. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 9 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 ** 2006 September 30 2 ** 2006 September 30
3 ** 3 **
4 ** The author disclaims copyright to this source code. In place of 4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing: 5 ** a legal notice, here is a blessing:
6 ** 6 **
7 ** May you do good and not evil. 7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others. 8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give. 9 ** May you share freely, never taking more than you give.
10 ** 10 **
11 ************************************************************************* 11 *************************************************************************
12 ** Implementation of the full-text-search tokenizer that implements 12 ** Implementation of the full-text-search tokenizer that implements
13 ** a Porter stemmer. 13 ** a Porter stemmer.
14 */ 14 */
15 15
16 /* 16 /*
17 ** The code in this file is only compiled if: 17 ** The code in this file is only compiled if:
18 ** 18 **
19 ** * The FTS3 module is being built as an extension 19 ** * The FTS3 module is being built as an extension
20 ** (in which case SQLITE_CORE is not defined), or 20 ** (in which case SQLITE_CORE is not defined), or
21 ** 21 **
22 ** * The FTS3 module is being built into the core of 22 ** * The FTS3 module is being built into the core of
23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
24 */ 24 */
25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) 25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
26 26
27 #include "fts3Int.h"
27 28
28 #include <assert.h> 29 #include <assert.h>
29 #include <stdlib.h> 30 #include <stdlib.h>
30 #include <stdio.h> 31 #include <stdio.h>
31 #include <string.h> 32 #include <string.h>
32 33
33 #include "fts3_tokenizer.h" 34 #include "fts3_tokenizer.h"
34 35
35 /* 36 /*
36 ** Class derived from sqlite3_tokenizer 37 ** Class derived from sqlite3_tokenizer
37 */ 38 */
38 typedef struct porter_tokenizer { 39 typedef struct porter_tokenizer {
39 sqlite3_tokenizer base; /* Base class */ 40 sqlite3_tokenizer base; /* Base class */
40 } porter_tokenizer; 41 } porter_tokenizer;
41 42
42 /* 43 /*
43 ** Class derived from sqlit3_tokenizer_cursor 44 ** Class derived from sqlit3_tokenizer_cursor
44 */ 45 */
45 typedef struct porter_tokenizer_cursor { 46 typedef struct porter_tokenizer_cursor {
46 sqlite3_tokenizer_cursor base; 47 sqlite3_tokenizer_cursor base;
47 const char *zInput; /* input we are tokenizing */ 48 const char *zInput; /* input we are tokenizing */
48 int nInput; /* size of the input */ 49 int nInput; /* size of the input */
49 int iOffset; /* current position in zInput */ 50 int iOffset; /* current position in zInput */
50 int iToken; /* index of next token to be returned */ 51 int iToken; /* index of next token to be returned */
51 char *zToken; /* storage for current token */ 52 char *zToken; /* storage for current token */
52 int nAllocated; /* space allocated to zToken buffer */ 53 int nAllocated; /* space allocated to zToken buffer */
53 } porter_tokenizer_cursor; 54 } porter_tokenizer_cursor;
54 55
55 56
56 /* Forward declaration */
57 static const sqlite3_tokenizer_module porterTokenizerModule;
58
59
60 /* 57 /*
61 ** Create a new tokenizer instance. 58 ** Create a new tokenizer instance.
62 */ 59 */
63 static int porterCreate( 60 static int porterCreate(
64 int argc, const char * const *argv, 61 int argc, const char * const *argv,
65 sqlite3_tokenizer **ppTokenizer 62 sqlite3_tokenizer **ppTokenizer
66 ){ 63 ){
67 porter_tokenizer *t; 64 porter_tokenizer *t;
65
66 UNUSED_PARAMETER(argc);
67 UNUSED_PARAMETER(argv);
68
68 t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t)); 69 t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
69 if( t==NULL ) return SQLITE_NOMEM; 70 if( t==NULL ) return SQLITE_NOMEM;
70 memset(t, 0, sizeof(*t)); 71 memset(t, 0, sizeof(*t));
71 *ppTokenizer = &t->base; 72 *ppTokenizer = &t->base;
72 return SQLITE_OK; 73 return SQLITE_OK;
73 } 74 }
74 75
75 /* 76 /*
76 ** Destroy a tokenizer 77 ** Destroy a tokenizer
77 */ 78 */
78 static int porterDestroy(sqlite3_tokenizer *pTokenizer){ 79 static int porterDestroy(sqlite3_tokenizer *pTokenizer){
79 sqlite3_free(pTokenizer); 80 sqlite3_free(pTokenizer);
80 return SQLITE_OK; 81 return SQLITE_OK;
81 } 82 }
82 83
83 /* 84 /*
84 ** Prepare to begin tokenizing a particular string. The input 85 ** Prepare to begin tokenizing a particular string. The input
85 ** string to be tokenized is zInput[0..nInput-1]. A cursor 86 ** string to be tokenized is zInput[0..nInput-1]. A cursor
86 ** used to incrementally tokenize this string is returned in 87 ** used to incrementally tokenize this string is returned in
87 ** *ppCursor. 88 ** *ppCursor.
88 */ 89 */
89 static int porterOpen( 90 static int porterOpen(
90 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ 91 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
91 const char *zInput, int nInput, /* String to be tokenized */ 92 const char *zInput, int nInput, /* String to be tokenized */
92 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ 93 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
93 ){ 94 ){
94 porter_tokenizer_cursor *c; 95 porter_tokenizer_cursor *c;
95 96
97 UNUSED_PARAMETER(pTokenizer);
98
96 c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); 99 c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
97 if( c==NULL ) return SQLITE_NOMEM; 100 if( c==NULL ) return SQLITE_NOMEM;
98 101
99 c->zInput = zInput; 102 c->zInput = zInput;
100 if( zInput==0 ){ 103 if( zInput==0 ){
101 c->nInput = 0; 104 c->nInput = 0;
102 }else if( nInput<0 ){ 105 }else if( nInput<0 ){
103 c->nInput = (int)strlen(zInput); 106 c->nInput = (int)strlen(zInput);
104 }else{ 107 }else{
105 c->nInput = nInput; 108 c->nInput = nInput;
(...skipping 120 matching lines...) Expand 10 before | Expand all | Expand 10 after
226 return *z!=0; 229 return *z!=0;
227 } 230 }
228 231
229 /* 232 /*
230 ** Return TRUE if the word ends in a double consonant. 233 ** Return TRUE if the word ends in a double consonant.
231 ** 234 **
232 ** The text is reversed here. So we are really looking at 235 ** The text is reversed here. So we are really looking at
233 ** the first two characters of z[]. 236 ** the first two characters of z[].
234 */ 237 */
235 static int doubleConsonant(const char *z){ 238 static int doubleConsonant(const char *z){
236 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1); 239 return isConsonant(z) && z[0]==z[1];
237 } 240 }
238 241
239 /* 242 /*
240 ** Return TRUE if the word ends with three letters which 243 ** Return TRUE if the word ends with three letters which
241 ** are consonant-vowel-consonent and where the final consonant 244 ** are consonant-vowel-consonent and where the final consonant
242 ** is not 'w', 'x', or 'y'. 245 ** is not 'w', 'x', or 'y'.
243 ** 246 **
244 ** The word is reversed here. So we are really checking the 247 ** The word is reversed here. So we are really checking the
245 ** first three letters and the first one cannot be in [wxy]. 248 ** first three letters and the first one cannot be in [wxy].
246 */ 249 */
247 static int star_oh(const char *z){ 250 static int star_oh(const char *z){
248 return 251 return
249 z[0]!=0 && isConsonant(z) && 252 isConsonant(z) &&
250 z[0]!='w' && z[0]!='x' && z[0]!='y' && 253 z[0]!='w' && z[0]!='x' && z[0]!='y' &&
251 z[1]!=0 && isVowel(z+1) && 254 isVowel(z+1) &&
252 z[2]!=0 && isConsonant(z+2); 255 isConsonant(z+2);
253 } 256 }
254 257
255 /* 258 /*
256 ** If the word ends with zFrom and xCond() is true for the stem 259 ** If the word ends with zFrom and xCond() is true for the stem
257 ** of the word that preceeds the zFrom ending, then change the 260 ** of the word that preceeds the zFrom ending, then change the
258 ** ending to zTo. 261 ** ending to zTo.
259 ** 262 **
260 ** The input word *pz and zFrom are both in reverse order. zTo 263 ** The input word *pz and zFrom are both in reverse order. zTo
261 ** is in normal order. 264 ** is in normal order.
262 ** 265 **
(...skipping 23 matching lines...) Expand all
286 ** inappropriate. The input word is copied into the output with 289 ** inappropriate. The input word is copied into the output with
287 ** US-ASCII case folding. If the input word is too long (more 290 ** US-ASCII case folding. If the input word is too long (more
288 ** than 20 bytes if it contains no digits or more than 6 bytes if 291 ** than 20 bytes if it contains no digits or more than 6 bytes if
289 ** it contains digits) then word is truncated to 20 or 6 bytes 292 ** it contains digits) then word is truncated to 20 or 6 bytes
290 ** by taking 10 or 3 bytes from the beginning and end. 293 ** by taking 10 or 3 bytes from the beginning and end.
291 */ 294 */
292 static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ 295 static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
293 int i, mx, j; 296 int i, mx, j;
294 int hasDigit = 0; 297 int hasDigit = 0;
295 for(i=0; i<nIn; i++){ 298 for(i=0; i<nIn; i++){
296 int c = zIn[i]; 299 char c = zIn[i];
297 if( c>='A' && c<='Z' ){ 300 if( c>='A' && c<='Z' ){
298 zOut[i] = c - 'A' + 'a'; 301 zOut[i] = c - 'A' + 'a';
299 }else{ 302 }else{
300 if( c>='0' && c<='9' ) hasDigit = 1; 303 if( c>='0' && c<='9' ) hasDigit = 1;
301 zOut[i] = c; 304 zOut[i] = c;
302 } 305 }
303 } 306 }
304 mx = hasDigit ? 3 : 10; 307 mx = hasDigit ? 3 : 10;
305 if( nIn>mx*2 ){ 308 if( nIn>mx*2 ){
306 for(j=mx, i=nIn-mx; i<nIn; i++, j++){ 309 for(j=mx, i=nIn-mx; i<nIn; i++, j++){
(...skipping 23 matching lines...) Expand all
330 ** 333 **
331 ** If the input word contains not digits but does characters not 334 ** If the input word contains not digits but does characters not
332 ** in [a-zA-Z] then no stemming is attempted and this routine just 335 ** in [a-zA-Z] then no stemming is attempted and this routine just
333 ** copies the input into the input into the output with US-ASCII 336 ** copies the input into the input into the output with US-ASCII
334 ** case folding. 337 ** case folding.
335 ** 338 **
336 ** Stemming never increases the length of the word. So there is 339 ** Stemming never increases the length of the word. So there is
337 ** no chance of overflowing the zOut buffer. 340 ** no chance of overflowing the zOut buffer.
338 */ 341 */
339 static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){ 342 static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
340 int i, j, c; 343 int i, j;
341 char zReverse[28]; 344 char zReverse[28];
342 char *z, *z2; 345 char *z, *z2;
343 if( nIn<3 || nIn>=sizeof(zReverse)-7 ){ 346 if( nIn<3 || nIn>=(int)sizeof(zReverse)-7 ){
344 /* The word is too big or too small for the porter stemmer. 347 /* The word is too big or too small for the porter stemmer.
345 ** Fallback to the copy stemmer */ 348 ** Fallback to the copy stemmer */
346 copy_stemmer(zIn, nIn, zOut, pnOut); 349 copy_stemmer(zIn, nIn, zOut, pnOut);
347 return; 350 return;
348 } 351 }
349 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){ 352 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
350 c = zIn[i]; 353 char c = zIn[i];
351 if( c>='A' && c<='Z' ){ 354 if( c>='A' && c<='Z' ){
352 zReverse[j] = c + 'a' - 'A'; 355 zReverse[j] = c + 'a' - 'A';
353 }else if( c>='a' && c<='z' ){ 356 }else if( c>='a' && c<='z' ){
354 zReverse[j] = c; 357 zReverse[j] = c;
355 }else{ 358 }else{
356 /* The use of a character not in [a-zA-Z] means that we fallback 359 /* The use of a character not in [a-zA-Z] means that we fallback
357 ** to the copy stemmer */ 360 ** to the copy stemmer */
358 copy_stemmer(zIn, nIn, zOut, pnOut); 361 copy_stemmer(zIn, nIn, zOut, pnOut);
359 return; 362 return;
360 } 363 }
(...skipping 178 matching lines...) Expand 10 before | Expand all | Expand 10 after
539 } 542 }
540 543
541 /* Step 5b */ 544 /* Step 5b */
542 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){ 545 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
543 z++; 546 z++;
544 } 547 }
545 548
546 /* z[] is now the stemmed word in reverse order. Flip it back 549 /* z[] is now the stemmed word in reverse order. Flip it back
547 ** around into forward order and return. 550 ** around into forward order and return.
548 */ 551 */
549 *pnOut = i = strlen(z); 552 *pnOut = i = (int)strlen(z);
550 zOut[i] = 0; 553 zOut[i] = 0;
551 while( *z ){ 554 while( *z ){
552 zOut[--i] = *(z++); 555 zOut[--i] = *(z++);
553 } 556 }
554 } 557 }
555 558
556 /* 559 /*
557 ** Characters that can be part of a token. We assume any character 560 ** Characters that can be part of a token. We assume any character
558 ** whose value is greater than 0x80 (any UTF character) can be 561 ** whose value is greater than 0x80 (any UTF character) can be
559 ** part of a token. In other words, delimiters all must have 562 ** part of a token. In other words, delimiters all must have
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
594 597
595 /* Count non-delimiter characters. */ 598 /* Count non-delimiter characters. */
596 iStartOffset = c->iOffset; 599 iStartOffset = c->iOffset;
597 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){ 600 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
598 c->iOffset++; 601 c->iOffset++;
599 } 602 }
600 603
601 if( c->iOffset>iStartOffset ){ 604 if( c->iOffset>iStartOffset ){
602 int n = c->iOffset-iStartOffset; 605 int n = c->iOffset-iStartOffset;
603 if( n>c->nAllocated ){ 606 if( n>c->nAllocated ){
607 char *pNew;
604 c->nAllocated = n+20; 608 c->nAllocated = n+20;
605 c->zToken = sqlite3_realloc(c->zToken, c->nAllocated); 609 pNew = sqlite3_realloc(c->zToken, c->nAllocated);
606 if( c->zToken==NULL ) return SQLITE_NOMEM; 610 if( !pNew ) return SQLITE_NOMEM;
611 c->zToken = pNew;
607 } 612 }
608 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes); 613 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
609 *pzToken = c->zToken; 614 *pzToken = c->zToken;
610 *piStartOffset = iStartOffset; 615 *piStartOffset = iStartOffset;
611 *piEndOffset = c->iOffset; 616 *piEndOffset = c->iOffset;
612 *piPosition = c->iToken++; 617 *piPosition = c->iToken++;
613 return SQLITE_OK; 618 return SQLITE_OK;
614 } 619 }
615 } 620 }
616 return SQLITE_DONE; 621 return SQLITE_DONE;
(...skipping 15 matching lines...) Expand all
632 ** Allocate a new porter tokenizer. Return a pointer to the new 637 ** Allocate a new porter tokenizer. Return a pointer to the new
633 ** tokenizer in *ppModule 638 ** tokenizer in *ppModule
634 */ 639 */
635 void sqlite3Fts3PorterTokenizerModule( 640 void sqlite3Fts3PorterTokenizerModule(
636 sqlite3_tokenizer_module const**ppModule 641 sqlite3_tokenizer_module const**ppModule
637 ){ 642 ){
638 *ppModule = &porterTokenizerModule; 643 *ppModule = &porterTokenizerModule;
639 } 644 }
640 645
641 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ 646 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
OLDNEW
« no previous file with comments | « third_party/sqlite/src/ext/fts3/fts3_hash.c ('k') | third_party/sqlite/src/ext/fts3/fts3_snippet.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698