third_party/sqlite/src/ext/fts3/fts3_porter.c - Issue 6990047: Import SQLite 3.7.6.3.

Side by Side Diff: third_party/sqlite/src/ext/fts3/fts3_porter.c

Issue 6990047: Import SQLite 3.7.6.3. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 9 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 ** 2006 September 30	2 ** 2006 September 30

3 **	3 **

4 ** The author disclaims copyright to this source code. In place of	4 ** The author disclaims copyright to this source code. In place of

5 ** a legal notice, here is a blessing:	5 ** a legal notice, here is a blessing:

6 **	6 **

7 ** May you do good and not evil.	7 ** May you do good and not evil.

8 ** May you find forgiveness for yourself and forgive others.	8 ** May you find forgiveness for yourself and forgive others.

9 ** May you share freely, never taking more than you give.	9 ** May you share freely, never taking more than you give.

10 **	10 **

11 *************************************************************************	11 *************************************************************************

12 ** Implementation of the full-text-search tokenizer that implements	12 ** Implementation of the full-text-search tokenizer that implements

13 ** a Porter stemmer.	13 ** a Porter stemmer.

14 */	14 */

15	15

16 /*	16 /*

17 ** The code in this file is only compiled if:	17 ** The code in this file is only compiled if:

18 **	18 **

19 ** * The FTS3 module is being built as an extension	19 ** * The FTS3 module is being built as an extension

20 ** (in which case SQLITE_CORE is not defined), or	20 ** (in which case SQLITE_CORE is not defined), or

21 **	21 **

22 ** * The FTS3 module is being built into the core of	22 ** * The FTS3 module is being built into the core of

23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).	23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).

24 */	24 */

25 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)	25 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)

26	26

	27 #include "fts3Int.h"

27	28

28 #include <assert.h>	29 #include <assert.h>

29 #include <stdlib.h>	30 #include <stdlib.h>

30 #include <stdio.h>	31 #include <stdio.h>

31 #include <string.h>	32 #include <string.h>

32	33

33 #include "fts3_tokenizer.h"	34 #include "fts3_tokenizer.h"

34	35

35 /*	36 /*

36 ** Class derived from sqlite3_tokenizer	37 ** Class derived from sqlite3_tokenizer

37 */	38 */

38 typedef struct porter_tokenizer {	39 typedef struct porter_tokenizer {

39 sqlite3_tokenizer base; /* Base class */	40 sqlite3_tokenizer base; /* Base class */

40 } porter_tokenizer;	41 } porter_tokenizer;

41	42

42 /*	43 /*

43 ** Class derived from sqlit3_tokenizer_cursor	44 ** Class derived from sqlit3_tokenizer_cursor

44 */	45 */

45 typedef struct porter_tokenizer_cursor {	46 typedef struct porter_tokenizer_cursor {

46 sqlite3_tokenizer_cursor base;	47 sqlite3_tokenizer_cursor base;

47 const char zInput; / input we are tokenizing */	48 const char zInput; / input we are tokenizing */

48 int nInput; /* size of the input */	49 int nInput; /* size of the input */

49 int iOffset; /* current position in zInput */	50 int iOffset; /* current position in zInput */

50 int iToken; /* index of next token to be returned */	51 int iToken; /* index of next token to be returned */

51 char zToken; / storage for current token */	52 char zToken; / storage for current token */

52 int nAllocated; /* space allocated to zToken buffer */	53 int nAllocated; /* space allocated to zToken buffer */

53 } porter_tokenizer_cursor;	54 } porter_tokenizer_cursor;

54	55

55	56

56 /* Forward declaration */

57 static const sqlite3_tokenizer_module porterTokenizerModule;

58

59

60 /*	57 /*

61 ** Create a new tokenizer instance.	58 ** Create a new tokenizer instance.

62 */	59 */

63 static int porterCreate(	60 static int porterCreate(

64 int argc, const char * const *argv,	61 int argc, const char * const *argv,

65 sqlite3_tokenizer **ppTokenizer	62 sqlite3_tokenizer **ppTokenizer

66 ){	63 ){

67 porter_tokenizer *t;	64 porter_tokenizer *t;

	65

	66 UNUSED_PARAMETER(argc);

	67 UNUSED_PARAMETER(argv);

	68

68 t = (porter_tokenizer ) sqlite3_malloc(sizeof(t));	69 t = (porter_tokenizer ) sqlite3_malloc(sizeof(t));

69 if( t==NULL ) return SQLITE_NOMEM;	70 if( t==NULL ) return SQLITE_NOMEM;

70 memset(t, 0, sizeof(*t));	71 memset(t, 0, sizeof(*t));

71 *ppTokenizer = &t->base;	72 *ppTokenizer = &t->base;

72 return SQLITE_OK;	73 return SQLITE_OK;

73 }	74 }

74	75

75 /*	76 /*

76 ** Destroy a tokenizer	77 ** Destroy a tokenizer

77 */	78 */

78 static int porterDestroy(sqlite3_tokenizer *pTokenizer){	79 static int porterDestroy(sqlite3_tokenizer *pTokenizer){

79 sqlite3_free(pTokenizer);	80 sqlite3_free(pTokenizer);

80 return SQLITE_OK;	81 return SQLITE_OK;

81 }	82 }

82	83

83 /*	84 /*

84 ** Prepare to begin tokenizing a particular string. The input	85 ** Prepare to begin tokenizing a particular string. The input

85 ** string to be tokenized is zInput[0..nInput-1]. A cursor	86 ** string to be tokenized is zInput[0..nInput-1]. A cursor

86 ** used to incrementally tokenize this string is returned in	87 ** used to incrementally tokenize this string is returned in

87 ** *ppCursor.	88 ** *ppCursor.

88 */	89 */

89 static int porterOpen(	90 static int porterOpen(

90 sqlite3_tokenizer pTokenizer, / The tokenizer */	91 sqlite3_tokenizer pTokenizer, / The tokenizer */

91 const char zInput, int nInput, / String to be tokenized */	92 const char zInput, int nInput, / String to be tokenized */

92 sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */	93 sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */

93 ){	94 ){

94 porter_tokenizer_cursor *c;	95 porter_tokenizer_cursor *c;

95	96

	97 UNUSED_PARAMETER(pTokenizer);

	98

96 c = (porter_tokenizer_cursor ) sqlite3_malloc(sizeof(c));	99 c = (porter_tokenizer_cursor ) sqlite3_malloc(sizeof(c));

97 if( c==NULL ) return SQLITE_NOMEM;	100 if( c==NULL ) return SQLITE_NOMEM;

98	101

99 c->zInput = zInput;	102 c->zInput = zInput;

100 if( zInput==0 ){	103 if( zInput==0 ){

101 c->nInput = 0;	104 c->nInput = 0;

102 }else if( nInput<0 ){	105 }else if( nInput<0 ){

103 c->nInput = (int)strlen(zInput);	106 c->nInput = (int)strlen(zInput);

104 }else{	107 }else{

105 c->nInput = nInput;	108 c->nInput = nInput;

(...skipping 120 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
226 return *z!=0;	229 return *z!=0;

227 }	230 }

228	231

229 /*	232 /*

230 ** Return TRUE if the word ends in a double consonant.	233 ** Return TRUE if the word ends in a double consonant.

231 **	234 **

232 ** The text is reversed here. So we are really looking at	235 ** The text is reversed here. So we are really looking at

233 ** the first two characters of z[].	236 ** the first two characters of z[].

234 */	237 */

235 static int doubleConsonant(const char *z){	238 static int doubleConsonant(const char *z){

236 return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);	239 return isConsonant(z) && z[0]==z[1];

237 }	240 }

238	241

239 /*	242 /*

240 ** Return TRUE if the word ends with three letters which	243 ** Return TRUE if the word ends with three letters which

241 ** are consonant-vowel-consonent and where the final consonant	244 ** are consonant-vowel-consonent and where the final consonant

242 ** is not 'w', 'x', or 'y'.	245 ** is not 'w', 'x', or 'y'.

243 **	246 **

244 ** The word is reversed here. So we are really checking the	247 ** The word is reversed here. So we are really checking the

245 ** first three letters and the first one cannot be in [wxy].	248 ** first three letters and the first one cannot be in [wxy].

246 */	249 */

247 static int star_oh(const char *z){	250 static int star_oh(const char *z){

248 return	251 return

249 z[0]!=0 && isConsonant(z) &&	252 isConsonant(z) &&

250 z[0]!='w' && z[0]!='x' && z[0]!='y' &&	253 z[0]!='w' && z[0]!='x' && z[0]!='y' &&

251 z[1]!=0 && isVowel(z+1) &&	254 isVowel(z+1) &&

252 z[2]!=0 && isConsonant(z+2);	255 isConsonant(z+2);

253 }	256 }

254	257

255 /*	258 /*

256 ** If the word ends with zFrom and xCond() is true for the stem	259 ** If the word ends with zFrom and xCond() is true for the stem

257 ** of the word that preceeds the zFrom ending, then change the	260 ** of the word that preceeds the zFrom ending, then change the

258 ** ending to zTo.	261 ** ending to zTo.

259 **	262 **

260 ** The input word *pz and zFrom are both in reverse order. zTo	263 ** The input word *pz and zFrom are both in reverse order. zTo

261 ** is in normal order.	264 ** is in normal order.

262 **	265 **

(...skipping 23 matching lines...) Expand all Loading...
286 ** inappropriate. The input word is copied into the output with	289 ** inappropriate. The input word is copied into the output with

287 ** US-ASCII case folding. If the input word is too long (more	290 ** US-ASCII case folding. If the input word is too long (more

288 ** than 20 bytes if it contains no digits or more than 6 bytes if	291 ** than 20 bytes if it contains no digits or more than 6 bytes if

289 ** it contains digits) then word is truncated to 20 or 6 bytes	292 ** it contains digits) then word is truncated to 20 or 6 bytes

290 ** by taking 10 or 3 bytes from the beginning and end.	293 ** by taking 10 or 3 bytes from the beginning and end.

291 */	294 */

292 static void copy_stemmer(const char zIn, int nIn, char zOut, int *pnOut){	295 static void copy_stemmer(const char zIn, int nIn, char zOut, int *pnOut){

293 int i, mx, j;	296 int i, mx, j;

294 int hasDigit = 0;	297 int hasDigit = 0;

295 for(i=0; i<nIn; i++){	298 for(i=0; i<nIn; i++){

296 int c = zIn[i];	299 char c = zIn[i];

297 if( c>='A' && c<='Z' ){	300 if( c>='A' && c<='Z' ){

298 zOut[i] = c - 'A' + 'a';	301 zOut[i] = c - 'A' + 'a';

299 }else{	302 }else{

300 if( c>='0' && c<='9' ) hasDigit = 1;	303 if( c>='0' && c<='9' ) hasDigit = 1;

301 zOut[i] = c;	304 zOut[i] = c;

302 }	305 }

303 }	306 }

304 mx = hasDigit ? 3 : 10;	307 mx = hasDigit ? 3 : 10;

305 if( nIn>mx*2 ){	308 if( nIn>mx*2 ){

306 for(j=mx, i=nIn-mx; i<nIn; i++, j++){	309 for(j=mx, i=nIn-mx; i<nIn; i++, j++){

(...skipping 23 matching lines...) Expand all Loading...
330 **	333 **

331 ** If the input word contains not digits but does characters not	334 ** If the input word contains not digits but does characters not

332 ** in [a-zA-Z] then no stemming is attempted and this routine just	335 ** in [a-zA-Z] then no stemming is attempted and this routine just

333 ** copies the input into the input into the output with US-ASCII	336 ** copies the input into the input into the output with US-ASCII

334 ** case folding.	337 ** case folding.

335 **	338 **

336 ** Stemming never increases the length of the word. So there is	339 ** Stemming never increases the length of the word. So there is

337 ** no chance of overflowing the zOut buffer.	340 ** no chance of overflowing the zOut buffer.

338 */	341 */

339 static void porter_stemmer(const char zIn, int nIn, char zOut, int *pnOut){	342 static void porter_stemmer(const char zIn, int nIn, char zOut, int *pnOut){

340 int i, j, c;	343 int i, j;

341 char zReverse[28];	344 char zReverse[28];

342 char z, z2;	345 char z, z2;

343 if( nIn<3 \|\| nIn>=sizeof(zReverse)-7 ){	346 if( nIn<3 \|\| nIn>=(int)sizeof(zReverse)-7 ){

344 /* The word is too big or too small for the porter stemmer.	347 /* The word is too big or too small for the porter stemmer.

345 ** Fallback to the copy stemmer */	348 ** Fallback to the copy stemmer */

346 copy_stemmer(zIn, nIn, zOut, pnOut);	349 copy_stemmer(zIn, nIn, zOut, pnOut);

347 return;	350 return;

348 }	351 }

349 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){	352 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){

350 c = zIn[i];	353 char c = zIn[i];

351 if( c>='A' && c<='Z' ){	354 if( c>='A' && c<='Z' ){

352 zReverse[j] = c + 'a' - 'A';	355 zReverse[j] = c + 'a' - 'A';

353 }else if( c>='a' && c<='z' ){	356 }else if( c>='a' && c<='z' ){

354 zReverse[j] = c;	357 zReverse[j] = c;

355 }else{	358 }else{

356 /* The use of a character not in [a-zA-Z] means that we fallback	359 /* The use of a character not in [a-zA-Z] means that we fallback

357 ** to the copy stemmer */	360 ** to the copy stemmer */

358 copy_stemmer(zIn, nIn, zOut, pnOut);	361 copy_stemmer(zIn, nIn, zOut, pnOut);

359 return;	362 return;

360 }	363 }

(...skipping 178 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
539 }	542 }

540	543

541 /* Step 5b */	544 /* Step 5b */

542 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){	545 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){

543 z++;	546 z++;

544 }	547 }

545	548

546 /* z[] is now the stemmed word in reverse order. Flip it back	549 /* z[] is now the stemmed word in reverse order. Flip it back

547 ** around into forward order and return.	550 ** around into forward order and return.

548 */	551 */

549 *pnOut = i = strlen(z);	552 *pnOut = i = (int)strlen(z);

550 zOut[i] = 0;	553 zOut[i] = 0;

551 while( *z ){	554 while( *z ){

552 zOut[--i] = *(z++);	555 zOut[--i] = *(z++);

553 }	556 }

554 }	557 }

555	558

556 /*	559 /*

557 ** Characters that can be part of a token. We assume any character	560 ** Characters that can be part of a token. We assume any character

558 ** whose value is greater than 0x80 (any UTF character) can be	561 ** whose value is greater than 0x80 (any UTF character) can be

559 ** part of a token. In other words, delimiters all must have	562 ** part of a token. In other words, delimiters all must have

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
594	597

595 /* Count non-delimiter characters. */	598 /* Count non-delimiter characters. */

596 iStartOffset = c->iOffset;	599 iStartOffset = c->iOffset;

597 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){	600 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){

598 c->iOffset++;	601 c->iOffset++;

599 }	602 }

600	603

601 if( c->iOffset>iStartOffset ){	604 if( c->iOffset>iStartOffset ){

602 int n = c->iOffset-iStartOffset;	605 int n = c->iOffset-iStartOffset;

603 if( n>c->nAllocated ){	606 if( n>c->nAllocated ){

	607 char *pNew;

604 c->nAllocated = n+20;	608 c->nAllocated = n+20;

605 c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);	609 pNew = sqlite3_realloc(c->zToken, c->nAllocated);

606 if( c->zToken==NULL ) return SQLITE_NOMEM;	610 if( !pNew ) return SQLITE_NOMEM;

	611 c->zToken = pNew;

607 }	612 }

608 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);	613 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);

609 *pzToken = c->zToken;	614 *pzToken = c->zToken;

610 *piStartOffset = iStartOffset;	615 *piStartOffset = iStartOffset;

611 *piEndOffset = c->iOffset;	616 *piEndOffset = c->iOffset;

612 *piPosition = c->iToken++;	617 *piPosition = c->iToken++;

613 return SQLITE_OK;	618 return SQLITE_OK;

614 }	619 }

615 }	620 }

616 return SQLITE_DONE;	621 return SQLITE_DONE;

(...skipping 15 matching lines...) Expand all Loading...
632 ** Allocate a new porter tokenizer. Return a pointer to the new	637 ** Allocate a new porter tokenizer. Return a pointer to the new

633 ** tokenizer in *ppModule	638 ** tokenizer in *ppModule

634 */	639 */

635 void sqlite3Fts3PorterTokenizerModule(	640 void sqlite3Fts3PorterTokenizerModule(

636 sqlite3_tokenizer_module const**ppModule	641 sqlite3_tokenizer_module const**ppModule

637 ){	642 ){

638 *ppModule = &porterTokenizerModule;	643 *ppModule = &porterTokenizerModule;

639 }	644 }

640	645

641 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */	646 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */

OLD	NEW

« no previous file with comments | « third_party/sqlite/src/ext/fts3/fts3_hash.c ('k') | third_party/sqlite/src/ext/fts3/fts3_snippet.c » ('j') | no next file with comments »