Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(338)

Side by Side Diff: third_party/sqlite/sqlite-src-3100200/ext/fts5/fts5_tokenize.c

Issue 1610543003: [sql] Import reference version of SQLite 3.10.2. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13
14
15 #include "fts5Int.h"
16
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
19 */
20
21 /*
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
24 */
25 static unsigned char aAsciiTokenChar[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
34 };
35
36 typedef struct AsciiTokenizer AsciiTokenizer;
37 struct AsciiTokenizer {
38 unsigned char aTokenChar[128];
39 };
40
41 static void fts5AsciiAddExceptions(
42 AsciiTokenizer *p,
43 const char *zArg,
44 int bTokenChars
45 ){
46 int i;
47 for(i=0; zArg[i]; i++){
48 if( (zArg[i] & 0x80)==0 ){
49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
50 }
51 }
52 }
53
54 /*
55 ** Delete a "ascii" tokenizer.
56 */
57 static void fts5AsciiDelete(Fts5Tokenizer *p){
58 sqlite3_free(p);
59 }
60
61 /*
62 ** Create an "ascii" tokenizer.
63 */
64 static int fts5AsciiCreate(
65 void *pCtx,
66 const char **azArg, int nArg,
67 Fts5Tokenizer **ppOut
68 ){
69 int rc = SQLITE_OK;
70 AsciiTokenizer *p = 0;
71 if( nArg%2 ){
72 rc = SQLITE_ERROR;
73 }else{
74 p = sqlite3_malloc(sizeof(AsciiTokenizer));
75 if( p==0 ){
76 rc = SQLITE_NOMEM;
77 }else{
78 int i;
79 memset(p, 0, sizeof(AsciiTokenizer));
80 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
81 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
82 const char *zArg = azArg[i+1];
83 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
84 fts5AsciiAddExceptions(p, zArg, 1);
85 }else
86 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
87 fts5AsciiAddExceptions(p, zArg, 0);
88 }else{
89 rc = SQLITE_ERROR;
90 }
91 }
92 if( rc!=SQLITE_OK ){
93 fts5AsciiDelete((Fts5Tokenizer*)p);
94 p = 0;
95 }
96 }
97 }
98
99 *ppOut = (Fts5Tokenizer*)p;
100 return rc;
101 }
102
103
104 static void asciiFold(char *aOut, const char *aIn, int nByte){
105 int i;
106 for(i=0; i<nByte; i++){
107 char c = aIn[i];
108 if( c>='A' && c<='Z' ) c += 32;
109 aOut[i] = c;
110 }
111 }
112
113 /*
114 ** Tokenize some text using the ascii tokenizer.
115 */
116 static int fts5AsciiTokenize(
117 Fts5Tokenizer *pTokenizer,
118 void *pCtx,
119 int flags,
120 const char *pText, int nText,
121 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
122 ){
123 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
124 int rc = SQLITE_OK;
125 int ie;
126 int is = 0;
127
128 char aFold[64];
129 int nFold = sizeof(aFold);
130 char *pFold = aFold;
131 unsigned char *a = p->aTokenChar;
132
133 while( is<nText && rc==SQLITE_OK ){
134 int nByte;
135
136 /* Skip any leading divider characters. */
137 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
138 is++;
139 }
140 if( is==nText ) break;
141
142 /* Count the token characters */
143 ie = is+1;
144 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
145 ie++;
146 }
147
148 /* Fold to lower case */
149 nByte = ie-is;
150 if( nByte>nFold ){
151 if( pFold!=aFold ) sqlite3_free(pFold);
152 pFold = sqlite3_malloc(nByte*2);
153 if( pFold==0 ){
154 rc = SQLITE_NOMEM;
155 break;
156 }
157 nFold = nByte*2;
158 }
159 asciiFold(pFold, &pText[is], nByte);
160
161 /* Invoke the token callback */
162 rc = xToken(pCtx, 0, pFold, nByte, is, ie);
163 is = ie+1;
164 }
165
166 if( pFold!=aFold ) sqlite3_free(pFold);
167 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
168 return rc;
169 }
170
171 /**************************************************************************
172 ** Start of unicode61 tokenizer implementation.
173 */
174
175
176 /*
177 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
178 ** from the sqlite3 source file utf.c. If this file is compiled as part
179 ** of the amalgamation, they are not required.
180 */
181 #ifndef SQLITE_AMALGAMATION
182
183 static const unsigned char sqlite3Utf8Trans1[] = {
184 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
185 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
186 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
187 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
188 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
189 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
190 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
191 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
192 };
193
194 #define READ_UTF8(zIn, zTerm, c) \
195 c = *(zIn++); \
196 if( c>=0xc0 ){ \
197 c = sqlite3Utf8Trans1[c-0xc0]; \
198 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
199 c = (c<<6) + (0x3f & *(zIn++)); \
200 } \
201 if( c<0x80 \
202 || (c&0xFFFFF800)==0xD800 \
203 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
204 }
205
206
207 #define WRITE_UTF8(zOut, c) { \
208 if( c<0x00080 ){ \
209 *zOut++ = (unsigned char)(c&0xFF); \
210 } \
211 else if( c<0x00800 ){ \
212 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
213 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
214 } \
215 else if( c<0x10000 ){ \
216 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
217 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
218 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
219 }else{ \
220 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
221 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
222 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
223 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
224 } \
225 }
226
227 #endif /* ifndef SQLITE_AMALGAMATION */
228
229 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
230 struct Unicode61Tokenizer {
231 unsigned char aTokenChar[128]; /* ASCII range token characters */
232 char *aFold; /* Buffer to fold text into */
233 int nFold; /* Size of aFold[] in bytes */
234 int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
235 int nException;
236 int *aiException;
237 };
238
239 static int fts5UnicodeAddExceptions(
240 Unicode61Tokenizer *p, /* Tokenizer object */
241 const char *z, /* Characters to treat as exceptions */
242 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
243 ){
244 int rc = SQLITE_OK;
245 int n = (int)strlen(z);
246 int *aNew;
247
248 if( n>0 ){
249 aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int));
250 if( aNew ){
251 int nNew = p->nException;
252 const unsigned char *zCsr = (const unsigned char*)z;
253 const unsigned char *zTerm = (const unsigned char*)&z[n];
254 while( zCsr<zTerm ){
255 int iCode;
256 int bToken;
257 READ_UTF8(zCsr, zTerm, iCode);
258 if( iCode<128 ){
259 p->aTokenChar[iCode] = (unsigned char)bTokenChars;
260 }else{
261 bToken = sqlite3Fts5UnicodeIsalnum(iCode);
262 assert( (bToken==0 || bToken==1) );
263 assert( (bTokenChars==0 || bTokenChars==1) );
264 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
265 int i;
266 for(i=0; i<nNew; i++){
267 if( aNew[i]>iCode ) break;
268 }
269 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
270 aNew[i] = iCode;
271 nNew++;
272 }
273 }
274 }
275 p->aiException = aNew;
276 p->nException = nNew;
277 }else{
278 rc = SQLITE_NOMEM;
279 }
280 }
281
282 return rc;
283 }
284
285 /*
286 ** Return true if the p->aiException[] array contains the value iCode.
287 */
288 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
289 if( p->nException>0 ){
290 int *a = p->aiException;
291 int iLo = 0;
292 int iHi = p->nException-1;
293
294 while( iHi>=iLo ){
295 int iTest = (iHi + iLo) / 2;
296 if( iCode==a[iTest] ){
297 return 1;
298 }else if( iCode>a[iTest] ){
299 iLo = iTest+1;
300 }else{
301 iHi = iTest-1;
302 }
303 }
304 }
305
306 return 0;
307 }
308
309 /*
310 ** Delete a "unicode61" tokenizer.
311 */
312 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
313 if( pTok ){
314 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
315 sqlite3_free(p->aiException);
316 sqlite3_free(p->aFold);
317 sqlite3_free(p);
318 }
319 return;
320 }
321
322 /*
323 ** Create a "unicode61" tokenizer.
324 */
325 static int fts5UnicodeCreate(
326 void *pCtx,
327 const char **azArg, int nArg,
328 Fts5Tokenizer **ppOut
329 ){
330 int rc = SQLITE_OK; /* Return code */
331 Unicode61Tokenizer *p = 0; /* New tokenizer object */
332
333 if( nArg%2 ){
334 rc = SQLITE_ERROR;
335 }else{
336 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
337 if( p ){
338 int i;
339 memset(p, 0, sizeof(Unicode61Tokenizer));
340 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
341 p->bRemoveDiacritic = 1;
342 p->nFold = 64;
343 p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
344 if( p->aFold==0 ){
345 rc = SQLITE_NOMEM;
346 }
347 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
348 const char *zArg = azArg[i+1];
349 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
350 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
351 rc = SQLITE_ERROR;
352 }
353 p->bRemoveDiacritic = (zArg[0]=='1');
354 }else
355 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
356 rc = fts5UnicodeAddExceptions(p, zArg, 1);
357 }else
358 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
359 rc = fts5UnicodeAddExceptions(p, zArg, 0);
360 }else{
361 rc = SQLITE_ERROR;
362 }
363 }
364 }else{
365 rc = SQLITE_NOMEM;
366 }
367 if( rc!=SQLITE_OK ){
368 fts5UnicodeDelete((Fts5Tokenizer*)p);
369 p = 0;
370 }
371 *ppOut = (Fts5Tokenizer*)p;
372 }
373 return rc;
374 }
375
376 /*
377 ** Return true if, for the purposes of tokenizing with the tokenizer
378 ** passed as the first argument, codepoint iCode is considered a token
379 ** character (not a separator).
380 */
381 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
382 assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
383 return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
384 }
385
386 static int fts5UnicodeTokenize(
387 Fts5Tokenizer *pTokenizer,
388 void *pCtx,
389 int flags,
390 const char *pText, int nText,
391 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
392 ){
393 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
394 int rc = SQLITE_OK;
395 unsigned char *a = p->aTokenChar;
396
397 unsigned char *zTerm = (unsigned char*)&pText[nText];
398 unsigned char *zCsr = (unsigned char *)pText;
399
400 /* Output buffer */
401 char *aFold = p->aFold;
402 int nFold = p->nFold;
403 const char *pEnd = &aFold[nFold-6];
404
405 /* Each iteration of this loop gobbles up a contiguous run of separators,
406 ** then the next token. */
407 while( rc==SQLITE_OK ){
408 int iCode; /* non-ASCII codepoint read from input */
409 char *zOut = aFold;
410 int is;
411 int ie;
412
413 /* Skip any separator characters. */
414 while( 1 ){
415 if( zCsr>=zTerm ) goto tokenize_done;
416 if( *zCsr & 0x80 ) {
417 /* A character outside of the ascii range. Skip past it if it is
418 ** a separator character. Or break out of the loop if it is not. */
419 is = zCsr - (unsigned char*)pText;
420 READ_UTF8(zCsr, zTerm, iCode);
421 if( fts5UnicodeIsAlnum(p, iCode) ){
422 goto non_ascii_tokenchar;
423 }
424 }else{
425 if( a[*zCsr] ){
426 is = zCsr - (unsigned char*)pText;
427 goto ascii_tokenchar;
428 }
429 zCsr++;
430 }
431 }
432
433 /* Run through the tokenchars. Fold them into the output buffer along
434 ** the way. */
435 while( zCsr<zTerm ){
436
437 /* Grow the output buffer so that there is sufficient space to fit the
438 ** largest possible utf-8 character. */
439 if( zOut>pEnd ){
440 aFold = sqlite3_malloc(nFold*2);
441 if( aFold==0 ){
442 rc = SQLITE_NOMEM;
443 goto tokenize_done;
444 }
445 zOut = &aFold[zOut - p->aFold];
446 memcpy(aFold, p->aFold, nFold);
447 sqlite3_free(p->aFold);
448 p->aFold = aFold;
449 p->nFold = nFold = nFold*2;
450 pEnd = &aFold[nFold-6];
451 }
452
453 if( *zCsr & 0x80 ){
454 /* An non-ascii-range character. Fold it into the output buffer if
455 ** it is a token character, or break out of the loop if it is not. */
456 READ_UTF8(zCsr, zTerm, iCode);
457 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
458 non_ascii_tokenchar:
459 iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
460 if( iCode ) WRITE_UTF8(zOut, iCode);
461 }else{
462 break;
463 }
464 }else if( a[*zCsr]==0 ){
465 /* An ascii-range separator character. End of token. */
466 break;
467 }else{
468 ascii_tokenchar:
469 if( *zCsr>='A' && *zCsr<='Z' ){
470 *zOut++ = *zCsr + 32;
471 }else{
472 *zOut++ = *zCsr;
473 }
474 zCsr++;
475 }
476 ie = zCsr - (unsigned char*)pText;
477 }
478
479 /* Invoke the token callback */
480 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
481 }
482
483 tokenize_done:
484 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
485 return rc;
486 }
487
488 /**************************************************************************
489 ** Start of porter stemmer implementation.
490 */
491
492 /* Any tokens larger than this (in bytes) are passed through without
493 ** stemming. */
494 #define FTS5_PORTER_MAX_TOKEN 64
495
496 typedef struct PorterTokenizer PorterTokenizer;
497 struct PorterTokenizer {
498 fts5_tokenizer tokenizer; /* Parent tokenizer module */
499 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
500 char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
501 };
502
503 /*
504 ** Delete a "porter" tokenizer.
505 */
506 static void fts5PorterDelete(Fts5Tokenizer *pTok){
507 if( pTok ){
508 PorterTokenizer *p = (PorterTokenizer*)pTok;
509 if( p->pTokenizer ){
510 p->tokenizer.xDelete(p->pTokenizer);
511 }
512 sqlite3_free(p);
513 }
514 }
515
516 /*
517 ** Create a "porter" tokenizer.
518 */
519 static int fts5PorterCreate(
520 void *pCtx,
521 const char **azArg, int nArg,
522 Fts5Tokenizer **ppOut
523 ){
524 fts5_api *pApi = (fts5_api*)pCtx;
525 int rc = SQLITE_OK;
526 PorterTokenizer *pRet;
527 void *pUserdata = 0;
528 const char *zBase = "unicode61";
529
530 if( nArg>0 ){
531 zBase = azArg[0];
532 }
533
534 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
535 if( pRet ){
536 memset(pRet, 0, sizeof(PorterTokenizer));
537 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
538 }else{
539 rc = SQLITE_NOMEM;
540 }
541 if( rc==SQLITE_OK ){
542 int nArg2 = (nArg>0 ? nArg-1 : 0);
543 const char **azArg2 = (nArg2 ? &azArg[1] : 0);
544 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
545 }
546
547 if( rc!=SQLITE_OK ){
548 fts5PorterDelete((Fts5Tokenizer*)pRet);
549 pRet = 0;
550 }
551 *ppOut = (Fts5Tokenizer*)pRet;
552 return rc;
553 }
554
555 typedef struct PorterContext PorterContext;
556 struct PorterContext {
557 void *pCtx;
558 int (*xToken)(void*, int, const char*, int, int, int);
559 char *aBuf;
560 };
561
562 typedef struct PorterRule PorterRule;
563 struct PorterRule {
564 const char *zSuffix;
565 int nSuffix;
566 int (*xCond)(char *zStem, int nStem);
567 const char *zOutput;
568 int nOutput;
569 };
570
571 #if 0
572 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
573 int ret = -1;
574 int nBuf = *pnBuf;
575 PorterRule *p;
576
577 for(p=aRule; p->zSuffix; p++){
578 assert( strlen(p->zSuffix)==p->nSuffix );
579 assert( strlen(p->zOutput)==p->nOutput );
580 if( nBuf<p->nSuffix ) continue;
581 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
582 }
583
584 if( p->zSuffix ){
585 int nStem = nBuf - p->nSuffix;
586 if( p->xCond==0 || p->xCond(aBuf, nStem) ){
587 memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
588 *pnBuf = nStem + p->nOutput;
589 ret = p - aRule;
590 }
591 }
592
593 return ret;
594 }
595 #endif
596
597 static int fts5PorterIsVowel(char c, int bYIsVowel){
598 return (
599 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
600 );
601 }
602
603 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
604 int i;
605 int bCons = bPrevCons;
606
607 /* Scan for a vowel */
608 for(i=0; i<nStem; i++){
609 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
610 }
611
612 /* Scan for a consonent */
613 for(i++; i<nStem; i++){
614 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
615 }
616 return 0;
617 }
618
619 /* porter rule condition: (m > 0) */
620 static int fts5Porter_MGt0(char *zStem, int nStem){
621 return !!fts5PorterGobbleVC(zStem, nStem, 0);
622 }
623
624 /* porter rule condition: (m > 1) */
625 static int fts5Porter_MGt1(char *zStem, int nStem){
626 int n;
627 n = fts5PorterGobbleVC(zStem, nStem, 0);
628 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
629 return 1;
630 }
631 return 0;
632 }
633
634 /* porter rule condition: (m = 1) */
635 static int fts5Porter_MEq1(char *zStem, int nStem){
636 int n;
637 n = fts5PorterGobbleVC(zStem, nStem, 0);
638 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
639 return 1;
640 }
641 return 0;
642 }
643
644 /* porter rule condition: (*o) */
645 static int fts5Porter_Ostar(char *zStem, int nStem){
646 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
647 return 0;
648 }else{
649 int i;
650 int mask = 0;
651 int bCons = 0;
652 for(i=0; i<nStem; i++){
653 bCons = !fts5PorterIsVowel(zStem[i], bCons);
654 assert( bCons==0 || bCons==1 );
655 mask = (mask << 1) + bCons;
656 }
657 return ((mask & 0x0007)==0x0005);
658 }
659 }
660
661 /* porter rule condition: (m > 1 and (*S or *T)) */
662 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
663 assert( nStem>0 );
664 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
665 && fts5Porter_MGt1(zStem, nStem);
666 }
667
668 /* porter rule condition: (*v*) */
669 static int fts5Porter_Vowel(char *zStem, int nStem){
670 int i;
671 for(i=0; i<nStem; i++){
672 if( fts5PorterIsVowel(zStem[i], i>0) ){
673 return 1;
674 }
675 }
676 return 0;
677 }
678
679
680 /**************************************************************************
681 ***************************************************************************
682 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
683 */
684
685 static int fts5PorterStep4(char *aBuf, int *pnBuf){
686 int ret = 0;
687 int nBuf = *pnBuf;
688 switch( aBuf[nBuf-2] ){
689
690 case 'a':
691 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
692 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
693 *pnBuf = nBuf - 2;
694 }
695 }
696 break;
697
698 case 'c':
699 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
700 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
701 *pnBuf = nBuf - 4;
702 }
703 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
704 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
705 *pnBuf = nBuf - 4;
706 }
707 }
708 break;
709
710 case 'e':
711 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
712 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
713 *pnBuf = nBuf - 2;
714 }
715 }
716 break;
717
718 case 'i':
719 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
720 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
721 *pnBuf = nBuf - 2;
722 }
723 }
724 break;
725
726 case 'l':
727 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
728 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
729 *pnBuf = nBuf - 4;
730 }
731 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
732 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
733 *pnBuf = nBuf - 4;
734 }
735 }
736 break;
737
738 case 'n':
739 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
740 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
741 *pnBuf = nBuf - 3;
742 }
743 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
744 if( fts5Porter_MGt1(aBuf, nBuf-5) ){
745 *pnBuf = nBuf - 5;
746 }
747 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
748 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
749 *pnBuf = nBuf - 4;
750 }
751 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
752 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
753 *pnBuf = nBuf - 3;
754 }
755 }
756 break;
757
758 case 'o':
759 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
760 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
761 *pnBuf = nBuf - 3;
762 }
763 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
764 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
765 *pnBuf = nBuf - 2;
766 }
767 }
768 break;
769
770 case 's':
771 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
772 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
773 *pnBuf = nBuf - 3;
774 }
775 }
776 break;
777
778 case 't':
779 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
780 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
781 *pnBuf = nBuf - 3;
782 }
783 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
784 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
785 *pnBuf = nBuf - 3;
786 }
787 }
788 break;
789
790 case 'u':
791 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
792 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
793 *pnBuf = nBuf - 3;
794 }
795 }
796 break;
797
798 case 'v':
799 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
800 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
801 *pnBuf = nBuf - 3;
802 }
803 }
804 break;
805
806 case 'z':
807 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
808 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
809 *pnBuf = nBuf - 3;
810 }
811 }
812 break;
813
814 }
815 return ret;
816 }
817
818
819 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
820 int ret = 0;
821 int nBuf = *pnBuf;
822 switch( aBuf[nBuf-2] ){
823
824 case 'a':
825 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
826 memcpy(&aBuf[nBuf-2], "ate", 3);
827 *pnBuf = nBuf - 2 + 3;
828 ret = 1;
829 }
830 break;
831
832 case 'b':
833 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
834 memcpy(&aBuf[nBuf-2], "ble", 3);
835 *pnBuf = nBuf - 2 + 3;
836 ret = 1;
837 }
838 break;
839
840 case 'i':
841 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
842 memcpy(&aBuf[nBuf-2], "ize", 3);
843 *pnBuf = nBuf - 2 + 3;
844 ret = 1;
845 }
846 break;
847
848 }
849 return ret;
850 }
851
852
853 static int fts5PorterStep2(char *aBuf, int *pnBuf){
854 int ret = 0;
855 int nBuf = *pnBuf;
856 switch( aBuf[nBuf-2] ){
857
858 case 'a':
859 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
860 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
861 memcpy(&aBuf[nBuf-7], "ate", 3);
862 *pnBuf = nBuf - 7 + 3;
863 }
864 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
865 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
866 memcpy(&aBuf[nBuf-6], "tion", 4);
867 *pnBuf = nBuf - 6 + 4;
868 }
869 }
870 break;
871
872 case 'c':
873 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
874 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
875 memcpy(&aBuf[nBuf-4], "ence", 4);
876 *pnBuf = nBuf - 4 + 4;
877 }
878 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
879 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
880 memcpy(&aBuf[nBuf-4], "ance", 4);
881 *pnBuf = nBuf - 4 + 4;
882 }
883 }
884 break;
885
886 case 'e':
887 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
888 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
889 memcpy(&aBuf[nBuf-4], "ize", 3);
890 *pnBuf = nBuf - 4 + 3;
891 }
892 }
893 break;
894
895 case 'g':
896 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
897 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
898 memcpy(&aBuf[nBuf-4], "log", 3);
899 *pnBuf = nBuf - 4 + 3;
900 }
901 }
902 break;
903
904 case 'l':
905 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
906 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
907 memcpy(&aBuf[nBuf-3], "ble", 3);
908 *pnBuf = nBuf - 3 + 3;
909 }
910 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
911 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
912 memcpy(&aBuf[nBuf-4], "al", 2);
913 *pnBuf = nBuf - 4 + 2;
914 }
915 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
916 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
917 memcpy(&aBuf[nBuf-5], "ent", 3);
918 *pnBuf = nBuf - 5 + 3;
919 }
920 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
921 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
922 memcpy(&aBuf[nBuf-3], "e", 1);
923 *pnBuf = nBuf - 3 + 1;
924 }
925 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
926 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
927 memcpy(&aBuf[nBuf-5], "ous", 3);
928 *pnBuf = nBuf - 5 + 3;
929 }
930 }
931 break;
932
933 case 'o':
934 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
935 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
936 memcpy(&aBuf[nBuf-7], "ize", 3);
937 *pnBuf = nBuf - 7 + 3;
938 }
939 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
940 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
941 memcpy(&aBuf[nBuf-5], "ate", 3);
942 *pnBuf = nBuf - 5 + 3;
943 }
944 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
945 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
946 memcpy(&aBuf[nBuf-4], "ate", 3);
947 *pnBuf = nBuf - 4 + 3;
948 }
949 }
950 break;
951
952 case 's':
953 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
954 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
955 memcpy(&aBuf[nBuf-5], "al", 2);
956 *pnBuf = nBuf - 5 + 2;
957 }
958 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
959 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
960 memcpy(&aBuf[nBuf-7], "ive", 3);
961 *pnBuf = nBuf - 7 + 3;
962 }
963 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
964 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
965 memcpy(&aBuf[nBuf-7], "ful", 3);
966 *pnBuf = nBuf - 7 + 3;
967 }
968 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
969 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
970 memcpy(&aBuf[nBuf-7], "ous", 3);
971 *pnBuf = nBuf - 7 + 3;
972 }
973 }
974 break;
975
976 case 't':
977 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
978 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
979 memcpy(&aBuf[nBuf-5], "al", 2);
980 *pnBuf = nBuf - 5 + 2;
981 }
982 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
983 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
984 memcpy(&aBuf[nBuf-5], "ive", 3);
985 *pnBuf = nBuf - 5 + 3;
986 }
987 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
988 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
989 memcpy(&aBuf[nBuf-6], "ble", 3);
990 *pnBuf = nBuf - 6 + 3;
991 }
992 }
993 break;
994
995 }
996 return ret;
997 }
998
999
1000 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1001 int ret = 0;
1002 int nBuf = *pnBuf;
1003 switch( aBuf[nBuf-2] ){
1004
1005 case 'a':
1006 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1007 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1008 memcpy(&aBuf[nBuf-4], "ic", 2);
1009 *pnBuf = nBuf - 4 + 2;
1010 }
1011 }
1012 break;
1013
1014 case 's':
1015 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1016 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1017 *pnBuf = nBuf - 4;
1018 }
1019 }
1020 break;
1021
1022 case 't':
1023 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1024 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1025 memcpy(&aBuf[nBuf-5], "ic", 2);
1026 *pnBuf = nBuf - 5 + 2;
1027 }
1028 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1029 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1030 memcpy(&aBuf[nBuf-5], "ic", 2);
1031 *pnBuf = nBuf - 5 + 2;
1032 }
1033 }
1034 break;
1035
1036 case 'u':
1037 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1038 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1039 *pnBuf = nBuf - 3;
1040 }
1041 }
1042 break;
1043
1044 case 'v':
1045 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1046 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1047 *pnBuf = nBuf - 5;
1048 }
1049 }
1050 break;
1051
1052 case 'z':
1053 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1054 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1055 memcpy(&aBuf[nBuf-5], "al", 2);
1056 *pnBuf = nBuf - 5 + 2;
1057 }
1058 }
1059 break;
1060
1061 }
1062 return ret;
1063 }
1064
1065
1066 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1067 int ret = 0;
1068 int nBuf = *pnBuf;
1069 switch( aBuf[nBuf-2] ){
1070
1071 case 'e':
1072 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1073 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1074 memcpy(&aBuf[nBuf-3], "ee", 2);
1075 *pnBuf = nBuf - 3 + 2;
1076 }
1077 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1078 if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1079 *pnBuf = nBuf - 2;
1080 ret = 1;
1081 }
1082 }
1083 break;
1084
1085 case 'n':
1086 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1087 if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1088 *pnBuf = nBuf - 3;
1089 ret = 1;
1090 }
1091 }
1092 break;
1093
1094 }
1095 return ret;
1096 }
1097
1098 /*
1099 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1100 ***************************************************************************
1101 **************************************************************************/
1102
1103 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1104 int nBuf = *pnBuf;
1105 if( aBuf[nBuf-1]=='s' ){
1106 if( aBuf[nBuf-2]=='e' ){
1107 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1108 || (nBuf>3 && aBuf[nBuf-3]=='i' )
1109 ){
1110 *pnBuf = nBuf-2;
1111 }else{
1112 *pnBuf = nBuf-1;
1113 }
1114 }
1115 else if( aBuf[nBuf-2]!='s' ){
1116 *pnBuf = nBuf-1;
1117 }
1118 }
1119 }
1120
1121 static int fts5PorterCb(
1122 void *pCtx,
1123 int tflags,
1124 const char *pToken,
1125 int nToken,
1126 int iStart,
1127 int iEnd
1128 ){
1129 PorterContext *p = (PorterContext*)pCtx;
1130
1131 char *aBuf;
1132 int nBuf;
1133
1134 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1135 aBuf = p->aBuf;
1136 nBuf = nToken;
1137 memcpy(aBuf, pToken, nBuf);
1138
1139 /* Step 1. */
1140 fts5PorterStep1A(aBuf, &nBuf);
1141 if( fts5PorterStep1B(aBuf, &nBuf) ){
1142 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1143 char c = aBuf[nBuf-1];
1144 if( fts5PorterIsVowel(c, 0)==0
1145 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1146 ){
1147 nBuf--;
1148 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1149 aBuf[nBuf++] = 'e';
1150 }
1151 }
1152 }
1153
1154 /* Step 1C. */
1155 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1156 aBuf[nBuf-1] = 'i';
1157 }
1158
1159 /* Steps 2 through 4. */
1160 fts5PorterStep2(aBuf, &nBuf);
1161 fts5PorterStep3(aBuf, &nBuf);
1162 fts5PorterStep4(aBuf, &nBuf);
1163
1164 /* Step 5a. */
1165 assert( nBuf>0 );
1166 if( aBuf[nBuf-1]=='e' ){
1167 if( fts5Porter_MGt1(aBuf, nBuf-1)
1168 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1169 ){
1170 nBuf--;
1171 }
1172 }
1173
1174 /* Step 5b. */
1175 if( nBuf>1 && aBuf[nBuf-1]=='l'
1176 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1177 ){
1178 nBuf--;
1179 }
1180
1181 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1182
1183 pass_through:
1184 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1185 }
1186
1187 /*
1188 ** Tokenize using the porter tokenizer.
1189 */
1190 static int fts5PorterTokenize(
1191 Fts5Tokenizer *pTokenizer,
1192 void *pCtx,
1193 int flags,
1194 const char *pText, int nText,
1195 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1196 ){
1197 PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1198 PorterContext sCtx;
1199 sCtx.xToken = xToken;
1200 sCtx.pCtx = pCtx;
1201 sCtx.aBuf = p->aBuf;
1202 return p->tokenizer.xTokenize(
1203 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1204 );
1205 }
1206
1207 /*
1208 ** Register all built-in tokenizers with FTS5.
1209 */
1210 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1211 struct BuiltinTokenizer {
1212 const char *zName;
1213 fts5_tokenizer x;
1214 } aBuiltin[] = {
1215 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1216 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1217 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1218 };
1219
1220 int rc = SQLITE_OK; /* Return code */
1221 int i; /* To iterate through builtin functions */
1222
1223 for(i=0; rc==SQLITE_OK && i<(int)ArraySize(aBuiltin); i++){
1224 rc = pApi->xCreateTokenizer(pApi,
1225 aBuiltin[i].zName,
1226 (void*)pApi,
1227 &aBuiltin[i].x,
1228 0
1229 );
1230 }
1231
1232 return rc;
1233 }
1234
1235
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698