Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(270)

Side by Side Diff: third_party/sqlite/sqlite-src-3170000/ext/fts5/fts5_tokenize.c

Issue 2747283002: [sql] Import reference version of SQLite 3.17.. (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13
14
15 #include "fts5Int.h"
16
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
19 */
20
21 /*
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
24 */
25 static unsigned char aAsciiTokenChar[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
34 };
35
36 typedef struct AsciiTokenizer AsciiTokenizer;
37 struct AsciiTokenizer {
38 unsigned char aTokenChar[128];
39 };
40
41 static void fts5AsciiAddExceptions(
42 AsciiTokenizer *p,
43 const char *zArg,
44 int bTokenChars
45 ){
46 int i;
47 for(i=0; zArg[i]; i++){
48 if( (zArg[i] & 0x80)==0 ){
49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
50 }
51 }
52 }
53
54 /*
55 ** Delete a "ascii" tokenizer.
56 */
57 static void fts5AsciiDelete(Fts5Tokenizer *p){
58 sqlite3_free(p);
59 }
60
61 /*
62 ** Create an "ascii" tokenizer.
63 */
64 static int fts5AsciiCreate(
65 void *pUnused,
66 const char **azArg, int nArg,
67 Fts5Tokenizer **ppOut
68 ){
69 int rc = SQLITE_OK;
70 AsciiTokenizer *p = 0;
71 UNUSED_PARAM(pUnused);
72 if( nArg%2 ){
73 rc = SQLITE_ERROR;
74 }else{
75 p = sqlite3_malloc(sizeof(AsciiTokenizer));
76 if( p==0 ){
77 rc = SQLITE_NOMEM;
78 }else{
79 int i;
80 memset(p, 0, sizeof(AsciiTokenizer));
81 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
82 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
83 const char *zArg = azArg[i+1];
84 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
85 fts5AsciiAddExceptions(p, zArg, 1);
86 }else
87 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
88 fts5AsciiAddExceptions(p, zArg, 0);
89 }else{
90 rc = SQLITE_ERROR;
91 }
92 }
93 if( rc!=SQLITE_OK ){
94 fts5AsciiDelete((Fts5Tokenizer*)p);
95 p = 0;
96 }
97 }
98 }
99
100 *ppOut = (Fts5Tokenizer*)p;
101 return rc;
102 }
103
104
105 static void asciiFold(char *aOut, const char *aIn, int nByte){
106 int i;
107 for(i=0; i<nByte; i++){
108 char c = aIn[i];
109 if( c>='A' && c<='Z' ) c += 32;
110 aOut[i] = c;
111 }
112 }
113
114 /*
115 ** Tokenize some text using the ascii tokenizer.
116 */
117 static int fts5AsciiTokenize(
118 Fts5Tokenizer *pTokenizer,
119 void *pCtx,
120 int iUnused,
121 const char *pText, int nText,
122 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
123 ){
124 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
125 int rc = SQLITE_OK;
126 int ie;
127 int is = 0;
128
129 char aFold[64];
130 int nFold = sizeof(aFold);
131 char *pFold = aFold;
132 unsigned char *a = p->aTokenChar;
133
134 UNUSED_PARAM(iUnused);
135
136 while( is<nText && rc==SQLITE_OK ){
137 int nByte;
138
139 /* Skip any leading divider characters. */
140 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
141 is++;
142 }
143 if( is==nText ) break;
144
145 /* Count the token characters */
146 ie = is+1;
147 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
148 ie++;
149 }
150
151 /* Fold to lower case */
152 nByte = ie-is;
153 if( nByte>nFold ){
154 if( pFold!=aFold ) sqlite3_free(pFold);
155 pFold = sqlite3_malloc(nByte*2);
156 if( pFold==0 ){
157 rc = SQLITE_NOMEM;
158 break;
159 }
160 nFold = nByte*2;
161 }
162 asciiFold(pFold, &pText[is], nByte);
163
164 /* Invoke the token callback */
165 rc = xToken(pCtx, 0, pFold, nByte, is, ie);
166 is = ie+1;
167 }
168
169 if( pFold!=aFold ) sqlite3_free(pFold);
170 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
171 return rc;
172 }
173
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
176 */
177
178
179 /*
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
183 */
184 #ifndef SQLITE_AMALGAMATION
185
186 static const unsigned char sqlite3Utf8Trans1[] = {
187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
195 };
196
197 #define READ_UTF8(zIn, zTerm, c) \
198 c = *(zIn++); \
199 if( c>=0xc0 ){ \
200 c = sqlite3Utf8Trans1[c-0xc0]; \
201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
202 c = (c<<6) + (0x3f & *(zIn++)); \
203 } \
204 if( c<0x80 \
205 || (c&0xFFFFF800)==0xD800 \
206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
207 }
208
209
210 #define WRITE_UTF8(zOut, c) { \
211 if( c<0x00080 ){ \
212 *zOut++ = (unsigned char)(c&0xFF); \
213 } \
214 else if( c<0x00800 ){ \
215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
217 } \
218 else if( c<0x10000 ){ \
219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
222 }else{ \
223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
227 } \
228 }
229
230 #endif /* ifndef SQLITE_AMALGAMATION */
231
232 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
233 struct Unicode61Tokenizer {
234 unsigned char aTokenChar[128]; /* ASCII range token characters */
235 char *aFold; /* Buffer to fold text into */
236 int nFold; /* Size of aFold[] in bytes */
237 int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
238 int nException;
239 int *aiException;
240 };
241
242 static int fts5UnicodeAddExceptions(
243 Unicode61Tokenizer *p, /* Tokenizer object */
244 const char *z, /* Characters to treat as exceptions */
245 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
246 ){
247 int rc = SQLITE_OK;
248 int n = (int)strlen(z);
249 int *aNew;
250
251 if( n>0 ){
252 aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int));
253 if( aNew ){
254 int nNew = p->nException;
255 const unsigned char *zCsr = (const unsigned char*)z;
256 const unsigned char *zTerm = (const unsigned char*)&z[n];
257 while( zCsr<zTerm ){
258 int iCode;
259 int bToken;
260 READ_UTF8(zCsr, zTerm, iCode);
261 if( iCode<128 ){
262 p->aTokenChar[iCode] = (unsigned char)bTokenChars;
263 }else{
264 bToken = sqlite3Fts5UnicodeIsalnum(iCode);
265 assert( (bToken==0 || bToken==1) );
266 assert( (bTokenChars==0 || bTokenChars==1) );
267 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
268 int i;
269 for(i=0; i<nNew; i++){
270 if( aNew[i]>iCode ) break;
271 }
272 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
273 aNew[i] = iCode;
274 nNew++;
275 }
276 }
277 }
278 p->aiException = aNew;
279 p->nException = nNew;
280 }else{
281 rc = SQLITE_NOMEM;
282 }
283 }
284
285 return rc;
286 }
287
288 /*
289 ** Return true if the p->aiException[] array contains the value iCode.
290 */
291 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
292 if( p->nException>0 ){
293 int *a = p->aiException;
294 int iLo = 0;
295 int iHi = p->nException-1;
296
297 while( iHi>=iLo ){
298 int iTest = (iHi + iLo) / 2;
299 if( iCode==a[iTest] ){
300 return 1;
301 }else if( iCode>a[iTest] ){
302 iLo = iTest+1;
303 }else{
304 iHi = iTest-1;
305 }
306 }
307 }
308
309 return 0;
310 }
311
312 /*
313 ** Delete a "unicode61" tokenizer.
314 */
315 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
316 if( pTok ){
317 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
318 sqlite3_free(p->aiException);
319 sqlite3_free(p->aFold);
320 sqlite3_free(p);
321 }
322 return;
323 }
324
325 /*
326 ** Create a "unicode61" tokenizer.
327 */
328 static int fts5UnicodeCreate(
329 void *pUnused,
330 const char **azArg, int nArg,
331 Fts5Tokenizer **ppOut
332 ){
333 int rc = SQLITE_OK; /* Return code */
334 Unicode61Tokenizer *p = 0; /* New tokenizer object */
335
336 UNUSED_PARAM(pUnused);
337
338 if( nArg%2 ){
339 rc = SQLITE_ERROR;
340 }else{
341 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
342 if( p ){
343 int i;
344 memset(p, 0, sizeof(Unicode61Tokenizer));
345 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
346 p->bRemoveDiacritic = 1;
347 p->nFold = 64;
348 p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
349 if( p->aFold==0 ){
350 rc = SQLITE_NOMEM;
351 }
352 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
353 const char *zArg = azArg[i+1];
354 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
355 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
356 rc = SQLITE_ERROR;
357 }
358 p->bRemoveDiacritic = (zArg[0]=='1');
359 }else
360 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
361 rc = fts5UnicodeAddExceptions(p, zArg, 1);
362 }else
363 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
364 rc = fts5UnicodeAddExceptions(p, zArg, 0);
365 }else{
366 rc = SQLITE_ERROR;
367 }
368 }
369 }else{
370 rc = SQLITE_NOMEM;
371 }
372 if( rc!=SQLITE_OK ){
373 fts5UnicodeDelete((Fts5Tokenizer*)p);
374 p = 0;
375 }
376 *ppOut = (Fts5Tokenizer*)p;
377 }
378 return rc;
379 }
380
381 /*
382 ** Return true if, for the purposes of tokenizing with the tokenizer
383 ** passed as the first argument, codepoint iCode is considered a token
384 ** character (not a separator).
385 */
386 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
387 assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
388 return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
389 }
390
391 static int fts5UnicodeTokenize(
392 Fts5Tokenizer *pTokenizer,
393 void *pCtx,
394 int iUnused,
395 const char *pText, int nText,
396 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
397 ){
398 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
399 int rc = SQLITE_OK;
400 unsigned char *a = p->aTokenChar;
401
402 unsigned char *zTerm = (unsigned char*)&pText[nText];
403 unsigned char *zCsr = (unsigned char *)pText;
404
405 /* Output buffer */
406 char *aFold = p->aFold;
407 int nFold = p->nFold;
408 const char *pEnd = &aFold[nFold-6];
409
410 UNUSED_PARAM(iUnused);
411
412 /* Each iteration of this loop gobbles up a contiguous run of separators,
413 ** then the next token. */
414 while( rc==SQLITE_OK ){
415 int iCode; /* non-ASCII codepoint read from input */
416 char *zOut = aFold;
417 int is;
418 int ie;
419
420 /* Skip any separator characters. */
421 while( 1 ){
422 if( zCsr>=zTerm ) goto tokenize_done;
423 if( *zCsr & 0x80 ) {
424 /* A character outside of the ascii range. Skip past it if it is
425 ** a separator character. Or break out of the loop if it is not. */
426 is = zCsr - (unsigned char*)pText;
427 READ_UTF8(zCsr, zTerm, iCode);
428 if( fts5UnicodeIsAlnum(p, iCode) ){
429 goto non_ascii_tokenchar;
430 }
431 }else{
432 if( a[*zCsr] ){
433 is = zCsr - (unsigned char*)pText;
434 goto ascii_tokenchar;
435 }
436 zCsr++;
437 }
438 }
439
440 /* Run through the tokenchars. Fold them into the output buffer along
441 ** the way. */
442 while( zCsr<zTerm ){
443
444 /* Grow the output buffer so that there is sufficient space to fit the
445 ** largest possible utf-8 character. */
446 if( zOut>pEnd ){
447 aFold = sqlite3_malloc(nFold*2);
448 if( aFold==0 ){
449 rc = SQLITE_NOMEM;
450 goto tokenize_done;
451 }
452 zOut = &aFold[zOut - p->aFold];
453 memcpy(aFold, p->aFold, nFold);
454 sqlite3_free(p->aFold);
455 p->aFold = aFold;
456 p->nFold = nFold = nFold*2;
457 pEnd = &aFold[nFold-6];
458 }
459
460 if( *zCsr & 0x80 ){
461 /* An non-ascii-range character. Fold it into the output buffer if
462 ** it is a token character, or break out of the loop if it is not. */
463 READ_UTF8(zCsr, zTerm, iCode);
464 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
465 non_ascii_tokenchar:
466 iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
467 if( iCode ) WRITE_UTF8(zOut, iCode);
468 }else{
469 break;
470 }
471 }else if( a[*zCsr]==0 ){
472 /* An ascii-range separator character. End of token. */
473 break;
474 }else{
475 ascii_tokenchar:
476 if( *zCsr>='A' && *zCsr<='Z' ){
477 *zOut++ = *zCsr + 32;
478 }else{
479 *zOut++ = *zCsr;
480 }
481 zCsr++;
482 }
483 ie = zCsr - (unsigned char*)pText;
484 }
485
486 /* Invoke the token callback */
487 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
488 }
489
490 tokenize_done:
491 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
492 return rc;
493 }
494
495 /**************************************************************************
496 ** Start of porter stemmer implementation.
497 */
498
499 /* Any tokens larger than this (in bytes) are passed through without
500 ** stemming. */
501 #define FTS5_PORTER_MAX_TOKEN 64
502
503 typedef struct PorterTokenizer PorterTokenizer;
504 struct PorterTokenizer {
505 fts5_tokenizer tokenizer; /* Parent tokenizer module */
506 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
507 char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
508 };
509
510 /*
511 ** Delete a "porter" tokenizer.
512 */
513 static void fts5PorterDelete(Fts5Tokenizer *pTok){
514 if( pTok ){
515 PorterTokenizer *p = (PorterTokenizer*)pTok;
516 if( p->pTokenizer ){
517 p->tokenizer.xDelete(p->pTokenizer);
518 }
519 sqlite3_free(p);
520 }
521 }
522
523 /*
524 ** Create a "porter" tokenizer.
525 */
526 static int fts5PorterCreate(
527 void *pCtx,
528 const char **azArg, int nArg,
529 Fts5Tokenizer **ppOut
530 ){
531 fts5_api *pApi = (fts5_api*)pCtx;
532 int rc = SQLITE_OK;
533 PorterTokenizer *pRet;
534 void *pUserdata = 0;
535 const char *zBase = "unicode61";
536
537 if( nArg>0 ){
538 zBase = azArg[0];
539 }
540
541 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
542 if( pRet ){
543 memset(pRet, 0, sizeof(PorterTokenizer));
544 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
545 }else{
546 rc = SQLITE_NOMEM;
547 }
548 if( rc==SQLITE_OK ){
549 int nArg2 = (nArg>0 ? nArg-1 : 0);
550 const char **azArg2 = (nArg2 ? &azArg[1] : 0);
551 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
552 }
553
554 if( rc!=SQLITE_OK ){
555 fts5PorterDelete((Fts5Tokenizer*)pRet);
556 pRet = 0;
557 }
558 *ppOut = (Fts5Tokenizer*)pRet;
559 return rc;
560 }
561
562 typedef struct PorterContext PorterContext;
563 struct PorterContext {
564 void *pCtx;
565 int (*xToken)(void*, int, const char*, int, int, int);
566 char *aBuf;
567 };
568
569 typedef struct PorterRule PorterRule;
570 struct PorterRule {
571 const char *zSuffix;
572 int nSuffix;
573 int (*xCond)(char *zStem, int nStem);
574 const char *zOutput;
575 int nOutput;
576 };
577
578 #if 0
579 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
580 int ret = -1;
581 int nBuf = *pnBuf;
582 PorterRule *p;
583
584 for(p=aRule; p->zSuffix; p++){
585 assert( strlen(p->zSuffix)==p->nSuffix );
586 assert( strlen(p->zOutput)==p->nOutput );
587 if( nBuf<p->nSuffix ) continue;
588 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
589 }
590
591 if( p->zSuffix ){
592 int nStem = nBuf - p->nSuffix;
593 if( p->xCond==0 || p->xCond(aBuf, nStem) ){
594 memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
595 *pnBuf = nStem + p->nOutput;
596 ret = p - aRule;
597 }
598 }
599
600 return ret;
601 }
602 #endif
603
604 static int fts5PorterIsVowel(char c, int bYIsVowel){
605 return (
606 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
607 );
608 }
609
610 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
611 int i;
612 int bCons = bPrevCons;
613
614 /* Scan for a vowel */
615 for(i=0; i<nStem; i++){
616 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
617 }
618
619 /* Scan for a consonent */
620 for(i++; i<nStem; i++){
621 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
622 }
623 return 0;
624 }
625
626 /* porter rule condition: (m > 0) */
627 static int fts5Porter_MGt0(char *zStem, int nStem){
628 return !!fts5PorterGobbleVC(zStem, nStem, 0);
629 }
630
631 /* porter rule condition: (m > 1) */
632 static int fts5Porter_MGt1(char *zStem, int nStem){
633 int n;
634 n = fts5PorterGobbleVC(zStem, nStem, 0);
635 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
636 return 1;
637 }
638 return 0;
639 }
640
641 /* porter rule condition: (m = 1) */
642 static int fts5Porter_MEq1(char *zStem, int nStem){
643 int n;
644 n = fts5PorterGobbleVC(zStem, nStem, 0);
645 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
646 return 1;
647 }
648 return 0;
649 }
650
651 /* porter rule condition: (*o) */
652 static int fts5Porter_Ostar(char *zStem, int nStem){
653 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
654 return 0;
655 }else{
656 int i;
657 int mask = 0;
658 int bCons = 0;
659 for(i=0; i<nStem; i++){
660 bCons = !fts5PorterIsVowel(zStem[i], bCons);
661 assert( bCons==0 || bCons==1 );
662 mask = (mask << 1) + bCons;
663 }
664 return ((mask & 0x0007)==0x0005);
665 }
666 }
667
668 /* porter rule condition: (m > 1 and (*S or *T)) */
669 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
670 assert( nStem>0 );
671 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
672 && fts5Porter_MGt1(zStem, nStem);
673 }
674
675 /* porter rule condition: (*v*) */
676 static int fts5Porter_Vowel(char *zStem, int nStem){
677 int i;
678 for(i=0; i<nStem; i++){
679 if( fts5PorterIsVowel(zStem[i], i>0) ){
680 return 1;
681 }
682 }
683 return 0;
684 }
685
686
687 /**************************************************************************
688 ***************************************************************************
689 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
690 */
691
692 static int fts5PorterStep4(char *aBuf, int *pnBuf){
693 int ret = 0;
694 int nBuf = *pnBuf;
695 switch( aBuf[nBuf-2] ){
696
697 case 'a':
698 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
699 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
700 *pnBuf = nBuf - 2;
701 }
702 }
703 break;
704
705 case 'c':
706 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
707 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
708 *pnBuf = nBuf - 4;
709 }
710 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
711 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
712 *pnBuf = nBuf - 4;
713 }
714 }
715 break;
716
717 case 'e':
718 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
719 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
720 *pnBuf = nBuf - 2;
721 }
722 }
723 break;
724
725 case 'i':
726 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
727 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
728 *pnBuf = nBuf - 2;
729 }
730 }
731 break;
732
733 case 'l':
734 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
735 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
736 *pnBuf = nBuf - 4;
737 }
738 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
739 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
740 *pnBuf = nBuf - 4;
741 }
742 }
743 break;
744
745 case 'n':
746 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
747 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
748 *pnBuf = nBuf - 3;
749 }
750 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
751 if( fts5Porter_MGt1(aBuf, nBuf-5) ){
752 *pnBuf = nBuf - 5;
753 }
754 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
755 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
756 *pnBuf = nBuf - 4;
757 }
758 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
759 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
760 *pnBuf = nBuf - 3;
761 }
762 }
763 break;
764
765 case 'o':
766 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
767 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
768 *pnBuf = nBuf - 3;
769 }
770 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
771 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
772 *pnBuf = nBuf - 2;
773 }
774 }
775 break;
776
777 case 's':
778 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
779 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
780 *pnBuf = nBuf - 3;
781 }
782 }
783 break;
784
785 case 't':
786 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
787 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
788 *pnBuf = nBuf - 3;
789 }
790 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
791 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
792 *pnBuf = nBuf - 3;
793 }
794 }
795 break;
796
797 case 'u':
798 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
799 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
800 *pnBuf = nBuf - 3;
801 }
802 }
803 break;
804
805 case 'v':
806 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
807 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
808 *pnBuf = nBuf - 3;
809 }
810 }
811 break;
812
813 case 'z':
814 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
815 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
816 *pnBuf = nBuf - 3;
817 }
818 }
819 break;
820
821 }
822 return ret;
823 }
824
825
826 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
827 int ret = 0;
828 int nBuf = *pnBuf;
829 switch( aBuf[nBuf-2] ){
830
831 case 'a':
832 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
833 memcpy(&aBuf[nBuf-2], "ate", 3);
834 *pnBuf = nBuf - 2 + 3;
835 ret = 1;
836 }
837 break;
838
839 case 'b':
840 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
841 memcpy(&aBuf[nBuf-2], "ble", 3);
842 *pnBuf = nBuf - 2 + 3;
843 ret = 1;
844 }
845 break;
846
847 case 'i':
848 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
849 memcpy(&aBuf[nBuf-2], "ize", 3);
850 *pnBuf = nBuf - 2 + 3;
851 ret = 1;
852 }
853 break;
854
855 }
856 return ret;
857 }
858
859
860 static int fts5PorterStep2(char *aBuf, int *pnBuf){
861 int ret = 0;
862 int nBuf = *pnBuf;
863 switch( aBuf[nBuf-2] ){
864
865 case 'a':
866 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
867 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
868 memcpy(&aBuf[nBuf-7], "ate", 3);
869 *pnBuf = nBuf - 7 + 3;
870 }
871 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
872 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
873 memcpy(&aBuf[nBuf-6], "tion", 4);
874 *pnBuf = nBuf - 6 + 4;
875 }
876 }
877 break;
878
879 case 'c':
880 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
881 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
882 memcpy(&aBuf[nBuf-4], "ence", 4);
883 *pnBuf = nBuf - 4 + 4;
884 }
885 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
886 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
887 memcpy(&aBuf[nBuf-4], "ance", 4);
888 *pnBuf = nBuf - 4 + 4;
889 }
890 }
891 break;
892
893 case 'e':
894 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
895 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
896 memcpy(&aBuf[nBuf-4], "ize", 3);
897 *pnBuf = nBuf - 4 + 3;
898 }
899 }
900 break;
901
902 case 'g':
903 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
904 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
905 memcpy(&aBuf[nBuf-4], "log", 3);
906 *pnBuf = nBuf - 4 + 3;
907 }
908 }
909 break;
910
911 case 'l':
912 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
913 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
914 memcpy(&aBuf[nBuf-3], "ble", 3);
915 *pnBuf = nBuf - 3 + 3;
916 }
917 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
918 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
919 memcpy(&aBuf[nBuf-4], "al", 2);
920 *pnBuf = nBuf - 4 + 2;
921 }
922 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
923 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
924 memcpy(&aBuf[nBuf-5], "ent", 3);
925 *pnBuf = nBuf - 5 + 3;
926 }
927 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
928 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
929 memcpy(&aBuf[nBuf-3], "e", 1);
930 *pnBuf = nBuf - 3 + 1;
931 }
932 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
933 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
934 memcpy(&aBuf[nBuf-5], "ous", 3);
935 *pnBuf = nBuf - 5 + 3;
936 }
937 }
938 break;
939
940 case 'o':
941 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
942 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
943 memcpy(&aBuf[nBuf-7], "ize", 3);
944 *pnBuf = nBuf - 7 + 3;
945 }
946 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
947 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
948 memcpy(&aBuf[nBuf-5], "ate", 3);
949 *pnBuf = nBuf - 5 + 3;
950 }
951 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
952 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
953 memcpy(&aBuf[nBuf-4], "ate", 3);
954 *pnBuf = nBuf - 4 + 3;
955 }
956 }
957 break;
958
959 case 's':
960 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
961 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
962 memcpy(&aBuf[nBuf-5], "al", 2);
963 *pnBuf = nBuf - 5 + 2;
964 }
965 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
966 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
967 memcpy(&aBuf[nBuf-7], "ive", 3);
968 *pnBuf = nBuf - 7 + 3;
969 }
970 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
971 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
972 memcpy(&aBuf[nBuf-7], "ful", 3);
973 *pnBuf = nBuf - 7 + 3;
974 }
975 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
976 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
977 memcpy(&aBuf[nBuf-7], "ous", 3);
978 *pnBuf = nBuf - 7 + 3;
979 }
980 }
981 break;
982
983 case 't':
984 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
985 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
986 memcpy(&aBuf[nBuf-5], "al", 2);
987 *pnBuf = nBuf - 5 + 2;
988 }
989 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
990 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
991 memcpy(&aBuf[nBuf-5], "ive", 3);
992 *pnBuf = nBuf - 5 + 3;
993 }
994 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
995 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
996 memcpy(&aBuf[nBuf-6], "ble", 3);
997 *pnBuf = nBuf - 6 + 3;
998 }
999 }
1000 break;
1001
1002 }
1003 return ret;
1004 }
1005
1006
1007 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1008 int ret = 0;
1009 int nBuf = *pnBuf;
1010 switch( aBuf[nBuf-2] ){
1011
1012 case 'a':
1013 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1014 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1015 memcpy(&aBuf[nBuf-4], "ic", 2);
1016 *pnBuf = nBuf - 4 + 2;
1017 }
1018 }
1019 break;
1020
1021 case 's':
1022 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1023 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1024 *pnBuf = nBuf - 4;
1025 }
1026 }
1027 break;
1028
1029 case 't':
1030 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1031 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1032 memcpy(&aBuf[nBuf-5], "ic", 2);
1033 *pnBuf = nBuf - 5 + 2;
1034 }
1035 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1036 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1037 memcpy(&aBuf[nBuf-5], "ic", 2);
1038 *pnBuf = nBuf - 5 + 2;
1039 }
1040 }
1041 break;
1042
1043 case 'u':
1044 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1045 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1046 *pnBuf = nBuf - 3;
1047 }
1048 }
1049 break;
1050
1051 case 'v':
1052 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1053 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1054 *pnBuf = nBuf - 5;
1055 }
1056 }
1057 break;
1058
1059 case 'z':
1060 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1061 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1062 memcpy(&aBuf[nBuf-5], "al", 2);
1063 *pnBuf = nBuf - 5 + 2;
1064 }
1065 }
1066 break;
1067
1068 }
1069 return ret;
1070 }
1071
1072
1073 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1074 int ret = 0;
1075 int nBuf = *pnBuf;
1076 switch( aBuf[nBuf-2] ){
1077
1078 case 'e':
1079 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1080 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1081 memcpy(&aBuf[nBuf-3], "ee", 2);
1082 *pnBuf = nBuf - 3 + 2;
1083 }
1084 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1085 if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1086 *pnBuf = nBuf - 2;
1087 ret = 1;
1088 }
1089 }
1090 break;
1091
1092 case 'n':
1093 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1094 if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1095 *pnBuf = nBuf - 3;
1096 ret = 1;
1097 }
1098 }
1099 break;
1100
1101 }
1102 return ret;
1103 }
1104
1105 /*
1106 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1107 ***************************************************************************
1108 **************************************************************************/
1109
1110 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1111 int nBuf = *pnBuf;
1112 if( aBuf[nBuf-1]=='s' ){
1113 if( aBuf[nBuf-2]=='e' ){
1114 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1115 || (nBuf>3 && aBuf[nBuf-3]=='i' )
1116 ){
1117 *pnBuf = nBuf-2;
1118 }else{
1119 *pnBuf = nBuf-1;
1120 }
1121 }
1122 else if( aBuf[nBuf-2]!='s' ){
1123 *pnBuf = nBuf-1;
1124 }
1125 }
1126 }
1127
1128 static int fts5PorterCb(
1129 void *pCtx,
1130 int tflags,
1131 const char *pToken,
1132 int nToken,
1133 int iStart,
1134 int iEnd
1135 ){
1136 PorterContext *p = (PorterContext*)pCtx;
1137
1138 char *aBuf;
1139 int nBuf;
1140
1141 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1142 aBuf = p->aBuf;
1143 nBuf = nToken;
1144 memcpy(aBuf, pToken, nBuf);
1145
1146 /* Step 1. */
1147 fts5PorterStep1A(aBuf, &nBuf);
1148 if( fts5PorterStep1B(aBuf, &nBuf) ){
1149 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1150 char c = aBuf[nBuf-1];
1151 if( fts5PorterIsVowel(c, 0)==0
1152 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1153 ){
1154 nBuf--;
1155 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1156 aBuf[nBuf++] = 'e';
1157 }
1158 }
1159 }
1160
1161 /* Step 1C. */
1162 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1163 aBuf[nBuf-1] = 'i';
1164 }
1165
1166 /* Steps 2 through 4. */
1167 fts5PorterStep2(aBuf, &nBuf);
1168 fts5PorterStep3(aBuf, &nBuf);
1169 fts5PorterStep4(aBuf, &nBuf);
1170
1171 /* Step 5a. */
1172 assert( nBuf>0 );
1173 if( aBuf[nBuf-1]=='e' ){
1174 if( fts5Porter_MGt1(aBuf, nBuf-1)
1175 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1176 ){
1177 nBuf--;
1178 }
1179 }
1180
1181 /* Step 5b. */
1182 if( nBuf>1 && aBuf[nBuf-1]=='l'
1183 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1184 ){
1185 nBuf--;
1186 }
1187
1188 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1189
1190 pass_through:
1191 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1192 }
1193
1194 /*
1195 ** Tokenize using the porter tokenizer.
1196 */
1197 static int fts5PorterTokenize(
1198 Fts5Tokenizer *pTokenizer,
1199 void *pCtx,
1200 int flags,
1201 const char *pText, int nText,
1202 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1203 ){
1204 PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1205 PorterContext sCtx;
1206 sCtx.xToken = xToken;
1207 sCtx.pCtx = pCtx;
1208 sCtx.aBuf = p->aBuf;
1209 return p->tokenizer.xTokenize(
1210 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1211 );
1212 }
1213
1214 /*
1215 ** Register all built-in tokenizers with FTS5.
1216 */
1217 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1218 struct BuiltinTokenizer {
1219 const char *zName;
1220 fts5_tokenizer x;
1221 } aBuiltin[] = {
1222 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1223 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1224 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1225 };
1226
1227 int rc = SQLITE_OK; /* Return code */
1228 int i; /* To iterate through builtin functions */
1229
1230 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1231 rc = pApi->xCreateTokenizer(pApi,
1232 aBuiltin[i].zName,
1233 (void*)pApi,
1234 &aBuiltin[i].x,
1235 0
1236 );
1237 }
1238
1239 return rc;
1240 }
1241
1242
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698