OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ** 2014 May 31 |
| 3 ** |
| 4 ** The author disclaims copyright to this source code. In place of |
| 5 ** a legal notice, here is a blessing: |
| 6 ** |
| 7 ** May you do good and not evil. |
| 8 ** May you find forgiveness for yourself and forgive others. |
| 9 ** May you share freely, never taking more than you give. |
| 10 ** |
| 11 ****************************************************************************** |
| 12 */ |
| 13 |
| 14 |
| 15 #include "fts5Int.h" |
| 16 |
| 17 /************************************************************************** |
| 18 ** Start of ascii tokenizer implementation. |
| 19 */ |
| 20 |
| 21 /* |
| 22 ** For tokenizers with no "unicode" modifier, the set of token characters |
| 23 ** is the same as the set of ASCII range alphanumeric characters. |
| 24 */ |
| 25 static unsigned char aAsciiTokenChar[128] = { |
| 26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */ |
| 27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */ |
| 28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */ |
| 29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */ |
| 30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */ |
| 31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */ |
| 32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */ |
| 33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */ |
| 34 }; |
| 35 |
| 36 typedef struct AsciiTokenizer AsciiTokenizer; |
| 37 struct AsciiTokenizer { |
| 38 unsigned char aTokenChar[128]; |
| 39 }; |
| 40 |
| 41 static void fts5AsciiAddExceptions( |
| 42 AsciiTokenizer *p, |
| 43 const char *zArg, |
| 44 int bTokenChars |
| 45 ){ |
| 46 int i; |
| 47 for(i=0; zArg[i]; i++){ |
| 48 if( (zArg[i] & 0x80)==0 ){ |
| 49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars; |
| 50 } |
| 51 } |
| 52 } |
| 53 |
| 54 /* |
| 55 ** Delete a "ascii" tokenizer. |
| 56 */ |
| 57 static void fts5AsciiDelete(Fts5Tokenizer *p){ |
| 58 sqlite3_free(p); |
| 59 } |
| 60 |
| 61 /* |
| 62 ** Create an "ascii" tokenizer. |
| 63 */ |
| 64 static int fts5AsciiCreate( |
| 65 void *pCtx, |
| 66 const char **azArg, int nArg, |
| 67 Fts5Tokenizer **ppOut |
| 68 ){ |
| 69 int rc = SQLITE_OK; |
| 70 AsciiTokenizer *p = 0; |
| 71 if( nArg%2 ){ |
| 72 rc = SQLITE_ERROR; |
| 73 }else{ |
| 74 p = sqlite3_malloc(sizeof(AsciiTokenizer)); |
| 75 if( p==0 ){ |
| 76 rc = SQLITE_NOMEM; |
| 77 }else{ |
| 78 int i; |
| 79 memset(p, 0, sizeof(AsciiTokenizer)); |
| 80 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar)); |
| 81 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ |
| 82 const char *zArg = azArg[i+1]; |
| 83 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ |
| 84 fts5AsciiAddExceptions(p, zArg, 1); |
| 85 }else |
| 86 if( 0==sqlite3_stricmp(azArg[i], "separators") ){ |
| 87 fts5AsciiAddExceptions(p, zArg, 0); |
| 88 }else{ |
| 89 rc = SQLITE_ERROR; |
| 90 } |
| 91 } |
| 92 if( rc!=SQLITE_OK ){ |
| 93 fts5AsciiDelete((Fts5Tokenizer*)p); |
| 94 p = 0; |
| 95 } |
| 96 } |
| 97 } |
| 98 |
| 99 *ppOut = (Fts5Tokenizer*)p; |
| 100 return rc; |
| 101 } |
| 102 |
| 103 |
| 104 static void asciiFold(char *aOut, const char *aIn, int nByte){ |
| 105 int i; |
| 106 for(i=0; i<nByte; i++){ |
| 107 char c = aIn[i]; |
| 108 if( c>='A' && c<='Z' ) c += 32; |
| 109 aOut[i] = c; |
| 110 } |
| 111 } |
| 112 |
| 113 /* |
| 114 ** Tokenize some text using the ascii tokenizer. |
| 115 */ |
| 116 static int fts5AsciiTokenize( |
| 117 Fts5Tokenizer *pTokenizer, |
| 118 void *pCtx, |
| 119 int flags, |
| 120 const char *pText, int nText, |
| 121 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) |
| 122 ){ |
| 123 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer; |
| 124 int rc = SQLITE_OK; |
| 125 int ie; |
| 126 int is = 0; |
| 127 |
| 128 char aFold[64]; |
| 129 int nFold = sizeof(aFold); |
| 130 char *pFold = aFold; |
| 131 unsigned char *a = p->aTokenChar; |
| 132 |
| 133 while( is<nText && rc==SQLITE_OK ){ |
| 134 int nByte; |
| 135 |
| 136 /* Skip any leading divider characters. */ |
| 137 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){ |
| 138 is++; |
| 139 } |
| 140 if( is==nText ) break; |
| 141 |
| 142 /* Count the token characters */ |
| 143 ie = is+1; |
| 144 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){ |
| 145 ie++; |
| 146 } |
| 147 |
| 148 /* Fold to lower case */ |
| 149 nByte = ie-is; |
| 150 if( nByte>nFold ){ |
| 151 if( pFold!=aFold ) sqlite3_free(pFold); |
| 152 pFold = sqlite3_malloc(nByte*2); |
| 153 if( pFold==0 ){ |
| 154 rc = SQLITE_NOMEM; |
| 155 break; |
| 156 } |
| 157 nFold = nByte*2; |
| 158 } |
| 159 asciiFold(pFold, &pText[is], nByte); |
| 160 |
| 161 /* Invoke the token callback */ |
| 162 rc = xToken(pCtx, 0, pFold, nByte, is, ie); |
| 163 is = ie+1; |
| 164 } |
| 165 |
| 166 if( pFold!=aFold ) sqlite3_free(pFold); |
| 167 if( rc==SQLITE_DONE ) rc = SQLITE_OK; |
| 168 return rc; |
| 169 } |
| 170 |
| 171 /************************************************************************** |
| 172 ** Start of unicode61 tokenizer implementation. |
| 173 */ |
| 174 |
| 175 |
| 176 /* |
| 177 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied |
| 178 ** from the sqlite3 source file utf.c. If this file is compiled as part |
| 179 ** of the amalgamation, they are not required. |
| 180 */ |
| 181 #ifndef SQLITE_AMALGAMATION |
| 182 |
| 183 static const unsigned char sqlite3Utf8Trans1[] = { |
| 184 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 185 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
| 186 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, |
| 187 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |
| 188 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 189 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
| 190 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 191 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, |
| 192 }; |
| 193 |
| 194 #define READ_UTF8(zIn, zTerm, c) \ |
| 195 c = *(zIn++); \ |
| 196 if( c>=0xc0 ){ \ |
| 197 c = sqlite3Utf8Trans1[c-0xc0]; \ |
| 198 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \ |
| 199 c = (c<<6) + (0x3f & *(zIn++)); \ |
| 200 } \ |
| 201 if( c<0x80 \ |
| 202 || (c&0xFFFFF800)==0xD800 \ |
| 203 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ |
| 204 } |
| 205 |
| 206 |
| 207 #define WRITE_UTF8(zOut, c) { \ |
| 208 if( c<0x00080 ){ \ |
| 209 *zOut++ = (unsigned char)(c&0xFF); \ |
| 210 } \ |
| 211 else if( c<0x00800 ){ \ |
| 212 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \ |
| 213 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ |
| 214 } \ |
| 215 else if( c<0x10000 ){ \ |
| 216 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \ |
| 217 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \ |
| 218 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ |
| 219 }else{ \ |
| 220 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \ |
| 221 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \ |
| 222 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \ |
| 223 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ |
| 224 } \ |
| 225 } |
| 226 |
| 227 #endif /* ifndef SQLITE_AMALGAMATION */ |
| 228 |
| 229 typedef struct Unicode61Tokenizer Unicode61Tokenizer; |
| 230 struct Unicode61Tokenizer { |
| 231 unsigned char aTokenChar[128]; /* ASCII range token characters */ |
| 232 char *aFold; /* Buffer to fold text into */ |
| 233 int nFold; /* Size of aFold[] in bytes */ |
| 234 int bRemoveDiacritic; /* True if remove_diacritics=1 is set */ |
| 235 int nException; |
| 236 int *aiException; |
| 237 }; |
| 238 |
| 239 static int fts5UnicodeAddExceptions( |
| 240 Unicode61Tokenizer *p, /* Tokenizer object */ |
| 241 const char *z, /* Characters to treat as exceptions */ |
| 242 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */ |
| 243 ){ |
| 244 int rc = SQLITE_OK; |
| 245 int n = (int)strlen(z); |
| 246 int *aNew; |
| 247 |
| 248 if( n>0 ){ |
| 249 aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int)); |
| 250 if( aNew ){ |
| 251 int nNew = p->nException; |
| 252 const unsigned char *zCsr = (const unsigned char*)z; |
| 253 const unsigned char *zTerm = (const unsigned char*)&z[n]; |
| 254 while( zCsr<zTerm ){ |
| 255 int iCode; |
| 256 int bToken; |
| 257 READ_UTF8(zCsr, zTerm, iCode); |
| 258 if( iCode<128 ){ |
| 259 p->aTokenChar[iCode] = (unsigned char)bTokenChars; |
| 260 }else{ |
| 261 bToken = sqlite3Fts5UnicodeIsalnum(iCode); |
| 262 assert( (bToken==0 || bToken==1) ); |
| 263 assert( (bTokenChars==0 || bTokenChars==1) ); |
| 264 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){ |
| 265 int i; |
| 266 for(i=0; i<nNew; i++){ |
| 267 if( aNew[i]>iCode ) break; |
| 268 } |
| 269 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int)); |
| 270 aNew[i] = iCode; |
| 271 nNew++; |
| 272 } |
| 273 } |
| 274 } |
| 275 p->aiException = aNew; |
| 276 p->nException = nNew; |
| 277 }else{ |
| 278 rc = SQLITE_NOMEM; |
| 279 } |
| 280 } |
| 281 |
| 282 return rc; |
| 283 } |
| 284 |
| 285 /* |
| 286 ** Return true if the p->aiException[] array contains the value iCode. |
| 287 */ |
| 288 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){ |
| 289 if( p->nException>0 ){ |
| 290 int *a = p->aiException; |
| 291 int iLo = 0; |
| 292 int iHi = p->nException-1; |
| 293 |
| 294 while( iHi>=iLo ){ |
| 295 int iTest = (iHi + iLo) / 2; |
| 296 if( iCode==a[iTest] ){ |
| 297 return 1; |
| 298 }else if( iCode>a[iTest] ){ |
| 299 iLo = iTest+1; |
| 300 }else{ |
| 301 iHi = iTest-1; |
| 302 } |
| 303 } |
| 304 } |
| 305 |
| 306 return 0; |
| 307 } |
| 308 |
| 309 /* |
| 310 ** Delete a "unicode61" tokenizer. |
| 311 */ |
| 312 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){ |
| 313 if( pTok ){ |
| 314 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok; |
| 315 sqlite3_free(p->aiException); |
| 316 sqlite3_free(p->aFold); |
| 317 sqlite3_free(p); |
| 318 } |
| 319 return; |
| 320 } |
| 321 |
| 322 /* |
| 323 ** Create a "unicode61" tokenizer. |
| 324 */ |
| 325 static int fts5UnicodeCreate( |
| 326 void *pCtx, |
| 327 const char **azArg, int nArg, |
| 328 Fts5Tokenizer **ppOut |
| 329 ){ |
| 330 int rc = SQLITE_OK; /* Return code */ |
| 331 Unicode61Tokenizer *p = 0; /* New tokenizer object */ |
| 332 |
| 333 if( nArg%2 ){ |
| 334 rc = SQLITE_ERROR; |
| 335 }else{ |
| 336 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer)); |
| 337 if( p ){ |
| 338 int i; |
| 339 memset(p, 0, sizeof(Unicode61Tokenizer)); |
| 340 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar)); |
| 341 p->bRemoveDiacritic = 1; |
| 342 p->nFold = 64; |
| 343 p->aFold = sqlite3_malloc(p->nFold * sizeof(char)); |
| 344 if( p->aFold==0 ){ |
| 345 rc = SQLITE_NOMEM; |
| 346 } |
| 347 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ |
| 348 const char *zArg = azArg[i+1]; |
| 349 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ |
| 350 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){ |
| 351 rc = SQLITE_ERROR; |
| 352 } |
| 353 p->bRemoveDiacritic = (zArg[0]=='1'); |
| 354 }else |
| 355 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ |
| 356 rc = fts5UnicodeAddExceptions(p, zArg, 1); |
| 357 }else |
| 358 if( 0==sqlite3_stricmp(azArg[i], "separators") ){ |
| 359 rc = fts5UnicodeAddExceptions(p, zArg, 0); |
| 360 }else{ |
| 361 rc = SQLITE_ERROR; |
| 362 } |
| 363 } |
| 364 }else{ |
| 365 rc = SQLITE_NOMEM; |
| 366 } |
| 367 if( rc!=SQLITE_OK ){ |
| 368 fts5UnicodeDelete((Fts5Tokenizer*)p); |
| 369 p = 0; |
| 370 } |
| 371 *ppOut = (Fts5Tokenizer*)p; |
| 372 } |
| 373 return rc; |
| 374 } |
| 375 |
| 376 /* |
| 377 ** Return true if, for the purposes of tokenizing with the tokenizer |
| 378 ** passed as the first argument, codepoint iCode is considered a token |
| 379 ** character (not a separator). |
| 380 */ |
| 381 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){ |
| 382 assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); |
| 383 return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode); |
| 384 } |
| 385 |
| 386 static int fts5UnicodeTokenize( |
| 387 Fts5Tokenizer *pTokenizer, |
| 388 void *pCtx, |
| 389 int flags, |
| 390 const char *pText, int nText, |
| 391 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) |
| 392 ){ |
| 393 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; |
| 394 int rc = SQLITE_OK; |
| 395 unsigned char *a = p->aTokenChar; |
| 396 |
| 397 unsigned char *zTerm = (unsigned char*)&pText[nText]; |
| 398 unsigned char *zCsr = (unsigned char *)pText; |
| 399 |
| 400 /* Output buffer */ |
| 401 char *aFold = p->aFold; |
| 402 int nFold = p->nFold; |
| 403 const char *pEnd = &aFold[nFold-6]; |
| 404 |
| 405 /* Each iteration of this loop gobbles up a contiguous run of separators, |
| 406 ** then the next token. */ |
| 407 while( rc==SQLITE_OK ){ |
| 408 int iCode; /* non-ASCII codepoint read from input */ |
| 409 char *zOut = aFold; |
| 410 int is; |
| 411 int ie; |
| 412 |
| 413 /* Skip any separator characters. */ |
| 414 while( 1 ){ |
| 415 if( zCsr>=zTerm ) goto tokenize_done; |
| 416 if( *zCsr & 0x80 ) { |
| 417 /* A character outside of the ascii range. Skip past it if it is |
| 418 ** a separator character. Or break out of the loop if it is not. */ |
| 419 is = zCsr - (unsigned char*)pText; |
| 420 READ_UTF8(zCsr, zTerm, iCode); |
| 421 if( fts5UnicodeIsAlnum(p, iCode) ){ |
| 422 goto non_ascii_tokenchar; |
| 423 } |
| 424 }else{ |
| 425 if( a[*zCsr] ){ |
| 426 is = zCsr - (unsigned char*)pText; |
| 427 goto ascii_tokenchar; |
| 428 } |
| 429 zCsr++; |
| 430 } |
| 431 } |
| 432 |
| 433 /* Run through the tokenchars. Fold them into the output buffer along |
| 434 ** the way. */ |
| 435 while( zCsr<zTerm ){ |
| 436 |
| 437 /* Grow the output buffer so that there is sufficient space to fit the |
| 438 ** largest possible utf-8 character. */ |
| 439 if( zOut>pEnd ){ |
| 440 aFold = sqlite3_malloc(nFold*2); |
| 441 if( aFold==0 ){ |
| 442 rc = SQLITE_NOMEM; |
| 443 goto tokenize_done; |
| 444 } |
| 445 zOut = &aFold[zOut - p->aFold]; |
| 446 memcpy(aFold, p->aFold, nFold); |
| 447 sqlite3_free(p->aFold); |
| 448 p->aFold = aFold; |
| 449 p->nFold = nFold = nFold*2; |
| 450 pEnd = &aFold[nFold-6]; |
| 451 } |
| 452 |
| 453 if( *zCsr & 0x80 ){ |
| 454 /* An non-ascii-range character. Fold it into the output buffer if |
| 455 ** it is a token character, or break out of the loop if it is not. */ |
| 456 READ_UTF8(zCsr, zTerm, iCode); |
| 457 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){ |
| 458 non_ascii_tokenchar: |
| 459 iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); |
| 460 if( iCode ) WRITE_UTF8(zOut, iCode); |
| 461 }else{ |
| 462 break; |
| 463 } |
| 464 }else if( a[*zCsr]==0 ){ |
| 465 /* An ascii-range separator character. End of token. */ |
| 466 break; |
| 467 }else{ |
| 468 ascii_tokenchar: |
| 469 if( *zCsr>='A' && *zCsr<='Z' ){ |
| 470 *zOut++ = *zCsr + 32; |
| 471 }else{ |
| 472 *zOut++ = *zCsr; |
| 473 } |
| 474 zCsr++; |
| 475 } |
| 476 ie = zCsr - (unsigned char*)pText; |
| 477 } |
| 478 |
| 479 /* Invoke the token callback */ |
| 480 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie); |
| 481 } |
| 482 |
| 483 tokenize_done: |
| 484 if( rc==SQLITE_DONE ) rc = SQLITE_OK; |
| 485 return rc; |
| 486 } |
| 487 |
| 488 /************************************************************************** |
| 489 ** Start of porter stemmer implementation. |
| 490 */ |
| 491 |
| 492 /* Any tokens larger than this (in bytes) are passed through without |
| 493 ** stemming. */ |
| 494 #define FTS5_PORTER_MAX_TOKEN 64 |
| 495 |
| 496 typedef struct PorterTokenizer PorterTokenizer; |
| 497 struct PorterTokenizer { |
| 498 fts5_tokenizer tokenizer; /* Parent tokenizer module */ |
| 499 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */ |
| 500 char aBuf[FTS5_PORTER_MAX_TOKEN + 64]; |
| 501 }; |
| 502 |
| 503 /* |
| 504 ** Delete a "porter" tokenizer. |
| 505 */ |
| 506 static void fts5PorterDelete(Fts5Tokenizer *pTok){ |
| 507 if( pTok ){ |
| 508 PorterTokenizer *p = (PorterTokenizer*)pTok; |
| 509 if( p->pTokenizer ){ |
| 510 p->tokenizer.xDelete(p->pTokenizer); |
| 511 } |
| 512 sqlite3_free(p); |
| 513 } |
| 514 } |
| 515 |
| 516 /* |
| 517 ** Create a "porter" tokenizer. |
| 518 */ |
| 519 static int fts5PorterCreate( |
| 520 void *pCtx, |
| 521 const char **azArg, int nArg, |
| 522 Fts5Tokenizer **ppOut |
| 523 ){ |
| 524 fts5_api *pApi = (fts5_api*)pCtx; |
| 525 int rc = SQLITE_OK; |
| 526 PorterTokenizer *pRet; |
| 527 void *pUserdata = 0; |
| 528 const char *zBase = "unicode61"; |
| 529 |
| 530 if( nArg>0 ){ |
| 531 zBase = azArg[0]; |
| 532 } |
| 533 |
| 534 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer)); |
| 535 if( pRet ){ |
| 536 memset(pRet, 0, sizeof(PorterTokenizer)); |
| 537 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer); |
| 538 }else{ |
| 539 rc = SQLITE_NOMEM; |
| 540 } |
| 541 if( rc==SQLITE_OK ){ |
| 542 int nArg2 = (nArg>0 ? nArg-1 : 0); |
| 543 const char **azArg2 = (nArg2 ? &azArg[1] : 0); |
| 544 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer); |
| 545 } |
| 546 |
| 547 if( rc!=SQLITE_OK ){ |
| 548 fts5PorterDelete((Fts5Tokenizer*)pRet); |
| 549 pRet = 0; |
| 550 } |
| 551 *ppOut = (Fts5Tokenizer*)pRet; |
| 552 return rc; |
| 553 } |
| 554 |
| 555 typedef struct PorterContext PorterContext; |
| 556 struct PorterContext { |
| 557 void *pCtx; |
| 558 int (*xToken)(void*, int, const char*, int, int, int); |
| 559 char *aBuf; |
| 560 }; |
| 561 |
| 562 typedef struct PorterRule PorterRule; |
| 563 struct PorterRule { |
| 564 const char *zSuffix; |
| 565 int nSuffix; |
| 566 int (*xCond)(char *zStem, int nStem); |
| 567 const char *zOutput; |
| 568 int nOutput; |
| 569 }; |
| 570 |
| 571 #if 0 |
| 572 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){ |
| 573 int ret = -1; |
| 574 int nBuf = *pnBuf; |
| 575 PorterRule *p; |
| 576 |
| 577 for(p=aRule; p->zSuffix; p++){ |
| 578 assert( strlen(p->zSuffix)==p->nSuffix ); |
| 579 assert( strlen(p->zOutput)==p->nOutput ); |
| 580 if( nBuf<p->nSuffix ) continue; |
| 581 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break; |
| 582 } |
| 583 |
| 584 if( p->zSuffix ){ |
| 585 int nStem = nBuf - p->nSuffix; |
| 586 if( p->xCond==0 || p->xCond(aBuf, nStem) ){ |
| 587 memcpy(&aBuf[nStem], p->zOutput, p->nOutput); |
| 588 *pnBuf = nStem + p->nOutput; |
| 589 ret = p - aRule; |
| 590 } |
| 591 } |
| 592 |
| 593 return ret; |
| 594 } |
| 595 #endif |
| 596 |
| 597 static int fts5PorterIsVowel(char c, int bYIsVowel){ |
| 598 return ( |
| 599 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y') |
| 600 ); |
| 601 } |
| 602 |
| 603 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){ |
| 604 int i; |
| 605 int bCons = bPrevCons; |
| 606 |
| 607 /* Scan for a vowel */ |
| 608 for(i=0; i<nStem; i++){ |
| 609 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break; |
| 610 } |
| 611 |
| 612 /* Scan for a consonent */ |
| 613 for(i++; i<nStem; i++){ |
| 614 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1; |
| 615 } |
| 616 return 0; |
| 617 } |
| 618 |
| 619 /* porter rule condition: (m > 0) */ |
| 620 static int fts5Porter_MGt0(char *zStem, int nStem){ |
| 621 return !!fts5PorterGobbleVC(zStem, nStem, 0); |
| 622 } |
| 623 |
| 624 /* porter rule condition: (m > 1) */ |
| 625 static int fts5Porter_MGt1(char *zStem, int nStem){ |
| 626 int n; |
| 627 n = fts5PorterGobbleVC(zStem, nStem, 0); |
| 628 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){ |
| 629 return 1; |
| 630 } |
| 631 return 0; |
| 632 } |
| 633 |
| 634 /* porter rule condition: (m = 1) */ |
| 635 static int fts5Porter_MEq1(char *zStem, int nStem){ |
| 636 int n; |
| 637 n = fts5PorterGobbleVC(zStem, nStem, 0); |
| 638 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){ |
| 639 return 1; |
| 640 } |
| 641 return 0; |
| 642 } |
| 643 |
| 644 /* porter rule condition: (*o) */ |
| 645 static int fts5Porter_Ostar(char *zStem, int nStem){ |
| 646 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){ |
| 647 return 0; |
| 648 }else{ |
| 649 int i; |
| 650 int mask = 0; |
| 651 int bCons = 0; |
| 652 for(i=0; i<nStem; i++){ |
| 653 bCons = !fts5PorterIsVowel(zStem[i], bCons); |
| 654 assert( bCons==0 || bCons==1 ); |
| 655 mask = (mask << 1) + bCons; |
| 656 } |
| 657 return ((mask & 0x0007)==0x0005); |
| 658 } |
| 659 } |
| 660 |
| 661 /* porter rule condition: (m > 1 and (*S or *T)) */ |
| 662 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){ |
| 663 assert( nStem>0 ); |
| 664 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t') |
| 665 && fts5Porter_MGt1(zStem, nStem); |
| 666 } |
| 667 |
| 668 /* porter rule condition: (*v*) */ |
| 669 static int fts5Porter_Vowel(char *zStem, int nStem){ |
| 670 int i; |
| 671 for(i=0; i<nStem; i++){ |
| 672 if( fts5PorterIsVowel(zStem[i], i>0) ){ |
| 673 return 1; |
| 674 } |
| 675 } |
| 676 return 0; |
| 677 } |
| 678 |
| 679 |
| 680 /************************************************************************** |
| 681 *************************************************************************** |
| 682 ** GENERATED CODE STARTS HERE (mkportersteps.tcl) |
| 683 */ |
| 684 |
| 685 static int fts5PorterStep4(char *aBuf, int *pnBuf){ |
| 686 int ret = 0; |
| 687 int nBuf = *pnBuf; |
| 688 switch( aBuf[nBuf-2] ){ |
| 689 |
| 690 case 'a': |
| 691 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){ |
| 692 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ |
| 693 *pnBuf = nBuf - 2; |
| 694 } |
| 695 } |
| 696 break; |
| 697 |
| 698 case 'c': |
| 699 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){ |
| 700 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 701 *pnBuf = nBuf - 4; |
| 702 } |
| 703 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){ |
| 704 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 705 *pnBuf = nBuf - 4; |
| 706 } |
| 707 } |
| 708 break; |
| 709 |
| 710 case 'e': |
| 711 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){ |
| 712 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ |
| 713 *pnBuf = nBuf - 2; |
| 714 } |
| 715 } |
| 716 break; |
| 717 |
| 718 case 'i': |
| 719 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){ |
| 720 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ |
| 721 *pnBuf = nBuf - 2; |
| 722 } |
| 723 } |
| 724 break; |
| 725 |
| 726 case 'l': |
| 727 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){ |
| 728 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 729 *pnBuf = nBuf - 4; |
| 730 } |
| 731 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){ |
| 732 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 733 *pnBuf = nBuf - 4; |
| 734 } |
| 735 } |
| 736 break; |
| 737 |
| 738 case 'n': |
| 739 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){ |
| 740 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 741 *pnBuf = nBuf - 3; |
| 742 } |
| 743 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){ |
| 744 if( fts5Porter_MGt1(aBuf, nBuf-5) ){ |
| 745 *pnBuf = nBuf - 5; |
| 746 } |
| 747 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){ |
| 748 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 749 *pnBuf = nBuf - 4; |
| 750 } |
| 751 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){ |
| 752 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 753 *pnBuf = nBuf - 3; |
| 754 } |
| 755 } |
| 756 break; |
| 757 |
| 758 case 'o': |
| 759 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){ |
| 760 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){ |
| 761 *pnBuf = nBuf - 3; |
| 762 } |
| 763 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){ |
| 764 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ |
| 765 *pnBuf = nBuf - 2; |
| 766 } |
| 767 } |
| 768 break; |
| 769 |
| 770 case 's': |
| 771 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){ |
| 772 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 773 *pnBuf = nBuf - 3; |
| 774 } |
| 775 } |
| 776 break; |
| 777 |
| 778 case 't': |
| 779 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){ |
| 780 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 781 *pnBuf = nBuf - 3; |
| 782 } |
| 783 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){ |
| 784 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 785 *pnBuf = nBuf - 3; |
| 786 } |
| 787 } |
| 788 break; |
| 789 |
| 790 case 'u': |
| 791 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){ |
| 792 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 793 *pnBuf = nBuf - 3; |
| 794 } |
| 795 } |
| 796 break; |
| 797 |
| 798 case 'v': |
| 799 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){ |
| 800 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 801 *pnBuf = nBuf - 3; |
| 802 } |
| 803 } |
| 804 break; |
| 805 |
| 806 case 'z': |
| 807 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){ |
| 808 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 809 *pnBuf = nBuf - 3; |
| 810 } |
| 811 } |
| 812 break; |
| 813 |
| 814 } |
| 815 return ret; |
| 816 } |
| 817 |
| 818 |
| 819 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){ |
| 820 int ret = 0; |
| 821 int nBuf = *pnBuf; |
| 822 switch( aBuf[nBuf-2] ){ |
| 823 |
| 824 case 'a': |
| 825 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){ |
| 826 memcpy(&aBuf[nBuf-2], "ate", 3); |
| 827 *pnBuf = nBuf - 2 + 3; |
| 828 ret = 1; |
| 829 } |
| 830 break; |
| 831 |
| 832 case 'b': |
| 833 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){ |
| 834 memcpy(&aBuf[nBuf-2], "ble", 3); |
| 835 *pnBuf = nBuf - 2 + 3; |
| 836 ret = 1; |
| 837 } |
| 838 break; |
| 839 |
| 840 case 'i': |
| 841 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){ |
| 842 memcpy(&aBuf[nBuf-2], "ize", 3); |
| 843 *pnBuf = nBuf - 2 + 3; |
| 844 ret = 1; |
| 845 } |
| 846 break; |
| 847 |
| 848 } |
| 849 return ret; |
| 850 } |
| 851 |
| 852 |
| 853 static int fts5PorterStep2(char *aBuf, int *pnBuf){ |
| 854 int ret = 0; |
| 855 int nBuf = *pnBuf; |
| 856 switch( aBuf[nBuf-2] ){ |
| 857 |
| 858 case 'a': |
| 859 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){ |
| 860 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 861 memcpy(&aBuf[nBuf-7], "ate", 3); |
| 862 *pnBuf = nBuf - 7 + 3; |
| 863 } |
| 864 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){ |
| 865 if( fts5Porter_MGt0(aBuf, nBuf-6) ){ |
| 866 memcpy(&aBuf[nBuf-6], "tion", 4); |
| 867 *pnBuf = nBuf - 6 + 4; |
| 868 } |
| 869 } |
| 870 break; |
| 871 |
| 872 case 'c': |
| 873 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){ |
| 874 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 875 memcpy(&aBuf[nBuf-4], "ence", 4); |
| 876 *pnBuf = nBuf - 4 + 4; |
| 877 } |
| 878 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){ |
| 879 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 880 memcpy(&aBuf[nBuf-4], "ance", 4); |
| 881 *pnBuf = nBuf - 4 + 4; |
| 882 } |
| 883 } |
| 884 break; |
| 885 |
| 886 case 'e': |
| 887 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){ |
| 888 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 889 memcpy(&aBuf[nBuf-4], "ize", 3); |
| 890 *pnBuf = nBuf - 4 + 3; |
| 891 } |
| 892 } |
| 893 break; |
| 894 |
| 895 case 'g': |
| 896 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){ |
| 897 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 898 memcpy(&aBuf[nBuf-4], "log", 3); |
| 899 *pnBuf = nBuf - 4 + 3; |
| 900 } |
| 901 } |
| 902 break; |
| 903 |
| 904 case 'l': |
| 905 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){ |
| 906 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ |
| 907 memcpy(&aBuf[nBuf-3], "ble", 3); |
| 908 *pnBuf = nBuf - 3 + 3; |
| 909 } |
| 910 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){ |
| 911 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 912 memcpy(&aBuf[nBuf-4], "al", 2); |
| 913 *pnBuf = nBuf - 4 + 2; |
| 914 } |
| 915 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){ |
| 916 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 917 memcpy(&aBuf[nBuf-5], "ent", 3); |
| 918 *pnBuf = nBuf - 5 + 3; |
| 919 } |
| 920 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){ |
| 921 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ |
| 922 memcpy(&aBuf[nBuf-3], "e", 1); |
| 923 *pnBuf = nBuf - 3 + 1; |
| 924 } |
| 925 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){ |
| 926 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 927 memcpy(&aBuf[nBuf-5], "ous", 3); |
| 928 *pnBuf = nBuf - 5 + 3; |
| 929 } |
| 930 } |
| 931 break; |
| 932 |
| 933 case 'o': |
| 934 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){ |
| 935 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 936 memcpy(&aBuf[nBuf-7], "ize", 3); |
| 937 *pnBuf = nBuf - 7 + 3; |
| 938 } |
| 939 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){ |
| 940 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 941 memcpy(&aBuf[nBuf-5], "ate", 3); |
| 942 *pnBuf = nBuf - 5 + 3; |
| 943 } |
| 944 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){ |
| 945 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 946 memcpy(&aBuf[nBuf-4], "ate", 3); |
| 947 *pnBuf = nBuf - 4 + 3; |
| 948 } |
| 949 } |
| 950 break; |
| 951 |
| 952 case 's': |
| 953 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){ |
| 954 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 955 memcpy(&aBuf[nBuf-5], "al", 2); |
| 956 *pnBuf = nBuf - 5 + 2; |
| 957 } |
| 958 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){ |
| 959 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 960 memcpy(&aBuf[nBuf-7], "ive", 3); |
| 961 *pnBuf = nBuf - 7 + 3; |
| 962 } |
| 963 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){ |
| 964 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 965 memcpy(&aBuf[nBuf-7], "ful", 3); |
| 966 *pnBuf = nBuf - 7 + 3; |
| 967 } |
| 968 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){ |
| 969 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 970 memcpy(&aBuf[nBuf-7], "ous", 3); |
| 971 *pnBuf = nBuf - 7 + 3; |
| 972 } |
| 973 } |
| 974 break; |
| 975 |
| 976 case 't': |
| 977 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){ |
| 978 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 979 memcpy(&aBuf[nBuf-5], "al", 2); |
| 980 *pnBuf = nBuf - 5 + 2; |
| 981 } |
| 982 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){ |
| 983 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 984 memcpy(&aBuf[nBuf-5], "ive", 3); |
| 985 *pnBuf = nBuf - 5 + 3; |
| 986 } |
| 987 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){ |
| 988 if( fts5Porter_MGt0(aBuf, nBuf-6) ){ |
| 989 memcpy(&aBuf[nBuf-6], "ble", 3); |
| 990 *pnBuf = nBuf - 6 + 3; |
| 991 } |
| 992 } |
| 993 break; |
| 994 |
| 995 } |
| 996 return ret; |
| 997 } |
| 998 |
| 999 |
| 1000 static int fts5PorterStep3(char *aBuf, int *pnBuf){ |
| 1001 int ret = 0; |
| 1002 int nBuf = *pnBuf; |
| 1003 switch( aBuf[nBuf-2] ){ |
| 1004 |
| 1005 case 'a': |
| 1006 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){ |
| 1007 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 1008 memcpy(&aBuf[nBuf-4], "ic", 2); |
| 1009 *pnBuf = nBuf - 4 + 2; |
| 1010 } |
| 1011 } |
| 1012 break; |
| 1013 |
| 1014 case 's': |
| 1015 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){ |
| 1016 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 1017 *pnBuf = nBuf - 4; |
| 1018 } |
| 1019 } |
| 1020 break; |
| 1021 |
| 1022 case 't': |
| 1023 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){ |
| 1024 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 1025 memcpy(&aBuf[nBuf-5], "ic", 2); |
| 1026 *pnBuf = nBuf - 5 + 2; |
| 1027 } |
| 1028 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){ |
| 1029 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 1030 memcpy(&aBuf[nBuf-5], "ic", 2); |
| 1031 *pnBuf = nBuf - 5 + 2; |
| 1032 } |
| 1033 } |
| 1034 break; |
| 1035 |
| 1036 case 'u': |
| 1037 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){ |
| 1038 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ |
| 1039 *pnBuf = nBuf - 3; |
| 1040 } |
| 1041 } |
| 1042 break; |
| 1043 |
| 1044 case 'v': |
| 1045 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){ |
| 1046 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 1047 *pnBuf = nBuf - 5; |
| 1048 } |
| 1049 } |
| 1050 break; |
| 1051 |
| 1052 case 'z': |
| 1053 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){ |
| 1054 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 1055 memcpy(&aBuf[nBuf-5], "al", 2); |
| 1056 *pnBuf = nBuf - 5 + 2; |
| 1057 } |
| 1058 } |
| 1059 break; |
| 1060 |
| 1061 } |
| 1062 return ret; |
| 1063 } |
| 1064 |
| 1065 |
| 1066 static int fts5PorterStep1B(char *aBuf, int *pnBuf){ |
| 1067 int ret = 0; |
| 1068 int nBuf = *pnBuf; |
| 1069 switch( aBuf[nBuf-2] ){ |
| 1070 |
| 1071 case 'e': |
| 1072 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){ |
| 1073 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ |
| 1074 memcpy(&aBuf[nBuf-3], "ee", 2); |
| 1075 *pnBuf = nBuf - 3 + 2; |
| 1076 } |
| 1077 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){ |
| 1078 if( fts5Porter_Vowel(aBuf, nBuf-2) ){ |
| 1079 *pnBuf = nBuf - 2; |
| 1080 ret = 1; |
| 1081 } |
| 1082 } |
| 1083 break; |
| 1084 |
| 1085 case 'n': |
| 1086 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){ |
| 1087 if( fts5Porter_Vowel(aBuf, nBuf-3) ){ |
| 1088 *pnBuf = nBuf - 3; |
| 1089 ret = 1; |
| 1090 } |
| 1091 } |
| 1092 break; |
| 1093 |
| 1094 } |
| 1095 return ret; |
| 1096 } |
| 1097 |
| 1098 /* |
| 1099 ** GENERATED CODE ENDS HERE (mkportersteps.tcl) |
| 1100 *************************************************************************** |
| 1101 **************************************************************************/ |
| 1102 |
| 1103 static void fts5PorterStep1A(char *aBuf, int *pnBuf){ |
| 1104 int nBuf = *pnBuf; |
| 1105 if( aBuf[nBuf-1]=='s' ){ |
| 1106 if( aBuf[nBuf-2]=='e' ){ |
| 1107 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s') |
| 1108 || (nBuf>3 && aBuf[nBuf-3]=='i' ) |
| 1109 ){ |
| 1110 *pnBuf = nBuf-2; |
| 1111 }else{ |
| 1112 *pnBuf = nBuf-1; |
| 1113 } |
| 1114 } |
| 1115 else if( aBuf[nBuf-2]!='s' ){ |
| 1116 *pnBuf = nBuf-1; |
| 1117 } |
| 1118 } |
| 1119 } |
| 1120 |
| 1121 static int fts5PorterCb( |
| 1122 void *pCtx, |
| 1123 int tflags, |
| 1124 const char *pToken, |
| 1125 int nToken, |
| 1126 int iStart, |
| 1127 int iEnd |
| 1128 ){ |
| 1129 PorterContext *p = (PorterContext*)pCtx; |
| 1130 |
| 1131 char *aBuf; |
| 1132 int nBuf; |
| 1133 |
| 1134 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through; |
| 1135 aBuf = p->aBuf; |
| 1136 nBuf = nToken; |
| 1137 memcpy(aBuf, pToken, nBuf); |
| 1138 |
| 1139 /* Step 1. */ |
| 1140 fts5PorterStep1A(aBuf, &nBuf); |
| 1141 if( fts5PorterStep1B(aBuf, &nBuf) ){ |
| 1142 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){ |
| 1143 char c = aBuf[nBuf-1]; |
| 1144 if( fts5PorterIsVowel(c, 0)==0 |
| 1145 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2] |
| 1146 ){ |
| 1147 nBuf--; |
| 1148 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){ |
| 1149 aBuf[nBuf++] = 'e'; |
| 1150 } |
| 1151 } |
| 1152 } |
| 1153 |
| 1154 /* Step 1C. */ |
| 1155 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){ |
| 1156 aBuf[nBuf-1] = 'i'; |
| 1157 } |
| 1158 |
| 1159 /* Steps 2 through 4. */ |
| 1160 fts5PorterStep2(aBuf, &nBuf); |
| 1161 fts5PorterStep3(aBuf, &nBuf); |
| 1162 fts5PorterStep4(aBuf, &nBuf); |
| 1163 |
| 1164 /* Step 5a. */ |
| 1165 assert( nBuf>0 ); |
| 1166 if( aBuf[nBuf-1]=='e' ){ |
| 1167 if( fts5Porter_MGt1(aBuf, nBuf-1) |
| 1168 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1)) |
| 1169 ){ |
| 1170 nBuf--; |
| 1171 } |
| 1172 } |
| 1173 |
| 1174 /* Step 5b. */ |
| 1175 if( nBuf>1 && aBuf[nBuf-1]=='l' |
| 1176 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) |
| 1177 ){ |
| 1178 nBuf--; |
| 1179 } |
| 1180 |
| 1181 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd); |
| 1182 |
| 1183 pass_through: |
| 1184 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd); |
| 1185 } |
| 1186 |
| 1187 /* |
| 1188 ** Tokenize using the porter tokenizer. |
| 1189 */ |
| 1190 static int fts5PorterTokenize( |
| 1191 Fts5Tokenizer *pTokenizer, |
| 1192 void *pCtx, |
| 1193 int flags, |
| 1194 const char *pText, int nText, |
| 1195 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) |
| 1196 ){ |
| 1197 PorterTokenizer *p = (PorterTokenizer*)pTokenizer; |
| 1198 PorterContext sCtx; |
| 1199 sCtx.xToken = xToken; |
| 1200 sCtx.pCtx = pCtx; |
| 1201 sCtx.aBuf = p->aBuf; |
| 1202 return p->tokenizer.xTokenize( |
| 1203 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb |
| 1204 ); |
| 1205 } |
| 1206 |
| 1207 /* |
| 1208 ** Register all built-in tokenizers with FTS5. |
| 1209 */ |
| 1210 int sqlite3Fts5TokenizerInit(fts5_api *pApi){ |
| 1211 struct BuiltinTokenizer { |
| 1212 const char *zName; |
| 1213 fts5_tokenizer x; |
| 1214 } aBuiltin[] = { |
| 1215 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}}, |
| 1216 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }}, |
| 1217 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }}, |
| 1218 }; |
| 1219 |
| 1220 int rc = SQLITE_OK; /* Return code */ |
| 1221 int i; /* To iterate through builtin functions */ |
| 1222 |
| 1223 for(i=0; rc==SQLITE_OK && i<(int)ArraySize(aBuiltin); i++){ |
| 1224 rc = pApi->xCreateTokenizer(pApi, |
| 1225 aBuiltin[i].zName, |
| 1226 (void*)pApi, |
| 1227 &aBuiltin[i].x, |
| 1228 0 |
| 1229 ); |
| 1230 } |
| 1231 |
| 1232 return rc; |
| 1233 } |
| 1234 |
| 1235 |
OLD | NEW |