OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ** 2014 May 31 |
| 3 ** |
| 4 ** The author disclaims copyright to this source code. In place of |
| 5 ** a legal notice, here is a blessing: |
| 6 ** |
| 7 ** May you do good and not evil. |
| 8 ** May you find forgiveness for yourself and forgive others. |
| 9 ** May you share freely, never taking more than you give. |
| 10 ** |
| 11 ****************************************************************************** |
| 12 */ |
| 13 |
| 14 |
| 15 #include "fts5Int.h" |
| 16 |
| 17 /************************************************************************** |
| 18 ** Start of ascii tokenizer implementation. |
| 19 */ |
| 20 |
| 21 /* |
| 22 ** For tokenizers with no "unicode" modifier, the set of token characters |
| 23 ** is the same as the set of ASCII range alphanumeric characters. |
| 24 */ |
| 25 static unsigned char aAsciiTokenChar[128] = { |
| 26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */ |
| 27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */ |
| 28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */ |
| 29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */ |
| 30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */ |
| 31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */ |
| 32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */ |
| 33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */ |
| 34 }; |
| 35 |
| 36 typedef struct AsciiTokenizer AsciiTokenizer; |
| 37 struct AsciiTokenizer { |
| 38 unsigned char aTokenChar[128]; |
| 39 }; |
| 40 |
| 41 static void fts5AsciiAddExceptions( |
| 42 AsciiTokenizer *p, |
| 43 const char *zArg, |
| 44 int bTokenChars |
| 45 ){ |
| 46 int i; |
| 47 for(i=0; zArg[i]; i++){ |
| 48 if( (zArg[i] & 0x80)==0 ){ |
| 49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars; |
| 50 } |
| 51 } |
| 52 } |
| 53 |
| 54 /* |
| 55 ** Delete a "ascii" tokenizer. |
| 56 */ |
| 57 static void fts5AsciiDelete(Fts5Tokenizer *p){ |
| 58 sqlite3_free(p); |
| 59 } |
| 60 |
| 61 /* |
| 62 ** Create an "ascii" tokenizer. |
| 63 */ |
| 64 static int fts5AsciiCreate( |
| 65 void *pUnused, |
| 66 const char **azArg, int nArg, |
| 67 Fts5Tokenizer **ppOut |
| 68 ){ |
| 69 int rc = SQLITE_OK; |
| 70 AsciiTokenizer *p = 0; |
| 71 UNUSED_PARAM(pUnused); |
| 72 if( nArg%2 ){ |
| 73 rc = SQLITE_ERROR; |
| 74 }else{ |
| 75 p = sqlite3_malloc(sizeof(AsciiTokenizer)); |
| 76 if( p==0 ){ |
| 77 rc = SQLITE_NOMEM; |
| 78 }else{ |
| 79 int i; |
| 80 memset(p, 0, sizeof(AsciiTokenizer)); |
| 81 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar)); |
| 82 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ |
| 83 const char *zArg = azArg[i+1]; |
| 84 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ |
| 85 fts5AsciiAddExceptions(p, zArg, 1); |
| 86 }else |
| 87 if( 0==sqlite3_stricmp(azArg[i], "separators") ){ |
| 88 fts5AsciiAddExceptions(p, zArg, 0); |
| 89 }else{ |
| 90 rc = SQLITE_ERROR; |
| 91 } |
| 92 } |
| 93 if( rc!=SQLITE_OK ){ |
| 94 fts5AsciiDelete((Fts5Tokenizer*)p); |
| 95 p = 0; |
| 96 } |
| 97 } |
| 98 } |
| 99 |
| 100 *ppOut = (Fts5Tokenizer*)p; |
| 101 return rc; |
| 102 } |
| 103 |
| 104 |
| 105 static void asciiFold(char *aOut, const char *aIn, int nByte){ |
| 106 int i; |
| 107 for(i=0; i<nByte; i++){ |
| 108 char c = aIn[i]; |
| 109 if( c>='A' && c<='Z' ) c += 32; |
| 110 aOut[i] = c; |
| 111 } |
| 112 } |
| 113 |
| 114 /* |
| 115 ** Tokenize some text using the ascii tokenizer. |
| 116 */ |
| 117 static int fts5AsciiTokenize( |
| 118 Fts5Tokenizer *pTokenizer, |
| 119 void *pCtx, |
| 120 int iUnused, |
| 121 const char *pText, int nText, |
| 122 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) |
| 123 ){ |
| 124 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer; |
| 125 int rc = SQLITE_OK; |
| 126 int ie; |
| 127 int is = 0; |
| 128 |
| 129 char aFold[64]; |
| 130 int nFold = sizeof(aFold); |
| 131 char *pFold = aFold; |
| 132 unsigned char *a = p->aTokenChar; |
| 133 |
| 134 UNUSED_PARAM(iUnused); |
| 135 |
| 136 while( is<nText && rc==SQLITE_OK ){ |
| 137 int nByte; |
| 138 |
| 139 /* Skip any leading divider characters. */ |
| 140 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){ |
| 141 is++; |
| 142 } |
| 143 if( is==nText ) break; |
| 144 |
| 145 /* Count the token characters */ |
| 146 ie = is+1; |
| 147 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){ |
| 148 ie++; |
| 149 } |
| 150 |
| 151 /* Fold to lower case */ |
| 152 nByte = ie-is; |
| 153 if( nByte>nFold ){ |
| 154 if( pFold!=aFold ) sqlite3_free(pFold); |
| 155 pFold = sqlite3_malloc(nByte*2); |
| 156 if( pFold==0 ){ |
| 157 rc = SQLITE_NOMEM; |
| 158 break; |
| 159 } |
| 160 nFold = nByte*2; |
| 161 } |
| 162 asciiFold(pFold, &pText[is], nByte); |
| 163 |
| 164 /* Invoke the token callback */ |
| 165 rc = xToken(pCtx, 0, pFold, nByte, is, ie); |
| 166 is = ie+1; |
| 167 } |
| 168 |
| 169 if( pFold!=aFold ) sqlite3_free(pFold); |
| 170 if( rc==SQLITE_DONE ) rc = SQLITE_OK; |
| 171 return rc; |
| 172 } |
| 173 |
| 174 /************************************************************************** |
| 175 ** Start of unicode61 tokenizer implementation. |
| 176 */ |
| 177 |
| 178 |
| 179 /* |
| 180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied |
| 181 ** from the sqlite3 source file utf.c. If this file is compiled as part |
| 182 ** of the amalgamation, they are not required. |
| 183 */ |
| 184 #ifndef SQLITE_AMALGAMATION |
| 185 |
| 186 static const unsigned char sqlite3Utf8Trans1[] = { |
| 187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
| 189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, |
| 190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |
| 191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
| 193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, |
| 195 }; |
| 196 |
| 197 #define READ_UTF8(zIn, zTerm, c) \ |
| 198 c = *(zIn++); \ |
| 199 if( c>=0xc0 ){ \ |
| 200 c = sqlite3Utf8Trans1[c-0xc0]; \ |
| 201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \ |
| 202 c = (c<<6) + (0x3f & *(zIn++)); \ |
| 203 } \ |
| 204 if( c<0x80 \ |
| 205 || (c&0xFFFFF800)==0xD800 \ |
| 206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ |
| 207 } |
| 208 |
| 209 |
| 210 #define WRITE_UTF8(zOut, c) { \ |
| 211 if( c<0x00080 ){ \ |
| 212 *zOut++ = (unsigned char)(c&0xFF); \ |
| 213 } \ |
| 214 else if( c<0x00800 ){ \ |
| 215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \ |
| 216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ |
| 217 } \ |
| 218 else if( c<0x10000 ){ \ |
| 219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \ |
| 220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \ |
| 221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ |
| 222 }else{ \ |
| 223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \ |
| 224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \ |
| 225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \ |
| 226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ |
| 227 } \ |
| 228 } |
| 229 |
| 230 #endif /* ifndef SQLITE_AMALGAMATION */ |
| 231 |
| 232 typedef struct Unicode61Tokenizer Unicode61Tokenizer; |
| 233 struct Unicode61Tokenizer { |
| 234 unsigned char aTokenChar[128]; /* ASCII range token characters */ |
| 235 char *aFold; /* Buffer to fold text into */ |
| 236 int nFold; /* Size of aFold[] in bytes */ |
| 237 int bRemoveDiacritic; /* True if remove_diacritics=1 is set */ |
| 238 int nException; |
| 239 int *aiException; |
| 240 }; |
| 241 |
| 242 static int fts5UnicodeAddExceptions( |
| 243 Unicode61Tokenizer *p, /* Tokenizer object */ |
| 244 const char *z, /* Characters to treat as exceptions */ |
| 245 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */ |
| 246 ){ |
| 247 int rc = SQLITE_OK; |
| 248 int n = (int)strlen(z); |
| 249 int *aNew; |
| 250 |
| 251 if( n>0 ){ |
| 252 aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int)); |
| 253 if( aNew ){ |
| 254 int nNew = p->nException; |
| 255 const unsigned char *zCsr = (const unsigned char*)z; |
| 256 const unsigned char *zTerm = (const unsigned char*)&z[n]; |
| 257 while( zCsr<zTerm ){ |
| 258 int iCode; |
| 259 int bToken; |
| 260 READ_UTF8(zCsr, zTerm, iCode); |
| 261 if( iCode<128 ){ |
| 262 p->aTokenChar[iCode] = (unsigned char)bTokenChars; |
| 263 }else{ |
| 264 bToken = sqlite3Fts5UnicodeIsalnum(iCode); |
| 265 assert( (bToken==0 || bToken==1) ); |
| 266 assert( (bTokenChars==0 || bTokenChars==1) ); |
| 267 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){ |
| 268 int i; |
| 269 for(i=0; i<nNew; i++){ |
| 270 if( aNew[i]>iCode ) break; |
| 271 } |
| 272 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int)); |
| 273 aNew[i] = iCode; |
| 274 nNew++; |
| 275 } |
| 276 } |
| 277 } |
| 278 p->aiException = aNew; |
| 279 p->nException = nNew; |
| 280 }else{ |
| 281 rc = SQLITE_NOMEM; |
| 282 } |
| 283 } |
| 284 |
| 285 return rc; |
| 286 } |
| 287 |
| 288 /* |
| 289 ** Return true if the p->aiException[] array contains the value iCode. |
| 290 */ |
| 291 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){ |
| 292 if( p->nException>0 ){ |
| 293 int *a = p->aiException; |
| 294 int iLo = 0; |
| 295 int iHi = p->nException-1; |
| 296 |
| 297 while( iHi>=iLo ){ |
| 298 int iTest = (iHi + iLo) / 2; |
| 299 if( iCode==a[iTest] ){ |
| 300 return 1; |
| 301 }else if( iCode>a[iTest] ){ |
| 302 iLo = iTest+1; |
| 303 }else{ |
| 304 iHi = iTest-1; |
| 305 } |
| 306 } |
| 307 } |
| 308 |
| 309 return 0; |
| 310 } |
| 311 |
| 312 /* |
| 313 ** Delete a "unicode61" tokenizer. |
| 314 */ |
| 315 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){ |
| 316 if( pTok ){ |
| 317 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok; |
| 318 sqlite3_free(p->aiException); |
| 319 sqlite3_free(p->aFold); |
| 320 sqlite3_free(p); |
| 321 } |
| 322 return; |
| 323 } |
| 324 |
| 325 /* |
| 326 ** Create a "unicode61" tokenizer. |
| 327 */ |
| 328 static int fts5UnicodeCreate( |
| 329 void *pUnused, |
| 330 const char **azArg, int nArg, |
| 331 Fts5Tokenizer **ppOut |
| 332 ){ |
| 333 int rc = SQLITE_OK; /* Return code */ |
| 334 Unicode61Tokenizer *p = 0; /* New tokenizer object */ |
| 335 |
| 336 UNUSED_PARAM(pUnused); |
| 337 |
| 338 if( nArg%2 ){ |
| 339 rc = SQLITE_ERROR; |
| 340 }else{ |
| 341 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer)); |
| 342 if( p ){ |
| 343 int i; |
| 344 memset(p, 0, sizeof(Unicode61Tokenizer)); |
| 345 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar)); |
| 346 p->bRemoveDiacritic = 1; |
| 347 p->nFold = 64; |
| 348 p->aFold = sqlite3_malloc(p->nFold * sizeof(char)); |
| 349 if( p->aFold==0 ){ |
| 350 rc = SQLITE_NOMEM; |
| 351 } |
| 352 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ |
| 353 const char *zArg = azArg[i+1]; |
| 354 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ |
| 355 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){ |
| 356 rc = SQLITE_ERROR; |
| 357 } |
| 358 p->bRemoveDiacritic = (zArg[0]=='1'); |
| 359 }else |
| 360 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ |
| 361 rc = fts5UnicodeAddExceptions(p, zArg, 1); |
| 362 }else |
| 363 if( 0==sqlite3_stricmp(azArg[i], "separators") ){ |
| 364 rc = fts5UnicodeAddExceptions(p, zArg, 0); |
| 365 }else{ |
| 366 rc = SQLITE_ERROR; |
| 367 } |
| 368 } |
| 369 }else{ |
| 370 rc = SQLITE_NOMEM; |
| 371 } |
| 372 if( rc!=SQLITE_OK ){ |
| 373 fts5UnicodeDelete((Fts5Tokenizer*)p); |
| 374 p = 0; |
| 375 } |
| 376 *ppOut = (Fts5Tokenizer*)p; |
| 377 } |
| 378 return rc; |
| 379 } |
| 380 |
| 381 /* |
| 382 ** Return true if, for the purposes of tokenizing with the tokenizer |
| 383 ** passed as the first argument, codepoint iCode is considered a token |
| 384 ** character (not a separator). |
| 385 */ |
| 386 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){ |
| 387 assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); |
| 388 return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode); |
| 389 } |
| 390 |
| 391 static int fts5UnicodeTokenize( |
| 392 Fts5Tokenizer *pTokenizer, |
| 393 void *pCtx, |
| 394 int iUnused, |
| 395 const char *pText, int nText, |
| 396 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) |
| 397 ){ |
| 398 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; |
| 399 int rc = SQLITE_OK; |
| 400 unsigned char *a = p->aTokenChar; |
| 401 |
| 402 unsigned char *zTerm = (unsigned char*)&pText[nText]; |
| 403 unsigned char *zCsr = (unsigned char *)pText; |
| 404 |
| 405 /* Output buffer */ |
| 406 char *aFold = p->aFold; |
| 407 int nFold = p->nFold; |
| 408 const char *pEnd = &aFold[nFold-6]; |
| 409 |
| 410 UNUSED_PARAM(iUnused); |
| 411 |
| 412 /* Each iteration of this loop gobbles up a contiguous run of separators, |
| 413 ** then the next token. */ |
| 414 while( rc==SQLITE_OK ){ |
| 415 int iCode; /* non-ASCII codepoint read from input */ |
| 416 char *zOut = aFold; |
| 417 int is; |
| 418 int ie; |
| 419 |
| 420 /* Skip any separator characters. */ |
| 421 while( 1 ){ |
| 422 if( zCsr>=zTerm ) goto tokenize_done; |
| 423 if( *zCsr & 0x80 ) { |
| 424 /* A character outside of the ascii range. Skip past it if it is |
| 425 ** a separator character. Or break out of the loop if it is not. */ |
| 426 is = zCsr - (unsigned char*)pText; |
| 427 READ_UTF8(zCsr, zTerm, iCode); |
| 428 if( fts5UnicodeIsAlnum(p, iCode) ){ |
| 429 goto non_ascii_tokenchar; |
| 430 } |
| 431 }else{ |
| 432 if( a[*zCsr] ){ |
| 433 is = zCsr - (unsigned char*)pText; |
| 434 goto ascii_tokenchar; |
| 435 } |
| 436 zCsr++; |
| 437 } |
| 438 } |
| 439 |
| 440 /* Run through the tokenchars. Fold them into the output buffer along |
| 441 ** the way. */ |
| 442 while( zCsr<zTerm ){ |
| 443 |
| 444 /* Grow the output buffer so that there is sufficient space to fit the |
| 445 ** largest possible utf-8 character. */ |
| 446 if( zOut>pEnd ){ |
| 447 aFold = sqlite3_malloc(nFold*2); |
| 448 if( aFold==0 ){ |
| 449 rc = SQLITE_NOMEM; |
| 450 goto tokenize_done; |
| 451 } |
| 452 zOut = &aFold[zOut - p->aFold]; |
| 453 memcpy(aFold, p->aFold, nFold); |
| 454 sqlite3_free(p->aFold); |
| 455 p->aFold = aFold; |
| 456 p->nFold = nFold = nFold*2; |
| 457 pEnd = &aFold[nFold-6]; |
| 458 } |
| 459 |
| 460 if( *zCsr & 0x80 ){ |
| 461 /* An non-ascii-range character. Fold it into the output buffer if |
| 462 ** it is a token character, or break out of the loop if it is not. */ |
| 463 READ_UTF8(zCsr, zTerm, iCode); |
| 464 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){ |
| 465 non_ascii_tokenchar: |
| 466 iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); |
| 467 if( iCode ) WRITE_UTF8(zOut, iCode); |
| 468 }else{ |
| 469 break; |
| 470 } |
| 471 }else if( a[*zCsr]==0 ){ |
| 472 /* An ascii-range separator character. End of token. */ |
| 473 break; |
| 474 }else{ |
| 475 ascii_tokenchar: |
| 476 if( *zCsr>='A' && *zCsr<='Z' ){ |
| 477 *zOut++ = *zCsr + 32; |
| 478 }else{ |
| 479 *zOut++ = *zCsr; |
| 480 } |
| 481 zCsr++; |
| 482 } |
| 483 ie = zCsr - (unsigned char*)pText; |
| 484 } |
| 485 |
| 486 /* Invoke the token callback */ |
| 487 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie); |
| 488 } |
| 489 |
| 490 tokenize_done: |
| 491 if( rc==SQLITE_DONE ) rc = SQLITE_OK; |
| 492 return rc; |
| 493 } |
| 494 |
| 495 /************************************************************************** |
| 496 ** Start of porter stemmer implementation. |
| 497 */ |
| 498 |
| 499 /* Any tokens larger than this (in bytes) are passed through without |
| 500 ** stemming. */ |
| 501 #define FTS5_PORTER_MAX_TOKEN 64 |
| 502 |
| 503 typedef struct PorterTokenizer PorterTokenizer; |
| 504 struct PorterTokenizer { |
| 505 fts5_tokenizer tokenizer; /* Parent tokenizer module */ |
| 506 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */ |
| 507 char aBuf[FTS5_PORTER_MAX_TOKEN + 64]; |
| 508 }; |
| 509 |
| 510 /* |
| 511 ** Delete a "porter" tokenizer. |
| 512 */ |
| 513 static void fts5PorterDelete(Fts5Tokenizer *pTok){ |
| 514 if( pTok ){ |
| 515 PorterTokenizer *p = (PorterTokenizer*)pTok; |
| 516 if( p->pTokenizer ){ |
| 517 p->tokenizer.xDelete(p->pTokenizer); |
| 518 } |
| 519 sqlite3_free(p); |
| 520 } |
| 521 } |
| 522 |
| 523 /* |
| 524 ** Create a "porter" tokenizer. |
| 525 */ |
| 526 static int fts5PorterCreate( |
| 527 void *pCtx, |
| 528 const char **azArg, int nArg, |
| 529 Fts5Tokenizer **ppOut |
| 530 ){ |
| 531 fts5_api *pApi = (fts5_api*)pCtx; |
| 532 int rc = SQLITE_OK; |
| 533 PorterTokenizer *pRet; |
| 534 void *pUserdata = 0; |
| 535 const char *zBase = "unicode61"; |
| 536 |
| 537 if( nArg>0 ){ |
| 538 zBase = azArg[0]; |
| 539 } |
| 540 |
| 541 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer)); |
| 542 if( pRet ){ |
| 543 memset(pRet, 0, sizeof(PorterTokenizer)); |
| 544 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer); |
| 545 }else{ |
| 546 rc = SQLITE_NOMEM; |
| 547 } |
| 548 if( rc==SQLITE_OK ){ |
| 549 int nArg2 = (nArg>0 ? nArg-1 : 0); |
| 550 const char **azArg2 = (nArg2 ? &azArg[1] : 0); |
| 551 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer); |
| 552 } |
| 553 |
| 554 if( rc!=SQLITE_OK ){ |
| 555 fts5PorterDelete((Fts5Tokenizer*)pRet); |
| 556 pRet = 0; |
| 557 } |
| 558 *ppOut = (Fts5Tokenizer*)pRet; |
| 559 return rc; |
| 560 } |
| 561 |
| 562 typedef struct PorterContext PorterContext; |
| 563 struct PorterContext { |
| 564 void *pCtx; |
| 565 int (*xToken)(void*, int, const char*, int, int, int); |
| 566 char *aBuf; |
| 567 }; |
| 568 |
| 569 typedef struct PorterRule PorterRule; |
| 570 struct PorterRule { |
| 571 const char *zSuffix; |
| 572 int nSuffix; |
| 573 int (*xCond)(char *zStem, int nStem); |
| 574 const char *zOutput; |
| 575 int nOutput; |
| 576 }; |
| 577 |
| 578 #if 0 |
| 579 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){ |
| 580 int ret = -1; |
| 581 int nBuf = *pnBuf; |
| 582 PorterRule *p; |
| 583 |
| 584 for(p=aRule; p->zSuffix; p++){ |
| 585 assert( strlen(p->zSuffix)==p->nSuffix ); |
| 586 assert( strlen(p->zOutput)==p->nOutput ); |
| 587 if( nBuf<p->nSuffix ) continue; |
| 588 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break; |
| 589 } |
| 590 |
| 591 if( p->zSuffix ){ |
| 592 int nStem = nBuf - p->nSuffix; |
| 593 if( p->xCond==0 || p->xCond(aBuf, nStem) ){ |
| 594 memcpy(&aBuf[nStem], p->zOutput, p->nOutput); |
| 595 *pnBuf = nStem + p->nOutput; |
| 596 ret = p - aRule; |
| 597 } |
| 598 } |
| 599 |
| 600 return ret; |
| 601 } |
| 602 #endif |
| 603 |
| 604 static int fts5PorterIsVowel(char c, int bYIsVowel){ |
| 605 return ( |
| 606 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y') |
| 607 ); |
| 608 } |
| 609 |
| 610 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){ |
| 611 int i; |
| 612 int bCons = bPrevCons; |
| 613 |
| 614 /* Scan for a vowel */ |
| 615 for(i=0; i<nStem; i++){ |
| 616 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break; |
| 617 } |
| 618 |
| 619 /* Scan for a consonent */ |
| 620 for(i++; i<nStem; i++){ |
| 621 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1; |
| 622 } |
| 623 return 0; |
| 624 } |
| 625 |
| 626 /* porter rule condition: (m > 0) */ |
| 627 static int fts5Porter_MGt0(char *zStem, int nStem){ |
| 628 return !!fts5PorterGobbleVC(zStem, nStem, 0); |
| 629 } |
| 630 |
| 631 /* porter rule condition: (m > 1) */ |
| 632 static int fts5Porter_MGt1(char *zStem, int nStem){ |
| 633 int n; |
| 634 n = fts5PorterGobbleVC(zStem, nStem, 0); |
| 635 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){ |
| 636 return 1; |
| 637 } |
| 638 return 0; |
| 639 } |
| 640 |
| 641 /* porter rule condition: (m = 1) */ |
| 642 static int fts5Porter_MEq1(char *zStem, int nStem){ |
| 643 int n; |
| 644 n = fts5PorterGobbleVC(zStem, nStem, 0); |
| 645 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){ |
| 646 return 1; |
| 647 } |
| 648 return 0; |
| 649 } |
| 650 |
| 651 /* porter rule condition: (*o) */ |
| 652 static int fts5Porter_Ostar(char *zStem, int nStem){ |
| 653 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){ |
| 654 return 0; |
| 655 }else{ |
| 656 int i; |
| 657 int mask = 0; |
| 658 int bCons = 0; |
| 659 for(i=0; i<nStem; i++){ |
| 660 bCons = !fts5PorterIsVowel(zStem[i], bCons); |
| 661 assert( bCons==0 || bCons==1 ); |
| 662 mask = (mask << 1) + bCons; |
| 663 } |
| 664 return ((mask & 0x0007)==0x0005); |
| 665 } |
| 666 } |
| 667 |
| 668 /* porter rule condition: (m > 1 and (*S or *T)) */ |
| 669 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){ |
| 670 assert( nStem>0 ); |
| 671 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t') |
| 672 && fts5Porter_MGt1(zStem, nStem); |
| 673 } |
| 674 |
| 675 /* porter rule condition: (*v*) */ |
| 676 static int fts5Porter_Vowel(char *zStem, int nStem){ |
| 677 int i; |
| 678 for(i=0; i<nStem; i++){ |
| 679 if( fts5PorterIsVowel(zStem[i], i>0) ){ |
| 680 return 1; |
| 681 } |
| 682 } |
| 683 return 0; |
| 684 } |
| 685 |
| 686 |
| 687 /************************************************************************** |
| 688 *************************************************************************** |
| 689 ** GENERATED CODE STARTS HERE (mkportersteps.tcl) |
| 690 */ |
| 691 |
| 692 static int fts5PorterStep4(char *aBuf, int *pnBuf){ |
| 693 int ret = 0; |
| 694 int nBuf = *pnBuf; |
| 695 switch( aBuf[nBuf-2] ){ |
| 696 |
| 697 case 'a': |
| 698 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){ |
| 699 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ |
| 700 *pnBuf = nBuf - 2; |
| 701 } |
| 702 } |
| 703 break; |
| 704 |
| 705 case 'c': |
| 706 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){ |
| 707 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 708 *pnBuf = nBuf - 4; |
| 709 } |
| 710 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){ |
| 711 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 712 *pnBuf = nBuf - 4; |
| 713 } |
| 714 } |
| 715 break; |
| 716 |
| 717 case 'e': |
| 718 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){ |
| 719 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ |
| 720 *pnBuf = nBuf - 2; |
| 721 } |
| 722 } |
| 723 break; |
| 724 |
| 725 case 'i': |
| 726 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){ |
| 727 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ |
| 728 *pnBuf = nBuf - 2; |
| 729 } |
| 730 } |
| 731 break; |
| 732 |
| 733 case 'l': |
| 734 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){ |
| 735 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 736 *pnBuf = nBuf - 4; |
| 737 } |
| 738 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){ |
| 739 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 740 *pnBuf = nBuf - 4; |
| 741 } |
| 742 } |
| 743 break; |
| 744 |
| 745 case 'n': |
| 746 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){ |
| 747 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 748 *pnBuf = nBuf - 3; |
| 749 } |
| 750 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){ |
| 751 if( fts5Porter_MGt1(aBuf, nBuf-5) ){ |
| 752 *pnBuf = nBuf - 5; |
| 753 } |
| 754 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){ |
| 755 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ |
| 756 *pnBuf = nBuf - 4; |
| 757 } |
| 758 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){ |
| 759 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 760 *pnBuf = nBuf - 3; |
| 761 } |
| 762 } |
| 763 break; |
| 764 |
| 765 case 'o': |
| 766 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){ |
| 767 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){ |
| 768 *pnBuf = nBuf - 3; |
| 769 } |
| 770 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){ |
| 771 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ |
| 772 *pnBuf = nBuf - 2; |
| 773 } |
| 774 } |
| 775 break; |
| 776 |
| 777 case 's': |
| 778 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){ |
| 779 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 780 *pnBuf = nBuf - 3; |
| 781 } |
| 782 } |
| 783 break; |
| 784 |
| 785 case 't': |
| 786 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){ |
| 787 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 788 *pnBuf = nBuf - 3; |
| 789 } |
| 790 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){ |
| 791 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 792 *pnBuf = nBuf - 3; |
| 793 } |
| 794 } |
| 795 break; |
| 796 |
| 797 case 'u': |
| 798 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){ |
| 799 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 800 *pnBuf = nBuf - 3; |
| 801 } |
| 802 } |
| 803 break; |
| 804 |
| 805 case 'v': |
| 806 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){ |
| 807 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 808 *pnBuf = nBuf - 3; |
| 809 } |
| 810 } |
| 811 break; |
| 812 |
| 813 case 'z': |
| 814 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){ |
| 815 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ |
| 816 *pnBuf = nBuf - 3; |
| 817 } |
| 818 } |
| 819 break; |
| 820 |
| 821 } |
| 822 return ret; |
| 823 } |
| 824 |
| 825 |
| 826 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){ |
| 827 int ret = 0; |
| 828 int nBuf = *pnBuf; |
| 829 switch( aBuf[nBuf-2] ){ |
| 830 |
| 831 case 'a': |
| 832 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){ |
| 833 memcpy(&aBuf[nBuf-2], "ate", 3); |
| 834 *pnBuf = nBuf - 2 + 3; |
| 835 ret = 1; |
| 836 } |
| 837 break; |
| 838 |
| 839 case 'b': |
| 840 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){ |
| 841 memcpy(&aBuf[nBuf-2], "ble", 3); |
| 842 *pnBuf = nBuf - 2 + 3; |
| 843 ret = 1; |
| 844 } |
| 845 break; |
| 846 |
| 847 case 'i': |
| 848 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){ |
| 849 memcpy(&aBuf[nBuf-2], "ize", 3); |
| 850 *pnBuf = nBuf - 2 + 3; |
| 851 ret = 1; |
| 852 } |
| 853 break; |
| 854 |
| 855 } |
| 856 return ret; |
| 857 } |
| 858 |
| 859 |
| 860 static int fts5PorterStep2(char *aBuf, int *pnBuf){ |
| 861 int ret = 0; |
| 862 int nBuf = *pnBuf; |
| 863 switch( aBuf[nBuf-2] ){ |
| 864 |
| 865 case 'a': |
| 866 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){ |
| 867 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 868 memcpy(&aBuf[nBuf-7], "ate", 3); |
| 869 *pnBuf = nBuf - 7 + 3; |
| 870 } |
| 871 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){ |
| 872 if( fts5Porter_MGt0(aBuf, nBuf-6) ){ |
| 873 memcpy(&aBuf[nBuf-6], "tion", 4); |
| 874 *pnBuf = nBuf - 6 + 4; |
| 875 } |
| 876 } |
| 877 break; |
| 878 |
| 879 case 'c': |
| 880 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){ |
| 881 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 882 memcpy(&aBuf[nBuf-4], "ence", 4); |
| 883 *pnBuf = nBuf - 4 + 4; |
| 884 } |
| 885 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){ |
| 886 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 887 memcpy(&aBuf[nBuf-4], "ance", 4); |
| 888 *pnBuf = nBuf - 4 + 4; |
| 889 } |
| 890 } |
| 891 break; |
| 892 |
| 893 case 'e': |
| 894 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){ |
| 895 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 896 memcpy(&aBuf[nBuf-4], "ize", 3); |
| 897 *pnBuf = nBuf - 4 + 3; |
| 898 } |
| 899 } |
| 900 break; |
| 901 |
| 902 case 'g': |
| 903 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){ |
| 904 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 905 memcpy(&aBuf[nBuf-4], "log", 3); |
| 906 *pnBuf = nBuf - 4 + 3; |
| 907 } |
| 908 } |
| 909 break; |
| 910 |
| 911 case 'l': |
| 912 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){ |
| 913 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ |
| 914 memcpy(&aBuf[nBuf-3], "ble", 3); |
| 915 *pnBuf = nBuf - 3 + 3; |
| 916 } |
| 917 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){ |
| 918 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 919 memcpy(&aBuf[nBuf-4], "al", 2); |
| 920 *pnBuf = nBuf - 4 + 2; |
| 921 } |
| 922 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){ |
| 923 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 924 memcpy(&aBuf[nBuf-5], "ent", 3); |
| 925 *pnBuf = nBuf - 5 + 3; |
| 926 } |
| 927 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){ |
| 928 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ |
| 929 memcpy(&aBuf[nBuf-3], "e", 1); |
| 930 *pnBuf = nBuf - 3 + 1; |
| 931 } |
| 932 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){ |
| 933 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 934 memcpy(&aBuf[nBuf-5], "ous", 3); |
| 935 *pnBuf = nBuf - 5 + 3; |
| 936 } |
| 937 } |
| 938 break; |
| 939 |
| 940 case 'o': |
| 941 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){ |
| 942 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 943 memcpy(&aBuf[nBuf-7], "ize", 3); |
| 944 *pnBuf = nBuf - 7 + 3; |
| 945 } |
| 946 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){ |
| 947 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 948 memcpy(&aBuf[nBuf-5], "ate", 3); |
| 949 *pnBuf = nBuf - 5 + 3; |
| 950 } |
| 951 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){ |
| 952 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 953 memcpy(&aBuf[nBuf-4], "ate", 3); |
| 954 *pnBuf = nBuf - 4 + 3; |
| 955 } |
| 956 } |
| 957 break; |
| 958 |
| 959 case 's': |
| 960 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){ |
| 961 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 962 memcpy(&aBuf[nBuf-5], "al", 2); |
| 963 *pnBuf = nBuf - 5 + 2; |
| 964 } |
| 965 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){ |
| 966 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 967 memcpy(&aBuf[nBuf-7], "ive", 3); |
| 968 *pnBuf = nBuf - 7 + 3; |
| 969 } |
| 970 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){ |
| 971 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 972 memcpy(&aBuf[nBuf-7], "ful", 3); |
| 973 *pnBuf = nBuf - 7 + 3; |
| 974 } |
| 975 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){ |
| 976 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ |
| 977 memcpy(&aBuf[nBuf-7], "ous", 3); |
| 978 *pnBuf = nBuf - 7 + 3; |
| 979 } |
| 980 } |
| 981 break; |
| 982 |
| 983 case 't': |
| 984 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){ |
| 985 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 986 memcpy(&aBuf[nBuf-5], "al", 2); |
| 987 *pnBuf = nBuf - 5 + 2; |
| 988 } |
| 989 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){ |
| 990 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 991 memcpy(&aBuf[nBuf-5], "ive", 3); |
| 992 *pnBuf = nBuf - 5 + 3; |
| 993 } |
| 994 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){ |
| 995 if( fts5Porter_MGt0(aBuf, nBuf-6) ){ |
| 996 memcpy(&aBuf[nBuf-6], "ble", 3); |
| 997 *pnBuf = nBuf - 6 + 3; |
| 998 } |
| 999 } |
| 1000 break; |
| 1001 |
| 1002 } |
| 1003 return ret; |
| 1004 } |
| 1005 |
| 1006 |
| 1007 static int fts5PorterStep3(char *aBuf, int *pnBuf){ |
| 1008 int ret = 0; |
| 1009 int nBuf = *pnBuf; |
| 1010 switch( aBuf[nBuf-2] ){ |
| 1011 |
| 1012 case 'a': |
| 1013 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){ |
| 1014 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 1015 memcpy(&aBuf[nBuf-4], "ic", 2); |
| 1016 *pnBuf = nBuf - 4 + 2; |
| 1017 } |
| 1018 } |
| 1019 break; |
| 1020 |
| 1021 case 's': |
| 1022 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){ |
| 1023 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ |
| 1024 *pnBuf = nBuf - 4; |
| 1025 } |
| 1026 } |
| 1027 break; |
| 1028 |
| 1029 case 't': |
| 1030 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){ |
| 1031 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 1032 memcpy(&aBuf[nBuf-5], "ic", 2); |
| 1033 *pnBuf = nBuf - 5 + 2; |
| 1034 } |
| 1035 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){ |
| 1036 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 1037 memcpy(&aBuf[nBuf-5], "ic", 2); |
| 1038 *pnBuf = nBuf - 5 + 2; |
| 1039 } |
| 1040 } |
| 1041 break; |
| 1042 |
| 1043 case 'u': |
| 1044 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){ |
| 1045 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ |
| 1046 *pnBuf = nBuf - 3; |
| 1047 } |
| 1048 } |
| 1049 break; |
| 1050 |
| 1051 case 'v': |
| 1052 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){ |
| 1053 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 1054 *pnBuf = nBuf - 5; |
| 1055 } |
| 1056 } |
| 1057 break; |
| 1058 |
| 1059 case 'z': |
| 1060 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){ |
| 1061 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ |
| 1062 memcpy(&aBuf[nBuf-5], "al", 2); |
| 1063 *pnBuf = nBuf - 5 + 2; |
| 1064 } |
| 1065 } |
| 1066 break; |
| 1067 |
| 1068 } |
| 1069 return ret; |
| 1070 } |
| 1071 |
| 1072 |
| 1073 static int fts5PorterStep1B(char *aBuf, int *pnBuf){ |
| 1074 int ret = 0; |
| 1075 int nBuf = *pnBuf; |
| 1076 switch( aBuf[nBuf-2] ){ |
| 1077 |
| 1078 case 'e': |
| 1079 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){ |
| 1080 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ |
| 1081 memcpy(&aBuf[nBuf-3], "ee", 2); |
| 1082 *pnBuf = nBuf - 3 + 2; |
| 1083 } |
| 1084 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){ |
| 1085 if( fts5Porter_Vowel(aBuf, nBuf-2) ){ |
| 1086 *pnBuf = nBuf - 2; |
| 1087 ret = 1; |
| 1088 } |
| 1089 } |
| 1090 break; |
| 1091 |
| 1092 case 'n': |
| 1093 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){ |
| 1094 if( fts5Porter_Vowel(aBuf, nBuf-3) ){ |
| 1095 *pnBuf = nBuf - 3; |
| 1096 ret = 1; |
| 1097 } |
| 1098 } |
| 1099 break; |
| 1100 |
| 1101 } |
| 1102 return ret; |
| 1103 } |
| 1104 |
| 1105 /* |
| 1106 ** GENERATED CODE ENDS HERE (mkportersteps.tcl) |
| 1107 *************************************************************************** |
| 1108 **************************************************************************/ |
| 1109 |
| 1110 static void fts5PorterStep1A(char *aBuf, int *pnBuf){ |
| 1111 int nBuf = *pnBuf; |
| 1112 if( aBuf[nBuf-1]=='s' ){ |
| 1113 if( aBuf[nBuf-2]=='e' ){ |
| 1114 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s') |
| 1115 || (nBuf>3 && aBuf[nBuf-3]=='i' ) |
| 1116 ){ |
| 1117 *pnBuf = nBuf-2; |
| 1118 }else{ |
| 1119 *pnBuf = nBuf-1; |
| 1120 } |
| 1121 } |
| 1122 else if( aBuf[nBuf-2]!='s' ){ |
| 1123 *pnBuf = nBuf-1; |
| 1124 } |
| 1125 } |
| 1126 } |
| 1127 |
| 1128 static int fts5PorterCb( |
| 1129 void *pCtx, |
| 1130 int tflags, |
| 1131 const char *pToken, |
| 1132 int nToken, |
| 1133 int iStart, |
| 1134 int iEnd |
| 1135 ){ |
| 1136 PorterContext *p = (PorterContext*)pCtx; |
| 1137 |
| 1138 char *aBuf; |
| 1139 int nBuf; |
| 1140 |
| 1141 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through; |
| 1142 aBuf = p->aBuf; |
| 1143 nBuf = nToken; |
| 1144 memcpy(aBuf, pToken, nBuf); |
| 1145 |
| 1146 /* Step 1. */ |
| 1147 fts5PorterStep1A(aBuf, &nBuf); |
| 1148 if( fts5PorterStep1B(aBuf, &nBuf) ){ |
| 1149 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){ |
| 1150 char c = aBuf[nBuf-1]; |
| 1151 if( fts5PorterIsVowel(c, 0)==0 |
| 1152 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2] |
| 1153 ){ |
| 1154 nBuf--; |
| 1155 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){ |
| 1156 aBuf[nBuf++] = 'e'; |
| 1157 } |
| 1158 } |
| 1159 } |
| 1160 |
| 1161 /* Step 1C. */ |
| 1162 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){ |
| 1163 aBuf[nBuf-1] = 'i'; |
| 1164 } |
| 1165 |
| 1166 /* Steps 2 through 4. */ |
| 1167 fts5PorterStep2(aBuf, &nBuf); |
| 1168 fts5PorterStep3(aBuf, &nBuf); |
| 1169 fts5PorterStep4(aBuf, &nBuf); |
| 1170 |
| 1171 /* Step 5a. */ |
| 1172 assert( nBuf>0 ); |
| 1173 if( aBuf[nBuf-1]=='e' ){ |
| 1174 if( fts5Porter_MGt1(aBuf, nBuf-1) |
| 1175 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1)) |
| 1176 ){ |
| 1177 nBuf--; |
| 1178 } |
| 1179 } |
| 1180 |
| 1181 /* Step 5b. */ |
| 1182 if( nBuf>1 && aBuf[nBuf-1]=='l' |
| 1183 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) |
| 1184 ){ |
| 1185 nBuf--; |
| 1186 } |
| 1187 |
| 1188 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd); |
| 1189 |
| 1190 pass_through: |
| 1191 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd); |
| 1192 } |
| 1193 |
| 1194 /* |
| 1195 ** Tokenize using the porter tokenizer. |
| 1196 */ |
| 1197 static int fts5PorterTokenize( |
| 1198 Fts5Tokenizer *pTokenizer, |
| 1199 void *pCtx, |
| 1200 int flags, |
| 1201 const char *pText, int nText, |
| 1202 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) |
| 1203 ){ |
| 1204 PorterTokenizer *p = (PorterTokenizer*)pTokenizer; |
| 1205 PorterContext sCtx; |
| 1206 sCtx.xToken = xToken; |
| 1207 sCtx.pCtx = pCtx; |
| 1208 sCtx.aBuf = p->aBuf; |
| 1209 return p->tokenizer.xTokenize( |
| 1210 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb |
| 1211 ); |
| 1212 } |
| 1213 |
| 1214 /* |
| 1215 ** Register all built-in tokenizers with FTS5. |
| 1216 */ |
| 1217 int sqlite3Fts5TokenizerInit(fts5_api *pApi){ |
| 1218 struct BuiltinTokenizer { |
| 1219 const char *zName; |
| 1220 fts5_tokenizer x; |
| 1221 } aBuiltin[] = { |
| 1222 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}}, |
| 1223 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }}, |
| 1224 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }}, |
| 1225 }; |
| 1226 |
| 1227 int rc = SQLITE_OK; /* Return code */ |
| 1228 int i; /* To iterate through builtin functions */ |
| 1229 |
| 1230 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){ |
| 1231 rc = pApi->xCreateTokenizer(pApi, |
| 1232 aBuiltin[i].zName, |
| 1233 (void*)pApi, |
| 1234 &aBuiltin[i].x, |
| 1235 0 |
| 1236 ); |
| 1237 } |
| 1238 |
| 1239 return rc; |
| 1240 } |
| 1241 |
| 1242 |
OLD | NEW |