OLD | NEW |
| (Empty) |
1 /* | |
2 ** 2014 May 31 | |
3 ** | |
4 ** The author disclaims copyright to this source code. In place of | |
5 ** a legal notice, here is a blessing: | |
6 ** | |
7 ** May you do good and not evil. | |
8 ** May you find forgiveness for yourself and forgive others. | |
9 ** May you share freely, never taking more than you give. | |
10 ** | |
11 ****************************************************************************** | |
12 */ | |
13 | |
14 | |
15 #include "fts5Int.h" | |
16 | |
17 /************************************************************************** | |
18 ** Start of ascii tokenizer implementation. | |
19 */ | |
20 | |
21 /* | |
22 ** For tokenizers with no "unicode" modifier, the set of token characters | |
23 ** is the same as the set of ASCII range alphanumeric characters. | |
24 */ | |
25 static unsigned char aAsciiTokenChar[128] = { | |
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */ | |
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */ | |
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */ | |
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */ | |
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */ | |
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */ | |
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */ | |
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */ | |
34 }; | |
35 | |
36 typedef struct AsciiTokenizer AsciiTokenizer; | |
37 struct AsciiTokenizer { | |
38 unsigned char aTokenChar[128]; | |
39 }; | |
40 | |
41 static void fts5AsciiAddExceptions( | |
42 AsciiTokenizer *p, | |
43 const char *zArg, | |
44 int bTokenChars | |
45 ){ | |
46 int i; | |
47 for(i=0; zArg[i]; i++){ | |
48 if( (zArg[i] & 0x80)==0 ){ | |
49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars; | |
50 } | |
51 } | |
52 } | |
53 | |
54 /* | |
55 ** Delete a "ascii" tokenizer. | |
56 */ | |
57 static void fts5AsciiDelete(Fts5Tokenizer *p){ | |
58 sqlite3_free(p); | |
59 } | |
60 | |
61 /* | |
62 ** Create an "ascii" tokenizer. | |
63 */ | |
64 static int fts5AsciiCreate( | |
65 void *pCtx, | |
66 const char **azArg, int nArg, | |
67 Fts5Tokenizer **ppOut | |
68 ){ | |
69 int rc = SQLITE_OK; | |
70 AsciiTokenizer *p = 0; | |
71 if( nArg%2 ){ | |
72 rc = SQLITE_ERROR; | |
73 }else{ | |
74 p = sqlite3_malloc(sizeof(AsciiTokenizer)); | |
75 if( p==0 ){ | |
76 rc = SQLITE_NOMEM; | |
77 }else{ | |
78 int i; | |
79 memset(p, 0, sizeof(AsciiTokenizer)); | |
80 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar)); | |
81 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ | |
82 const char *zArg = azArg[i+1]; | |
83 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ | |
84 fts5AsciiAddExceptions(p, zArg, 1); | |
85 }else | |
86 if( 0==sqlite3_stricmp(azArg[i], "separators") ){ | |
87 fts5AsciiAddExceptions(p, zArg, 0); | |
88 }else{ | |
89 rc = SQLITE_ERROR; | |
90 } | |
91 } | |
92 if( rc!=SQLITE_OK ){ | |
93 fts5AsciiDelete((Fts5Tokenizer*)p); | |
94 p = 0; | |
95 } | |
96 } | |
97 } | |
98 | |
99 *ppOut = (Fts5Tokenizer*)p; | |
100 return rc; | |
101 } | |
102 | |
103 | |
104 static void asciiFold(char *aOut, const char *aIn, int nByte){ | |
105 int i; | |
106 for(i=0; i<nByte; i++){ | |
107 char c = aIn[i]; | |
108 if( c>='A' && c<='Z' ) c += 32; | |
109 aOut[i] = c; | |
110 } | |
111 } | |
112 | |
113 /* | |
114 ** Tokenize some text using the ascii tokenizer. | |
115 */ | |
116 static int fts5AsciiTokenize( | |
117 Fts5Tokenizer *pTokenizer, | |
118 void *pCtx, | |
119 int flags, | |
120 const char *pText, int nText, | |
121 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) | |
122 ){ | |
123 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer; | |
124 int rc = SQLITE_OK; | |
125 int ie; | |
126 int is = 0; | |
127 | |
128 char aFold[64]; | |
129 int nFold = sizeof(aFold); | |
130 char *pFold = aFold; | |
131 unsigned char *a = p->aTokenChar; | |
132 | |
133 while( is<nText && rc==SQLITE_OK ){ | |
134 int nByte; | |
135 | |
136 /* Skip any leading divider characters. */ | |
137 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){ | |
138 is++; | |
139 } | |
140 if( is==nText ) break; | |
141 | |
142 /* Count the token characters */ | |
143 ie = is+1; | |
144 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){ | |
145 ie++; | |
146 } | |
147 | |
148 /* Fold to lower case */ | |
149 nByte = ie-is; | |
150 if( nByte>nFold ){ | |
151 if( pFold!=aFold ) sqlite3_free(pFold); | |
152 pFold = sqlite3_malloc(nByte*2); | |
153 if( pFold==0 ){ | |
154 rc = SQLITE_NOMEM; | |
155 break; | |
156 } | |
157 nFold = nByte*2; | |
158 } | |
159 asciiFold(pFold, &pText[is], nByte); | |
160 | |
161 /* Invoke the token callback */ | |
162 rc = xToken(pCtx, 0, pFold, nByte, is, ie); | |
163 is = ie+1; | |
164 } | |
165 | |
166 if( pFold!=aFold ) sqlite3_free(pFold); | |
167 if( rc==SQLITE_DONE ) rc = SQLITE_OK; | |
168 return rc; | |
169 } | |
170 | |
171 /************************************************************************** | |
172 ** Start of unicode61 tokenizer implementation. | |
173 */ | |
174 | |
175 | |
176 /* | |
177 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied | |
178 ** from the sqlite3 source file utf.c. If this file is compiled as part | |
179 ** of the amalgamation, they are not required. | |
180 */ | |
181 #ifndef SQLITE_AMALGAMATION | |
182 | |
183 static const unsigned char sqlite3Utf8Trans1[] = { | |
184 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
185 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, | |
186 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, | |
187 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, | |
188 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
189 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, | |
190 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
191 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, | |
192 }; | |
193 | |
194 #define READ_UTF8(zIn, zTerm, c) \ | |
195 c = *(zIn++); \ | |
196 if( c>=0xc0 ){ \ | |
197 c = sqlite3Utf8Trans1[c-0xc0]; \ | |
198 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \ | |
199 c = (c<<6) + (0x3f & *(zIn++)); \ | |
200 } \ | |
201 if( c<0x80 \ | |
202 || (c&0xFFFFF800)==0xD800 \ | |
203 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ | |
204 } | |
205 | |
206 | |
207 #define WRITE_UTF8(zOut, c) { \ | |
208 if( c<0x00080 ){ \ | |
209 *zOut++ = (unsigned char)(c&0xFF); \ | |
210 } \ | |
211 else if( c<0x00800 ){ \ | |
212 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \ | |
213 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ | |
214 } \ | |
215 else if( c<0x10000 ){ \ | |
216 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \ | |
217 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \ | |
218 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ | |
219 }else{ \ | |
220 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \ | |
221 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \ | |
222 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \ | |
223 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \ | |
224 } \ | |
225 } | |
226 | |
227 #endif /* ifndef SQLITE_AMALGAMATION */ | |
228 | |
229 typedef struct Unicode61Tokenizer Unicode61Tokenizer; | |
230 struct Unicode61Tokenizer { | |
231 unsigned char aTokenChar[128]; /* ASCII range token characters */ | |
232 char *aFold; /* Buffer to fold text into */ | |
233 int nFold; /* Size of aFold[] in bytes */ | |
234 int bRemoveDiacritic; /* True if remove_diacritics=1 is set */ | |
235 int nException; | |
236 int *aiException; | |
237 }; | |
238 | |
239 static int fts5UnicodeAddExceptions( | |
240 Unicode61Tokenizer *p, /* Tokenizer object */ | |
241 const char *z, /* Characters to treat as exceptions */ | |
242 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */ | |
243 ){ | |
244 int rc = SQLITE_OK; | |
245 int n = (int)strlen(z); | |
246 int *aNew; | |
247 | |
248 if( n>0 ){ | |
249 aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int)); | |
250 if( aNew ){ | |
251 int nNew = p->nException; | |
252 const unsigned char *zCsr = (const unsigned char*)z; | |
253 const unsigned char *zTerm = (const unsigned char*)&z[n]; | |
254 while( zCsr<zTerm ){ | |
255 int iCode; | |
256 int bToken; | |
257 READ_UTF8(zCsr, zTerm, iCode); | |
258 if( iCode<128 ){ | |
259 p->aTokenChar[iCode] = (unsigned char)bTokenChars; | |
260 }else{ | |
261 bToken = sqlite3Fts5UnicodeIsalnum(iCode); | |
262 assert( (bToken==0 || bToken==1) ); | |
263 assert( (bTokenChars==0 || bTokenChars==1) ); | |
264 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){ | |
265 int i; | |
266 for(i=0; i<nNew; i++){ | |
267 if( aNew[i]>iCode ) break; | |
268 } | |
269 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int)); | |
270 aNew[i] = iCode; | |
271 nNew++; | |
272 } | |
273 } | |
274 } | |
275 p->aiException = aNew; | |
276 p->nException = nNew; | |
277 }else{ | |
278 rc = SQLITE_NOMEM; | |
279 } | |
280 } | |
281 | |
282 return rc; | |
283 } | |
284 | |
285 /* | |
286 ** Return true if the p->aiException[] array contains the value iCode. | |
287 */ | |
288 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){ | |
289 if( p->nException>0 ){ | |
290 int *a = p->aiException; | |
291 int iLo = 0; | |
292 int iHi = p->nException-1; | |
293 | |
294 while( iHi>=iLo ){ | |
295 int iTest = (iHi + iLo) / 2; | |
296 if( iCode==a[iTest] ){ | |
297 return 1; | |
298 }else if( iCode>a[iTest] ){ | |
299 iLo = iTest+1; | |
300 }else{ | |
301 iHi = iTest-1; | |
302 } | |
303 } | |
304 } | |
305 | |
306 return 0; | |
307 } | |
308 | |
309 /* | |
310 ** Delete a "unicode61" tokenizer. | |
311 */ | |
312 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){ | |
313 if( pTok ){ | |
314 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok; | |
315 sqlite3_free(p->aiException); | |
316 sqlite3_free(p->aFold); | |
317 sqlite3_free(p); | |
318 } | |
319 return; | |
320 } | |
321 | |
322 /* | |
323 ** Create a "unicode61" tokenizer. | |
324 */ | |
325 static int fts5UnicodeCreate( | |
326 void *pCtx, | |
327 const char **azArg, int nArg, | |
328 Fts5Tokenizer **ppOut | |
329 ){ | |
330 int rc = SQLITE_OK; /* Return code */ | |
331 Unicode61Tokenizer *p = 0; /* New tokenizer object */ | |
332 | |
333 if( nArg%2 ){ | |
334 rc = SQLITE_ERROR; | |
335 }else{ | |
336 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer)); | |
337 if( p ){ | |
338 int i; | |
339 memset(p, 0, sizeof(Unicode61Tokenizer)); | |
340 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar)); | |
341 p->bRemoveDiacritic = 1; | |
342 p->nFold = 64; | |
343 p->aFold = sqlite3_malloc(p->nFold * sizeof(char)); | |
344 if( p->aFold==0 ){ | |
345 rc = SQLITE_NOMEM; | |
346 } | |
347 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){ | |
348 const char *zArg = azArg[i+1]; | |
349 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){ | |
350 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){ | |
351 rc = SQLITE_ERROR; | |
352 } | |
353 p->bRemoveDiacritic = (zArg[0]=='1'); | |
354 }else | |
355 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){ | |
356 rc = fts5UnicodeAddExceptions(p, zArg, 1); | |
357 }else | |
358 if( 0==sqlite3_stricmp(azArg[i], "separators") ){ | |
359 rc = fts5UnicodeAddExceptions(p, zArg, 0); | |
360 }else{ | |
361 rc = SQLITE_ERROR; | |
362 } | |
363 } | |
364 }else{ | |
365 rc = SQLITE_NOMEM; | |
366 } | |
367 if( rc!=SQLITE_OK ){ | |
368 fts5UnicodeDelete((Fts5Tokenizer*)p); | |
369 p = 0; | |
370 } | |
371 *ppOut = (Fts5Tokenizer*)p; | |
372 } | |
373 return rc; | |
374 } | |
375 | |
376 /* | |
377 ** Return true if, for the purposes of tokenizing with the tokenizer | |
378 ** passed as the first argument, codepoint iCode is considered a token | |
379 ** character (not a separator). | |
380 */ | |
381 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){ | |
382 assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); | |
383 return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode); | |
384 } | |
385 | |
386 static int fts5UnicodeTokenize( | |
387 Fts5Tokenizer *pTokenizer, | |
388 void *pCtx, | |
389 int flags, | |
390 const char *pText, int nText, | |
391 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) | |
392 ){ | |
393 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer; | |
394 int rc = SQLITE_OK; | |
395 unsigned char *a = p->aTokenChar; | |
396 | |
397 unsigned char *zTerm = (unsigned char*)&pText[nText]; | |
398 unsigned char *zCsr = (unsigned char *)pText; | |
399 | |
400 /* Output buffer */ | |
401 char *aFold = p->aFold; | |
402 int nFold = p->nFold; | |
403 const char *pEnd = &aFold[nFold-6]; | |
404 | |
405 /* Each iteration of this loop gobbles up a contiguous run of separators, | |
406 ** then the next token. */ | |
407 while( rc==SQLITE_OK ){ | |
408 int iCode; /* non-ASCII codepoint read from input */ | |
409 char *zOut = aFold; | |
410 int is; | |
411 int ie; | |
412 | |
413 /* Skip any separator characters. */ | |
414 while( 1 ){ | |
415 if( zCsr>=zTerm ) goto tokenize_done; | |
416 if( *zCsr & 0x80 ) { | |
417 /* A character outside of the ascii range. Skip past it if it is | |
418 ** a separator character. Or break out of the loop if it is not. */ | |
419 is = zCsr - (unsigned char*)pText; | |
420 READ_UTF8(zCsr, zTerm, iCode); | |
421 if( fts5UnicodeIsAlnum(p, iCode) ){ | |
422 goto non_ascii_tokenchar; | |
423 } | |
424 }else{ | |
425 if( a[*zCsr] ){ | |
426 is = zCsr - (unsigned char*)pText; | |
427 goto ascii_tokenchar; | |
428 } | |
429 zCsr++; | |
430 } | |
431 } | |
432 | |
433 /* Run through the tokenchars. Fold them into the output buffer along | |
434 ** the way. */ | |
435 while( zCsr<zTerm ){ | |
436 | |
437 /* Grow the output buffer so that there is sufficient space to fit the | |
438 ** largest possible utf-8 character. */ | |
439 if( zOut>pEnd ){ | |
440 aFold = sqlite3_malloc(nFold*2); | |
441 if( aFold==0 ){ | |
442 rc = SQLITE_NOMEM; | |
443 goto tokenize_done; | |
444 } | |
445 zOut = &aFold[zOut - p->aFold]; | |
446 memcpy(aFold, p->aFold, nFold); | |
447 sqlite3_free(p->aFold); | |
448 p->aFold = aFold; | |
449 p->nFold = nFold = nFold*2; | |
450 pEnd = &aFold[nFold-6]; | |
451 } | |
452 | |
453 if( *zCsr & 0x80 ){ | |
454 /* An non-ascii-range character. Fold it into the output buffer if | |
455 ** it is a token character, or break out of the loop if it is not. */ | |
456 READ_UTF8(zCsr, zTerm, iCode); | |
457 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){ | |
458 non_ascii_tokenchar: | |
459 iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic); | |
460 if( iCode ) WRITE_UTF8(zOut, iCode); | |
461 }else{ | |
462 break; | |
463 } | |
464 }else if( a[*zCsr]==0 ){ | |
465 /* An ascii-range separator character. End of token. */ | |
466 break; | |
467 }else{ | |
468 ascii_tokenchar: | |
469 if( *zCsr>='A' && *zCsr<='Z' ){ | |
470 *zOut++ = *zCsr + 32; | |
471 }else{ | |
472 *zOut++ = *zCsr; | |
473 } | |
474 zCsr++; | |
475 } | |
476 ie = zCsr - (unsigned char*)pText; | |
477 } | |
478 | |
479 /* Invoke the token callback */ | |
480 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie); | |
481 } | |
482 | |
483 tokenize_done: | |
484 if( rc==SQLITE_DONE ) rc = SQLITE_OK; | |
485 return rc; | |
486 } | |
487 | |
488 /************************************************************************** | |
489 ** Start of porter stemmer implementation. | |
490 */ | |
491 | |
492 /* Any tokens larger than this (in bytes) are passed through without | |
493 ** stemming. */ | |
494 #define FTS5_PORTER_MAX_TOKEN 64 | |
495 | |
496 typedef struct PorterTokenizer PorterTokenizer; | |
497 struct PorterTokenizer { | |
498 fts5_tokenizer tokenizer; /* Parent tokenizer module */ | |
499 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */ | |
500 char aBuf[FTS5_PORTER_MAX_TOKEN + 64]; | |
501 }; | |
502 | |
503 /* | |
504 ** Delete a "porter" tokenizer. | |
505 */ | |
506 static void fts5PorterDelete(Fts5Tokenizer *pTok){ | |
507 if( pTok ){ | |
508 PorterTokenizer *p = (PorterTokenizer*)pTok; | |
509 if( p->pTokenizer ){ | |
510 p->tokenizer.xDelete(p->pTokenizer); | |
511 } | |
512 sqlite3_free(p); | |
513 } | |
514 } | |
515 | |
516 /* | |
517 ** Create a "porter" tokenizer. | |
518 */ | |
519 static int fts5PorterCreate( | |
520 void *pCtx, | |
521 const char **azArg, int nArg, | |
522 Fts5Tokenizer **ppOut | |
523 ){ | |
524 fts5_api *pApi = (fts5_api*)pCtx; | |
525 int rc = SQLITE_OK; | |
526 PorterTokenizer *pRet; | |
527 void *pUserdata = 0; | |
528 const char *zBase = "unicode61"; | |
529 | |
530 if( nArg>0 ){ | |
531 zBase = azArg[0]; | |
532 } | |
533 | |
534 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer)); | |
535 if( pRet ){ | |
536 memset(pRet, 0, sizeof(PorterTokenizer)); | |
537 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer); | |
538 }else{ | |
539 rc = SQLITE_NOMEM; | |
540 } | |
541 if( rc==SQLITE_OK ){ | |
542 int nArg2 = (nArg>0 ? nArg-1 : 0); | |
543 const char **azArg2 = (nArg2 ? &azArg[1] : 0); | |
544 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer); | |
545 } | |
546 | |
547 if( rc!=SQLITE_OK ){ | |
548 fts5PorterDelete((Fts5Tokenizer*)pRet); | |
549 pRet = 0; | |
550 } | |
551 *ppOut = (Fts5Tokenizer*)pRet; | |
552 return rc; | |
553 } | |
554 | |
555 typedef struct PorterContext PorterContext; | |
556 struct PorterContext { | |
557 void *pCtx; | |
558 int (*xToken)(void*, int, const char*, int, int, int); | |
559 char *aBuf; | |
560 }; | |
561 | |
562 typedef struct PorterRule PorterRule; | |
563 struct PorterRule { | |
564 const char *zSuffix; | |
565 int nSuffix; | |
566 int (*xCond)(char *zStem, int nStem); | |
567 const char *zOutput; | |
568 int nOutput; | |
569 }; | |
570 | |
571 #if 0 | |
572 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){ | |
573 int ret = -1; | |
574 int nBuf = *pnBuf; | |
575 PorterRule *p; | |
576 | |
577 for(p=aRule; p->zSuffix; p++){ | |
578 assert( strlen(p->zSuffix)==p->nSuffix ); | |
579 assert( strlen(p->zOutput)==p->nOutput ); | |
580 if( nBuf<p->nSuffix ) continue; | |
581 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break; | |
582 } | |
583 | |
584 if( p->zSuffix ){ | |
585 int nStem = nBuf - p->nSuffix; | |
586 if( p->xCond==0 || p->xCond(aBuf, nStem) ){ | |
587 memcpy(&aBuf[nStem], p->zOutput, p->nOutput); | |
588 *pnBuf = nStem + p->nOutput; | |
589 ret = p - aRule; | |
590 } | |
591 } | |
592 | |
593 return ret; | |
594 } | |
595 #endif | |
596 | |
597 static int fts5PorterIsVowel(char c, int bYIsVowel){ | |
598 return ( | |
599 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y') | |
600 ); | |
601 } | |
602 | |
603 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){ | |
604 int i; | |
605 int bCons = bPrevCons; | |
606 | |
607 /* Scan for a vowel */ | |
608 for(i=0; i<nStem; i++){ | |
609 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break; | |
610 } | |
611 | |
612 /* Scan for a consonent */ | |
613 for(i++; i<nStem; i++){ | |
614 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1; | |
615 } | |
616 return 0; | |
617 } | |
618 | |
619 /* porter rule condition: (m > 0) */ | |
620 static int fts5Porter_MGt0(char *zStem, int nStem){ | |
621 return !!fts5PorterGobbleVC(zStem, nStem, 0); | |
622 } | |
623 | |
624 /* porter rule condition: (m > 1) */ | |
625 static int fts5Porter_MGt1(char *zStem, int nStem){ | |
626 int n; | |
627 n = fts5PorterGobbleVC(zStem, nStem, 0); | |
628 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){ | |
629 return 1; | |
630 } | |
631 return 0; | |
632 } | |
633 | |
634 /* porter rule condition: (m = 1) */ | |
635 static int fts5Porter_MEq1(char *zStem, int nStem){ | |
636 int n; | |
637 n = fts5PorterGobbleVC(zStem, nStem, 0); | |
638 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){ | |
639 return 1; | |
640 } | |
641 return 0; | |
642 } | |
643 | |
644 /* porter rule condition: (*o) */ | |
645 static int fts5Porter_Ostar(char *zStem, int nStem){ | |
646 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){ | |
647 return 0; | |
648 }else{ | |
649 int i; | |
650 int mask = 0; | |
651 int bCons = 0; | |
652 for(i=0; i<nStem; i++){ | |
653 bCons = !fts5PorterIsVowel(zStem[i], bCons); | |
654 assert( bCons==0 || bCons==1 ); | |
655 mask = (mask << 1) + bCons; | |
656 } | |
657 return ((mask & 0x0007)==0x0005); | |
658 } | |
659 } | |
660 | |
661 /* porter rule condition: (m > 1 and (*S or *T)) */ | |
662 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){ | |
663 assert( nStem>0 ); | |
664 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t') | |
665 && fts5Porter_MGt1(zStem, nStem); | |
666 } | |
667 | |
668 /* porter rule condition: (*v*) */ | |
669 static int fts5Porter_Vowel(char *zStem, int nStem){ | |
670 int i; | |
671 for(i=0; i<nStem; i++){ | |
672 if( fts5PorterIsVowel(zStem[i], i>0) ){ | |
673 return 1; | |
674 } | |
675 } | |
676 return 0; | |
677 } | |
678 | |
679 | |
680 /************************************************************************** | |
681 *************************************************************************** | |
682 ** GENERATED CODE STARTS HERE (mkportersteps.tcl) | |
683 */ | |
684 | |
685 static int fts5PorterStep4(char *aBuf, int *pnBuf){ | |
686 int ret = 0; | |
687 int nBuf = *pnBuf; | |
688 switch( aBuf[nBuf-2] ){ | |
689 | |
690 case 'a': | |
691 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){ | |
692 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ | |
693 *pnBuf = nBuf - 2; | |
694 } | |
695 } | |
696 break; | |
697 | |
698 case 'c': | |
699 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){ | |
700 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ | |
701 *pnBuf = nBuf - 4; | |
702 } | |
703 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){ | |
704 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ | |
705 *pnBuf = nBuf - 4; | |
706 } | |
707 } | |
708 break; | |
709 | |
710 case 'e': | |
711 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){ | |
712 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ | |
713 *pnBuf = nBuf - 2; | |
714 } | |
715 } | |
716 break; | |
717 | |
718 case 'i': | |
719 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){ | |
720 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ | |
721 *pnBuf = nBuf - 2; | |
722 } | |
723 } | |
724 break; | |
725 | |
726 case 'l': | |
727 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){ | |
728 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ | |
729 *pnBuf = nBuf - 4; | |
730 } | |
731 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){ | |
732 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ | |
733 *pnBuf = nBuf - 4; | |
734 } | |
735 } | |
736 break; | |
737 | |
738 case 'n': | |
739 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){ | |
740 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ | |
741 *pnBuf = nBuf - 3; | |
742 } | |
743 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){ | |
744 if( fts5Porter_MGt1(aBuf, nBuf-5) ){ | |
745 *pnBuf = nBuf - 5; | |
746 } | |
747 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){ | |
748 if( fts5Porter_MGt1(aBuf, nBuf-4) ){ | |
749 *pnBuf = nBuf - 4; | |
750 } | |
751 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){ | |
752 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ | |
753 *pnBuf = nBuf - 3; | |
754 } | |
755 } | |
756 break; | |
757 | |
758 case 'o': | |
759 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){ | |
760 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){ | |
761 *pnBuf = nBuf - 3; | |
762 } | |
763 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){ | |
764 if( fts5Porter_MGt1(aBuf, nBuf-2) ){ | |
765 *pnBuf = nBuf - 2; | |
766 } | |
767 } | |
768 break; | |
769 | |
770 case 's': | |
771 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){ | |
772 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ | |
773 *pnBuf = nBuf - 3; | |
774 } | |
775 } | |
776 break; | |
777 | |
778 case 't': | |
779 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){ | |
780 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ | |
781 *pnBuf = nBuf - 3; | |
782 } | |
783 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){ | |
784 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ | |
785 *pnBuf = nBuf - 3; | |
786 } | |
787 } | |
788 break; | |
789 | |
790 case 'u': | |
791 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){ | |
792 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ | |
793 *pnBuf = nBuf - 3; | |
794 } | |
795 } | |
796 break; | |
797 | |
798 case 'v': | |
799 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){ | |
800 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ | |
801 *pnBuf = nBuf - 3; | |
802 } | |
803 } | |
804 break; | |
805 | |
806 case 'z': | |
807 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){ | |
808 if( fts5Porter_MGt1(aBuf, nBuf-3) ){ | |
809 *pnBuf = nBuf - 3; | |
810 } | |
811 } | |
812 break; | |
813 | |
814 } | |
815 return ret; | |
816 } | |
817 | |
818 | |
819 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){ | |
820 int ret = 0; | |
821 int nBuf = *pnBuf; | |
822 switch( aBuf[nBuf-2] ){ | |
823 | |
824 case 'a': | |
825 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){ | |
826 memcpy(&aBuf[nBuf-2], "ate", 3); | |
827 *pnBuf = nBuf - 2 + 3; | |
828 ret = 1; | |
829 } | |
830 break; | |
831 | |
832 case 'b': | |
833 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){ | |
834 memcpy(&aBuf[nBuf-2], "ble", 3); | |
835 *pnBuf = nBuf - 2 + 3; | |
836 ret = 1; | |
837 } | |
838 break; | |
839 | |
840 case 'i': | |
841 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){ | |
842 memcpy(&aBuf[nBuf-2], "ize", 3); | |
843 *pnBuf = nBuf - 2 + 3; | |
844 ret = 1; | |
845 } | |
846 break; | |
847 | |
848 } | |
849 return ret; | |
850 } | |
851 | |
852 | |
853 static int fts5PorterStep2(char *aBuf, int *pnBuf){ | |
854 int ret = 0; | |
855 int nBuf = *pnBuf; | |
856 switch( aBuf[nBuf-2] ){ | |
857 | |
858 case 'a': | |
859 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){ | |
860 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ | |
861 memcpy(&aBuf[nBuf-7], "ate", 3); | |
862 *pnBuf = nBuf - 7 + 3; | |
863 } | |
864 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){ | |
865 if( fts5Porter_MGt0(aBuf, nBuf-6) ){ | |
866 memcpy(&aBuf[nBuf-6], "tion", 4); | |
867 *pnBuf = nBuf - 6 + 4; | |
868 } | |
869 } | |
870 break; | |
871 | |
872 case 'c': | |
873 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){ | |
874 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ | |
875 memcpy(&aBuf[nBuf-4], "ence", 4); | |
876 *pnBuf = nBuf - 4 + 4; | |
877 } | |
878 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){ | |
879 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ | |
880 memcpy(&aBuf[nBuf-4], "ance", 4); | |
881 *pnBuf = nBuf - 4 + 4; | |
882 } | |
883 } | |
884 break; | |
885 | |
886 case 'e': | |
887 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){ | |
888 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ | |
889 memcpy(&aBuf[nBuf-4], "ize", 3); | |
890 *pnBuf = nBuf - 4 + 3; | |
891 } | |
892 } | |
893 break; | |
894 | |
895 case 'g': | |
896 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){ | |
897 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ | |
898 memcpy(&aBuf[nBuf-4], "log", 3); | |
899 *pnBuf = nBuf - 4 + 3; | |
900 } | |
901 } | |
902 break; | |
903 | |
904 case 'l': | |
905 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){ | |
906 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ | |
907 memcpy(&aBuf[nBuf-3], "ble", 3); | |
908 *pnBuf = nBuf - 3 + 3; | |
909 } | |
910 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){ | |
911 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ | |
912 memcpy(&aBuf[nBuf-4], "al", 2); | |
913 *pnBuf = nBuf - 4 + 2; | |
914 } | |
915 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){ | |
916 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
917 memcpy(&aBuf[nBuf-5], "ent", 3); | |
918 *pnBuf = nBuf - 5 + 3; | |
919 } | |
920 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){ | |
921 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ | |
922 memcpy(&aBuf[nBuf-3], "e", 1); | |
923 *pnBuf = nBuf - 3 + 1; | |
924 } | |
925 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){ | |
926 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
927 memcpy(&aBuf[nBuf-5], "ous", 3); | |
928 *pnBuf = nBuf - 5 + 3; | |
929 } | |
930 } | |
931 break; | |
932 | |
933 case 'o': | |
934 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){ | |
935 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ | |
936 memcpy(&aBuf[nBuf-7], "ize", 3); | |
937 *pnBuf = nBuf - 7 + 3; | |
938 } | |
939 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){ | |
940 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
941 memcpy(&aBuf[nBuf-5], "ate", 3); | |
942 *pnBuf = nBuf - 5 + 3; | |
943 } | |
944 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){ | |
945 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ | |
946 memcpy(&aBuf[nBuf-4], "ate", 3); | |
947 *pnBuf = nBuf - 4 + 3; | |
948 } | |
949 } | |
950 break; | |
951 | |
952 case 's': | |
953 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){ | |
954 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
955 memcpy(&aBuf[nBuf-5], "al", 2); | |
956 *pnBuf = nBuf - 5 + 2; | |
957 } | |
958 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){ | |
959 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ | |
960 memcpy(&aBuf[nBuf-7], "ive", 3); | |
961 *pnBuf = nBuf - 7 + 3; | |
962 } | |
963 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){ | |
964 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ | |
965 memcpy(&aBuf[nBuf-7], "ful", 3); | |
966 *pnBuf = nBuf - 7 + 3; | |
967 } | |
968 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){ | |
969 if( fts5Porter_MGt0(aBuf, nBuf-7) ){ | |
970 memcpy(&aBuf[nBuf-7], "ous", 3); | |
971 *pnBuf = nBuf - 7 + 3; | |
972 } | |
973 } | |
974 break; | |
975 | |
976 case 't': | |
977 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){ | |
978 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
979 memcpy(&aBuf[nBuf-5], "al", 2); | |
980 *pnBuf = nBuf - 5 + 2; | |
981 } | |
982 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){ | |
983 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
984 memcpy(&aBuf[nBuf-5], "ive", 3); | |
985 *pnBuf = nBuf - 5 + 3; | |
986 } | |
987 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){ | |
988 if( fts5Porter_MGt0(aBuf, nBuf-6) ){ | |
989 memcpy(&aBuf[nBuf-6], "ble", 3); | |
990 *pnBuf = nBuf - 6 + 3; | |
991 } | |
992 } | |
993 break; | |
994 | |
995 } | |
996 return ret; | |
997 } | |
998 | |
999 | |
1000 static int fts5PorterStep3(char *aBuf, int *pnBuf){ | |
1001 int ret = 0; | |
1002 int nBuf = *pnBuf; | |
1003 switch( aBuf[nBuf-2] ){ | |
1004 | |
1005 case 'a': | |
1006 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){ | |
1007 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ | |
1008 memcpy(&aBuf[nBuf-4], "ic", 2); | |
1009 *pnBuf = nBuf - 4 + 2; | |
1010 } | |
1011 } | |
1012 break; | |
1013 | |
1014 case 's': | |
1015 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){ | |
1016 if( fts5Porter_MGt0(aBuf, nBuf-4) ){ | |
1017 *pnBuf = nBuf - 4; | |
1018 } | |
1019 } | |
1020 break; | |
1021 | |
1022 case 't': | |
1023 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){ | |
1024 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
1025 memcpy(&aBuf[nBuf-5], "ic", 2); | |
1026 *pnBuf = nBuf - 5 + 2; | |
1027 } | |
1028 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){ | |
1029 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
1030 memcpy(&aBuf[nBuf-5], "ic", 2); | |
1031 *pnBuf = nBuf - 5 + 2; | |
1032 } | |
1033 } | |
1034 break; | |
1035 | |
1036 case 'u': | |
1037 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){ | |
1038 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ | |
1039 *pnBuf = nBuf - 3; | |
1040 } | |
1041 } | |
1042 break; | |
1043 | |
1044 case 'v': | |
1045 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){ | |
1046 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
1047 *pnBuf = nBuf - 5; | |
1048 } | |
1049 } | |
1050 break; | |
1051 | |
1052 case 'z': | |
1053 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){ | |
1054 if( fts5Porter_MGt0(aBuf, nBuf-5) ){ | |
1055 memcpy(&aBuf[nBuf-5], "al", 2); | |
1056 *pnBuf = nBuf - 5 + 2; | |
1057 } | |
1058 } | |
1059 break; | |
1060 | |
1061 } | |
1062 return ret; | |
1063 } | |
1064 | |
1065 | |
1066 static int fts5PorterStep1B(char *aBuf, int *pnBuf){ | |
1067 int ret = 0; | |
1068 int nBuf = *pnBuf; | |
1069 switch( aBuf[nBuf-2] ){ | |
1070 | |
1071 case 'e': | |
1072 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){ | |
1073 if( fts5Porter_MGt0(aBuf, nBuf-3) ){ | |
1074 memcpy(&aBuf[nBuf-3], "ee", 2); | |
1075 *pnBuf = nBuf - 3 + 2; | |
1076 } | |
1077 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){ | |
1078 if( fts5Porter_Vowel(aBuf, nBuf-2) ){ | |
1079 *pnBuf = nBuf - 2; | |
1080 ret = 1; | |
1081 } | |
1082 } | |
1083 break; | |
1084 | |
1085 case 'n': | |
1086 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){ | |
1087 if( fts5Porter_Vowel(aBuf, nBuf-3) ){ | |
1088 *pnBuf = nBuf - 3; | |
1089 ret = 1; | |
1090 } | |
1091 } | |
1092 break; | |
1093 | |
1094 } | |
1095 return ret; | |
1096 } | |
1097 | |
1098 /* | |
1099 ** GENERATED CODE ENDS HERE (mkportersteps.tcl) | |
1100 *************************************************************************** | |
1101 **************************************************************************/ | |
1102 | |
1103 static void fts5PorterStep1A(char *aBuf, int *pnBuf){ | |
1104 int nBuf = *pnBuf; | |
1105 if( aBuf[nBuf-1]=='s' ){ | |
1106 if( aBuf[nBuf-2]=='e' ){ | |
1107 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s') | |
1108 || (nBuf>3 && aBuf[nBuf-3]=='i' ) | |
1109 ){ | |
1110 *pnBuf = nBuf-2; | |
1111 }else{ | |
1112 *pnBuf = nBuf-1; | |
1113 } | |
1114 } | |
1115 else if( aBuf[nBuf-2]!='s' ){ | |
1116 *pnBuf = nBuf-1; | |
1117 } | |
1118 } | |
1119 } | |
1120 | |
1121 static int fts5PorterCb( | |
1122 void *pCtx, | |
1123 int tflags, | |
1124 const char *pToken, | |
1125 int nToken, | |
1126 int iStart, | |
1127 int iEnd | |
1128 ){ | |
1129 PorterContext *p = (PorterContext*)pCtx; | |
1130 | |
1131 char *aBuf; | |
1132 int nBuf; | |
1133 | |
1134 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through; | |
1135 aBuf = p->aBuf; | |
1136 nBuf = nToken; | |
1137 memcpy(aBuf, pToken, nBuf); | |
1138 | |
1139 /* Step 1. */ | |
1140 fts5PorterStep1A(aBuf, &nBuf); | |
1141 if( fts5PorterStep1B(aBuf, &nBuf) ){ | |
1142 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){ | |
1143 char c = aBuf[nBuf-1]; | |
1144 if( fts5PorterIsVowel(c, 0)==0 | |
1145 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2] | |
1146 ){ | |
1147 nBuf--; | |
1148 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){ | |
1149 aBuf[nBuf++] = 'e'; | |
1150 } | |
1151 } | |
1152 } | |
1153 | |
1154 /* Step 1C. */ | |
1155 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){ | |
1156 aBuf[nBuf-1] = 'i'; | |
1157 } | |
1158 | |
1159 /* Steps 2 through 4. */ | |
1160 fts5PorterStep2(aBuf, &nBuf); | |
1161 fts5PorterStep3(aBuf, &nBuf); | |
1162 fts5PorterStep4(aBuf, &nBuf); | |
1163 | |
1164 /* Step 5a. */ | |
1165 assert( nBuf>0 ); | |
1166 if( aBuf[nBuf-1]=='e' ){ | |
1167 if( fts5Porter_MGt1(aBuf, nBuf-1) | |
1168 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1)) | |
1169 ){ | |
1170 nBuf--; | |
1171 } | |
1172 } | |
1173 | |
1174 /* Step 5b. */ | |
1175 if( nBuf>1 && aBuf[nBuf-1]=='l' | |
1176 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) | |
1177 ){ | |
1178 nBuf--; | |
1179 } | |
1180 | |
1181 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd); | |
1182 | |
1183 pass_through: | |
1184 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd); | |
1185 } | |
1186 | |
1187 /* | |
1188 ** Tokenize using the porter tokenizer. | |
1189 */ | |
1190 static int fts5PorterTokenize( | |
1191 Fts5Tokenizer *pTokenizer, | |
1192 void *pCtx, | |
1193 int flags, | |
1194 const char *pText, int nText, | |
1195 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd) | |
1196 ){ | |
1197 PorterTokenizer *p = (PorterTokenizer*)pTokenizer; | |
1198 PorterContext sCtx; | |
1199 sCtx.xToken = xToken; | |
1200 sCtx.pCtx = pCtx; | |
1201 sCtx.aBuf = p->aBuf; | |
1202 return p->tokenizer.xTokenize( | |
1203 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb | |
1204 ); | |
1205 } | |
1206 | |
1207 /* | |
1208 ** Register all built-in tokenizers with FTS5. | |
1209 */ | |
1210 int sqlite3Fts5TokenizerInit(fts5_api *pApi){ | |
1211 struct BuiltinTokenizer { | |
1212 const char *zName; | |
1213 fts5_tokenizer x; | |
1214 } aBuiltin[] = { | |
1215 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}}, | |
1216 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }}, | |
1217 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }}, | |
1218 }; | |
1219 | |
1220 int rc = SQLITE_OK; /* Return code */ | |
1221 int i; /* To iterate through builtin functions */ | |
1222 | |
1223 for(i=0; rc==SQLITE_OK && i<(int)ArraySize(aBuiltin); i++){ | |
1224 rc = pApi->xCreateTokenizer(pApi, | |
1225 aBuiltin[i].zName, | |
1226 (void*)pApi, | |
1227 &aBuiltin[i].x, | |
1228 0 | |
1229 ); | |
1230 } | |
1231 | |
1232 return rc; | |
1233 } | |
1234 | |
1235 | |
OLD | NEW |