OLD | NEW |
1 /* | 1 /* |
2 ** 2006 September 30 | 2 ** 2006 September 30 |
3 ** | 3 ** |
4 ** The author disclaims copyright to this source code. In place of | 4 ** The author disclaims copyright to this source code. In place of |
5 ** a legal notice, here is a blessing: | 5 ** a legal notice, here is a blessing: |
6 ** | 6 ** |
7 ** May you do good and not evil. | 7 ** May you do good and not evil. |
8 ** May you find forgiveness for yourself and forgive others. | 8 ** May you find forgiveness for yourself and forgive others. |
9 ** May you share freely, never taking more than you give. | 9 ** May you share freely, never taking more than you give. |
10 ** | 10 ** |
11 ************************************************************************* | 11 ************************************************************************* |
12 ** Implementation of the full-text-search tokenizer that implements | 12 ** Implementation of the full-text-search tokenizer that implements |
13 ** a Porter stemmer. | 13 ** a Porter stemmer. |
14 */ | 14 */ |
15 | 15 |
16 /* | 16 /* |
17 ** The code in this file is only compiled if: | 17 ** The code in this file is only compiled if: |
18 ** | 18 ** |
19 ** * The FTS3 module is being built as an extension | 19 ** * The FTS3 module is being built as an extension |
20 ** (in which case SQLITE_CORE is not defined), or | 20 ** (in which case SQLITE_CORE is not defined), or |
21 ** | 21 ** |
22 ** * The FTS3 module is being built into the core of | 22 ** * The FTS3 module is being built into the core of |
23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). |
24 */ | 24 */ |
| 25 #include "fts3Int.h" |
25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | 26 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
26 | 27 |
27 #include "fts3Int.h" | |
28 | |
29 #include <assert.h> | 28 #include <assert.h> |
30 #include <stdlib.h> | 29 #include <stdlib.h> |
31 #include <stdio.h> | 30 #include <stdio.h> |
32 #include <string.h> | 31 #include <string.h> |
33 | 32 |
34 #include "fts3_tokenizer.h" | 33 #include "fts3_tokenizer.h" |
35 | 34 |
36 /* | 35 /* |
37 ** Class derived from sqlite3_tokenizer | 36 ** Class derived from sqlite3_tokenizer |
38 */ | 37 */ |
39 typedef struct porter_tokenizer { | 38 typedef struct porter_tokenizer { |
40 sqlite3_tokenizer base; /* Base class */ | 39 sqlite3_tokenizer base; /* Base class */ |
41 } porter_tokenizer; | 40 } porter_tokenizer; |
42 | 41 |
43 /* | 42 /* |
44 ** Class derived from sqlit3_tokenizer_cursor | 43 ** Class derived from sqlite3_tokenizer_cursor |
45 */ | 44 */ |
46 typedef struct porter_tokenizer_cursor { | 45 typedef struct porter_tokenizer_cursor { |
47 sqlite3_tokenizer_cursor base; | 46 sqlite3_tokenizer_cursor base; |
48 const char *zInput; /* input we are tokenizing */ | 47 const char *zInput; /* input we are tokenizing */ |
49 int nInput; /* size of the input */ | 48 int nInput; /* size of the input */ |
50 int iOffset; /* current position in zInput */ | 49 int iOffset; /* current position in zInput */ |
51 int iToken; /* index of next token to be returned */ | 50 int iToken; /* index of next token to be returned */ |
52 char *zToken; /* storage for current token */ | 51 char *zToken; /* storage for current token */ |
53 int nAllocated; /* space allocated to zToken buffer */ | 52 int nAllocated; /* space allocated to zToken buffer */ |
54 } porter_tokenizer_cursor; | 53 } porter_tokenizer_cursor; |
(...skipping 342 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
397 } | 396 } |
398 | 397 |
399 /* Step 1c */ | 398 /* Step 1c */ |
400 if( z[0]=='y' && hasVowel(z+1) ){ | 399 if( z[0]=='y' && hasVowel(z+1) ){ |
401 z[0] = 'i'; | 400 z[0] = 'i'; |
402 } | 401 } |
403 | 402 |
404 /* Step 2 */ | 403 /* Step 2 */ |
405 switch( z[1] ){ | 404 switch( z[1] ){ |
406 case 'a': | 405 case 'a': |
407 stem(&z, "lanoita", "ate", m_gt_0) || | 406 if( !stem(&z, "lanoita", "ate", m_gt_0) ){ |
408 stem(&z, "lanoit", "tion", m_gt_0); | 407 stem(&z, "lanoit", "tion", m_gt_0); |
| 408 } |
409 break; | 409 break; |
410 case 'c': | 410 case 'c': |
411 stem(&z, "icne", "ence", m_gt_0) || | 411 if( !stem(&z, "icne", "ence", m_gt_0) ){ |
412 stem(&z, "icna", "ance", m_gt_0); | 412 stem(&z, "icna", "ance", m_gt_0); |
| 413 } |
413 break; | 414 break; |
414 case 'e': | 415 case 'e': |
415 stem(&z, "rezi", "ize", m_gt_0); | 416 stem(&z, "rezi", "ize", m_gt_0); |
416 break; | 417 break; |
417 case 'g': | 418 case 'g': |
418 stem(&z, "igol", "log", m_gt_0); | 419 stem(&z, "igol", "log", m_gt_0); |
419 break; | 420 break; |
420 case 'l': | 421 case 'l': |
421 stem(&z, "ilb", "ble", m_gt_0) || | 422 if( !stem(&z, "ilb", "ble", m_gt_0) |
422 stem(&z, "illa", "al", m_gt_0) || | 423 && !stem(&z, "illa", "al", m_gt_0) |
423 stem(&z, "iltne", "ent", m_gt_0) || | 424 && !stem(&z, "iltne", "ent", m_gt_0) |
424 stem(&z, "ile", "e", m_gt_0) || | 425 && !stem(&z, "ile", "e", m_gt_0) |
425 stem(&z, "ilsuo", "ous", m_gt_0); | 426 ){ |
| 427 stem(&z, "ilsuo", "ous", m_gt_0); |
| 428 } |
426 break; | 429 break; |
427 case 'o': | 430 case 'o': |
428 stem(&z, "noitazi", "ize", m_gt_0) || | 431 if( !stem(&z, "noitazi", "ize", m_gt_0) |
429 stem(&z, "noita", "ate", m_gt_0) || | 432 && !stem(&z, "noita", "ate", m_gt_0) |
430 stem(&z, "rota", "ate", m_gt_0); | 433 ){ |
| 434 stem(&z, "rota", "ate", m_gt_0); |
| 435 } |
431 break; | 436 break; |
432 case 's': | 437 case 's': |
433 stem(&z, "msila", "al", m_gt_0) || | 438 if( !stem(&z, "msila", "al", m_gt_0) |
434 stem(&z, "ssenevi", "ive", m_gt_0) || | 439 && !stem(&z, "ssenevi", "ive", m_gt_0) |
435 stem(&z, "ssenluf", "ful", m_gt_0) || | 440 && !stem(&z, "ssenluf", "ful", m_gt_0) |
436 stem(&z, "ssensuo", "ous", m_gt_0); | 441 ){ |
| 442 stem(&z, "ssensuo", "ous", m_gt_0); |
| 443 } |
437 break; | 444 break; |
438 case 't': | 445 case 't': |
439 stem(&z, "itila", "al", m_gt_0) || | 446 if( !stem(&z, "itila", "al", m_gt_0) |
440 stem(&z, "itivi", "ive", m_gt_0) || | 447 && !stem(&z, "itivi", "ive", m_gt_0) |
441 stem(&z, "itilib", "ble", m_gt_0); | 448 ){ |
| 449 stem(&z, "itilib", "ble", m_gt_0); |
| 450 } |
442 break; | 451 break; |
443 } | 452 } |
444 | 453 |
445 /* Step 3 */ | 454 /* Step 3 */ |
446 switch( z[0] ){ | 455 switch( z[0] ){ |
447 case 'e': | 456 case 'e': |
448 stem(&z, "etaci", "ic", m_gt_0) || | 457 if( !stem(&z, "etaci", "ic", m_gt_0) |
449 stem(&z, "evita", "", m_gt_0) || | 458 && !stem(&z, "evita", "", m_gt_0) |
450 stem(&z, "ezila", "al", m_gt_0); | 459 ){ |
| 460 stem(&z, "ezila", "al", m_gt_0); |
| 461 } |
451 break; | 462 break; |
452 case 'i': | 463 case 'i': |
453 stem(&z, "itici", "ic", m_gt_0); | 464 stem(&z, "itici", "ic", m_gt_0); |
454 break; | 465 break; |
455 case 'l': | 466 case 'l': |
456 stem(&z, "laci", "ic", m_gt_0) || | 467 if( !stem(&z, "laci", "ic", m_gt_0) ){ |
457 stem(&z, "luf", "", m_gt_0); | 468 stem(&z, "luf", "", m_gt_0); |
| 469 } |
458 break; | 470 break; |
459 case 's': | 471 case 's': |
460 stem(&z, "ssen", "", m_gt_0); | 472 stem(&z, "ssen", "", m_gt_0); |
461 break; | 473 break; |
462 } | 474 } |
463 | 475 |
464 /* Step 4 */ | 476 /* Step 4 */ |
465 switch( z[1] ){ | 477 switch( z[1] ){ |
466 case 'a': | 478 case 'a': |
467 if( z[0]=='l' && m_gt_1(z+2) ){ | 479 if( z[0]=='l' && m_gt_1(z+2) ){ |
(...skipping 20 matching lines...) Expand all Loading... |
488 z += 4; | 500 z += 4; |
489 } | 501 } |
490 break; | 502 break; |
491 case 'n': | 503 case 'n': |
492 if( z[0]=='t' ){ | 504 if( z[0]=='t' ){ |
493 if( z[2]=='a' ){ | 505 if( z[2]=='a' ){ |
494 if( m_gt_1(z+3) ){ | 506 if( m_gt_1(z+3) ){ |
495 z += 3; | 507 z += 3; |
496 } | 508 } |
497 }else if( z[2]=='e' ){ | 509 }else if( z[2]=='e' ){ |
498 stem(&z, "tneme", "", m_gt_1) || | 510 if( !stem(&z, "tneme", "", m_gt_1) |
499 stem(&z, "tnem", "", m_gt_1) || | 511 && !stem(&z, "tnem", "", m_gt_1) |
500 stem(&z, "tne", "", m_gt_1); | 512 ){ |
| 513 stem(&z, "tne", "", m_gt_1); |
| 514 } |
501 } | 515 } |
502 } | 516 } |
503 break; | 517 break; |
504 case 'o': | 518 case 'o': |
505 if( z[0]=='u' ){ | 519 if( z[0]=='u' ){ |
506 if( m_gt_1(z+2) ){ | 520 if( m_gt_1(z+2) ){ |
507 z += 2; | 521 z += 2; |
508 } | 522 } |
509 }else if( z[3]=='s' || z[3]=='t' ){ | 523 }else if( z[3]=='s' || z[3]=='t' ){ |
510 stem(&z, "noi", "", m_gt_1); | 524 stem(&z, "noi", "", m_gt_1); |
511 } | 525 } |
512 break; | 526 break; |
513 case 's': | 527 case 's': |
514 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ | 528 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ |
515 z += 3; | 529 z += 3; |
516 } | 530 } |
517 break; | 531 break; |
518 case 't': | 532 case 't': |
519 stem(&z, "eta", "", m_gt_1) || | 533 if( !stem(&z, "eta", "", m_gt_1) ){ |
520 stem(&z, "iti", "", m_gt_1); | 534 stem(&z, "iti", "", m_gt_1); |
| 535 } |
521 break; | 536 break; |
522 case 'u': | 537 case 'u': |
523 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ | 538 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ |
524 z += 3; | 539 z += 3; |
525 } | 540 } |
526 break; | 541 break; |
527 case 'v': | 542 case 'v': |
528 case 'z': | 543 case 'z': |
529 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ | 544 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ |
530 z += 3; | 545 z += 3; |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
624 /* | 639 /* |
625 ** The set of routines that implement the porter-stemmer tokenizer | 640 ** The set of routines that implement the porter-stemmer tokenizer |
626 */ | 641 */ |
627 static const sqlite3_tokenizer_module porterTokenizerModule = { | 642 static const sqlite3_tokenizer_module porterTokenizerModule = { |
628 0, | 643 0, |
629 porterCreate, | 644 porterCreate, |
630 porterDestroy, | 645 porterDestroy, |
631 porterOpen, | 646 porterOpen, |
632 porterClose, | 647 porterClose, |
633 porterNext, | 648 porterNext, |
| 649 0 |
634 }; | 650 }; |
635 | 651 |
636 /* | 652 /* |
637 ** Allocate a new porter tokenizer. Return a pointer to the new | 653 ** Allocate a new porter tokenizer. Return a pointer to the new |
638 ** tokenizer in *ppModule | 654 ** tokenizer in *ppModule |
639 */ | 655 */ |
640 void sqlite3Fts3PorterTokenizerModule( | 656 void sqlite3Fts3PorterTokenizerModule( |
641 sqlite3_tokenizer_module const**ppModule | 657 sqlite3_tokenizer_module const**ppModule |
642 ){ | 658 ){ |
643 *ppModule = &porterTokenizerModule; | 659 *ppModule = &porterTokenizerModule; |
644 } | 660 } |
645 | 661 |
646 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | 662 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |
OLD | NEW |