| OLD | NEW |
| 1 /* | 1 /* |
| 2 ** 2006 September 30 | 2 ** 2006 September 30 |
| 3 ** | 3 ** |
| 4 ** The author disclaims copyright to this source code. In place of | 4 ** The author disclaims copyright to this source code. In place of |
| 5 ** a legal notice, here is a blessing: | 5 ** a legal notice, here is a blessing: |
| 6 ** | 6 ** |
| 7 ** May you do good and not evil. | 7 ** May you do good and not evil. |
| 8 ** May you find forgiveness for yourself and forgive others. | 8 ** May you find forgiveness for yourself and forgive others. |
| 9 ** May you share freely, never taking more than you give. | 9 ** May you share freely, never taking more than you give. |
| 10 ** | 10 ** |
| 11 ************************************************************************* | 11 ************************************************************************* |
| 12 ** Implementation of the full-text-search tokenizer that implements | 12 ** Implementation of the full-text-search tokenizer that implements |
| 13 ** a Porter stemmer. | 13 ** a Porter stemmer. |
| 14 */ | 14 */ |
| 15 | 15 |
| 16 /* | 16 /* |
| 17 ** The code in this file is only compiled if: | 17 ** The code in this file is only compiled if: |
| 18 ** | 18 ** |
| 19 ** * The FTS3 module is being built as an extension | 19 ** * The FTS3 module is being built as an extension |
| 20 ** (in which case SQLITE_CORE is not defined), or | 20 ** (in which case SQLITE_CORE is not defined), or |
| 21 ** | 21 ** |
| 22 ** * The FTS3 module is being built into the core of | 22 ** * The FTS3 module is being built into the core of |
| 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). | 23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). |
| 24 */ | 24 */ |
| 25 #include "fts3Int.h" |
| 25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | 26 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
| 26 | 27 |
| 27 #include "fts3Int.h" | |
| 28 | |
| 29 #include <assert.h> | 28 #include <assert.h> |
| 30 #include <stdlib.h> | 29 #include <stdlib.h> |
| 31 #include <stdio.h> | 30 #include <stdio.h> |
| 32 #include <string.h> | 31 #include <string.h> |
| 33 | 32 |
| 34 #include "fts3_tokenizer.h" | 33 #include "fts3_tokenizer.h" |
| 35 | 34 |
| 36 /* | 35 /* |
| 37 ** Class derived from sqlite3_tokenizer | 36 ** Class derived from sqlite3_tokenizer |
| 38 */ | 37 */ |
| 39 typedef struct porter_tokenizer { | 38 typedef struct porter_tokenizer { |
| 40 sqlite3_tokenizer base; /* Base class */ | 39 sqlite3_tokenizer base; /* Base class */ |
| 41 } porter_tokenizer; | 40 } porter_tokenizer; |
| 42 | 41 |
| 43 /* | 42 /* |
| 44 ** Class derived from sqlit3_tokenizer_cursor | 43 ** Class derived from sqlite3_tokenizer_cursor |
| 45 */ | 44 */ |
| 46 typedef struct porter_tokenizer_cursor { | 45 typedef struct porter_tokenizer_cursor { |
| 47 sqlite3_tokenizer_cursor base; | 46 sqlite3_tokenizer_cursor base; |
| 48 const char *zInput; /* input we are tokenizing */ | 47 const char *zInput; /* input we are tokenizing */ |
| 49 int nInput; /* size of the input */ | 48 int nInput; /* size of the input */ |
| 50 int iOffset; /* current position in zInput */ | 49 int iOffset; /* current position in zInput */ |
| 51 int iToken; /* index of next token to be returned */ | 50 int iToken; /* index of next token to be returned */ |
| 52 char *zToken; /* storage for current token */ | 51 char *zToken; /* storage for current token */ |
| 53 int nAllocated; /* space allocated to zToken buffer */ | 52 int nAllocated; /* space allocated to zToken buffer */ |
| 54 } porter_tokenizer_cursor; | 53 } porter_tokenizer_cursor; |
| (...skipping 342 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 397 } | 396 } |
| 398 | 397 |
| 399 /* Step 1c */ | 398 /* Step 1c */ |
| 400 if( z[0]=='y' && hasVowel(z+1) ){ | 399 if( z[0]=='y' && hasVowel(z+1) ){ |
| 401 z[0] = 'i'; | 400 z[0] = 'i'; |
| 402 } | 401 } |
| 403 | 402 |
| 404 /* Step 2 */ | 403 /* Step 2 */ |
| 405 switch( z[1] ){ | 404 switch( z[1] ){ |
| 406 case 'a': | 405 case 'a': |
| 407 stem(&z, "lanoita", "ate", m_gt_0) || | 406 if( !stem(&z, "lanoita", "ate", m_gt_0) ){ |
| 408 stem(&z, "lanoit", "tion", m_gt_0); | 407 stem(&z, "lanoit", "tion", m_gt_0); |
| 408 } |
| 409 break; | 409 break; |
| 410 case 'c': | 410 case 'c': |
| 411 stem(&z, "icne", "ence", m_gt_0) || | 411 if( !stem(&z, "icne", "ence", m_gt_0) ){ |
| 412 stem(&z, "icna", "ance", m_gt_0); | 412 stem(&z, "icna", "ance", m_gt_0); |
| 413 } |
| 413 break; | 414 break; |
| 414 case 'e': | 415 case 'e': |
| 415 stem(&z, "rezi", "ize", m_gt_0); | 416 stem(&z, "rezi", "ize", m_gt_0); |
| 416 break; | 417 break; |
| 417 case 'g': | 418 case 'g': |
| 418 stem(&z, "igol", "log", m_gt_0); | 419 stem(&z, "igol", "log", m_gt_0); |
| 419 break; | 420 break; |
| 420 case 'l': | 421 case 'l': |
| 421 stem(&z, "ilb", "ble", m_gt_0) || | 422 if( !stem(&z, "ilb", "ble", m_gt_0) |
| 422 stem(&z, "illa", "al", m_gt_0) || | 423 && !stem(&z, "illa", "al", m_gt_0) |
| 423 stem(&z, "iltne", "ent", m_gt_0) || | 424 && !stem(&z, "iltne", "ent", m_gt_0) |
| 424 stem(&z, "ile", "e", m_gt_0) || | 425 && !stem(&z, "ile", "e", m_gt_0) |
| 425 stem(&z, "ilsuo", "ous", m_gt_0); | 426 ){ |
| 427 stem(&z, "ilsuo", "ous", m_gt_0); |
| 428 } |
| 426 break; | 429 break; |
| 427 case 'o': | 430 case 'o': |
| 428 stem(&z, "noitazi", "ize", m_gt_0) || | 431 if( !stem(&z, "noitazi", "ize", m_gt_0) |
| 429 stem(&z, "noita", "ate", m_gt_0) || | 432 && !stem(&z, "noita", "ate", m_gt_0) |
| 430 stem(&z, "rota", "ate", m_gt_0); | 433 ){ |
| 434 stem(&z, "rota", "ate", m_gt_0); |
| 435 } |
| 431 break; | 436 break; |
| 432 case 's': | 437 case 's': |
| 433 stem(&z, "msila", "al", m_gt_0) || | 438 if( !stem(&z, "msila", "al", m_gt_0) |
| 434 stem(&z, "ssenevi", "ive", m_gt_0) || | 439 && !stem(&z, "ssenevi", "ive", m_gt_0) |
| 435 stem(&z, "ssenluf", "ful", m_gt_0) || | 440 && !stem(&z, "ssenluf", "ful", m_gt_0) |
| 436 stem(&z, "ssensuo", "ous", m_gt_0); | 441 ){ |
| 442 stem(&z, "ssensuo", "ous", m_gt_0); |
| 443 } |
| 437 break; | 444 break; |
| 438 case 't': | 445 case 't': |
| 439 stem(&z, "itila", "al", m_gt_0) || | 446 if( !stem(&z, "itila", "al", m_gt_0) |
| 440 stem(&z, "itivi", "ive", m_gt_0) || | 447 && !stem(&z, "itivi", "ive", m_gt_0) |
| 441 stem(&z, "itilib", "ble", m_gt_0); | 448 ){ |
| 449 stem(&z, "itilib", "ble", m_gt_0); |
| 450 } |
| 442 break; | 451 break; |
| 443 } | 452 } |
| 444 | 453 |
| 445 /* Step 3 */ | 454 /* Step 3 */ |
| 446 switch( z[0] ){ | 455 switch( z[0] ){ |
| 447 case 'e': | 456 case 'e': |
| 448 stem(&z, "etaci", "ic", m_gt_0) || | 457 if( !stem(&z, "etaci", "ic", m_gt_0) |
| 449 stem(&z, "evita", "", m_gt_0) || | 458 && !stem(&z, "evita", "", m_gt_0) |
| 450 stem(&z, "ezila", "al", m_gt_0); | 459 ){ |
| 460 stem(&z, "ezila", "al", m_gt_0); |
| 461 } |
| 451 break; | 462 break; |
| 452 case 'i': | 463 case 'i': |
| 453 stem(&z, "itici", "ic", m_gt_0); | 464 stem(&z, "itici", "ic", m_gt_0); |
| 454 break; | 465 break; |
| 455 case 'l': | 466 case 'l': |
| 456 stem(&z, "laci", "ic", m_gt_0) || | 467 if( !stem(&z, "laci", "ic", m_gt_0) ){ |
| 457 stem(&z, "luf", "", m_gt_0); | 468 stem(&z, "luf", "", m_gt_0); |
| 469 } |
| 458 break; | 470 break; |
| 459 case 's': | 471 case 's': |
| 460 stem(&z, "ssen", "", m_gt_0); | 472 stem(&z, "ssen", "", m_gt_0); |
| 461 break; | 473 break; |
| 462 } | 474 } |
| 463 | 475 |
| 464 /* Step 4 */ | 476 /* Step 4 */ |
| 465 switch( z[1] ){ | 477 switch( z[1] ){ |
| 466 case 'a': | 478 case 'a': |
| 467 if( z[0]=='l' && m_gt_1(z+2) ){ | 479 if( z[0]=='l' && m_gt_1(z+2) ){ |
| (...skipping 20 matching lines...) Expand all Loading... |
| 488 z += 4; | 500 z += 4; |
| 489 } | 501 } |
| 490 break; | 502 break; |
| 491 case 'n': | 503 case 'n': |
| 492 if( z[0]=='t' ){ | 504 if( z[0]=='t' ){ |
| 493 if( z[2]=='a' ){ | 505 if( z[2]=='a' ){ |
| 494 if( m_gt_1(z+3) ){ | 506 if( m_gt_1(z+3) ){ |
| 495 z += 3; | 507 z += 3; |
| 496 } | 508 } |
| 497 }else if( z[2]=='e' ){ | 509 }else if( z[2]=='e' ){ |
| 498 stem(&z, "tneme", "", m_gt_1) || | 510 if( !stem(&z, "tneme", "", m_gt_1) |
| 499 stem(&z, "tnem", "", m_gt_1) || | 511 && !stem(&z, "tnem", "", m_gt_1) |
| 500 stem(&z, "tne", "", m_gt_1); | 512 ){ |
| 513 stem(&z, "tne", "", m_gt_1); |
| 514 } |
| 501 } | 515 } |
| 502 } | 516 } |
| 503 break; | 517 break; |
| 504 case 'o': | 518 case 'o': |
| 505 if( z[0]=='u' ){ | 519 if( z[0]=='u' ){ |
| 506 if( m_gt_1(z+2) ){ | 520 if( m_gt_1(z+2) ){ |
| 507 z += 2; | 521 z += 2; |
| 508 } | 522 } |
| 509 }else if( z[3]=='s' || z[3]=='t' ){ | 523 }else if( z[3]=='s' || z[3]=='t' ){ |
| 510 stem(&z, "noi", "", m_gt_1); | 524 stem(&z, "noi", "", m_gt_1); |
| 511 } | 525 } |
| 512 break; | 526 break; |
| 513 case 's': | 527 case 's': |
| 514 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ | 528 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){ |
| 515 z += 3; | 529 z += 3; |
| 516 } | 530 } |
| 517 break; | 531 break; |
| 518 case 't': | 532 case 't': |
| 519 stem(&z, "eta", "", m_gt_1) || | 533 if( !stem(&z, "eta", "", m_gt_1) ){ |
| 520 stem(&z, "iti", "", m_gt_1); | 534 stem(&z, "iti", "", m_gt_1); |
| 535 } |
| 521 break; | 536 break; |
| 522 case 'u': | 537 case 'u': |
| 523 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ | 538 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){ |
| 524 z += 3; | 539 z += 3; |
| 525 } | 540 } |
| 526 break; | 541 break; |
| 527 case 'v': | 542 case 'v': |
| 528 case 'z': | 543 case 'z': |
| 529 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ | 544 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){ |
| 530 z += 3; | 545 z += 3; |
| (...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 624 /* | 639 /* |
| 625 ** The set of routines that implement the porter-stemmer tokenizer | 640 ** The set of routines that implement the porter-stemmer tokenizer |
| 626 */ | 641 */ |
| 627 static const sqlite3_tokenizer_module porterTokenizerModule = { | 642 static const sqlite3_tokenizer_module porterTokenizerModule = { |
| 628 0, | 643 0, |
| 629 porterCreate, | 644 porterCreate, |
| 630 porterDestroy, | 645 porterDestroy, |
| 631 porterOpen, | 646 porterOpen, |
| 632 porterClose, | 647 porterClose, |
| 633 porterNext, | 648 porterNext, |
| 649 0 |
| 634 }; | 650 }; |
| 635 | 651 |
| 636 /* | 652 /* |
| 637 ** Allocate a new porter tokenizer. Return a pointer to the new | 653 ** Allocate a new porter tokenizer. Return a pointer to the new |
| 638 ** tokenizer in *ppModule | 654 ** tokenizer in *ppModule |
| 639 */ | 655 */ |
| 640 void sqlite3Fts3PorterTokenizerModule( | 656 void sqlite3Fts3PorterTokenizerModule( |
| 641 sqlite3_tokenizer_module const**ppModule | 657 sqlite3_tokenizer_module const**ppModule |
| 642 ){ | 658 ){ |
| 643 *ppModule = &porterTokenizerModule; | 659 *ppModule = &porterTokenizerModule; |
| 644 } | 660 } |
| 645 | 661 |
| 646 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | 662 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |
| OLD | NEW |