third_party/sqlite/sqlite-src-3100200/ext/fts3/fts3_porter.c - Issue 2846743003: [sql] Remove SQLite 3.10.2 reference directory.

Side by Side Diff: third_party/sqlite/sqlite-src-3100200/ext/fts3/fts3_porter.c

Issue 2846743003: [sql] Remove SQLite 3.10.2 reference directory. (Closed)

Patch Set: Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/sqlite/sqlite-src-3100200/ext/fts3/fts3_icu.c ('k') | third_party/sqlite/sqlite-src-3100200/ext/fts3/fts3_snippet.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 /*

2 ** 2006 September 30

3 **

4 ** The author disclaims copyright to this source code. In place of

5 ** a legal notice, here is a blessing:

6 **

7 ** May you do good and not evil.

8 ** May you find forgiveness for yourself and forgive others.

9 ** May you share freely, never taking more than you give.

10 **

11 *************************************************************************

12 ** Implementation of the full-text-search tokenizer that implements

13 ** a Porter stemmer.

14 */

15

16 /*

17 ** The code in this file is only compiled if:

18 **

19 ** * The FTS3 module is being built as an extension

20 ** (in which case SQLITE_CORE is not defined), or

21 **

22 ** * The FTS3 module is being built into the core of

23 ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).

24 */

25 #include "fts3Int.h"

26 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)

27

28 #include <assert.h>

29 #include <stdlib.h>

30 #include <stdio.h>

31 #include <string.h>

32

33 #include "fts3_tokenizer.h"

34

35 /*

36 ** Class derived from sqlite3_tokenizer

37 */

38 typedef struct porter_tokenizer {

39 sqlite3_tokenizer base; /* Base class */

40 } porter_tokenizer;

41

42 /*

43 ** Class derived from sqlite3_tokenizer_cursor

44 */

45 typedef struct porter_tokenizer_cursor {

46 sqlite3_tokenizer_cursor base;

47 const char zInput; / input we are tokenizing */

48 int nInput; /* size of the input */

49 int iOffset; /* current position in zInput */

50 int iToken; /* index of next token to be returned */

51 char zToken; / storage for current token */

52 int nAllocated; /* space allocated to zToken buffer */

53 } porter_tokenizer_cursor;

54

55

56 /*

57 ** Create a new tokenizer instance.

58 */

59 static int porterCreate(

60 int argc, const char * const *argv,

61 sqlite3_tokenizer **ppTokenizer

62 ){

63 porter_tokenizer *t;

64

65 UNUSED_PARAMETER(argc);

66 UNUSED_PARAMETER(argv);

67

68 t = (porter_tokenizer ) sqlite3_malloc(sizeof(t));

69 if( t==NULL ) return SQLITE_NOMEM;

70 memset(t, 0, sizeof(*t));

71 *ppTokenizer = &t->base;

72 return SQLITE_OK;

73 }

74

75 /*

76 ** Destroy a tokenizer

77 */

78 static int porterDestroy(sqlite3_tokenizer *pTokenizer){

79 sqlite3_free(pTokenizer);

80 return SQLITE_OK;

81 }

82

83 /*

84 ** Prepare to begin tokenizing a particular string. The input

85 ** string to be tokenized is zInput[0..nInput-1]. A cursor

86 ** used to incrementally tokenize this string is returned in

87 ** *ppCursor.

88 */

89 static int porterOpen(

90 sqlite3_tokenizer pTokenizer, / The tokenizer */

91 const char zInput, int nInput, / String to be tokenized */

92 sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */

93 ){

94 porter_tokenizer_cursor *c;

95

96 UNUSED_PARAMETER(pTokenizer);

97

98 c = (porter_tokenizer_cursor ) sqlite3_malloc(sizeof(c));

99 if( c==NULL ) return SQLITE_NOMEM;

100

101 c->zInput = zInput;

102 if( zInput==0 ){

103 c->nInput = 0;

104 }else if( nInput<0 ){

105 c->nInput = (int)strlen(zInput);

106 }else{

107 c->nInput = nInput;

108 }

109 c->iOffset = 0; /* start tokenizing at the beginning */

110 c->iToken = 0;

111 c->zToken = NULL; /* no space allocated, yet. */

112 c->nAllocated = 0;

113

114 *ppCursor = &c->base;

115 return SQLITE_OK;

116 }

117

118 /*

119 ** Close a tokenization cursor previously opened by a call to

120 ** porterOpen() above.

121 */

122 static int porterClose(sqlite3_tokenizer_cursor *pCursor){

123 porter_tokenizer_cursor c = (porter_tokenizer_cursor ) pCursor;

124 sqlite3_free(c->zToken);

125 sqlite3_free(c);

126 return SQLITE_OK;

127 }

128 /*

129 ** Vowel or consonant

130 */

131 static const char cType[] = {

132 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,

133 1, 1, 1, 2, 1

134 };

135

136 /*

137 ** isConsonant() and isVowel() determine if their first character in

138 ** the string they point to is a consonant or a vowel, according

139 ** to Porter ruls.

140 **

141 ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.

142 ** 'Y' is a consonant unless it follows another consonant,

143 ** in which case it is a vowel.

144 **

145 ** In these routine, the letters are in reverse order. So the 'y' rule

146 ** is that 'y' is a consonant unless it is followed by another

147 ** consonent.

148 */

149 static int isVowel(const char*);

150 static int isConsonant(const char *z){

151 int j;

152 char x = *z;

153 if( x==0 ) return 0;

154 assert( x>='a' && x<='z' );

155 j = cType[x-'a'];

156 if( j<2 ) return j;

157 return z[1]==0 \|\| isVowel(z + 1);

158 }

159 static int isVowel(const char *z){

160 int j;

161 char x = *z;

162 if( x==0 ) return 0;

163 assert( x>='a' && x<='z' );

164 j = cType[x-'a'];

165 if( j<2 ) return 1-j;

166 return isConsonant(z + 1);

167 }

168

169 /*

170 ** Let any sequence of one or more vowels be represented by V and let

171 ** C be sequence of one or more consonants. Then every word can be

172 ** represented as:

173 **

174 ** [C] (VC){m} [V]

175 **

176 ** In prose: A word is an optional consonant followed by zero or

177 ** vowel-consonant pairs followed by an optional vowel. "m" is the

178 ** number of vowel consonant pairs. This routine computes the value

179 ** of m for the first i bytes of a word.

180 **

181 ** Return true if the m-value for z is 1 or more. In other words,

182 ** return true if z contains at least one vowel that is followed

183 ** by a consonant.

184 **

185 ** In this routine z[] is in reverse order. So we are really looking

186 ** for an instance of a consonant followed by a vowel.

187 */

188 static int m_gt_0(const char *z){

189 while( isVowel(z) ){ z++; }

190 if( *z==0 ) return 0;

191 while( isConsonant(z) ){ z++; }

192 return *z!=0;

193 }

194

195 /* Like mgt0 above except we are looking for a value of m which is

196 ** exactly 1

197 */

198 static int m_eq_1(const char *z){

199 while( isVowel(z) ){ z++; }

200 if( *z==0 ) return 0;

201 while( isConsonant(z) ){ z++; }

202 if( *z==0 ) return 0;

203 while( isVowel(z) ){ z++; }

204 if( *z==0 ) return 1;

205 while( isConsonant(z) ){ z++; }

206 return *z==0;

207 }

208

209 /* Like mgt0 above except we are looking for a value of m>1 instead

210 ** or m>0

211 */

212 static int m_gt_1(const char *z){

213 while( isVowel(z) ){ z++; }

214 if( *z==0 ) return 0;

215 while( isConsonant(z) ){ z++; }

216 if( *z==0 ) return 0;

217 while( isVowel(z) ){ z++; }

218 if( *z==0 ) return 0;

219 while( isConsonant(z) ){ z++; }

220 return *z!=0;

221 }

222

223 /*

224 ** Return TRUE if there is a vowel anywhere within z[0..n-1]

225 */

226 static int hasVowel(const char *z){

227 while( isConsonant(z) ){ z++; }

228 return *z!=0;

229 }

230

231 /*

232 ** Return TRUE if the word ends in a double consonant.

233 **

234 ** The text is reversed here. So we are really looking at

235 ** the first two characters of z[].

236 */

237 static int doubleConsonant(const char *z){

238 return isConsonant(z) && z[0]==z[1];

239 }

240

241 /*

242 ** Return TRUE if the word ends with three letters which

243 ** are consonant-vowel-consonent and where the final consonant

244 ** is not 'w', 'x', or 'y'.

245 **

246 ** The word is reversed here. So we are really checking the

247 ** first three letters and the first one cannot be in [wxy].

248 */

249 static int star_oh(const char *z){

250 return

251 isConsonant(z) &&

252 z[0]!='w' && z[0]!='x' && z[0]!='y' &&

253 isVowel(z+1) &&

254 isConsonant(z+2);

255 }

256

257 /*

258 ** If the word ends with zFrom and xCond() is true for the stem

259 ** of the word that preceeds the zFrom ending, then change the

260 ** ending to zTo.

261 **

262 ** The input word *pz and zFrom are both in reverse order. zTo

263 ** is in normal order.

264 **

265 ** Return TRUE if zFrom matches. Return FALSE if zFrom does not

266 ** match. Not that TRUE is returned even if xCond() fails and

267 ** no substitution occurs.

268 */

269 static int stem(

270 char *pz, / The word being stemmed (Reversed) */

271 const char zFrom, / If the ending matches this... (Reversed) */

272 const char zTo, / ... change the ending to this (not reversed) */

273 int (xCond)(const char) /* Condition that must be true */

274 ){

275 char z = pz;

276 while( zFrom && zFrom==*z ){ z++; zFrom++; }

277 if( *zFrom!=0 ) return 0;

278 if( xCond && !xCond(z) ) return 1;

279 while( *zTo ){

280 (--z) = (zTo++);

281 }

282 *pz = z;

283 return 1;

284 }

285

286 /*

287 ** This is the fallback stemmer used when the porter stemmer is

288 ** inappropriate. The input word is copied into the output with

289 ** US-ASCII case folding. If the input word is too long (more

290 ** than 20 bytes if it contains no digits or more than 6 bytes if

291 ** it contains digits) then word is truncated to 20 or 6 bytes

292 ** by taking 10 or 3 bytes from the beginning and end.

293 */

294 static void copy_stemmer(const char zIn, int nIn, char zOut, int *pnOut){

295 int i, mx, j;

296 int hasDigit = 0;

297 for(i=0; i<nIn; i++){

298 char c = zIn[i];

299 if( c>='A' && c<='Z' ){

300 zOut[i] = c - 'A' + 'a';

301 }else{

302 if( c>='0' && c<='9' ) hasDigit = 1;

303 zOut[i] = c;

304 }

305 }

306 mx = hasDigit ? 3 : 10;

307 if( nIn>mx*2 ){

308 for(j=mx, i=nIn-mx; i<nIn; i++, j++){

309 zOut[j] = zOut[i];

310 }

311 i = j;

312 }

313 zOut[i] = 0;

314 *pnOut = i;

315 }

316

317

318 /*

319 ** Stem the input word zIn[0..nIn-1]. Store the output in zOut.

320 ** zOut is at least big enough to hold nIn bytes. Write the actual

321 ** size of the output word (exclusive of the '\0' terminator) into *pnOut.

322 **

323 ** Any upper-case characters in the US-ASCII character set ([A-Z])

324 ** are converted to lower case. Upper-case UTF characters are

325 ** unchanged.

326 **

327 ** Words that are longer than about 20 bytes are stemmed by retaining

328 ** a few bytes from the beginning and the end of the word. If the

329 ** word contains digits, 3 bytes are taken from the beginning and

330 ** 3 bytes from the end. For long words without digits, 10 bytes

331 ** are taken from each end. US-ASCII case folding still applies.

332 **

333 ** If the input word contains not digits but does characters not

334 ** in [a-zA-Z] then no stemming is attempted and this routine just

335 ** copies the input into the input into the output with US-ASCII

336 ** case folding.

337 **

338 ** Stemming never increases the length of the word. So there is

339 ** no chance of overflowing the zOut buffer.

340 */

341 static void porter_stemmer(const char zIn, int nIn, char zOut, int *pnOut){

342 int i, j;

343 char zReverse[28];

344 char z, z2;

345 if( nIn<3 \|\| nIn>=(int)sizeof(zReverse)-7 ){

346 /* The word is too big or too small for the porter stemmer.

347 ** Fallback to the copy stemmer */

348 copy_stemmer(zIn, nIn, zOut, pnOut);

349 return;

350 }

351 for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){

352 char c = zIn[i];

353 if( c>='A' && c<='Z' ){

354 zReverse[j] = c + 'a' - 'A';

355 }else if( c>='a' && c<='z' ){

356 zReverse[j] = c;

357 }else{

358 /* The use of a character not in [a-zA-Z] means that we fallback

359 ** to the copy stemmer */

360 copy_stemmer(zIn, nIn, zOut, pnOut);

361 return;

362 }

363 }

364 memset(&zReverse[sizeof(zReverse)-5], 0, 5);

365 z = &zReverse[j+1];

366

367

368 /* Step 1a */

369 if( z[0]=='s' ){

370 if(

371 !stem(&z, "sess", "ss", 0) &&

372 !stem(&z, "sei", "i", 0) &&

373 !stem(&z, "ss", "ss", 0)

374 ){

375 z++;

376 }

377 }

378

379 /* Step 1b */

380 z2 = z;

381 if( stem(&z, "dee", "ee", m_gt_0) ){

382 /* Do nothing. The work was all in the test */

383 }else if(

384 (stem(&z, "gni", "", hasVowel) \|\| stem(&z, "de", "", hasVowel))

385 && z!=z2

386 ){

387 if( stem(&z, "ta", "ate", 0) \|\|

388 stem(&z, "lb", "ble", 0) \|\|

389 stem(&z, "zi", "ize", 0) ){

390 /* Do nothing. The work was all in the test */

391 }else if( doubleConsonant(z) && (z!='l' && z!='s' && *z!='z') ){

392 z++;

393 }else if( m_eq_1(z) && star_oh(z) ){

394 *(--z) = 'e';

395 }

396 }

397

398 /* Step 1c */

399 if( z[0]=='y' && hasVowel(z+1) ){

400 z[0] = 'i';

401 }

402

403 /* Step 2 */

404 switch( z[1] ){

405 case 'a':

406 if( !stem(&z, "lanoita", "ate", m_gt_0) ){

407 stem(&z, "lanoit", "tion", m_gt_0);

408 }

409 break;

410 case 'c':

411 if( !stem(&z, "icne", "ence", m_gt_0) ){

412 stem(&z, "icna", "ance", m_gt_0);

413 }

414 break;

415 case 'e':

416 stem(&z, "rezi", "ize", m_gt_0);

417 break;

418 case 'g':

419 stem(&z, "igol", "log", m_gt_0);

420 break;

421 case 'l':

422 if( !stem(&z, "ilb", "ble", m_gt_0)

423 && !stem(&z, "illa", "al", m_gt_0)

424 && !stem(&z, "iltne", "ent", m_gt_0)

425 && !stem(&z, "ile", "e", m_gt_0)

426 ){

427 stem(&z, "ilsuo", "ous", m_gt_0);

428 }

429 break;

430 case 'o':

431 if( !stem(&z, "noitazi", "ize", m_gt_0)

432 && !stem(&z, "noita", "ate", m_gt_0)

433 ){

434 stem(&z, "rota", "ate", m_gt_0);

435 }

436 break;

437 case 's':

438 if( !stem(&z, "msila", "al", m_gt_0)

439 && !stem(&z, "ssenevi", "ive", m_gt_0)

440 && !stem(&z, "ssenluf", "ful", m_gt_0)

441 ){

442 stem(&z, "ssensuo", "ous", m_gt_0);

443 }

444 break;

445 case 't':

446 if( !stem(&z, "itila", "al", m_gt_0)

447 && !stem(&z, "itivi", "ive", m_gt_0)

448 ){

449 stem(&z, "itilib", "ble", m_gt_0);

450 }

451 break;

452 }

453

454 /* Step 3 */

455 switch( z[0] ){

456 case 'e':

457 if( !stem(&z, "etaci", "ic", m_gt_0)

458 && !stem(&z, "evita", "", m_gt_0)

459 ){

460 stem(&z, "ezila", "al", m_gt_0);

461 }

462 break;

463 case 'i':

464 stem(&z, "itici", "ic", m_gt_0);

465 break;

466 case 'l':

467 if( !stem(&z, "laci", "ic", m_gt_0) ){

468 stem(&z, "luf", "", m_gt_0);

469 }

470 break;

471 case 's':

472 stem(&z, "ssen", "", m_gt_0);

473 break;

474 }

475

476 /* Step 4 */

477 switch( z[1] ){

478 case 'a':

479 if( z[0]=='l' && m_gt_1(z+2) ){

480 z += 2;

481 }

482 break;

483 case 'c':

484 if( z[0]=='e' && z[2]=='n' && (z[3]=='a' \|\| z[3]=='e') && m_gt_1(z+4) ){

485 z += 4;

486 }

487 break;

488 case 'e':

489 if( z[0]=='r' && m_gt_1(z+2) ){

490 z += 2;

491 }

492 break;

493 case 'i':

494 if( z[0]=='c' && m_gt_1(z+2) ){

495 z += 2;

496 }

497 break;

498 case 'l':

499 if( z[0]=='e' && z[2]=='b' && (z[3]=='a' \|\| z[3]=='i') && m_gt_1(z+4) ){

500 z += 4;

501 }

502 break;

503 case 'n':

504 if( z[0]=='t' ){

505 if( z[2]=='a' ){

506 if( m_gt_1(z+3) ){

507 z += 3;

508 }

509 }else if( z[2]=='e' ){

510 if( !stem(&z, "tneme", "", m_gt_1)

511 && !stem(&z, "tnem", "", m_gt_1)

512 ){

513 stem(&z, "tne", "", m_gt_1);

514 }

515 }

516 }

517 break;

518 case 'o':

519 if( z[0]=='u' ){

520 if( m_gt_1(z+2) ){

521 z += 2;

522 }

523 }else if( z[3]=='s' \|\| z[3]=='t' ){

524 stem(&z, "noi", "", m_gt_1);

525 }

526 break;

527 case 's':

528 if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){

529 z += 3;

530 }

531 break;

532 case 't':

533 if( !stem(&z, "eta", "", m_gt_1) ){

534 stem(&z, "iti", "", m_gt_1);

535 }

536 break;

537 case 'u':

538 if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){

539 z += 3;

540 }

541 break;

542 case 'v':

543 case 'z':

544 if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){

545 z += 3;

546 }

547 break;

548 }

549

550 /* Step 5a */

551 if( z[0]=='e' ){

552 if( m_gt_1(z+1) ){

553 z++;

554 }else if( m_eq_1(z+1) && !star_oh(z+1) ){

555 z++;

556 }

557 }

558

559 /* Step 5b */

560 if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){

561 z++;

562 }

563

564 /* z[] is now the stemmed word in reverse order. Flip it back

565 ** around into forward order and return.

566 */

567 *pnOut = i = (int)strlen(z);

568 zOut[i] = 0;

569 while( *z ){

570 zOut[--i] = *(z++);

571 }

572 }

573

574 /*

575 ** Characters that can be part of a token. We assume any character

576 ** whose value is greater than 0x80 (any UTF character) can be

577 ** part of a token. In other words, delimiters all must have

578 ** values of 0x7f or lower.

579 */

580 static const char porterIdChar[] = {

581 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */

582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */

583 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */

584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */

585 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */

586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */

587 };

588 #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 \|\| !porterIdChar[ch-0x30]))

589

590 /*

591 ** Extract the next token from a tokenization cursor. The cursor must

592 ** have been opened by a prior call to porterOpen().

593 */

594 static int porterNext(

595 sqlite3_tokenizer_cursor pCursor, / Cursor returned by porterOpen */

596 const char *pzToken, / OUT: pzToken is the token text /

597 int pnBytes, / OUT: Number of bytes in token */

598 int piStartOffset, / OUT: Starting offset of token */

599 int piEndOffset, / OUT: Ending offset of token */

600 int piPosition / OUT: Position integer of token */

601 ){

602 porter_tokenizer_cursor c = (porter_tokenizer_cursor ) pCursor;

603 const char *z = c->zInput;

604

605 while( c->iOffset<c->nInput ){

606 int iStartOffset, ch;

607

608 /* Scan past delimiter characters */

609 while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){

610 c->iOffset++;

611 }

612

613 /* Count non-delimiter characters. */

614 iStartOffset = c->iOffset;

615 while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){

616 c->iOffset++;

617 }

618

619 if( c->iOffset>iStartOffset ){

620 int n = c->iOffset-iStartOffset;

621 if( n>c->nAllocated ){

622 char *pNew;

623 c->nAllocated = n+20;

624 pNew = sqlite3_realloc(c->zToken, c->nAllocated);

625 if( !pNew ) return SQLITE_NOMEM;

626 c->zToken = pNew;

627 }

628 porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);

629 *pzToken = c->zToken;

630 *piStartOffset = iStartOffset;

631 *piEndOffset = c->iOffset;

632 *piPosition = c->iToken++;

633 return SQLITE_OK;

634 }

635 }

636 return SQLITE_DONE;

637 }

638

639 /*

640 ** The set of routines that implement the porter-stemmer tokenizer

641 */

642 static const sqlite3_tokenizer_module porterTokenizerModule = {

643 0,

644 porterCreate,

645 porterDestroy,

646 porterOpen,

647 porterClose,

648 porterNext,

649 0

650 };

651

652 /*

653 ** Allocate a new porter tokenizer. Return a pointer to the new

654 ** tokenizer in *ppModule

655 */

656 void sqlite3Fts3PorterTokenizerModule(

657 sqlite3_tokenizer_module const**ppModule

658 ){

659 *ppModule = &porterTokenizerModule;

660 }

661

662 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */

OLD	NEW