third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_unicode.c - Issue 2363173002: [sqlite] Remove obsolete reference version 3.8.7.4.

Side by Side Diff: third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_unicode.c

Issue 2363173002: [sqlite] Remove obsolete reference version 3.8.7.4. (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_tokenizer1.c ('k') | third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_unicode2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 /*

2 ** 2012 May 24

3 **

4 ** The author disclaims copyright to this source code. In place of

5 ** a legal notice, here is a blessing:

6 **

7 ** May you do good and not evil.

8 ** May you find forgiveness for yourself and forgive others.

9 ** May you share freely, never taking more than you give.

10 **

11 ******************************************************************************

12 **

13 ** Implementation of the "unicode" full-text-search tokenizer.

14 */

15

16 #ifndef SQLITE_DISABLE_FTS3_UNICODE

17

18 #include "fts3Int.h"

19 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)

20

21 #include <assert.h>

22 #include <stdlib.h>

23 #include <stdio.h>

24 #include <string.h>

25

26 #include "fts3_tokenizer.h"

27

28 /*

29 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied

30 ** from the sqlite3 source file utf.c. If this file is compiled as part

31 ** of the amalgamation, they are not required.

32 */

33 #ifndef SQLITE_AMALGAMATION

34

35 static const unsigned char sqlite3Utf8Trans1[] = {

36 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,

37 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,

38 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

39 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,

40 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,

41 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,

42 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,

43 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,

44 };

45

46 #define READ_UTF8(zIn, zTerm, c) \

47 c = *(zIn++); \

48 if( c>=0xc0 ){ \

49 c = sqlite3Utf8Trans1[c-0xc0]; \

50 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \

51 c = (c<<6) + (0x3f & *(zIn++)); \

52 } \

53 if( c<0x80 \

54 \|\| (c&0xFFFFF800)==0xD800 \

55 \|\| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \

56 }

57

58 #define WRITE_UTF8(zOut, c) { \

59 if( c<0x00080 ){ \

60 *zOut++ = (u8)(c&0xFF); \

61 } \

62 else if( c<0x00800 ){ \

63 *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \

64 *zOut++ = 0x80 + (u8)(c & 0x3F); \

65 } \

66 else if( c<0x10000 ){ \

67 *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \

68 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \

69 *zOut++ = 0x80 + (u8)(c & 0x3F); \

70 }else{ \

71 *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \

72 *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \

73 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \

74 *zOut++ = 0x80 + (u8)(c & 0x3F); \

75 } \

76 }

77

78 #endif /* ifndef SQLITE_AMALGAMATION */

79

80 typedef struct unicode_tokenizer unicode_tokenizer;

81 typedef struct unicode_cursor unicode_cursor;

82

83 struct unicode_tokenizer {

84 sqlite3_tokenizer base;

85 int bRemoveDiacritic;

86 int nException;

87 int *aiException;

88 };

89

90 struct unicode_cursor {

91 sqlite3_tokenizer_cursor base;

92 const unsigned char aInput; / Input text being tokenized */

93 int nInput; /* Size of aInput[] in bytes */

94 int iOff; /* Current offset within aInput[] */

95 int iToken; /* Index of next token to be returned */

96 char zToken; / storage for current token */

97 int nAlloc; /* space allocated at zToken */

98 };

99

100

101 /*

102 ** Destroy a tokenizer allocated by unicodeCreate().

103 */

104 static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){

105 if( pTokenizer ){

106 unicode_tokenizer p = (unicode_tokenizer )pTokenizer;

107 sqlite3_free(p->aiException);

108 sqlite3_free(p);

109 }

110 return SQLITE_OK;

111 }

112

113 /*

114 ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE

115 ** statement has specified that the tokenizer for this table shall consider

116 ** all characters in string zIn/nIn to be separators (if bAlnum==0) or

117 ** token characters (if bAlnum==1).

118 **

119 ** For each codepoint in the zIn/nIn string, this function checks if the

120 ** sqlite3FtsUnicodeIsalnum() function already returns the desired result.

121 ** If so, no action is taken. Otherwise, the codepoint is added to the

122 ** unicode_tokenizer.aiException[] array. For the purposes of tokenization,

123 ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all

124 ** codepoints in the aiException[] array.

125 **

126 ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()

127 ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.

128 ** It is not possible to change the behavior of the tokenizer with respect

129 ** to these codepoints.

130 */

131 static int unicodeAddExceptions(

132 unicode_tokenizer p, / Tokenizer to add exceptions to */

133 int bAlnum, /* Replace Isalnum() return value with this */

134 const char zIn, / Array of characters to make exceptions */

135 int nIn /* Length of z in bytes */

136 ){

137 const unsigned char z = (const unsigned char )zIn;

138 const unsigned char *zTerm = &z[nIn];

139 int iCode;

140 int nEntry = 0;

141

142 assert( bAlnum==0 \|\| bAlnum==1 );

143

144 while( z<zTerm ){

145 READ_UTF8(z, zTerm, iCode);

146 assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );

147 if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum

148 && sqlite3FtsUnicodeIsdiacritic(iCode)==0

149 ){

150 nEntry++;

151 }

152 }

153

154 if( nEntry ){

155 int aNew; / New aiException[] array */

156 int nNew; /* Number of valid entries in array aNew[] */

157

158 aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int));

159 if( aNew==0 ) return SQLITE_NOMEM;

160 nNew = p->nException;

161

162 z = (const unsigned char *)zIn;

163 while( z<zTerm ){

164 READ_UTF8(z, zTerm, iCode);

165 if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum

166 && sqlite3FtsUnicodeIsdiacritic(iCode)==0

167 ){

168 int i, j;

169 for(i=0; i<nNew && aNew[i]<iCode; i++);

170 for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];

171 aNew[i] = iCode;

172 nNew++;

173 }

174 }

175 p->aiException = aNew;

176 p->nException = nNew;

177 }

178

179 return SQLITE_OK;

180 }

181

182 /*

183 ** Return true if the p->aiException[] array contains the value iCode.

184 */

185 static int unicodeIsException(unicode_tokenizer *p, int iCode){

186 if( p->nException>0 ){

187 int *a = p->aiException;

188 int iLo = 0;

189 int iHi = p->nException-1;

190

191 while( iHi>=iLo ){

192 int iTest = (iHi + iLo) / 2;

193 if( iCode==a[iTest] ){

194 return 1;

195 }else if( iCode>a[iTest] ){

196 iLo = iTest+1;

197 }else{

198 iHi = iTest-1;

199 }

200 }

201 }

202

203 return 0;

204 }

205

206 /*

207 ** Return true if, for the purposes of tokenization, codepoint iCode is

208 ** considered a token character (not a separator).

209 */

210 static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){

211 assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );

212 return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);

213 }

214

215 /*

216 ** Create a new tokenizer instance.

217 */

218 static int unicodeCreate(

219 int nArg, /* Size of array argv[] */

220 const char * const azArg, / Tokenizer creation arguments */

221 sqlite3_tokenizer *pp / OUT: New tokenizer handle */

222 ){

223 unicode_tokenizer pNew; / New tokenizer object */

224 int i;

225 int rc = SQLITE_OK;

226

227 pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));

228 if( pNew==NULL ) return SQLITE_NOMEM;

229 memset(pNew, 0, sizeof(unicode_tokenizer));

230 pNew->bRemoveDiacritic = 1;

231

232 for(i=0; rc==SQLITE_OK && i<nArg; i++){

233 const char *z = azArg[i];

234 int n = (int)strlen(z);

235

236 if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){

237 pNew->bRemoveDiacritic = 1;

238 }

239 else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){

240 pNew->bRemoveDiacritic = 0;

241 }

242 else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){

243 rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);

244 }

245 else if( n>=11 && memcmp("separators=", z, 11)==0 ){

246 rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);

247 }

248 else{

249 /* Unrecognized argument */

250 rc = SQLITE_ERROR;

251 }

252 }

253

254 if( rc!=SQLITE_OK ){

255 unicodeDestroy((sqlite3_tokenizer *)pNew);

256 pNew = 0;

257 }

258 pp = (sqlite3_tokenizer )pNew;

259 return rc;

260 }

261

262 /*

263 ** Prepare to begin tokenizing a particular string. The input

264 ** string to be tokenized is pInput[0..nBytes-1]. A cursor

265 ** used to incrementally tokenize this string is returned in

266 ** *ppCursor.

267 */

268 static int unicodeOpen(

269 sqlite3_tokenizer p, / The tokenizer */

270 const char aInput, / Input string */

271 int nInput, /* Size of string aInput in bytes */

272 sqlite3_tokenizer_cursor *pp / OUT: New cursor object */

273 ){

274 unicode_cursor *pCsr;

275

276 pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));

277 if( pCsr==0 ){

278 return SQLITE_NOMEM;

279 }

280 memset(pCsr, 0, sizeof(unicode_cursor));

281

282 pCsr->aInput = (const unsigned char *)aInput;

283 if( aInput==0 ){

284 pCsr->nInput = 0;

285 }else if( nInput<0 ){

286 pCsr->nInput = (int)strlen(aInput);

287 }else{

288 pCsr->nInput = nInput;

289 }

290

291 *pp = &pCsr->base;

292 UNUSED_PARAMETER(p);

293 return SQLITE_OK;

294 }

295

296 /*

297 ** Close a tokenization cursor previously opened by a call to

298 ** simpleOpen() above.

299 */

300 static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){

301 unicode_cursor pCsr = (unicode_cursor ) pCursor;

302 sqlite3_free(pCsr->zToken);

303 sqlite3_free(pCsr);

304 return SQLITE_OK;

305 }

306

307 /*

308 ** Extract the next token from a tokenization cursor. The cursor must

309 ** have been opened by a prior call to simpleOpen().

310 */

311 static int unicodeNext(

312 sqlite3_tokenizer_cursor pC, / Cursor returned by simpleOpen */

313 const char *paToken, / OUT: Token text */

314 int pnToken, / OUT: Number of bytes at paToken /

315 int piStart, / OUT: Starting offset of token */

316 int piEnd, / OUT: Ending offset of token */

317 int piPos / OUT: Position integer of token */

318 ){

319 unicode_cursor pCsr = (unicode_cursor )pC;

320 unicode_tokenizer p = ((unicode_tokenizer )pCsr->base.pTokenizer);

321 int iCode = 0;

322 char *zOut;

323 const unsigned char *z = &pCsr->aInput[pCsr->iOff];

324 const unsigned char *zStart = z;

325 const unsigned char *zEnd;

326 const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];

327

328 /* Scan past any delimiter characters before the start of the next token.

329 ** Return SQLITE_DONE early if this takes us all the way to the end of

330 ** the input. */

331 while( z<zTerm ){

332 READ_UTF8(z, zTerm, iCode);

333 if( unicodeIsAlnum(p, iCode) ) break;

334 zStart = z;

335 }

336 if( zStart>=zTerm ) return SQLITE_DONE;

337

338 zOut = pCsr->zToken;

339 do {

340 int iOut;

341

342 /* Grow the output buffer if required. */

343 if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){

344 char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);

345 if( !zNew ) return SQLITE_NOMEM;

346 zOut = &zNew[zOut - pCsr->zToken];

347 pCsr->zToken = zNew;

348 pCsr->nAlloc += 64;

349 }

350

351 /* Write the folded case of the last character read to the output */

352 zEnd = z;

353 iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic);

354 if( iOut ){

355 WRITE_UTF8(zOut, iOut);

356 }

357

358 /* If the cursor is not at EOF, read the next character */

359 if( z>=zTerm ) break;

360 READ_UTF8(z, zTerm, iCode);

361 }while( unicodeIsAlnum(p, iCode)

362 \|\| sqlite3FtsUnicodeIsdiacritic(iCode)

363 );

364

365 /* Set the output variables and return. */

366 pCsr->iOff = (int)(z - pCsr->aInput);

367 *paToken = pCsr->zToken;

368 *pnToken = (int)(zOut - pCsr->zToken);

369 *piStart = (int)(zStart - pCsr->aInput);

370 *piEnd = (int)(zEnd - pCsr->aInput);

371 *piPos = pCsr->iToken++;

372 return SQLITE_OK;

373 }

374

375 /*

376 ** Set *ppModule to a pointer to the sqlite3_tokenizer_module

377 ** structure for the unicode tokenizer.

378 */

379 void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){

380 static const sqlite3_tokenizer_module module = {

381 0,

382 unicodeCreate,

383 unicodeDestroy,

384 unicodeOpen,

385 unicodeClose,

386 unicodeNext,

387 0,

388 };

389 *ppModule = &module;

390 }

391

392 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */

393 #endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */

OLD	NEW