OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ** 2012 May 24 |
| 3 ** |
| 4 ** The author disclaims copyright to this source code. In place of |
| 5 ** a legal notice, here is a blessing: |
| 6 ** |
| 7 ** May you do good and not evil. |
| 8 ** May you find forgiveness for yourself and forgive others. |
| 9 ** May you share freely, never taking more than you give. |
| 10 ** |
| 11 ****************************************************************************** |
| 12 ** |
| 13 ** Implementation of the "unicode" full-text-search tokenizer. |
| 14 */ |
| 15 |
| 16 #ifndef SQLITE_DISABLE_FTS3_UNICODE |
| 17 |
| 18 #include "fts3Int.h" |
| 19 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
| 20 |
| 21 #include <assert.h> |
| 22 #include <stdlib.h> |
| 23 #include <stdio.h> |
| 24 #include <string.h> |
| 25 |
| 26 #include "fts3_tokenizer.h" |
| 27 |
| 28 /* |
| 29 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied |
| 30 ** from the sqlite3 source file utf.c. If this file is compiled as part |
| 31 ** of the amalgamation, they are not required. |
| 32 */ |
| 33 #ifndef SQLITE_AMALGAMATION |
| 34 |
| 35 static const unsigned char sqlite3Utf8Trans1[] = { |
| 36 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 37 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
| 38 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, |
| 39 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, |
| 40 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 41 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, |
| 42 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 43 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, |
| 44 }; |
| 45 |
| 46 #define READ_UTF8(zIn, zTerm, c) \ |
| 47 c = *(zIn++); \ |
| 48 if( c>=0xc0 ){ \ |
| 49 c = sqlite3Utf8Trans1[c-0xc0]; \ |
| 50 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \ |
| 51 c = (c<<6) + (0x3f & *(zIn++)); \ |
| 52 } \ |
| 53 if( c<0x80 \ |
| 54 || (c&0xFFFFF800)==0xD800 \ |
| 55 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ |
| 56 } |
| 57 |
| 58 #define WRITE_UTF8(zOut, c) { \ |
| 59 if( c<0x00080 ){ \ |
| 60 *zOut++ = (u8)(c&0xFF); \ |
| 61 } \ |
| 62 else if( c<0x00800 ){ \ |
| 63 *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \ |
| 64 *zOut++ = 0x80 + (u8)(c & 0x3F); \ |
| 65 } \ |
| 66 else if( c<0x10000 ){ \ |
| 67 *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \ |
| 68 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \ |
| 69 *zOut++ = 0x80 + (u8)(c & 0x3F); \ |
| 70 }else{ \ |
| 71 *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \ |
| 72 *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \ |
| 73 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \ |
| 74 *zOut++ = 0x80 + (u8)(c & 0x3F); \ |
| 75 } \ |
| 76 } |
| 77 |
| 78 #endif /* ifndef SQLITE_AMALGAMATION */ |
| 79 |
| 80 typedef struct unicode_tokenizer unicode_tokenizer; |
| 81 typedef struct unicode_cursor unicode_cursor; |
| 82 |
| 83 struct unicode_tokenizer { |
| 84 sqlite3_tokenizer base; |
| 85 int bRemoveDiacritic; |
| 86 int nException; |
| 87 int *aiException; |
| 88 }; |
| 89 |
| 90 struct unicode_cursor { |
| 91 sqlite3_tokenizer_cursor base; |
| 92 const unsigned char *aInput; /* Input text being tokenized */ |
| 93 int nInput; /* Size of aInput[] in bytes */ |
| 94 int iOff; /* Current offset within aInput[] */ |
| 95 int iToken; /* Index of next token to be returned */ |
| 96 char *zToken; /* storage for current token */ |
| 97 int nAlloc; /* space allocated at zToken */ |
| 98 }; |
| 99 |
| 100 |
| 101 /* |
| 102 ** Destroy a tokenizer allocated by unicodeCreate(). |
| 103 */ |
| 104 static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){ |
| 105 if( pTokenizer ){ |
| 106 unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer; |
| 107 sqlite3_free(p->aiException); |
| 108 sqlite3_free(p); |
| 109 } |
| 110 return SQLITE_OK; |
| 111 } |
| 112 |
| 113 /* |
| 114 ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE |
| 115 ** statement has specified that the tokenizer for this table shall consider |
| 116 ** all characters in string zIn/nIn to be separators (if bAlnum==0) or |
| 117 ** token characters (if bAlnum==1). |
| 118 ** |
| 119 ** For each codepoint in the zIn/nIn string, this function checks if the |
| 120 ** sqlite3FtsUnicodeIsalnum() function already returns the desired result. |
| 121 ** If so, no action is taken. Otherwise, the codepoint is added to the |
| 122 ** unicode_tokenizer.aiException[] array. For the purposes of tokenization, |
| 123 ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all |
| 124 ** codepoints in the aiException[] array. |
| 125 ** |
| 126 ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic() |
| 127 ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored. |
| 128 ** It is not possible to change the behavior of the tokenizer with respect |
| 129 ** to these codepoints. |
| 130 */ |
| 131 static int unicodeAddExceptions( |
| 132 unicode_tokenizer *p, /* Tokenizer to add exceptions to */ |
| 133 int bAlnum, /* Replace Isalnum() return value with this */ |
| 134 const char *zIn, /* Array of characters to make exceptions */ |
| 135 int nIn /* Length of z in bytes */ |
| 136 ){ |
| 137 const unsigned char *z = (const unsigned char *)zIn; |
| 138 const unsigned char *zTerm = &z[nIn]; |
| 139 int iCode; |
| 140 int nEntry = 0; |
| 141 |
| 142 assert( bAlnum==0 || bAlnum==1 ); |
| 143 |
| 144 while( z<zTerm ){ |
| 145 READ_UTF8(z, zTerm, iCode); |
| 146 assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); |
| 147 if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum |
| 148 && sqlite3FtsUnicodeIsdiacritic(iCode)==0 |
| 149 ){ |
| 150 nEntry++; |
| 151 } |
| 152 } |
| 153 |
| 154 if( nEntry ){ |
| 155 int *aNew; /* New aiException[] array */ |
| 156 int nNew; /* Number of valid entries in array aNew[] */ |
| 157 |
| 158 aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int)); |
| 159 if( aNew==0 ) return SQLITE_NOMEM; |
| 160 nNew = p->nException; |
| 161 |
| 162 z = (const unsigned char *)zIn; |
| 163 while( z<zTerm ){ |
| 164 READ_UTF8(z, zTerm, iCode); |
| 165 if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum |
| 166 && sqlite3FtsUnicodeIsdiacritic(iCode)==0 |
| 167 ){ |
| 168 int i, j; |
| 169 for(i=0; i<nNew && aNew[i]<iCode; i++); |
| 170 for(j=nNew; j>i; j--) aNew[j] = aNew[j-1]; |
| 171 aNew[i] = iCode; |
| 172 nNew++; |
| 173 } |
| 174 } |
| 175 p->aiException = aNew; |
| 176 p->nException = nNew; |
| 177 } |
| 178 |
| 179 return SQLITE_OK; |
| 180 } |
| 181 |
| 182 /* |
| 183 ** Return true if the p->aiException[] array contains the value iCode. |
| 184 */ |
| 185 static int unicodeIsException(unicode_tokenizer *p, int iCode){ |
| 186 if( p->nException>0 ){ |
| 187 int *a = p->aiException; |
| 188 int iLo = 0; |
| 189 int iHi = p->nException-1; |
| 190 |
| 191 while( iHi>=iLo ){ |
| 192 int iTest = (iHi + iLo) / 2; |
| 193 if( iCode==a[iTest] ){ |
| 194 return 1; |
| 195 }else if( iCode>a[iTest] ){ |
| 196 iLo = iTest+1; |
| 197 }else{ |
| 198 iHi = iTest-1; |
| 199 } |
| 200 } |
| 201 } |
| 202 |
| 203 return 0; |
| 204 } |
| 205 |
| 206 /* |
| 207 ** Return true if, for the purposes of tokenization, codepoint iCode is |
| 208 ** considered a token character (not a separator). |
| 209 */ |
| 210 static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){ |
| 211 assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 ); |
| 212 return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode); |
| 213 } |
| 214 |
| 215 /* |
| 216 ** Create a new tokenizer instance. |
| 217 */ |
| 218 static int unicodeCreate( |
| 219 int nArg, /* Size of array argv[] */ |
| 220 const char * const *azArg, /* Tokenizer creation arguments */ |
| 221 sqlite3_tokenizer **pp /* OUT: New tokenizer handle */ |
| 222 ){ |
| 223 unicode_tokenizer *pNew; /* New tokenizer object */ |
| 224 int i; |
| 225 int rc = SQLITE_OK; |
| 226 |
| 227 pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer)); |
| 228 if( pNew==NULL ) return SQLITE_NOMEM; |
| 229 memset(pNew, 0, sizeof(unicode_tokenizer)); |
| 230 pNew->bRemoveDiacritic = 1; |
| 231 |
| 232 for(i=0; rc==SQLITE_OK && i<nArg; i++){ |
| 233 const char *z = azArg[i]; |
| 234 int n = (int)strlen(z); |
| 235 |
| 236 if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){ |
| 237 pNew->bRemoveDiacritic = 1; |
| 238 } |
| 239 else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){ |
| 240 pNew->bRemoveDiacritic = 0; |
| 241 } |
| 242 else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){ |
| 243 rc = unicodeAddExceptions(pNew, 1, &z[11], n-11); |
| 244 } |
| 245 else if( n>=11 && memcmp("separators=", z, 11)==0 ){ |
| 246 rc = unicodeAddExceptions(pNew, 0, &z[11], n-11); |
| 247 } |
| 248 else{ |
| 249 /* Unrecognized argument */ |
| 250 rc = SQLITE_ERROR; |
| 251 } |
| 252 } |
| 253 |
| 254 if( rc!=SQLITE_OK ){ |
| 255 unicodeDestroy((sqlite3_tokenizer *)pNew); |
| 256 pNew = 0; |
| 257 } |
| 258 *pp = (sqlite3_tokenizer *)pNew; |
| 259 return rc; |
| 260 } |
| 261 |
| 262 /* |
| 263 ** Prepare to begin tokenizing a particular string. The input |
| 264 ** string to be tokenized is pInput[0..nBytes-1]. A cursor |
| 265 ** used to incrementally tokenize this string is returned in |
| 266 ** *ppCursor. |
| 267 */ |
| 268 static int unicodeOpen( |
| 269 sqlite3_tokenizer *p, /* The tokenizer */ |
| 270 const char *aInput, /* Input string */ |
| 271 int nInput, /* Size of string aInput in bytes */ |
| 272 sqlite3_tokenizer_cursor **pp /* OUT: New cursor object */ |
| 273 ){ |
| 274 unicode_cursor *pCsr; |
| 275 |
| 276 pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor)); |
| 277 if( pCsr==0 ){ |
| 278 return SQLITE_NOMEM; |
| 279 } |
| 280 memset(pCsr, 0, sizeof(unicode_cursor)); |
| 281 |
| 282 pCsr->aInput = (const unsigned char *)aInput; |
| 283 if( aInput==0 ){ |
| 284 pCsr->nInput = 0; |
| 285 }else if( nInput<0 ){ |
| 286 pCsr->nInput = (int)strlen(aInput); |
| 287 }else{ |
| 288 pCsr->nInput = nInput; |
| 289 } |
| 290 |
| 291 *pp = &pCsr->base; |
| 292 UNUSED_PARAMETER(p); |
| 293 return SQLITE_OK; |
| 294 } |
| 295 |
| 296 /* |
| 297 ** Close a tokenization cursor previously opened by a call to |
| 298 ** simpleOpen() above. |
| 299 */ |
| 300 static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){ |
| 301 unicode_cursor *pCsr = (unicode_cursor *) pCursor; |
| 302 sqlite3_free(pCsr->zToken); |
| 303 sqlite3_free(pCsr); |
| 304 return SQLITE_OK; |
| 305 } |
| 306 |
| 307 /* |
| 308 ** Extract the next token from a tokenization cursor. The cursor must |
| 309 ** have been opened by a prior call to simpleOpen(). |
| 310 */ |
| 311 static int unicodeNext( |
| 312 sqlite3_tokenizer_cursor *pC, /* Cursor returned by simpleOpen */ |
| 313 const char **paToken, /* OUT: Token text */ |
| 314 int *pnToken, /* OUT: Number of bytes at *paToken */ |
| 315 int *piStart, /* OUT: Starting offset of token */ |
| 316 int *piEnd, /* OUT: Ending offset of token */ |
| 317 int *piPos /* OUT: Position integer of token */ |
| 318 ){ |
| 319 unicode_cursor *pCsr = (unicode_cursor *)pC; |
| 320 unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer); |
| 321 int iCode = 0; |
| 322 char *zOut; |
| 323 const unsigned char *z = &pCsr->aInput[pCsr->iOff]; |
| 324 const unsigned char *zStart = z; |
| 325 const unsigned char *zEnd; |
| 326 const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput]; |
| 327 |
| 328 /* Scan past any delimiter characters before the start of the next token. |
| 329 ** Return SQLITE_DONE early if this takes us all the way to the end of |
| 330 ** the input. */ |
| 331 while( z<zTerm ){ |
| 332 READ_UTF8(z, zTerm, iCode); |
| 333 if( unicodeIsAlnum(p, iCode) ) break; |
| 334 zStart = z; |
| 335 } |
| 336 if( zStart>=zTerm ) return SQLITE_DONE; |
| 337 |
| 338 zOut = pCsr->zToken; |
| 339 do { |
| 340 int iOut; |
| 341 |
| 342 /* Grow the output buffer if required. */ |
| 343 if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){ |
| 344 char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64); |
| 345 if( !zNew ) return SQLITE_NOMEM; |
| 346 zOut = &zNew[zOut - pCsr->zToken]; |
| 347 pCsr->zToken = zNew; |
| 348 pCsr->nAlloc += 64; |
| 349 } |
| 350 |
| 351 /* Write the folded case of the last character read to the output */ |
| 352 zEnd = z; |
| 353 iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic); |
| 354 if( iOut ){ |
| 355 WRITE_UTF8(zOut, iOut); |
| 356 } |
| 357 |
| 358 /* If the cursor is not at EOF, read the next character */ |
| 359 if( z>=zTerm ) break; |
| 360 READ_UTF8(z, zTerm, iCode); |
| 361 }while( unicodeIsAlnum(p, iCode) |
| 362 || sqlite3FtsUnicodeIsdiacritic(iCode) |
| 363 ); |
| 364 |
| 365 /* Set the output variables and return. */ |
| 366 pCsr->iOff = (int)(z - pCsr->aInput); |
| 367 *paToken = pCsr->zToken; |
| 368 *pnToken = (int)(zOut - pCsr->zToken); |
| 369 *piStart = (int)(zStart - pCsr->aInput); |
| 370 *piEnd = (int)(zEnd - pCsr->aInput); |
| 371 *piPos = pCsr->iToken++; |
| 372 return SQLITE_OK; |
| 373 } |
| 374 |
| 375 /* |
| 376 ** Set *ppModule to a pointer to the sqlite3_tokenizer_module |
| 377 ** structure for the unicode tokenizer. |
| 378 */ |
| 379 void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){ |
| 380 static const sqlite3_tokenizer_module module = { |
| 381 0, |
| 382 unicodeCreate, |
| 383 unicodeDestroy, |
| 384 unicodeOpen, |
| 385 unicodeClose, |
| 386 unicodeNext, |
| 387 0, |
| 388 }; |
| 389 *ppModule = &module; |
| 390 } |
| 391 |
| 392 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |
| 393 #endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */ |
OLD | NEW |