third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_unicode.c - Issue 883353008: [sql] Import reference version of SQLite 3.8.7.4.

Side by Side Diff: third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_unicode.c

Issue 883353008: [sql] Import reference version of SQLite 3.8.7.4. (Closed) Base URL: http://chromium.googlesource.com/chromium/src.git@master

Patch Set: Hold back encoding change which is messing up patch. Created 5 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_tokenizer1.c ('k') | third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_unicode2.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 /*

	2 ** 2012 May 24

	3 **

	4 ** The author disclaims copyright to this source code. In place of

	5 ** a legal notice, here is a blessing:

	6 **

	7 ** May you do good and not evil.

	8 ** May you find forgiveness for yourself and forgive others.

	9 ** May you share freely, never taking more than you give.

	10 **

	11 ******************************************************************************

	12 **

	13 ** Implementation of the "unicode" full-text-search tokenizer.

	14 */

	15

	16 #ifndef SQLITE_DISABLE_FTS3_UNICODE

	17

	18 #include "fts3Int.h"

	19 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)

	20

	21 #include <assert.h>

	22 #include <stdlib.h>

	23 #include <stdio.h>

	24 #include <string.h>

	25

	26 #include "fts3_tokenizer.h"

	27

	28 /*

	29 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied

	30 ** from the sqlite3 source file utf.c. If this file is compiled as part

	31 ** of the amalgamation, they are not required.

	32 */

	33 #ifndef SQLITE_AMALGAMATION

	34

	35 static const unsigned char sqlite3Utf8Trans1[] = {

	36 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,

	37 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,

	38 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

	39 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,

	40 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,

	41 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,

	42 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,

	43 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,

	44 };

	45

	46 #define READ_UTF8(zIn, zTerm, c) \

	47 c = *(zIn++); \

	48 if( c>=0xc0 ){ \

	49 c = sqlite3Utf8Trans1[c-0xc0]; \

	50 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \

	51 c = (c<<6) + (0x3f & *(zIn++)); \

	52 } \

	53 if( c<0x80 \

	54 \|\| (c&0xFFFFF800)==0xD800 \

	55 \|\| (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \

	56 }

	57

	58 #define WRITE_UTF8(zOut, c) { \

	59 if( c<0x00080 ){ \

	60 *zOut++ = (u8)(c&0xFF); \

	61 } \

	62 else if( c<0x00800 ){ \

	63 *zOut++ = 0xC0 + (u8)((c>>6)&0x1F); \

	64 *zOut++ = 0x80 + (u8)(c & 0x3F); \

	65 } \

	66 else if( c<0x10000 ){ \

	67 *zOut++ = 0xE0 + (u8)((c>>12)&0x0F); \

	68 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \

	69 *zOut++ = 0x80 + (u8)(c & 0x3F); \

	70 }else{ \

	71 *zOut++ = 0xF0 + (u8)((c>>18) & 0x07); \

	72 *zOut++ = 0x80 + (u8)((c>>12) & 0x3F); \

	73 *zOut++ = 0x80 + (u8)((c>>6) & 0x3F); \

	74 *zOut++ = 0x80 + (u8)(c & 0x3F); \

	75 } \

	76 }

	77

	78 #endif /* ifndef SQLITE_AMALGAMATION */

	79

	80 typedef struct unicode_tokenizer unicode_tokenizer;

	81 typedef struct unicode_cursor unicode_cursor;

	82

	83 struct unicode_tokenizer {

	84 sqlite3_tokenizer base;

	85 int bRemoveDiacritic;

	86 int nException;

	87 int *aiException;

	88 };

	89

	90 struct unicode_cursor {

	91 sqlite3_tokenizer_cursor base;

	92 const unsigned char aInput; / Input text being tokenized */

	93 int nInput; /* Size of aInput[] in bytes */

	94 int iOff; /* Current offset within aInput[] */

	95 int iToken; /* Index of next token to be returned */

	96 char zToken; / storage for current token */

	97 int nAlloc; /* space allocated at zToken */

	98 };

	99

	100

	101 /*

	102 ** Destroy a tokenizer allocated by unicodeCreate().

	103 */

	104 static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){

	105 if( pTokenizer ){

	106 unicode_tokenizer p = (unicode_tokenizer )pTokenizer;

	107 sqlite3_free(p->aiException);

	108 sqlite3_free(p);

	109 }

	110 return SQLITE_OK;

	111 }

	112

	113 /*

	114 ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE

	115 ** statement has specified that the tokenizer for this table shall consider

	116 ** all characters in string zIn/nIn to be separators (if bAlnum==0) or

	117 ** token characters (if bAlnum==1).

	118 **

	119 ** For each codepoint in the zIn/nIn string, this function checks if the

	120 ** sqlite3FtsUnicodeIsalnum() function already returns the desired result.

	121 ** If so, no action is taken. Otherwise, the codepoint is added to the

	122 ** unicode_tokenizer.aiException[] array. For the purposes of tokenization,

	123 ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all

	124 ** codepoints in the aiException[] array.

	125 **

	126 ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()

	127 ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.

	128 ** It is not possible to change the behavior of the tokenizer with respect

	129 ** to these codepoints.

	130 */

	131 static int unicodeAddExceptions(

	132 unicode_tokenizer p, / Tokenizer to add exceptions to */

	133 int bAlnum, /* Replace Isalnum() return value with this */

	134 const char zIn, / Array of characters to make exceptions */

	135 int nIn /* Length of z in bytes */

	136 ){

	137 const unsigned char z = (const unsigned char )zIn;

	138 const unsigned char *zTerm = &z[nIn];

	139 int iCode;

	140 int nEntry = 0;

	141

	142 assert( bAlnum==0 \|\| bAlnum==1 );

	143

	144 while( z<zTerm ){

	145 READ_UTF8(z, zTerm, iCode);

	146 assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );

	147 if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum

	148 && sqlite3FtsUnicodeIsdiacritic(iCode)==0

	149 ){

	150 nEntry++;

	151 }

	152 }

	153

	154 if( nEntry ){

	155 int aNew; / New aiException[] array */

	156 int nNew; /* Number of valid entries in array aNew[] */

	157

	158 aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int));

	159 if( aNew==0 ) return SQLITE_NOMEM;

	160 nNew = p->nException;

	161

	162 z = (const unsigned char *)zIn;

	163 while( z<zTerm ){

	164 READ_UTF8(z, zTerm, iCode);

	165 if( sqlite3FtsUnicodeIsalnum(iCode)!=bAlnum

	166 && sqlite3FtsUnicodeIsdiacritic(iCode)==0

	167 ){

	168 int i, j;

	169 for(i=0; i<nNew && aNew[i]<iCode; i++);

	170 for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];

	171 aNew[i] = iCode;

	172 nNew++;

	173 }

	174 }

	175 p->aiException = aNew;

	176 p->nException = nNew;

	177 }

	178

	179 return SQLITE_OK;

	180 }

	181

	182 /*

	183 ** Return true if the p->aiException[] array contains the value iCode.

	184 */

	185 static int unicodeIsException(unicode_tokenizer *p, int iCode){

	186 if( p->nException>0 ){

	187 int *a = p->aiException;

	188 int iLo = 0;

	189 int iHi = p->nException-1;

	190

	191 while( iHi>=iLo ){

	192 int iTest = (iHi + iLo) / 2;

	193 if( iCode==a[iTest] ){

	194 return 1;

	195 }else if( iCode>a[iTest] ){

	196 iLo = iTest+1;

	197 }else{

	198 iHi = iTest-1;

	199 }

	200 }

	201 }

	202

	203 return 0;

	204 }

	205

	206 /*

	207 ** Return true if, for the purposes of tokenization, codepoint iCode is

	208 ** considered a token character (not a separator).

	209 */

	210 static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){

	211 assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );

	212 return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);

	213 }

	214

	215 /*

	216 ** Create a new tokenizer instance.

	217 */

	218 static int unicodeCreate(

	219 int nArg, /* Size of array argv[] */

	220 const char * const azArg, / Tokenizer creation arguments */

	221 sqlite3_tokenizer *pp / OUT: New tokenizer handle */

	222 ){

	223 unicode_tokenizer pNew; / New tokenizer object */

	224 int i;

	225 int rc = SQLITE_OK;

	226

	227 pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));

	228 if( pNew==NULL ) return SQLITE_NOMEM;

	229 memset(pNew, 0, sizeof(unicode_tokenizer));

	230 pNew->bRemoveDiacritic = 1;

	231

	232 for(i=0; rc==SQLITE_OK && i<nArg; i++){

	233 const char *z = azArg[i];

	234 int n = (int)strlen(z);

	235

	236 if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){

	237 pNew->bRemoveDiacritic = 1;

	238 }

	239 else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){

	240 pNew->bRemoveDiacritic = 0;

	241 }

	242 else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){

	243 rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);

	244 }

	245 else if( n>=11 && memcmp("separators=", z, 11)==0 ){

	246 rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);

	247 }

	248 else{

	249 /* Unrecognized argument */

	250 rc = SQLITE_ERROR;

	251 }

	252 }

	253

	254 if( rc!=SQLITE_OK ){

	255 unicodeDestroy((sqlite3_tokenizer *)pNew);

	256 pNew = 0;

	257 }

	258 pp = (sqlite3_tokenizer )pNew;

	259 return rc;

	260 }

	261

	262 /*

	263 ** Prepare to begin tokenizing a particular string. The input

	264 ** string to be tokenized is pInput[0..nBytes-1]. A cursor

	265 ** used to incrementally tokenize this string is returned in

	266 ** *ppCursor.

	267 */

	268 static int unicodeOpen(

	269 sqlite3_tokenizer p, / The tokenizer */

	270 const char aInput, / Input string */

	271 int nInput, /* Size of string aInput in bytes */

	272 sqlite3_tokenizer_cursor *pp / OUT: New cursor object */

	273 ){

	274 unicode_cursor *pCsr;

	275

	276 pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));

	277 if( pCsr==0 ){

	278 return SQLITE_NOMEM;

	279 }

	280 memset(pCsr, 0, sizeof(unicode_cursor));

	281

	282 pCsr->aInput = (const unsigned char *)aInput;

	283 if( aInput==0 ){

	284 pCsr->nInput = 0;

	285 }else if( nInput<0 ){

	286 pCsr->nInput = (int)strlen(aInput);

	287 }else{

	288 pCsr->nInput = nInput;

	289 }

	290

	291 *pp = &pCsr->base;

	292 UNUSED_PARAMETER(p);

	293 return SQLITE_OK;

	294 }

	295

	296 /*

	297 ** Close a tokenization cursor previously opened by a call to

	298 ** simpleOpen() above.

	299 */

	300 static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){

	301 unicode_cursor pCsr = (unicode_cursor ) pCursor;

	302 sqlite3_free(pCsr->zToken);

	303 sqlite3_free(pCsr);

	304 return SQLITE_OK;

	305 }

	306

	307 /*

	308 ** Extract the next token from a tokenization cursor. The cursor must

	309 ** have been opened by a prior call to simpleOpen().

	310 */

	311 static int unicodeNext(

	312 sqlite3_tokenizer_cursor pC, / Cursor returned by simpleOpen */

	313 const char *paToken, / OUT: Token text */

	314 int pnToken, / OUT: Number of bytes at paToken /

	315 int piStart, / OUT: Starting offset of token */

	316 int piEnd, / OUT: Ending offset of token */

	317 int piPos / OUT: Position integer of token */

	318 ){

	319 unicode_cursor pCsr = (unicode_cursor )pC;

	320 unicode_tokenizer p = ((unicode_tokenizer )pCsr->base.pTokenizer);

	321 int iCode = 0;

	322 char *zOut;

	323 const unsigned char *z = &pCsr->aInput[pCsr->iOff];

	324 const unsigned char *zStart = z;

	325 const unsigned char *zEnd;

	326 const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];

	327

	328 /* Scan past any delimiter characters before the start of the next token.

	329 ** Return SQLITE_DONE early if this takes us all the way to the end of

	330 ** the input. */

	331 while( z<zTerm ){

	332 READ_UTF8(z, zTerm, iCode);

	333 if( unicodeIsAlnum(p, iCode) ) break;

	334 zStart = z;

	335 }

	336 if( zStart>=zTerm ) return SQLITE_DONE;

	337

	338 zOut = pCsr->zToken;

	339 do {

	340 int iOut;

	341

	342 /* Grow the output buffer if required. */

	343 if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){

	344 char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);

	345 if( !zNew ) return SQLITE_NOMEM;

	346 zOut = &zNew[zOut - pCsr->zToken];

	347 pCsr->zToken = zNew;

	348 pCsr->nAlloc += 64;

	349 }

	350

	351 /* Write the folded case of the last character read to the output */

	352 zEnd = z;

	353 iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic);

	354 if( iOut ){

	355 WRITE_UTF8(zOut, iOut);

	356 }

	357

	358 /* If the cursor is not at EOF, read the next character */

	359 if( z>=zTerm ) break;

	360 READ_UTF8(z, zTerm, iCode);

	361 }while( unicodeIsAlnum(p, iCode)

	362 \|\| sqlite3FtsUnicodeIsdiacritic(iCode)

	363 );

	364

	365 /* Set the output variables and return. */

	366 pCsr->iOff = (int)(z - pCsr->aInput);

	367 *paToken = pCsr->zToken;

	368 *pnToken = (int)(zOut - pCsr->zToken);

	369 *piStart = (int)(zStart - pCsr->aInput);

	370 *piEnd = (int)(zEnd - pCsr->aInput);

	371 *piPos = pCsr->iToken++;

	372 return SQLITE_OK;

	373 }

	374

	375 /*

	376 ** Set *ppModule to a pointer to the sqlite3_tokenizer_module

	377 ** structure for the unicode tokenizer.

	378 */

	379 void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){

	380 static const sqlite3_tokenizer_module module = {

	381 0,

	382 unicodeCreate,

	383 unicodeDestroy,

	384 unicodeOpen,

	385 unicodeClose,

	386 unicodeNext,

	387 0,

	388 };

	389 *ppModule = &module;

	390 }

	391

	392 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */

	393 #endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */

OLD	NEW