third_party/sqlite/sqlite-src-3170000/ext/fts3/fts3_icu.c - Issue 2747283002: [sql] Import reference version of SQLite 3.17..

Side by Side Diff: third_party/sqlite/sqlite-src-3170000/ext/fts3/fts3_icu.c

Issue 2747283002: [sql] Import reference version of SQLite 3.17.. (Closed)

Patch Set: Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/sqlite/sqlite-src-3170000/ext/fts3/fts3_hash.c ('k') | third_party/sqlite/sqlite-src-3170000/ext/fts3/fts3_porter.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 /*

	2 ** 2007 June 22

	3 **

	4 ** The author disclaims copyright to this source code. In place of

	5 ** a legal notice, here is a blessing:

	6 **

	7 ** May you do good and not evil.

	8 ** May you find forgiveness for yourself and forgive others.

	9 ** May you share freely, never taking more than you give.

	10 **

	11 *************************************************************************

	12 ** This file implements a tokenizer for fts3 based on the ICU library.

	13 */

	14 #include "fts3Int.h"

	15 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)

	16 #ifdef SQLITE_ENABLE_ICU

	17

	18 #include <assert.h>

	19 #include <string.h>

	20 #include "fts3_tokenizer.h"

	21

	22 #include <unicode/ubrk.h>

	23 #include <unicode/ucol.h>

	24 #include <unicode/ustring.h>

	25 #include <unicode/utf16.h>

	26

	27 typedef struct IcuTokenizer IcuTokenizer;

	28 typedef struct IcuCursor IcuCursor;

	29

	30 struct IcuTokenizer {

	31 sqlite3_tokenizer base;

	32 char *zLocale;

	33 };

	34

	35 struct IcuCursor {

	36 sqlite3_tokenizer_cursor base;

	37

	38 UBreakIterator pIter; / ICU break-iterator object */

	39 int nChar; /* Number of UChar elements in pInput */

	40 UChar aChar; / Copy of input using utf-16 encoding */

	41 int aOffset; / Offsets of each character in utf-8 input */

	42

	43 int nBuffer;

	44 char *zBuffer;

	45

	46 int iToken;

	47 };

	48

	49 /*

	50 ** Create a new tokenizer instance.

	51 */

	52 static int icuCreate(

	53 int argc, /* Number of entries in argv[] */

	54 const char * const argv, / Tokenizer creation arguments */

	55 sqlite3_tokenizer *ppTokenizer / OUT: Created tokenizer */

	56 ){

	57 IcuTokenizer *p;

	58 int n = 0;

	59

	60 if( argc>0 ){

	61 n = strlen(argv[0])+1;

	62 }

	63 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);

	64 if( !p ){

	65 return SQLITE_NOMEM;

	66 }

	67 memset(p, 0, sizeof(IcuTokenizer));

	68

	69 if( n ){

	70 p->zLocale = (char *)&p[1];

	71 memcpy(p->zLocale, argv[0], n);

	72 }

	73

	74 ppTokenizer = (sqlite3_tokenizer )p;

	75

	76 return SQLITE_OK;

	77 }

	78

	79 /*

	80 ** Destroy a tokenizer

	81 */

	82 static int icuDestroy(sqlite3_tokenizer *pTokenizer){

	83 IcuTokenizer p = (IcuTokenizer )pTokenizer;

	84 sqlite3_free(p);

	85 return SQLITE_OK;

	86 }

	87

	88 /*

	89 ** Prepare to begin tokenizing a particular string. The input

	90 ** string to be tokenized is pInput[0..nBytes-1]. A cursor

	91 ** used to incrementally tokenize this string is returned in

	92 ** *ppCursor.

	93 */

	94 static int icuOpen(

	95 sqlite3_tokenizer pTokenizer, / The tokenizer */

	96 const char zInput, / Input string */

	97 int nInput, /* Length of zInput in bytes */

	98 sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */

	99 ){

	100 IcuTokenizer p = (IcuTokenizer )pTokenizer;

	101 IcuCursor *pCsr;

	102

	103 const int32_t opt = U_FOLD_CASE_DEFAULT;

	104 UErrorCode status = U_ZERO_ERROR;

	105 int nChar;

	106

	107 UChar32 c;

	108 int iInput = 0;

	109 int iOut = 0;

	110

	111 *ppCursor = 0;

	112

	113 if( zInput==0 ){

	114 nInput = 0;

	115 zInput = "";

	116 }else if( nInput<0 ){

	117 nInput = strlen(zInput);

	118 }

	119 nChar = nInput+1;

	120 pCsr = (IcuCursor *)sqlite3_malloc(

	121 sizeof(IcuCursor) + /* IcuCursor */

	122 ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */

	123 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */

	124 );

	125 if( !pCsr ){

	126 return SQLITE_NOMEM;

	127 }

	128 memset(pCsr, 0, sizeof(IcuCursor));

	129 pCsr->aChar = (UChar *)&pCsr[1];

	130 pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];

	131

	132 pCsr->aOffset[iOut] = iInput;

	133 U8_NEXT(zInput, iInput, nInput, c);

	134 while( c>0 ){

	135 int isError = 0;

	136 c = u_foldCase(c, opt);

	137 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);

	138 if( isError ){

	139 sqlite3_free(pCsr);

	140 return SQLITE_ERROR;

	141 }

	142 pCsr->aOffset[iOut] = iInput;

	143

	144 if( iInput<nInput ){

	145 U8_NEXT(zInput, iInput, nInput, c);

	146 }else{

	147 c = 0;

	148 }

	149 }

	150

	151 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);

	152 if( !U_SUCCESS(status) ){

	153 sqlite3_free(pCsr);

	154 return SQLITE_ERROR;

	155 }

	156 pCsr->nChar = iOut;

	157

	158 ubrk_first(pCsr->pIter);

	159 ppCursor = (sqlite3_tokenizer_cursor )pCsr;

	160 return SQLITE_OK;

	161 }

	162

	163 /*

	164 ** Close a tokenization cursor previously opened by a call to icuOpen().

	165 */

	166 static int icuClose(sqlite3_tokenizer_cursor *pCursor){

	167 IcuCursor pCsr = (IcuCursor )pCursor;

	168 ubrk_close(pCsr->pIter);

	169 sqlite3_free(pCsr->zBuffer);

	170 sqlite3_free(pCsr);

	171 return SQLITE_OK;

	172 }

	173

	174 /*

	175 ** Extract the next token from a tokenization cursor.

	176 */

	177 static int icuNext(

	178 sqlite3_tokenizer_cursor pCursor, / Cursor returned by simpleOpen */

	179 const char *ppToken, / OUT: ppToken is the token text /

	180 int pnBytes, / OUT: Number of bytes in token */

	181 int piStartOffset, / OUT: Starting offset of token */

	182 int piEndOffset, / OUT: Ending offset of token */

	183 int piPosition / OUT: Position integer of token */

	184 ){

	185 IcuCursor pCsr = (IcuCursor )pCursor;

	186

	187 int iStart = 0;

	188 int iEnd = 0;

	189 int nByte = 0;

	190

	191 while( iStart==iEnd ){

	192 UChar32 c;

	193

	194 iStart = ubrk_current(pCsr->pIter);

	195 iEnd = ubrk_next(pCsr->pIter);

	196 if( iEnd==UBRK_DONE ){

	197 return SQLITE_DONE;

	198 }

	199

	200 while( iStart<iEnd ){

	201 int iWhite = iStart;

	202 U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);

	203 if( u_isspace(c) ){

	204 iStart = iWhite;

	205 }else{

	206 break;

	207 }

	208 }

	209 assert(iStart<=iEnd);

	210 }

	211

	212 do {

	213 UErrorCode status = U_ZERO_ERROR;

	214 if( nByte ){

	215 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);

	216 if( !zNew ){

	217 return SQLITE_NOMEM;

	218 }

	219 pCsr->zBuffer = zNew;

	220 pCsr->nBuffer = nByte;

	221 }

	222

	223 u_strToUTF8(

	224 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */

	225 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */

	226 &status /* Output success/failure */

	227 );

	228 } while( nByte>pCsr->nBuffer );

	229

	230 *ppToken = pCsr->zBuffer;

	231 *pnBytes = nByte;

	232 *piStartOffset = pCsr->aOffset[iStart];

	233 *piEndOffset = pCsr->aOffset[iEnd];

	234 *piPosition = pCsr->iToken++;

	235

	236 return SQLITE_OK;

	237 }

	238

	239 /*

	240 ** The set of routines that implement the simple tokenizer

	241 */

	242 static const sqlite3_tokenizer_module icuTokenizerModule = {

	243 0, /* iVersion */

	244 icuCreate, /* xCreate */

	245 icuDestroy, /* xCreate */

	246 icuOpen, /* xOpen */

	247 icuClose, /* xClose */

	248 icuNext, /* xNext */

	249 0, /* xLanguageid */

	250 };

	251

	252 /*

	253 ** Set *ppModule to point at the implementation of the ICU tokenizer.

	254 */

	255 void sqlite3Fts3IcuTokenizerModule(

	256 sqlite3_tokenizer_module const**ppModule

	257 ){

	258 *ppModule = &icuTokenizerModule;

	259 }

	260

	261 #endif /* defined(SQLITE_ENABLE_ICU) */

	262 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */

OLD	NEW