| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 ** 2007 June 22 | |
| 3 ** | |
| 4 ** The author disclaims copyright to this source code. In place of | |
| 5 ** a legal notice, here is a blessing: | |
| 6 ** | |
| 7 ** May you do good and not evil. | |
| 8 ** May you find forgiveness for yourself and forgive others. | |
| 9 ** May you share freely, never taking more than you give. | |
| 10 ** | |
| 11 ************************************************************************* | |
| 12 ** This file implements a tokenizer for fts3 based on the ICU library. | |
| 13 ** | |
| 14 ** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $ | |
| 15 */ | |
| 16 | |
| 17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) | |
| 18 #ifdef SQLITE_ENABLE_ICU | |
| 19 | |
| 20 #include <assert.h> | |
| 21 #include <string.h> | |
| 22 #include "fts3_tokenizer.h" | |
| 23 | |
| 24 #include <unicode/ubrk.h> | |
| 25 #include <unicode/ucol.h> | |
| 26 #include <unicode/ustring.h> | |
| 27 #include <unicode/utf16.h> | |
| 28 | |
| 29 typedef struct IcuTokenizer IcuTokenizer; | |
| 30 typedef struct IcuCursor IcuCursor; | |
| 31 | |
| 32 struct IcuTokenizer { | |
| 33 sqlite3_tokenizer base; | |
| 34 char *zLocale; | |
| 35 }; | |
| 36 | |
| 37 struct IcuCursor { | |
| 38 sqlite3_tokenizer_cursor base; | |
| 39 | |
| 40 UBreakIterator *pIter; /* ICU break-iterator object */ | |
| 41 int nChar; /* Number of UChar elements in pInput */ | |
| 42 UChar *aChar; /* Copy of input using utf-16 encoding */ | |
| 43 int *aOffset; /* Offsets of each character in utf-8 input */ | |
| 44 | |
| 45 int nBuffer; | |
| 46 char *zBuffer; | |
| 47 | |
| 48 int iToken; | |
| 49 }; | |
| 50 | |
| 51 /* | |
| 52 ** Create a new tokenizer instance. | |
| 53 */ | |
| 54 static int icuCreate( | |
| 55 int argc, /* Number of entries in argv[] */ | |
| 56 const char * const *argv, /* Tokenizer creation arguments */ | |
| 57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ | |
| 58 ){ | |
| 59 IcuTokenizer *p; | |
| 60 int n = 0; | |
| 61 | |
| 62 if( argc>0 ){ | |
| 63 n = strlen(argv[0])+1; | |
| 64 } | |
| 65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); | |
| 66 if( !p ){ | |
| 67 return SQLITE_NOMEM; | |
| 68 } | |
| 69 memset(p, 0, sizeof(IcuTokenizer)); | |
| 70 | |
| 71 if( n ){ | |
| 72 p->zLocale = (char *)&p[1]; | |
| 73 memcpy(p->zLocale, argv[0], n); | |
| 74 } | |
| 75 | |
| 76 *ppTokenizer = (sqlite3_tokenizer *)p; | |
| 77 | |
| 78 return SQLITE_OK; | |
| 79 } | |
| 80 | |
| 81 /* | |
| 82 ** Destroy a tokenizer | |
| 83 */ | |
| 84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ | |
| 85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; | |
| 86 sqlite3_free(p); | |
| 87 return SQLITE_OK; | |
| 88 } | |
| 89 | |
| 90 /* | |
| 91 ** Prepare to begin tokenizing a particular string. The input | |
| 92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor | |
| 93 ** used to incrementally tokenize this string is returned in | |
| 94 ** *ppCursor. | |
| 95 */ | |
| 96 static int icuOpen( | |
| 97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | |
| 98 const char *zInput, /* Input string */ | |
| 99 int nInput, /* Length of zInput in bytes */ | |
| 100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | |
| 101 ){ | |
| 102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; | |
| 103 IcuCursor *pCsr; | |
| 104 | |
| 105 const int32_t opt = U_FOLD_CASE_DEFAULT; | |
| 106 UErrorCode status = U_ZERO_ERROR; | |
| 107 int nChar; | |
| 108 | |
| 109 UChar32 c; | |
| 110 int iInput = 0; | |
| 111 int iOut = 0; | |
| 112 | |
| 113 *ppCursor = 0; | |
| 114 | |
| 115 if( nInput<0 ){ | |
| 116 nInput = strlen(zInput); | |
| 117 } | |
| 118 nChar = nInput+1; | |
| 119 pCsr = (IcuCursor *)sqlite3_malloc( | |
| 120 sizeof(IcuCursor) + /* IcuCursor */ | |
| 121 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ | |
| 122 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ | |
| 123 ); | |
| 124 if( !pCsr ){ | |
| 125 return SQLITE_NOMEM; | |
| 126 } | |
| 127 memset(pCsr, 0, sizeof(IcuCursor)); | |
| 128 pCsr->aChar = (UChar *)&pCsr[1]; | |
| 129 pCsr->aOffset = (int *)&pCsr->aChar[nChar]; | |
| 130 | |
| 131 pCsr->aOffset[iOut] = iInput; | |
| 132 U8_NEXT(zInput, iInput, nInput, c); | |
| 133 while( c>0 ){ | |
| 134 int isError = 0; | |
| 135 c = u_foldCase(c, opt); | |
| 136 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); | |
| 137 if( isError ){ | |
| 138 sqlite3_free(pCsr); | |
| 139 return SQLITE_ERROR; | |
| 140 } | |
| 141 pCsr->aOffset[iOut] = iInput; | |
| 142 | |
| 143 if( iInput<nInput ){ | |
| 144 U8_NEXT(zInput, iInput, nInput, c); | |
| 145 }else{ | |
| 146 c = 0; | |
| 147 } | |
| 148 } | |
| 149 | |
| 150 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); | |
| 151 if( !U_SUCCESS(status) ){ | |
| 152 sqlite3_free(pCsr); | |
| 153 return SQLITE_ERROR; | |
| 154 } | |
| 155 pCsr->nChar = iOut; | |
| 156 | |
| 157 ubrk_first(pCsr->pIter); | |
| 158 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; | |
| 159 return SQLITE_OK; | |
| 160 } | |
| 161 | |
| 162 /* | |
| 163 ** Close a tokenization cursor previously opened by a call to icuOpen(). | |
| 164 */ | |
| 165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ | |
| 166 IcuCursor *pCsr = (IcuCursor *)pCursor; | |
| 167 ubrk_close(pCsr->pIter); | |
| 168 sqlite3_free(pCsr->zBuffer); | |
| 169 sqlite3_free(pCsr); | |
| 170 return SQLITE_OK; | |
| 171 } | |
| 172 | |
| 173 /* | |
| 174 ** Extract the next token from a tokenization cursor. | |
| 175 */ | |
| 176 static int icuNext( | |
| 177 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ | |
| 178 const char **ppToken, /* OUT: *ppToken is the token text */ | |
| 179 int *pnBytes, /* OUT: Number of bytes in token */ | |
| 180 int *piStartOffset, /* OUT: Starting offset of token */ | |
| 181 int *piEndOffset, /* OUT: Ending offset of token */ | |
| 182 int *piPosition /* OUT: Position integer of token */ | |
| 183 ){ | |
| 184 IcuCursor *pCsr = (IcuCursor *)pCursor; | |
| 185 | |
| 186 int iStart = 0; | |
| 187 int iEnd = 0; | |
| 188 int nByte = 0; | |
| 189 | |
| 190 while( iStart==iEnd ){ | |
| 191 UChar32 c; | |
| 192 | |
| 193 iStart = ubrk_current(pCsr->pIter); | |
| 194 iEnd = ubrk_next(pCsr->pIter); | |
| 195 if( iEnd==UBRK_DONE ){ | |
| 196 return SQLITE_DONE; | |
| 197 } | |
| 198 | |
| 199 while( iStart<iEnd ){ | |
| 200 int iWhite = iStart; | |
| 201 U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); | |
| 202 if( u_isspace(c) ){ | |
| 203 iStart = iWhite; | |
| 204 }else{ | |
| 205 break; | |
| 206 } | |
| 207 } | |
| 208 assert(iStart<=iEnd); | |
| 209 } | |
| 210 | |
| 211 do { | |
| 212 UErrorCode status = U_ZERO_ERROR; | |
| 213 if( nByte ){ | |
| 214 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); | |
| 215 if( !zNew ){ | |
| 216 return SQLITE_NOMEM; | |
| 217 } | |
| 218 pCsr->zBuffer = zNew; | |
| 219 pCsr->nBuffer = nByte; | |
| 220 } | |
| 221 | |
| 222 u_strToUTF8( | |
| 223 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ | |
| 224 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ | |
| 225 &status /* Output success/failure */ | |
| 226 ); | |
| 227 } while( nByte>pCsr->nBuffer ); | |
| 228 | |
| 229 *ppToken = pCsr->zBuffer; | |
| 230 *pnBytes = nByte; | |
| 231 *piStartOffset = pCsr->aOffset[iStart]; | |
| 232 *piEndOffset = pCsr->aOffset[iEnd]; | |
| 233 *piPosition = pCsr->iToken++; | |
| 234 | |
| 235 return SQLITE_OK; | |
| 236 } | |
| 237 | |
| 238 /* | |
| 239 ** The set of routines that implement the simple tokenizer | |
| 240 */ | |
| 241 static const sqlite3_tokenizer_module icuTokenizerModule = { | |
| 242 0, /* iVersion */ | |
| 243 icuCreate, /* xCreate */ | |
| 244 icuDestroy, /* xCreate */ | |
| 245 icuOpen, /* xOpen */ | |
| 246 icuClose, /* xClose */ | |
| 247 icuNext, /* xNext */ | |
| 248 }; | |
| 249 | |
| 250 /* | |
| 251 ** Set *ppModule to point at the implementation of the ICU tokenizer. | |
| 252 */ | |
| 253 void sqlite3Fts3IcuTokenizerModule( | |
| 254 sqlite3_tokenizer_module const**ppModule | |
| 255 ){ | |
| 256 *ppModule = &icuTokenizerModule; | |
| 257 } | |
| 258 | |
| 259 #endif /* defined(SQLITE_ENABLE_ICU) */ | |
| 260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ | |
| OLD | NEW |