OLD | NEW |
(Empty) | |
| 1 /* |
| 2 ** 2007 June 22 |
| 3 ** |
| 4 ** The author disclaims copyright to this source code. In place of |
| 5 ** a legal notice, here is a blessing: |
| 6 ** |
| 7 ** May you do good and not evil. |
| 8 ** May you find forgiveness for yourself and forgive others. |
| 9 ** May you share freely, never taking more than you give. |
| 10 ** |
| 11 ************************************************************************* |
| 12 ** This file implements a tokenizer for fts3 based on the ICU library. |
| 13 */ |
| 14 #include "fts3Int.h" |
| 15 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) |
| 16 #ifdef SQLITE_ENABLE_ICU |
| 17 |
| 18 #include <assert.h> |
| 19 #include <string.h> |
| 20 #include "fts3_tokenizer.h" |
| 21 |
| 22 #include <unicode/ubrk.h> |
| 23 #include <unicode/ucol.h> |
| 24 #include <unicode/ustring.h> |
| 25 #include <unicode/utf16.h> |
| 26 |
| 27 typedef struct IcuTokenizer IcuTokenizer; |
| 28 typedef struct IcuCursor IcuCursor; |
| 29 |
| 30 struct IcuTokenizer { |
| 31 sqlite3_tokenizer base; |
| 32 char *zLocale; |
| 33 }; |
| 34 |
| 35 struct IcuCursor { |
| 36 sqlite3_tokenizer_cursor base; |
| 37 |
| 38 UBreakIterator *pIter; /* ICU break-iterator object */ |
| 39 int nChar; /* Number of UChar elements in pInput */ |
| 40 UChar *aChar; /* Copy of input using utf-16 encoding */ |
| 41 int *aOffset; /* Offsets of each character in utf-8 input */ |
| 42 |
| 43 int nBuffer; |
| 44 char *zBuffer; |
| 45 |
| 46 int iToken; |
| 47 }; |
| 48 |
| 49 /* |
| 50 ** Create a new tokenizer instance. |
| 51 */ |
| 52 static int icuCreate( |
| 53 int argc, /* Number of entries in argv[] */ |
| 54 const char * const *argv, /* Tokenizer creation arguments */ |
| 55 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ |
| 56 ){ |
| 57 IcuTokenizer *p; |
| 58 int n = 0; |
| 59 |
| 60 if( argc>0 ){ |
| 61 n = strlen(argv[0])+1; |
| 62 } |
| 63 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); |
| 64 if( !p ){ |
| 65 return SQLITE_NOMEM; |
| 66 } |
| 67 memset(p, 0, sizeof(IcuTokenizer)); |
| 68 |
| 69 if( n ){ |
| 70 p->zLocale = (char *)&p[1]; |
| 71 memcpy(p->zLocale, argv[0], n); |
| 72 } |
| 73 |
| 74 *ppTokenizer = (sqlite3_tokenizer *)p; |
| 75 |
| 76 return SQLITE_OK; |
| 77 } |
| 78 |
| 79 /* |
| 80 ** Destroy a tokenizer |
| 81 */ |
| 82 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ |
| 83 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
| 84 sqlite3_free(p); |
| 85 return SQLITE_OK; |
| 86 } |
| 87 |
| 88 /* |
| 89 ** Prepare to begin tokenizing a particular string. The input |
| 90 ** string to be tokenized is pInput[0..nBytes-1]. A cursor |
| 91 ** used to incrementally tokenize this string is returned in |
| 92 ** *ppCursor. |
| 93 */ |
| 94 static int icuOpen( |
| 95 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ |
| 96 const char *zInput, /* Input string */ |
| 97 int nInput, /* Length of zInput in bytes */ |
| 98 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ |
| 99 ){ |
| 100 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; |
| 101 IcuCursor *pCsr; |
| 102 |
| 103 const int32_t opt = U_FOLD_CASE_DEFAULT; |
| 104 UErrorCode status = U_ZERO_ERROR; |
| 105 int nChar; |
| 106 |
| 107 UChar32 c; |
| 108 int iInput = 0; |
| 109 int iOut = 0; |
| 110 |
| 111 *ppCursor = 0; |
| 112 |
| 113 if( zInput==0 ){ |
| 114 nInput = 0; |
| 115 zInput = ""; |
| 116 }else if( nInput<0 ){ |
| 117 nInput = strlen(zInput); |
| 118 } |
| 119 nChar = nInput+1; |
| 120 pCsr = (IcuCursor *)sqlite3_malloc( |
| 121 sizeof(IcuCursor) + /* IcuCursor */ |
| 122 ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */ |
| 123 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ |
| 124 ); |
| 125 if( !pCsr ){ |
| 126 return SQLITE_NOMEM; |
| 127 } |
| 128 memset(pCsr, 0, sizeof(IcuCursor)); |
| 129 pCsr->aChar = (UChar *)&pCsr[1]; |
| 130 pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3]; |
| 131 |
| 132 pCsr->aOffset[iOut] = iInput; |
| 133 U8_NEXT(zInput, iInput, nInput, c); |
| 134 while( c>0 ){ |
| 135 int isError = 0; |
| 136 c = u_foldCase(c, opt); |
| 137 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); |
| 138 if( isError ){ |
| 139 sqlite3_free(pCsr); |
| 140 return SQLITE_ERROR; |
| 141 } |
| 142 pCsr->aOffset[iOut] = iInput; |
| 143 |
| 144 if( iInput<nInput ){ |
| 145 U8_NEXT(zInput, iInput, nInput, c); |
| 146 }else{ |
| 147 c = 0; |
| 148 } |
| 149 } |
| 150 |
| 151 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); |
| 152 if( !U_SUCCESS(status) ){ |
| 153 sqlite3_free(pCsr); |
| 154 return SQLITE_ERROR; |
| 155 } |
| 156 pCsr->nChar = iOut; |
| 157 |
| 158 ubrk_first(pCsr->pIter); |
| 159 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; |
| 160 return SQLITE_OK; |
| 161 } |
| 162 |
| 163 /* |
| 164 ** Close a tokenization cursor previously opened by a call to icuOpen(). |
| 165 */ |
| 166 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ |
| 167 IcuCursor *pCsr = (IcuCursor *)pCursor; |
| 168 ubrk_close(pCsr->pIter); |
| 169 sqlite3_free(pCsr->zBuffer); |
| 170 sqlite3_free(pCsr); |
| 171 return SQLITE_OK; |
| 172 } |
| 173 |
| 174 /* |
| 175 ** Extract the next token from a tokenization cursor. |
| 176 */ |
| 177 static int icuNext( |
| 178 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ |
| 179 const char **ppToken, /* OUT: *ppToken is the token text */ |
| 180 int *pnBytes, /* OUT: Number of bytes in token */ |
| 181 int *piStartOffset, /* OUT: Starting offset of token */ |
| 182 int *piEndOffset, /* OUT: Ending offset of token */ |
| 183 int *piPosition /* OUT: Position integer of token */ |
| 184 ){ |
| 185 IcuCursor *pCsr = (IcuCursor *)pCursor; |
| 186 |
| 187 int iStart = 0; |
| 188 int iEnd = 0; |
| 189 int nByte = 0; |
| 190 |
| 191 while( iStart==iEnd ){ |
| 192 UChar32 c; |
| 193 |
| 194 iStart = ubrk_current(pCsr->pIter); |
| 195 iEnd = ubrk_next(pCsr->pIter); |
| 196 if( iEnd==UBRK_DONE ){ |
| 197 return SQLITE_DONE; |
| 198 } |
| 199 |
| 200 while( iStart<iEnd ){ |
| 201 int iWhite = iStart; |
| 202 U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); |
| 203 if( u_isspace(c) ){ |
| 204 iStart = iWhite; |
| 205 }else{ |
| 206 break; |
| 207 } |
| 208 } |
| 209 assert(iStart<=iEnd); |
| 210 } |
| 211 |
| 212 do { |
| 213 UErrorCode status = U_ZERO_ERROR; |
| 214 if( nByte ){ |
| 215 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); |
| 216 if( !zNew ){ |
| 217 return SQLITE_NOMEM; |
| 218 } |
| 219 pCsr->zBuffer = zNew; |
| 220 pCsr->nBuffer = nByte; |
| 221 } |
| 222 |
| 223 u_strToUTF8( |
| 224 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ |
| 225 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ |
| 226 &status /* Output success/failure */ |
| 227 ); |
| 228 } while( nByte>pCsr->nBuffer ); |
| 229 |
| 230 *ppToken = pCsr->zBuffer; |
| 231 *pnBytes = nByte; |
| 232 *piStartOffset = pCsr->aOffset[iStart]; |
| 233 *piEndOffset = pCsr->aOffset[iEnd]; |
| 234 *piPosition = pCsr->iToken++; |
| 235 |
| 236 return SQLITE_OK; |
| 237 } |
| 238 |
| 239 /* |
| 240 ** The set of routines that implement the simple tokenizer |
| 241 */ |
| 242 static const sqlite3_tokenizer_module icuTokenizerModule = { |
| 243 0, /* iVersion */ |
| 244 icuCreate, /* xCreate */ |
| 245 icuDestroy, /* xCreate */ |
| 246 icuOpen, /* xOpen */ |
| 247 icuClose, /* xClose */ |
| 248 icuNext, /* xNext */ |
| 249 0, /* xLanguageid */ |
| 250 }; |
| 251 |
| 252 /* |
| 253 ** Set *ppModule to point at the implementation of the ICU tokenizer. |
| 254 */ |
| 255 void sqlite3Fts3IcuTokenizerModule( |
| 256 sqlite3_tokenizer_module const**ppModule |
| 257 ){ |
| 258 *ppModule = &icuTokenizerModule; |
| 259 } |
| 260 |
| 261 #endif /* defined(SQLITE_ENABLE_ICU) */ |
| 262 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */ |
OLD | NEW |