OLD | NEW |
| (Empty) |
1 /* | |
2 ** 2007 June 22 | |
3 ** | |
4 ** The author disclaims copyright to this source code. In place of | |
5 ** a legal notice, here is a blessing: | |
6 ** | |
7 ** May you do good and not evil. | |
8 ** May you find forgiveness for yourself and forgive others. | |
9 ** May you share freely, never taking more than you give. | |
10 ** | |
11 ************************************************************************* | |
12 ** This file implements a tokenizer for fts2 based on the ICU library. | |
13 ** | |
14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $ | |
15 */ | |
16 | |
17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) | |
18 #ifdef SQLITE_ENABLE_ICU | |
19 | |
20 #include <assert.h> | |
21 #include <string.h> | |
22 #include "fts2_tokenizer.h" | |
23 | |
24 #include <unicode/ubrk.h> | |
25 #include <unicode/ucol.h> | |
26 #include <unicode/ustring.h> | |
27 #include <unicode/utf16.h> | |
28 | |
29 typedef struct IcuTokenizer IcuTokenizer; | |
30 typedef struct IcuCursor IcuCursor; | |
31 | |
32 struct IcuTokenizer { | |
33 sqlite3_tokenizer base; | |
34 char *zLocale; | |
35 }; | |
36 | |
37 struct IcuCursor { | |
38 sqlite3_tokenizer_cursor base; | |
39 | |
40 UBreakIterator *pIter; /* ICU break-iterator object */ | |
41 int nChar; /* Number of UChar elements in pInput */ | |
42 UChar *aChar; /* Copy of input using utf-16 encoding */ | |
43 int *aOffset; /* Offsets of each character in utf-8 input */ | |
44 | |
45 int nBuffer; | |
46 char *zBuffer; | |
47 | |
48 int iToken; | |
49 }; | |
50 | |
51 /* | |
52 ** Create a new tokenizer instance. | |
53 */ | |
54 static int icuCreate( | |
55 int argc, /* Number of entries in argv[] */ | |
56 const char * const *argv, /* Tokenizer creation arguments */ | |
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ | |
58 ){ | |
59 IcuTokenizer *p; | |
60 int n = 0; | |
61 | |
62 if( argc>0 ){ | |
63 n = strlen(argv[0])+1; | |
64 } | |
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); | |
66 if( !p ){ | |
67 return SQLITE_NOMEM; | |
68 } | |
69 memset(p, 0, sizeof(IcuTokenizer)); | |
70 | |
71 if( n ){ | |
72 p->zLocale = (char *)&p[1]; | |
73 memcpy(p->zLocale, argv[0], n); | |
74 } | |
75 | |
76 *ppTokenizer = (sqlite3_tokenizer *)p; | |
77 | |
78 return SQLITE_OK; | |
79 } | |
80 | |
81 /* | |
82 ** Destroy a tokenizer | |
83 */ | |
84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){ | |
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; | |
86 sqlite3_free(p); | |
87 return SQLITE_OK; | |
88 } | |
89 | |
90 /* | |
91 ** Prepare to begin tokenizing a particular string. The input | |
92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor | |
93 ** used to incrementally tokenize this string is returned in | |
94 ** *ppCursor. | |
95 */ | |
96 static int icuOpen( | |
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */ | |
98 const char *zInput, /* Input string */ | |
99 int nInput, /* Length of zInput in bytes */ | |
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ | |
101 ){ | |
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer; | |
103 IcuCursor *pCsr; | |
104 | |
105 const int32_t opt = U_FOLD_CASE_DEFAULT; | |
106 UErrorCode status = U_ZERO_ERROR; | |
107 int nChar; | |
108 | |
109 UChar32 c; | |
110 int iInput = 0; | |
111 int iOut = 0; | |
112 | |
113 *ppCursor = 0; | |
114 | |
115 if( nInput<0 ){ | |
116 nInput = strlen(zInput); | |
117 } | |
118 nChar = nInput+1; | |
119 pCsr = (IcuCursor *)sqlite3_malloc( | |
120 sizeof(IcuCursor) + /* IcuCursor */ | |
121 ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */ | |
122 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ | |
123 ); | |
124 if( !pCsr ){ | |
125 return SQLITE_NOMEM; | |
126 } | |
127 memset(pCsr, 0, sizeof(IcuCursor)); | |
128 pCsr->aChar = (UChar *)&pCsr[1]; | |
129 pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3]; | |
130 | |
131 pCsr->aOffset[iOut] = iInput; | |
132 U8_NEXT(zInput, iInput, nInput, c); | |
133 while( c>0 ){ | |
134 int isError = 0; | |
135 c = u_foldCase(c, opt); | |
136 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); | |
137 if( isError ){ | |
138 sqlite3_free(pCsr); | |
139 return SQLITE_ERROR; | |
140 } | |
141 pCsr->aOffset[iOut] = iInput; | |
142 | |
143 if( iInput<nInput ){ | |
144 U8_NEXT(zInput, iInput, nInput, c); | |
145 }else{ | |
146 c = 0; | |
147 } | |
148 } | |
149 | |
150 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); | |
151 if( !U_SUCCESS(status) ){ | |
152 sqlite3_free(pCsr); | |
153 return SQLITE_ERROR; | |
154 } | |
155 pCsr->nChar = iOut; | |
156 | |
157 ubrk_first(pCsr->pIter); | |
158 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; | |
159 return SQLITE_OK; | |
160 } | |
161 | |
162 /* | |
163 ** Close a tokenization cursor previously opened by a call to icuOpen(). | |
164 */ | |
165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){ | |
166 IcuCursor *pCsr = (IcuCursor *)pCursor; | |
167 ubrk_close(pCsr->pIter); | |
168 sqlite3_free(pCsr->zBuffer); | |
169 sqlite3_free(pCsr); | |
170 return SQLITE_OK; | |
171 } | |
172 | |
173 /* | |
174 ** Extract the next token from a tokenization cursor. | |
175 */ | |
176 static int icuNext( | |
177 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ | |
178 const char **ppToken, /* OUT: *ppToken is the token text */ | |
179 int *pnBytes, /* OUT: Number of bytes in token */ | |
180 int *piStartOffset, /* OUT: Starting offset of token */ | |
181 int *piEndOffset, /* OUT: Ending offset of token */ | |
182 int *piPosition /* OUT: Position integer of token */ | |
183 ){ | |
184 IcuCursor *pCsr = (IcuCursor *)pCursor; | |
185 | |
186 int iStart = 0; | |
187 int iEnd = 0; | |
188 int nByte = 0; | |
189 | |
190 while( iStart==iEnd ){ | |
191 UChar32 c; | |
192 | |
193 iStart = ubrk_current(pCsr->pIter); | |
194 iEnd = ubrk_next(pCsr->pIter); | |
195 if( iEnd==UBRK_DONE ){ | |
196 return SQLITE_DONE; | |
197 } | |
198 | |
199 while( iStart<iEnd ){ | |
200 int iWhite = iStart; | |
201 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c); | |
202 if( u_isspace(c) ){ | |
203 iStart = iWhite; | |
204 }else{ | |
205 break; | |
206 } | |
207 } | |
208 assert(iStart<=iEnd); | |
209 } | |
210 | |
211 do { | |
212 UErrorCode status = U_ZERO_ERROR; | |
213 if( nByte ){ | |
214 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); | |
215 if( !zNew ){ | |
216 return SQLITE_NOMEM; | |
217 } | |
218 pCsr->zBuffer = zNew; | |
219 pCsr->nBuffer = nByte; | |
220 } | |
221 | |
222 u_strToUTF8( | |
223 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ | |
224 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ | |
225 &status /* Output success/failure */ | |
226 ); | |
227 } while( nByte>pCsr->nBuffer ); | |
228 | |
229 *ppToken = pCsr->zBuffer; | |
230 *pnBytes = nByte; | |
231 *piStartOffset = pCsr->aOffset[iStart]; | |
232 *piEndOffset = pCsr->aOffset[iEnd]; | |
233 *piPosition = pCsr->iToken++; | |
234 | |
235 return SQLITE_OK; | |
236 } | |
237 | |
238 /* | |
239 ** The set of routines that implement the simple tokenizer | |
240 */ | |
241 static const sqlite3_tokenizer_module icuTokenizerModule = { | |
242 0, /* iVersion */ | |
243 icuCreate, /* xCreate */ | |
244 icuDestroy, /* xCreate */ | |
245 icuOpen, /* xOpen */ | |
246 icuClose, /* xClose */ | |
247 icuNext, /* xNext */ | |
248 }; | |
249 | |
250 /* | |
251 ** Set *ppModule to point at the implementation of the ICU tokenizer. | |
252 */ | |
253 void sqlite3Fts2IcuTokenizerModule( | |
254 sqlite3_tokenizer_module const**ppModule | |
255 ){ | |
256 *ppModule = &icuTokenizerModule; | |
257 } | |
258 | |
259 #endif /* defined(SQLITE_ENABLE_ICU) */ | |
260 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */ | |
OLD | NEW |