Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(68)

Side by Side Diff: third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_icu.c

Issue 2363173002: [sqlite] Remove obsolete reference version 3.8.7.4. (Closed)
Patch Set: Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 ** 2007 June 22
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements a tokenizer for fts3 based on the ICU library.
13 */
14 #include "fts3Int.h"
15 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
16 #ifdef SQLITE_ENABLE_ICU
17
18 #include <assert.h>
19 #include <string.h>
20 #include "fts3_tokenizer.h"
21
22 #include <unicode/ubrk.h>
23 #include <unicode/ucol.h>
24 #include <unicode/ustring.h>
25 #include <unicode/utf16.h>
26
27 typedef struct IcuTokenizer IcuTokenizer;
28 typedef struct IcuCursor IcuCursor;
29
30 struct IcuTokenizer {
31 sqlite3_tokenizer base;
32 char *zLocale;
33 };
34
35 struct IcuCursor {
36 sqlite3_tokenizer_cursor base;
37
38 UBreakIterator *pIter; /* ICU break-iterator object */
39 int nChar; /* Number of UChar elements in pInput */
40 UChar *aChar; /* Copy of input using utf-16 encoding */
41 int *aOffset; /* Offsets of each character in utf-8 input */
42
43 int nBuffer;
44 char *zBuffer;
45
46 int iToken;
47 };
48
49 /*
50 ** Create a new tokenizer instance.
51 */
52 static int icuCreate(
53 int argc, /* Number of entries in argv[] */
54 const char * const *argv, /* Tokenizer creation arguments */
55 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
56 ){
57 IcuTokenizer *p;
58 int n = 0;
59
60 if( argc>0 ){
61 n = strlen(argv[0])+1;
62 }
63 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
64 if( !p ){
65 return SQLITE_NOMEM;
66 }
67 memset(p, 0, sizeof(IcuTokenizer));
68
69 if( n ){
70 p->zLocale = (char *)&p[1];
71 memcpy(p->zLocale, argv[0], n);
72 }
73
74 *ppTokenizer = (sqlite3_tokenizer *)p;
75
76 return SQLITE_OK;
77 }
78
79 /*
80 ** Destroy a tokenizer
81 */
82 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
83 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
84 sqlite3_free(p);
85 return SQLITE_OK;
86 }
87
88 /*
89 ** Prepare to begin tokenizing a particular string. The input
90 ** string to be tokenized is pInput[0..nBytes-1]. A cursor
91 ** used to incrementally tokenize this string is returned in
92 ** *ppCursor.
93 */
94 static int icuOpen(
95 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
96 const char *zInput, /* Input string */
97 int nInput, /* Length of zInput in bytes */
98 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
99 ){
100 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
101 IcuCursor *pCsr;
102
103 const int32_t opt = U_FOLD_CASE_DEFAULT;
104 UErrorCode status = U_ZERO_ERROR;
105 int nChar;
106
107 UChar32 c;
108 int iInput = 0;
109 int iOut = 0;
110
111 *ppCursor = 0;
112
113 if( zInput==0 ){
114 nInput = 0;
115 zInput = "";
116 }else if( nInput<0 ){
117 nInput = strlen(zInput);
118 }
119 nChar = nInput+1;
120 pCsr = (IcuCursor *)sqlite3_malloc(
121 sizeof(IcuCursor) + /* IcuCursor */
122 ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */
123 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
124 );
125 if( !pCsr ){
126 return SQLITE_NOMEM;
127 }
128 memset(pCsr, 0, sizeof(IcuCursor));
129 pCsr->aChar = (UChar *)&pCsr[1];
130 pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
131
132 pCsr->aOffset[iOut] = iInput;
133 U8_NEXT(zInput, iInput, nInput, c);
134 while( c>0 ){
135 int isError = 0;
136 c = u_foldCase(c, opt);
137 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
138 if( isError ){
139 sqlite3_free(pCsr);
140 return SQLITE_ERROR;
141 }
142 pCsr->aOffset[iOut] = iInput;
143
144 if( iInput<nInput ){
145 U8_NEXT(zInput, iInput, nInput, c);
146 }else{
147 c = 0;
148 }
149 }
150
151 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
152 if( !U_SUCCESS(status) ){
153 sqlite3_free(pCsr);
154 return SQLITE_ERROR;
155 }
156 pCsr->nChar = iOut;
157
158 ubrk_first(pCsr->pIter);
159 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
160 return SQLITE_OK;
161 }
162
163 /*
164 ** Close a tokenization cursor previously opened by a call to icuOpen().
165 */
166 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
167 IcuCursor *pCsr = (IcuCursor *)pCursor;
168 ubrk_close(pCsr->pIter);
169 sqlite3_free(pCsr->zBuffer);
170 sqlite3_free(pCsr);
171 return SQLITE_OK;
172 }
173
174 /*
175 ** Extract the next token from a tokenization cursor.
176 */
177 static int icuNext(
178 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
179 const char **ppToken, /* OUT: *ppToken is the token text */
180 int *pnBytes, /* OUT: Number of bytes in token */
181 int *piStartOffset, /* OUT: Starting offset of token */
182 int *piEndOffset, /* OUT: Ending offset of token */
183 int *piPosition /* OUT: Position integer of token */
184 ){
185 IcuCursor *pCsr = (IcuCursor *)pCursor;
186
187 int iStart = 0;
188 int iEnd = 0;
189 int nByte = 0;
190
191 while( iStart==iEnd ){
192 UChar32 c;
193
194 iStart = ubrk_current(pCsr->pIter);
195 iEnd = ubrk_next(pCsr->pIter);
196 if( iEnd==UBRK_DONE ){
197 return SQLITE_DONE;
198 }
199
200 while( iStart<iEnd ){
201 int iWhite = iStart;
202 U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
203 if( u_isspace(c) ){
204 iStart = iWhite;
205 }else{
206 break;
207 }
208 }
209 assert(iStart<=iEnd);
210 }
211
212 do {
213 UErrorCode status = U_ZERO_ERROR;
214 if( nByte ){
215 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
216 if( !zNew ){
217 return SQLITE_NOMEM;
218 }
219 pCsr->zBuffer = zNew;
220 pCsr->nBuffer = nByte;
221 }
222
223 u_strToUTF8(
224 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
225 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
226 &status /* Output success/failure */
227 );
228 } while( nByte>pCsr->nBuffer );
229
230 *ppToken = pCsr->zBuffer;
231 *pnBytes = nByte;
232 *piStartOffset = pCsr->aOffset[iStart];
233 *piEndOffset = pCsr->aOffset[iEnd];
234 *piPosition = pCsr->iToken++;
235
236 return SQLITE_OK;
237 }
238
239 /*
240 ** The set of routines that implement the simple tokenizer
241 */
242 static const sqlite3_tokenizer_module icuTokenizerModule = {
243 0, /* iVersion */
244 icuCreate, /* xCreate */
245 icuDestroy, /* xCreate */
246 icuOpen, /* xOpen */
247 icuClose, /* xClose */
248 icuNext, /* xNext */
249 };
250
251 /*
252 ** Set *ppModule to point at the implementation of the ICU tokenizer.
253 */
254 void sqlite3Fts3IcuTokenizerModule(
255 sqlite3_tokenizer_module const**ppModule
256 ){
257 *ppModule = &icuTokenizerModule;
258 }
259
260 #endif /* defined(SQLITE_ENABLE_ICU) */
261 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698