third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_icu.c - Issue 2363173002: [sqlite] Remove obsolete reference version 3.8.7.4.

Side by Side Diff: third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_icu.c

Issue 2363173002: [sqlite] Remove obsolete reference version 3.8.7.4. (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_hash.c ('k') | third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_porter.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 /*

2 ** 2007 June 22

3 **

4 ** The author disclaims copyright to this source code. In place of

5 ** a legal notice, here is a blessing:

6 **

7 ** May you do good and not evil.

8 ** May you find forgiveness for yourself and forgive others.

9 ** May you share freely, never taking more than you give.

10 **

11 *************************************************************************

12 ** This file implements a tokenizer for fts3 based on the ICU library.

13 */

14 #include "fts3Int.h"

15 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)

16 #ifdef SQLITE_ENABLE_ICU

17

18 #include <assert.h>

19 #include <string.h>

20 #include "fts3_tokenizer.h"

21

22 #include <unicode/ubrk.h>

23 #include <unicode/ucol.h>

24 #include <unicode/ustring.h>

25 #include <unicode/utf16.h>

26

27 typedef struct IcuTokenizer IcuTokenizer;

28 typedef struct IcuCursor IcuCursor;

29

30 struct IcuTokenizer {

31 sqlite3_tokenizer base;

32 char *zLocale;

33 };

34

35 struct IcuCursor {

36 sqlite3_tokenizer_cursor base;

37

38 UBreakIterator pIter; / ICU break-iterator object */

39 int nChar; /* Number of UChar elements in pInput */

40 UChar aChar; / Copy of input using utf-16 encoding */

41 int aOffset; / Offsets of each character in utf-8 input */

42

43 int nBuffer;

44 char *zBuffer;

45

46 int iToken;

47 };

48

49 /*

50 ** Create a new tokenizer instance.

51 */

52 static int icuCreate(

53 int argc, /* Number of entries in argv[] */

54 const char * const argv, / Tokenizer creation arguments */

55 sqlite3_tokenizer *ppTokenizer / OUT: Created tokenizer */

56 ){

57 IcuTokenizer *p;

58 int n = 0;

59

60 if( argc>0 ){

61 n = strlen(argv[0])+1;

62 }

63 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);

64 if( !p ){

65 return SQLITE_NOMEM;

66 }

67 memset(p, 0, sizeof(IcuTokenizer));

68

69 if( n ){

70 p->zLocale = (char *)&p[1];

71 memcpy(p->zLocale, argv[0], n);

72 }

73

74 ppTokenizer = (sqlite3_tokenizer )p;

75

76 return SQLITE_OK;

77 }

78

79 /*

80 ** Destroy a tokenizer

81 */

82 static int icuDestroy(sqlite3_tokenizer *pTokenizer){

83 IcuTokenizer p = (IcuTokenizer )pTokenizer;

84 sqlite3_free(p);

85 return SQLITE_OK;

86 }

87

88 /*

89 ** Prepare to begin tokenizing a particular string. The input

90 ** string to be tokenized is pInput[0..nBytes-1]. A cursor

91 ** used to incrementally tokenize this string is returned in

92 ** *ppCursor.

93 */

94 static int icuOpen(

95 sqlite3_tokenizer pTokenizer, / The tokenizer */

96 const char zInput, / Input string */

97 int nInput, /* Length of zInput in bytes */

98 sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */

99 ){

100 IcuTokenizer p = (IcuTokenizer )pTokenizer;

101 IcuCursor *pCsr;

102

103 const int32_t opt = U_FOLD_CASE_DEFAULT;

104 UErrorCode status = U_ZERO_ERROR;

105 int nChar;

106

107 UChar32 c;

108 int iInput = 0;

109 int iOut = 0;

110

111 *ppCursor = 0;

112

113 if( zInput==0 ){

114 nInput = 0;

115 zInput = "";

116 }else if( nInput<0 ){

117 nInput = strlen(zInput);

118 }

119 nChar = nInput+1;

120 pCsr = (IcuCursor *)sqlite3_malloc(

121 sizeof(IcuCursor) + /* IcuCursor */

122 ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */

123 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */

124 );

125 if( !pCsr ){

126 return SQLITE_NOMEM;

127 }

128 memset(pCsr, 0, sizeof(IcuCursor));

129 pCsr->aChar = (UChar *)&pCsr[1];

130 pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];

131

132 pCsr->aOffset[iOut] = iInput;

133 U8_NEXT(zInput, iInput, nInput, c);

134 while( c>0 ){

135 int isError = 0;

136 c = u_foldCase(c, opt);

137 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);

138 if( isError ){

139 sqlite3_free(pCsr);

140 return SQLITE_ERROR;

141 }

142 pCsr->aOffset[iOut] = iInput;

143

144 if( iInput<nInput ){

145 U8_NEXT(zInput, iInput, nInput, c);

146 }else{

147 c = 0;

148 }

149 }

150

151 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);

152 if( !U_SUCCESS(status) ){

153 sqlite3_free(pCsr);

154 return SQLITE_ERROR;

155 }

156 pCsr->nChar = iOut;

157

158 ubrk_first(pCsr->pIter);

159 ppCursor = (sqlite3_tokenizer_cursor )pCsr;

160 return SQLITE_OK;

161 }

162

163 /*

164 ** Close a tokenization cursor previously opened by a call to icuOpen().

165 */

166 static int icuClose(sqlite3_tokenizer_cursor *pCursor){

167 IcuCursor pCsr = (IcuCursor )pCursor;

168 ubrk_close(pCsr->pIter);

169 sqlite3_free(pCsr->zBuffer);

170 sqlite3_free(pCsr);

171 return SQLITE_OK;

172 }

173

174 /*

175 ** Extract the next token from a tokenization cursor.

176 */

177 static int icuNext(

178 sqlite3_tokenizer_cursor pCursor, / Cursor returned by simpleOpen */

179 const char *ppToken, / OUT: ppToken is the token text /

180 int pnBytes, / OUT: Number of bytes in token */

181 int piStartOffset, / OUT: Starting offset of token */

182 int piEndOffset, / OUT: Ending offset of token */

183 int piPosition / OUT: Position integer of token */

184 ){

185 IcuCursor pCsr = (IcuCursor )pCursor;

186

187 int iStart = 0;

188 int iEnd = 0;

189 int nByte = 0;

190

191 while( iStart==iEnd ){

192 UChar32 c;

193

194 iStart = ubrk_current(pCsr->pIter);

195 iEnd = ubrk_next(pCsr->pIter);

196 if( iEnd==UBRK_DONE ){

197 return SQLITE_DONE;

198 }

199

200 while( iStart<iEnd ){

201 int iWhite = iStart;

202 U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);

203 if( u_isspace(c) ){

204 iStart = iWhite;

205 }else{

206 break;

207 }

208 }

209 assert(iStart<=iEnd);

210 }

211

212 do {

213 UErrorCode status = U_ZERO_ERROR;

214 if( nByte ){

215 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);

216 if( !zNew ){

217 return SQLITE_NOMEM;

218 }

219 pCsr->zBuffer = zNew;

220 pCsr->nBuffer = nByte;

221 }

222

223 u_strToUTF8(

224 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */

225 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */

226 &status /* Output success/failure */

227 );

228 } while( nByte>pCsr->nBuffer );

229

230 *ppToken = pCsr->zBuffer;

231 *pnBytes = nByte;

232 *piStartOffset = pCsr->aOffset[iStart];

233 *piEndOffset = pCsr->aOffset[iEnd];

234 *piPosition = pCsr->iToken++;

235

236 return SQLITE_OK;

237 }

238

239 /*

240 ** The set of routines that implement the simple tokenizer

241 */

242 static const sqlite3_tokenizer_module icuTokenizerModule = {

243 0, /* iVersion */

244 icuCreate, /* xCreate */

245 icuDestroy, /* xCreate */

246 icuOpen, /* xOpen */

247 icuClose, /* xClose */

248 icuNext, /* xNext */

249 };

250

251 /*

252 ** Set *ppModule to point at the implementation of the ICU tokenizer.

253 */

254 void sqlite3Fts3IcuTokenizerModule(

255 sqlite3_tokenizer_module const**ppModule

256 ){

257 *ppModule = &icuTokenizerModule;

258 }

259

260 #endif /* defined(SQLITE_ENABLE_ICU) */

261 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */

OLD	NEW