third_party/sqlite/sqlite-src-3080704/ext/fts2/fts2_icu.c - Issue 2363173002: [sqlite] Remove obsolete reference version 3.8.7.4.

Side by Side Diff: third_party/sqlite/sqlite-src-3080704/ext/fts2/fts2_icu.c

Issue 2363173002: [sqlite] Remove obsolete reference version 3.8.7.4. (Closed)

Patch Set: Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « third_party/sqlite/sqlite-src-3080704/ext/fts2/fts2_hash.c ('k') | third_party/sqlite/sqlite-src-3080704/ext/fts2/fts2_porter.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 /*

2 ** 2007 June 22

3 **

4 ** The author disclaims copyright to this source code. In place of

5 ** a legal notice, here is a blessing:

6 **

7 ** May you do good and not evil.

8 ** May you find forgiveness for yourself and forgive others.

9 ** May you share freely, never taking more than you give.

10 **

11 *************************************************************************

12 ** This file implements a tokenizer for fts2 based on the ICU library.

13 **

14 ** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $

15 */

16

17 #if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS2)

18 #ifdef SQLITE_ENABLE_ICU

19

20 #include <assert.h>

21 #include <string.h>

22 #include "fts2_tokenizer.h"

23

24 #include <unicode/ubrk.h>

25 #include <unicode/ucol.h>

26 #include <unicode/ustring.h>

27 #include <unicode/utf16.h>

28

29 typedef struct IcuTokenizer IcuTokenizer;

30 typedef struct IcuCursor IcuCursor;

31

32 struct IcuTokenizer {

33 sqlite3_tokenizer base;

34 char *zLocale;

35 };

36

37 struct IcuCursor {

38 sqlite3_tokenizer_cursor base;

39

40 UBreakIterator pIter; / ICU break-iterator object */

41 int nChar; /* Number of UChar elements in pInput */

42 UChar aChar; / Copy of input using utf-16 encoding */

43 int aOffset; / Offsets of each character in utf-8 input */

44

45 int nBuffer;

46 char *zBuffer;

47

48 int iToken;

49 };

50

51 /*

52 ** Create a new tokenizer instance.

53 */

54 static int icuCreate(

55 int argc, /* Number of entries in argv[] */

56 const char * const argv, / Tokenizer creation arguments */

57 sqlite3_tokenizer *ppTokenizer / OUT: Created tokenizer */

58 ){

59 IcuTokenizer *p;

60 int n = 0;

61

62 if( argc>0 ){

63 n = strlen(argv[0])+1;

64 }

65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);

66 if( !p ){

67 return SQLITE_NOMEM;

68 }

69 memset(p, 0, sizeof(IcuTokenizer));

70

71 if( n ){

72 p->zLocale = (char *)&p[1];

73 memcpy(p->zLocale, argv[0], n);

74 }

75

76 ppTokenizer = (sqlite3_tokenizer )p;

77

78 return SQLITE_OK;

79 }

80

81 /*

82 ** Destroy a tokenizer

83 */

84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){

85 IcuTokenizer p = (IcuTokenizer )pTokenizer;

86 sqlite3_free(p);

87 return SQLITE_OK;

88 }

89

90 /*

91 ** Prepare to begin tokenizing a particular string. The input

92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor

93 ** used to incrementally tokenize this string is returned in

94 ** *ppCursor.

95 */

96 static int icuOpen(

97 sqlite3_tokenizer pTokenizer, / The tokenizer */

98 const char zInput, / Input string */

99 int nInput, /* Length of zInput in bytes */

100 sqlite3_tokenizer_cursor *ppCursor / OUT: Tokenization cursor */

101 ){

102 IcuTokenizer p = (IcuTokenizer )pTokenizer;

103 IcuCursor *pCsr;

104

105 const int32_t opt = U_FOLD_CASE_DEFAULT;

106 UErrorCode status = U_ZERO_ERROR;

107 int nChar;

108

109 UChar32 c;

110 int iInput = 0;

111 int iOut = 0;

112

113 *ppCursor = 0;

114

115 if( nInput<0 ){

116 nInput = strlen(zInput);

117 }

118 nChar = nInput+1;

119 pCsr = (IcuCursor *)sqlite3_malloc(

120 sizeof(IcuCursor) + /* IcuCursor */

121 ((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */

122 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */

123 );

124 if( !pCsr ){

125 return SQLITE_NOMEM;

126 }

127 memset(pCsr, 0, sizeof(IcuCursor));

128 pCsr->aChar = (UChar *)&pCsr[1];

129 pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];

130

131 pCsr->aOffset[iOut] = iInput;

132 U8_NEXT(zInput, iInput, nInput, c);

133 while( c>0 ){

134 int isError = 0;

135 c = u_foldCase(c, opt);

136 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);

137 if( isError ){

138 sqlite3_free(pCsr);

139 return SQLITE_ERROR;

140 }

141 pCsr->aOffset[iOut] = iInput;

142

143 if( iInput<nInput ){

144 U8_NEXT(zInput, iInput, nInput, c);

145 }else{

146 c = 0;

147 }

148 }

149

150 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);

151 if( !U_SUCCESS(status) ){

152 sqlite3_free(pCsr);

153 return SQLITE_ERROR;

154 }

155 pCsr->nChar = iOut;

156

157 ubrk_first(pCsr->pIter);

158 ppCursor = (sqlite3_tokenizer_cursor )pCsr;

159 return SQLITE_OK;

160 }

161

162 /*

163 ** Close a tokenization cursor previously opened by a call to icuOpen().

164 */

165 static int icuClose(sqlite3_tokenizer_cursor *pCursor){

166 IcuCursor pCsr = (IcuCursor )pCursor;

167 ubrk_close(pCsr->pIter);

168 sqlite3_free(pCsr->zBuffer);

169 sqlite3_free(pCsr);

170 return SQLITE_OK;

171 }

172

173 /*

174 ** Extract the next token from a tokenization cursor.

175 */

176 static int icuNext(

177 sqlite3_tokenizer_cursor pCursor, / Cursor returned by simpleOpen */

178 const char *ppToken, / OUT: ppToken is the token text /

179 int pnBytes, / OUT: Number of bytes in token */

180 int piStartOffset, / OUT: Starting offset of token */

181 int piEndOffset, / OUT: Ending offset of token */

182 int piPosition / OUT: Position integer of token */

183 ){

184 IcuCursor pCsr = (IcuCursor )pCursor;

185

186 int iStart = 0;

187 int iEnd = 0;

188 int nByte = 0;

189

190 while( iStart==iEnd ){

191 UChar32 c;

192

193 iStart = ubrk_current(pCsr->pIter);

194 iEnd = ubrk_next(pCsr->pIter);

195 if( iEnd==UBRK_DONE ){

196 return SQLITE_DONE;

197 }

198

199 while( iStart<iEnd ){

200 int iWhite = iStart;

201 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);

202 if( u_isspace(c) ){

203 iStart = iWhite;

204 }else{

205 break;

206 }

207 }

208 assert(iStart<=iEnd);

209 }

210

211 do {

212 UErrorCode status = U_ZERO_ERROR;

213 if( nByte ){

214 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);

215 if( !zNew ){

216 return SQLITE_NOMEM;

217 }

218 pCsr->zBuffer = zNew;

219 pCsr->nBuffer = nByte;

220 }

221

222 u_strToUTF8(

223 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */

224 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */

225 &status /* Output success/failure */

226 );

227 } while( nByte>pCsr->nBuffer );

228

229 *ppToken = pCsr->zBuffer;

230 *pnBytes = nByte;

231 *piStartOffset = pCsr->aOffset[iStart];

232 *piEndOffset = pCsr->aOffset[iEnd];

233 *piPosition = pCsr->iToken++;

234

235 return SQLITE_OK;

236 }

237

238 /*

239 ** The set of routines that implement the simple tokenizer

240 */

241 static const sqlite3_tokenizer_module icuTokenizerModule = {

242 0, /* iVersion */

243 icuCreate, /* xCreate */

244 icuDestroy, /* xCreate */

245 icuOpen, /* xOpen */

246 icuClose, /* xClose */

247 icuNext, /* xNext */

248 };

249

250 /*

251 ** Set *ppModule to point at the implementation of the ICU tokenizer.

252 */

253 void sqlite3Fts2IcuTokenizerModule(

254 sqlite3_tokenizer_module const**ppModule

255 ){

256 *ppModule = &icuTokenizerModule;

257 }

258

259 #endif /* defined(SQLITE_ENABLE_ICU) */

260 #endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS2) */

OLD	NEW