third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test - Issue 1610543003: [sql] Import reference version of SQLite 3.10.2.

Side by Side Diff: third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test

Issue 1610543003: [sql] Import reference version of SQLite 3.10.2. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode.test ('k') | third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode3.test » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # 2012 May 25	1 # 2012 May 25

2 #	2 #

3 # The author disclaims copyright to this source code. In place of	3 # The author disclaims copyright to this source code. In place of

4 # a legal notice, here is a blessing:	4 # a legal notice, here is a blessing:

5 #	5 #

6 # May you do good and not evil.	6 # May you do good and not evil.

7 # May you find forgiveness for yourself and forgive others.	7 # May you find forgiveness for yourself and forgive others.

8 # May you share freely, never taking more than you give.	8 # May you share freely, never taking more than you give.

9 #	9 #

10 #*************************************************************************	10 #*************************************************************************

11 #	11 #

12 # The tests in this file focus on testing the "unicode" FTS tokenizer.	12 # The tests in this file focus on testing the "unicode" FTS tokenizer.

13 #	13 #

	14 # This is a modified copy of FTS4 test file "fts4_unicode.test".

	15 #

14	16

15 set testdir [file dirname $argv0]	17 source [file join [file dirname [info script]] fts5_common.tcl]

16 source $testdir/tester.tcl	18 set testprefix fts5unicode2

17 ifcapable !fts3_unicode { finish_test ; return }	19

18 set ::testprefix fts4unicode	20 # If SQLITE_ENABLE_FTS5 is defined, omit this file.

	21 ifcapable !fts5 {

	22 finish_test

	23 return

	24 }

19	25

20 proc do_unicode_token_test {tn input res} {	26 proc do_unicode_token_test {tn input res} {

21 set input [string map {' ''} $input]	27 uplevel [list do_test $tn [list \

22 uplevel [list do_execsql_test $tn "	28 sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input

23 SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');	29 ] [list {*}$res]]

24 " [list [list {*}$res]]]

25 }	30 }

26	31

27 proc do_unicode_token_test2 {tn input res} {	32 proc do_unicode_token_test2 {tn input res} {

28 set input [string map {' ''} $input]	33 uplevel [list do_test $tn [list \

29 uplevel [list do_execsql_test $tn "	34 sqlite3_fts5_tokenize -subst db "unicode61" $input

30 SELECT fts3_tokenizer_test('unicode61', '$input');	35 ] [list {*}$res]]

31 " [list [list {*}$res]]]

32 }	36 }

33	37

34 proc do_unicode_token_test3 {tn args} {	38 proc do_unicode_token_test3 {tn args} {

35 set res [lindex $args end]	39 set tokenizer [concat unicode61 {*}[lrange $args 0 end-2]]

36 set sql "SELECT fts3_tokenizer_test('unicode61'"	40 set input [lindex $args end-1]

37 foreach a [lrange $args 0 end-1] {	41 set res [lindex $args end]

38 append sql ", '"	42 uplevel [list do_test $tn [list \

39 append sql [string map {' ''} $a]	43 sqlite3_fts5_tokenize -subst db $tokenizer $input

40 append sql "'"	44 ] [list {*}$res]]

41 }

42 append sql ")"

43 uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]

44 }	45 }

45	46

46 do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}	47 do_unicode_token_test 1.0 {a B c D} {a a b B c c d D}

47	48

48 do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \	49 do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \

49 "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"	50 "\uE4 \uC4 \uF6 \uD6 \uFC \uDC"

50	51

51 do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \	52 do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \

52 "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"	53 "x\uE4x x\uC4x x\uF6x x\uD6x x\uFCx x\uDCx"

53	54

54 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.	55 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.

55 do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"	56 do_unicode_token_test 1.3 "\uDF" "\uDF \uDF"

56 do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"	57 do_unicode_token_test 1.4 "\u1E9E" "\uDF \u1E9E"

57	58

58 do_unicode_token_test 1.5 "The quick brown fox" {	59 do_unicode_token_test 1.5 "The quick brown fox" {

59 0 the The 1 quick quick 2 brown brown 3 fox fox	60 the The quick quick brown brown fox fox

60 }	61 }

61 do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {	62 do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {

62 0 the The 1 quick quick 2 brown brown 3 fox fox	63 the The quick quick brown brown fox fox

63 }	64 }

64	65

65 do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D}	66 do_unicode_token_test2 1.7 {a B c D} {a a b B c c d D}

66 do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"	67 do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "a \uC4 o \uD6 u \uDC"

67	68

68 do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \	69 do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \

69 "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"	70 "xax x\uC4x xox x\uD6x xux x\uDCx"

70	71

71 # Check that diacritics are removed if remove_diacritics=1 is specified.	72 # Check that diacritics are removed if remove_diacritics=1 is specified.

72 # And that they do not break tokens.	73 # And that they do not break tokens.

73 do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"	74 do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx"

74	75

75 # Title-case mappings work	76 # Title-case mappings work

76 do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"	77 do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5"

	78

	79 do_unicode_token_test 1.12 "\u00C1abc\u00C2 \u00D1def\u00C3" \

	80 "\u00E1abc\u00E2 \u00C1abc\u00C2 \u00F1def\u00E3 \u00D1def\u00C3"

	81

	82 do_unicode_token_test 1.13 "\u00A2abc\u00A3 \u00A4def\u00A5" \

	83 "abc abc def def"

77	84

78 #-------------------------------------------------------------------------	85 #-------------------------------------------------------------------------

79 #	86 #

80 set docs [list {	87 set docs [list {

81 Enhance the INSERT syntax to allow multiple rows to be inserted via the	88 Enhance the INSERT syntax to allow multiple rows to be inserted via the

82 VALUES clause.	89 VALUES clause.

83 } {	90 } {

84 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.	91 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.

85 } {	92 } {

86 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().	93 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
121 foreach k [array names map] {	128 foreach k [array names map] {

122 lappend mappings [string toupper $k] [lindex $map($k) 0]	129 lappend mappings [string toupper $k] [lindex $map($k) 0]

123 lappend mappings $k [lindex $map($k) 1]	130 lappend mappings $k [lindex $map($k) 1]

124 }	131 }

125 proc mapdoc {doc} {	132 proc mapdoc {doc} {

126 set doc [regsub -all {[[:space:]]+} $doc " "]	133 set doc [regsub -all {[[:space:]]+} $doc " "]

127 string map $::mappings [string trim $doc]	134 string map $::mappings [string trim $doc]

128 }	135 }

129	136

130 do_test 2.0 {	137 do_test 2.0 {

131 execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }	138 execsql { CREATE VIRTUAL TABLE t2 USING fts5(tokenize=unicode61, x); }

132 foreach doc $docs {	139 foreach doc $docs {

133 set d [mapdoc $doc]	140 set d [mapdoc $doc]

134 execsql { INSERT INTO t2 VALUES($d) }	141 execsql { INSERT INTO t2 VALUES($d) }

135 }	142 }

136 } {}	143 } {}

137	144

138 do_test 2.1 {	145 do_test 2.1 {

139 set q [mapdoc "row"]	146 set q [mapdoc "row"]

140 execsql { SELECT * FROM t2 WHERE t2 MATCH $q }	147 execsql { SELECT * FROM t2 WHERE t2 MATCH $q }

141 } [list [mapdoc {	148 } [list [mapdoc {

(...skipping 17 matching lines...) Expand all Loading...
159 5 "rOllback" {	166 5 "rOllback" {

160 ...[ROLLBACK]. Instead, the pending statement	167 ...[ROLLBACK]. Instead, the pending statement

161 will return SQLITE_ABORT upon next access after the [ROLLBACK].	168 will return SQLITE_ABORT upon next access after the [ROLLBACK].

162 }	169 }

163 6 "lang*" {	170 6 "lang*" {

164 Added support for the FTS4 [languageid] option.	171 Added support for the FTS4 [languageid] option.

165 }	172 }

166 } {	173 } {

167 do_test 2.$tn {	174 do_test 2.$tn {

168 set q [mapdoc $query]	175 set q [mapdoc $query]

169 execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }	176 execsql {

	177 SELECT snippet(t2, -1, '[', ']', '...', 15) FROM t2 WHERE t2 MATCH $q

	178 }

170 } [list [mapdoc $snippet]]	179 } [list [mapdoc $snippet]]

171 }	180 }

172	181

173 #-------------------------------------------------------------------------	182 #-------------------------------------------------------------------------

174 # Make sure the unicode61 tokenizer does not crash if it is passed a	183 # Make sure the unicode61 tokenizer does not crash if it is passed a

175 # NULL pointer.	184 # NULL pointer.

176 reset_db	185 reset_db

177 do_execsql_test 3.1 {	186 do_execsql_test 3.1 {

178 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);	187 CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x, y);

179 INSERT INTO t1 VALUES(NULL, 'a b c');	188 INSERT INTO t1 VALUES(NULL, 'a b c');

180 }	189 }

181	190

182 do_execsql_test 3.2 {	191 do_execsql_test 3.2 {

183 SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'	192 SELECT snippet(t1, -1, '[', ']', '...', 15) FROM t1 WHERE t1 MATCH 'b'

184 } {{a [b] c}}	193 } {{a [b] c}}

185	194

186 do_execsql_test 3.3 {	195 do_execsql_test 3.3 {

187 BEGIN;	196 BEGIN;

188 DELETE FROM t1;	197 DELETE FROM t1;

189 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');	198 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');

190 INSERT INTO t1 SELECT * FROM t1;	199 INSERT INTO t1 SELECT * FROM t1;

191 INSERT INTO t1 SELECT * FROM t1;	200 INSERT INTO t1 SELECT * FROM t1;

192 INSERT INTO t1 SELECT * FROM t1;	201 INSERT INTO t1 SELECT * FROM t1;

193 INSERT INTO t1 SELECT * FROM t1;	202 INSERT INTO t1 SELECT * FROM t1;

(...skipping 21 matching lines...) Expand all Loading...
215 #-------------------------------------------------------------------------	224 #-------------------------------------------------------------------------

216 #	225 #

217 reset_db	226 reset_db

218	227

219 do_test 4.1 {	228 do_test 4.1 {

220 set a "abc\uFFFEdef"	229 set a "abc\uFFFEdef"

221 set b "abc\uD800def"	230 set b "abc\uD800def"

222 set c "\uFFFEdef"	231 set c "\uFFFEdef"

223 set d "\uD800def"	232 set d "\uD800def"

224 execsql {	233 execsql {

225 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);	234 CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x);

226 INSERT INTO t1 VALUES($a);	235 INSERT INTO t1 VALUES($a);

227 INSERT INTO t1 VALUES($b);	236 INSERT INTO t1 VALUES($b);

228 INSERT INTO t1 VALUES($c);	237 INSERT INTO t1 VALUES($c);

229 INSERT INTO t1 VALUES($d);	238 INSERT INTO t1 VALUES($d);

230 }	239 }

	240

	241 execsql "CREATE VIRTUAL TABLE t8 USING fts5(

	242 a, b, tokenize=\"unicode61 separators '\uFFFE\uD800\u00BF'\"

	243 )"

231 } {}	244 } {}

232	245

233 do_test 4.2 {	246 do_test 4.2 {

234 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]	247 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]

235 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]	248 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]

236 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]	249 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]

237 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]	250 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]

238 execsql {	251 execsql {

239 INSERT INTO t1 VALUES($a);	252 INSERT INTO t1 VALUES($a);

240 INSERT INTO t1 VALUES($b);	253 INSERT INTO t1 VALUES($b);

241 INSERT INTO t1 VALUES($c);	254 INSERT INTO t1 VALUES($c);

242 INSERT INTO t1 VALUES($d);	255 INSERT INTO t1 VALUES($d);

243 }	256 }

244 } {}	257 } {}

245	258

246 do_test 4.3 {	259 do_test 4.3 {

247 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]	260 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]

248 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]	261 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]

249 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]	262 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]

250 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]	263 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]

251 execsql {	264 execsql {

252 INSERT INTO t1 VALUES($a);	265 INSERT INTO t1 VALUES($a);

253 INSERT INTO t1 VALUES($b);	266 INSERT INTO t1 VALUES($b);

254 INSERT INTO t1 VALUES($c);	267 INSERT INTO t1 VALUES($c);

255 INSERT INTO t1 VALUES($d);	268 INSERT INTO t1 VALUES($d);

256 }	269 }

257 } {}	270 } {}

258	271

	272 do_test 4.4 {

	273 sqlite3_exec_hex db {

	274 CREATE VIRTUAL TABLE t9 USING fts5(a, b,

	275 tokenize="unicode61 separators '%C09004'"

	276 );

	277 INSERT INTO t9(a) VALUES('abc%88def %89ghi%90');

	278 }

	279 } {0 {}}

	280

	281

259 #-------------------------------------------------------------------------	282 #-------------------------------------------------------------------------

260	283

261 do_unicode_token_test3 5.1 {tokenchars=} {	284 breakpoint

	285 do_unicode_token_test3 5.1 {tokenchars {}} {

262 sqlite3_reset sqlite3_column_int	286 sqlite3_reset sqlite3_column_int

263 } {	287 } {

264 0 sqlite3 sqlite3	288 sqlite3 sqlite3

265 1 reset reset	289 reset reset

266 2 sqlite3 sqlite3	290 sqlite3 sqlite3

267 3 column column	291 column column

268 4 int int	292 int int

269 }	293 }

270	294

271 do_unicode_token_test3 5.2 {tokenchars=_} {	295 do_unicode_token_test3 5.2 {tokenchars _} {

272 sqlite3_reset sqlite3_column_int	296 sqlite3_reset sqlite3_column_int

273 } {	297 } {

274 0 sqlite3_reset sqlite3_reset	298 sqlite3_reset sqlite3_reset

275 1 sqlite3_column_int sqlite3_column_int	299 sqlite3_column_int sqlite3_column_int

276 }	300 }

277	301

278 do_unicode_token_test3 5.3 {separators=xyz} {	302 do_unicode_token_test3 5.3 {separators xyz} {

279 Laotianxhorseyrunszfast	303 Laotianxhorseyrunszfast

280 } {	304 } {

281 0 laotian Laotian	305 laotian Laotian

282 1 horse horse	306 horse horse

283 2 runs runs	307 runs runs

284 3 fast fast	308 fast fast

285 }	309 }

286	310

287 do_unicode_token_test3 5.4 {tokenchars=xyz} {	311 do_unicode_token_test3 5.4 {tokenchars xyz} {

288 Laotianxhorseyrunszfast	312 Laotianxhorseyrunszfast

289 } {	313 } {

290 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast	314 laotianxhorseyrunszfast Laotianxhorseyrunszfast

291 }	315 }

292	316

293 do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {	317 do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} {

294 sqlite3_resetxsqlite3_column_intyhonda_phantom	318 sqlite3_resetxsqlite3_column_intyhonda_phantom

295 } {	319 } {

296 0 sqlite3_reset sqlite3_reset	320 sqlite3_reset sqlite3_reset

297 1 sqlite3_column_int sqlite3_column_int	321 sqlite3_column_int sqlite3_column_int

298 2 honda_phantom honda_phantom	322 honda_phantom honda_phantom

299 }	323 }

300	324

301 do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {	325 do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" {

302 0 abc abc 1 def def	326 abc abc def def

303 }	327 }

304	328

305 do_unicode_token_test3 5.7 \	329 do_unicode_token_test3 5.7 \

306 "tokenchars=\u2444\u2445" \	330 "tokenchars \u2444\u2445" \

307 "separators=\u05D0\u05D1\u05D2" \	331 "separators \u05D0\u05D1\u05D2" \

308 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \	332 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \

309 [list \	333 [list \

310 0 \u2444fre\u2445sh \u2444fre\u2445sh \	334 \u2444fre\u2445sh \u2444fre\u2445sh \

311 1 water water \	335 water water \

312 2 fish fish \	336 fish fish \

313 3 \u2445timer \u2445timer \	337 \u2445timer \u2445timer \

314 ]	338 ]

315	339

316 # Check that it is not possible to add a standalone diacritic codepoint	340 # Check that it is not possible to add a standalone diacritic codepoint

317 # to either separators or tokenchars.	341 # to either separators or tokenchars.

318 do_unicode_token_test3 5.8 "separators=\u0301" \	342 do_unicode_token_test3 5.8 "separators \u0301" \

319 "hello\u0301world \u0301helloworld" \	343 "hello\u0301world \u0301helloworld" \

320 "0 helloworld hello\u0301world 1 helloworld helloworld"	344 "helloworld hello\u0301world helloworld helloworld"

321	345

322 do_unicode_token_test3 5.9 "tokenchars=\u0301" \	346 do_unicode_token_test3 5.9 "tokenchars \u0301" \

323 "hello\u0301world \u0301helloworld" \	347 "hello\u0301world \u0301helloworld" \

324 "0 helloworld hello\u0301world 1 helloworld helloworld"	348 "helloworld hello\u0301world helloworld helloworld"

325	349

326 do_unicode_token_test3 5.10 "separators=\u0301" \	350 do_unicode_token_test3 5.10 "separators \u0301" \

327 "remove_diacritics=0" \	351 "remove_diacritics 0" \

328 "hello\u0301world \u0301helloworld" \	352 "hello\u0301world \u0301helloworld" \

329 "0 hello\u0301world hello\u0301world 1 helloworld helloworld"	353 "hello\u0301world hello\u0301world helloworld helloworld"

330	354

331 do_unicode_token_test3 5.11 "tokenchars=\u0301" \	355 do_unicode_token_test3 5.11 "tokenchars \u0301" \

332 "remove_diacritics=0" \	356 "remove_diacritics 0" \

333 "hello\u0301world \u0301helloworld" \	357 "hello\u0301world \u0301helloworld" \

334 "0 hello\u0301world hello\u0301world 1 helloworld helloworld"	358 "hello\u0301world hello\u0301world helloworld helloworld"

335

336	359

337 #-------------------------------------------------------------------------	360 #-------------------------------------------------------------------------

338	361

339 proc do_tokenize {tokenizer txt} {	362 proc do_tokenize {tokenizer txt} {

340 set res [list]	363 set res [list]

341 foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {	364 foreach {b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] {

342 lappend res $b	365 lappend res $b

343 }	366 }

344 set res	367 set res

345 }	368 }

346	369

347 # Argument $lCodepoint must be a list of codepoints (integers) that	370 # Argument $lCodepoint must be a list of codepoints (integers) that

348 # correspond to whitespace characters. This command creates a string	371 # correspond to whitespace characters. This command creates a string

349 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"	372 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"

350 # using tokenizer $tokenizer. The test passes if the tokenizer successfully	373 # using tokenizer $tokenizer. The test passes if the tokenizer successfully

351 # extracts the two 5 character tokens.	374 # extracts the two 5 character tokens.

352 #	375 #

353 proc do_isspace_test {tn tokenizer lCp} {	376 proc do_isspace_test {tn tokenizer lCp} {

354 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]	377 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]

355 set txt "${whitespace}hello${whitespace}world${whitespace}"	378 set txt "${whitespace}hello${whitespace}world${whitespace}"

356 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]	379 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]

357 }	380 }

358	381

359 set tokenizers [list unicode61]	382 set tokenizers [list unicode61]

360 ifcapable icu { lappend tokenizers icu }	383 #ifcapable icu { lappend tokenizers icu }

361	384

362 # Some tests to check that the tokenizers can both identify white-space	385 # Some tests to check that the tokenizers can both identify white-space

363 # codepoints. All codepoints tested below are of type "Zs" in the	386 # codepoints. All codepoints tested below are of type "Zs" in the

364 # UnicodeData.txt file.	387 # UnicodeData.txt file.

365 foreach T $tokenizers {	388 foreach T $tokenizers {

366 do_isspace_test 6.$T.1 $T 32	389 do_isspace_test 6.$T.1 $T 32

367 do_isspace_test 6.$T.2 $T 160	390 do_isspace_test 6.$T.2 $T 160

368 do_isspace_test 6.$T.3 $T 5760	391 do_isspace_test 6.$T.3 $T 5760

369 do_isspace_test 6.$T.4 $T 6158	392 do_isspace_test 6.$T.4 $T 6158

370 do_isspace_test 6.$T.5 $T 8192	393 do_isspace_test 6.$T.5 $T 8192

(...skipping 11 matching lines...) Expand all Loading...
382 do_isspace_test 6.$T.17 $T 8287	405 do_isspace_test 6.$T.17 $T 8287

383 do_isspace_test 6.$T.18 $T 12288	406 do_isspace_test 6.$T.18 $T 12288

384	407

385 do_isspace_test 6.$T.19 $T {32 160 5760 6158}	408 do_isspace_test 6.$T.19 $T {32 160 5760 6158}

386 do_isspace_test 6.$T.20 $T {8192 8193 8194 8195}	409 do_isspace_test 6.$T.20 $T {8192 8193 8194 8195}

387 do_isspace_test 6.$T.21 $T {8196 8197 8198 8199}	410 do_isspace_test 6.$T.21 $T {8196 8197 8198 8199}

388 do_isspace_test 6.$T.22 $T {8200 8201 8202 8239}	411 do_isspace_test 6.$T.22 $T {8200 8201 8202 8239}

389 do_isspace_test 6.$T.23 $T {8287 12288}	412 do_isspace_test 6.$T.23 $T {8287 12288}

390 }	413 }

391	414

	415

392 #-------------------------------------------------------------------------	416 #-------------------------------------------------------------------------

393 # Test that the private use ranges are treated as alphanumeric.	417 # Test that the private use ranges are treated as alphanumeric.

394 #	418 #

395 foreach {tn1 c} {	419 foreach {tn1 c} {

396 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff	420 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff

397 } {	421 } {

398 foreach {tn2 config res} {	422 foreach {tn2 config res} {

399 1 "" "0 helloworld helloworld"	423 1 "" "helloworld helloworld"

400 2 "separators=*" "0 hello hello 1 world world"	424 2 "separators *" "hello hello world world"

401 } {	425 } {

402 set config [string map [list * $c] $config]	426 set config [string map [list * $c] $config]

403 set input [string map [list * $c] "hello*world"]	427 set input [string map [list * $c] "hello*world"]

404 set output [string map [list * $c] $res]	428 set output [string map [list * $c] $res]

405 do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output	429 do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output

406 }	430 }

407 }	431 }

408	432

409 #-------------------------------------------------------------------------	433 #-------------------------------------------------------------------------

410 # Cursory test of remove_diacritics=0.	434 # Cursory test of remove_diacritics=0.

411 #	435 #

412 # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS	436 # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS

413 # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS	437 # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS

414 # 00E4;LATIN SMALL LETTER A WITH DIAERESIS	438 # 00E4;LATIN SMALL LETTER A WITH DIAERESIS

415 # 00F6;LATIN SMALL LETTER O WITH DIAERESIS	439 # 00F6;LATIN SMALL LETTER O WITH DIAERESIS

416 #	440 #

417 do_execsql_test 8.1.1 "	441 do_execsql_test 8.1.1 "

418 CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1');	442 CREATE VIRTUAL TABLE t3 USING fts5(

	443 content, tokenize='unicode61 remove_diacritics 1'

	444 );

419 INSERT INTO t3 VALUES('o');	445 INSERT INTO t3 VALUES('o');

420 INSERT INTO t3 VALUES('a');	446 INSERT INTO t3 VALUES('a');

421 INSERT INTO t3 VALUES('O');	447 INSERT INTO t3 VALUES('O');

422 INSERT INTO t3 VALUES('A');	448 INSERT INTO t3 VALUES('A');

423 INSERT INTO t3 VALUES('\xD6');	449 INSERT INTO t3 VALUES('\xD6');

424 INSERT INTO t3 VALUES('\xC4');	450 INSERT INTO t3 VALUES('\xC4');

425 INSERT INTO t3 VALUES('\xF6');	451 INSERT INTO t3 VALUES('\xF6');

426 INSERT INTO t3 VALUES('\xE4');	452 INSERT INTO t3 VALUES('\xE4');

427 "	453 "

428 do_execsql_test 8.1.2 {	454 do_execsql_test 8.1.2 {

429 SELECT rowid FROM t3 WHERE t3 MATCH 'o';	455 SELECT rowid FROM t3 WHERE t3 MATCH 'o' ORDER BY rowid ASC;

430 } {1 3 5 7}	456 } {1 3 5 7}

431 do_execsql_test 8.1.3 {	457 do_execsql_test 8.1.3 {

432 SELECT rowid FROM t3 WHERE t3 MATCH 'a';	458 SELECT rowid FROM t3 WHERE t3 MATCH 'a' ORDER BY rowid ASC;

433 } {2 4 6 8}	459 } {2 4 6 8}

434 do_execsql_test 8.2.1 {	460 do_execsql_test 8.2.1 {

435 CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0");	461 CREATE VIRTUAL TABLE t4 USING fts5(

436 INSERT INTO t4 SELECT * FROM t3;	462 content, tokenize='unicode61 remove_diacritics 0'

	463 );

	464 INSERT INTO t4 SELECT * FROM t3 ORDER BY rowid ASC;

437 }	465 }

438 do_execsql_test 8.2.2 {	466 do_execsql_test 8.2.2 {

439 SELECT rowid FROM t4 WHERE t4 MATCH 'o';	467 SELECT rowid FROM t4 WHERE t4 MATCH 'o' ORDER BY rowid ASC;

440 } {1 3}	468 } {1 3}

441 do_execsql_test 8.2.3 {	469 do_execsql_test 8.2.3 {

442 SELECT rowid FROM t4 WHERE t4 MATCH 'a';	470 SELECT rowid FROM t4 WHERE t4 MATCH 'a' ORDER BY rowid ASC;

443 } {2 4}	471 } {2 4}

444	472

445 #-------------------------------------------------------------------------	473 #-------------------------------------------------------------------------

446 #	474 #

	475 if 0 {

447 foreach {tn sql} {	476 foreach {tn sql} {

448 1 {	477 1 {

449 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);	478 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);

450 CREATE VIRTUAL TABLE t6 USING fts4(	479 CREATE VIRTUAL TABLE t6 USING fts4(

451 tokenize=unicode61 [tokenchars=="] "tokenchars=[]");	480 tokenize=unicode61 [tokenchars=="] "tokenchars=[]");

452 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);	481 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);

453 }	482 }

454 2 {	483 2 {

455 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");	484 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");

456 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");	485 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
548 #	577 #

549 do_execsql_test 11.1 {	578 do_execsql_test 11.1 {

550 CREATE VIRTUAL TABLE ft1 USING fts3tokenize(	579 CREATE VIRTUAL TABLE ft1 USING fts3tokenize(

551 "unicode61", "tokenchars=@.", "separators=1234567890"	580 "unicode61", "tokenchars=@.", "separators=1234567890"

552 );	581 );

553 SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';	582 SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';

554 } {	583 } {

555 berlin@street sydney.road	584 berlin@street sydney.road

556 }	585 }

557	586

	587 }

	588

558 finish_test	589 finish_test

OLD	NEW