Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(337)

Side by Side Diff: third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test

Issue 1610543003: [sql] Import reference version of SQLite 3.10.2. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # 2012 May 25 1 # 2012 May 25
2 # 2 #
3 # The author disclaims copyright to this source code. In place of 3 # The author disclaims copyright to this source code. In place of
4 # a legal notice, here is a blessing: 4 # a legal notice, here is a blessing:
5 # 5 #
6 # May you do good and not evil. 6 # May you do good and not evil.
7 # May you find forgiveness for yourself and forgive others. 7 # May you find forgiveness for yourself and forgive others.
8 # May you share freely, never taking more than you give. 8 # May you share freely, never taking more than you give.
9 # 9 #
10 #************************************************************************* 10 #*************************************************************************
11 # 11 #
12 # The tests in this file focus on testing the "unicode" FTS tokenizer. 12 # The tests in this file focus on testing the "unicode" FTS tokenizer.
13 # 13 #
14 # This is a modified copy of FTS4 test file "fts4_unicode.test".
15 #
14 16
15 set testdir [file dirname $argv0] 17 source [file join [file dirname [info script]] fts5_common.tcl]
16 source $testdir/tester.tcl 18 set testprefix fts5unicode2
17 ifcapable !fts3_unicode { finish_test ; return } 19
18 set ::testprefix fts4unicode 20 # If SQLITE_ENABLE_FTS5 is defined, omit this file.
21 ifcapable !fts5 {
22 finish_test
23 return
24 }
19 25
20 proc do_unicode_token_test {tn input res} { 26 proc do_unicode_token_test {tn input res} {
21 set input [string map {' ''} $input] 27 uplevel [list do_test $tn [list \
22 uplevel [list do_execsql_test $tn " 28 sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input
23 SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input'); 29 ] [list {*}$res]]
24 " [list [list {*}$res]]]
25 } 30 }
26 31
27 proc do_unicode_token_test2 {tn input res} { 32 proc do_unicode_token_test2 {tn input res} {
28 set input [string map {' ''} $input] 33 uplevel [list do_test $tn [list \
29 uplevel [list do_execsql_test $tn " 34 sqlite3_fts5_tokenize -subst db "unicode61" $input
30 SELECT fts3_tokenizer_test('unicode61', '$input'); 35 ] [list {*}$res]]
31 " [list [list {*}$res]]]
32 } 36 }
33 37
34 proc do_unicode_token_test3 {tn args} { 38 proc do_unicode_token_test3 {tn args} {
35 set res [lindex $args end] 39 set tokenizer [concat unicode61 {*}[lrange $args 0 end-2]]
36 set sql "SELECT fts3_tokenizer_test('unicode61'" 40 set input [lindex $args end-1]
37 foreach a [lrange $args 0 end-1] { 41 set res [lindex $args end]
38 append sql ", '" 42 uplevel [list do_test $tn [list \
39 append sql [string map {' ''} $a] 43 sqlite3_fts5_tokenize -subst db $tokenizer $input
40 append sql "'" 44 ] [list {*}$res]]
41 }
42 append sql ")"
43 uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
44 } 45 }
45 46
46 do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D} 47 do_unicode_token_test 1.0 {a B c D} {a a b B c c d D}
47 48
48 do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \ 49 do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
49 "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC" 50 "\uE4 \uC4 \uF6 \uD6 \uFC \uDC"
50 51
51 do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \ 52 do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
52 "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx" 53 "x\uE4x x\uC4x x\uF6x x\uD6x x\uFCx x\uDCx"
53 54
54 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s. 55 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
55 do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF" 56 do_unicode_token_test 1.3 "\uDF" "\uDF \uDF"
56 do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E" 57 do_unicode_token_test 1.4 "\u1E9E" "\uDF \u1E9E"
57 58
58 do_unicode_token_test 1.5 "The quick brown fox" { 59 do_unicode_token_test 1.5 "The quick brown fox" {
59 0 the The 1 quick quick 2 brown brown 3 fox fox 60 the The quick quick brown brown fox fox
60 } 61 }
61 do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" { 62 do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
62 0 the The 1 quick quick 2 brown brown 3 fox fox 63 the The quick quick brown brown fox fox
63 } 64 }
64 65
65 do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D} 66 do_unicode_token_test2 1.7 {a B c D} {a a b B c c d D}
66 do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC" 67 do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "a \uC4 o \uD6 u \uDC"
67 68
68 do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \ 69 do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \
69 "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx" 70 "xax x\uC4x xox x\uD6x xux x\uDCx"
70 71
71 # Check that diacritics are removed if remove_diacritics=1 is specified. 72 # Check that diacritics are removed if remove_diacritics=1 is specified.
72 # And that they do not break tokens. 73 # And that they do not break tokens.
73 do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx" 74 do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx"
74 75
75 # Title-case mappings work 76 # Title-case mappings work
76 do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5" 77 do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5"
78
79 do_unicode_token_test 1.12 "\u00C1abc\u00C2 \u00D1def\u00C3" \
80 "\u00E1abc\u00E2 \u00C1abc\u00C2 \u00F1def\u00E3 \u00D1def\u00C3"
81
82 do_unicode_token_test 1.13 "\u00A2abc\u00A3 \u00A4def\u00A5" \
83 "abc abc def def"
77 84
78 #------------------------------------------------------------------------- 85 #-------------------------------------------------------------------------
79 # 86 #
80 set docs [list { 87 set docs [list {
81 Enhance the INSERT syntax to allow multiple rows to be inserted via the 88 Enhance the INSERT syntax to allow multiple rows to be inserted via the
82 VALUES clause. 89 VALUES clause.
83 } { 90 } {
84 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause. 91 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
85 } { 92 } {
86 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp(). 93 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
121 foreach k [array names map] { 128 foreach k [array names map] {
122 lappend mappings [string toupper $k] [lindex $map($k) 0] 129 lappend mappings [string toupper $k] [lindex $map($k) 0]
123 lappend mappings $k [lindex $map($k) 1] 130 lappend mappings $k [lindex $map($k) 1]
124 } 131 }
125 proc mapdoc {doc} { 132 proc mapdoc {doc} {
126 set doc [regsub -all {[[:space:]]+} $doc " "] 133 set doc [regsub -all {[[:space:]]+} $doc " "]
127 string map $::mappings [string trim $doc] 134 string map $::mappings [string trim $doc]
128 } 135 }
129 136
130 do_test 2.0 { 137 do_test 2.0 {
131 execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); } 138 execsql { CREATE VIRTUAL TABLE t2 USING fts5(tokenize=unicode61, x); }
132 foreach doc $docs { 139 foreach doc $docs {
133 set d [mapdoc $doc] 140 set d [mapdoc $doc]
134 execsql { INSERT INTO t2 VALUES($d) } 141 execsql { INSERT INTO t2 VALUES($d) }
135 } 142 }
136 } {} 143 } {}
137 144
138 do_test 2.1 { 145 do_test 2.1 {
139 set q [mapdoc "row"] 146 set q [mapdoc "row"]
140 execsql { SELECT * FROM t2 WHERE t2 MATCH $q } 147 execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
141 } [list [mapdoc { 148 } [list [mapdoc {
(...skipping 17 matching lines...) Expand all
159 5 "rOllback" { 166 5 "rOllback" {
160 ...[ROLLBACK]. Instead, the pending statement 167 ...[ROLLBACK]. Instead, the pending statement
161 will return SQLITE_ABORT upon next access after the [ROLLBACK]. 168 will return SQLITE_ABORT upon next access after the [ROLLBACK].
162 } 169 }
163 6 "lang*" { 170 6 "lang*" {
164 Added support for the FTS4 [languageid] option. 171 Added support for the FTS4 [languageid] option.
165 } 172 }
166 } { 173 } {
167 do_test 2.$tn { 174 do_test 2.$tn {
168 set q [mapdoc $query] 175 set q [mapdoc $query]
169 execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q } 176 execsql {
177 SELECT snippet(t2, -1, '[', ']', '...', 15) FROM t2 WHERE t2 MATCH $q
178 }
170 } [list [mapdoc $snippet]] 179 } [list [mapdoc $snippet]]
171 } 180 }
172 181
173 #------------------------------------------------------------------------- 182 #-------------------------------------------------------------------------
174 # Make sure the unicode61 tokenizer does not crash if it is passed a 183 # Make sure the unicode61 tokenizer does not crash if it is passed a
175 # NULL pointer. 184 # NULL pointer.
176 reset_db 185 reset_db
177 do_execsql_test 3.1 { 186 do_execsql_test 3.1 {
178 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y); 187 CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x, y);
179 INSERT INTO t1 VALUES(NULL, 'a b c'); 188 INSERT INTO t1 VALUES(NULL, 'a b c');
180 } 189 }
181 190
182 do_execsql_test 3.2 { 191 do_execsql_test 3.2 {
183 SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b' 192 SELECT snippet(t1, -1, '[', ']', '...', 15) FROM t1 WHERE t1 MATCH 'b'
184 } {{a [b] c}} 193 } {{a [b] c}}
185 194
186 do_execsql_test 3.3 { 195 do_execsql_test 3.3 {
187 BEGIN; 196 BEGIN;
188 DELETE FROM t1; 197 DELETE FROM t1;
189 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b'); 198 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
190 INSERT INTO t1 SELECT * FROM t1; 199 INSERT INTO t1 SELECT * FROM t1;
191 INSERT INTO t1 SELECT * FROM t1; 200 INSERT INTO t1 SELECT * FROM t1;
192 INSERT INTO t1 SELECT * FROM t1; 201 INSERT INTO t1 SELECT * FROM t1;
193 INSERT INTO t1 SELECT * FROM t1; 202 INSERT INTO t1 SELECT * FROM t1;
(...skipping 21 matching lines...) Expand all
215 #------------------------------------------------------------------------- 224 #-------------------------------------------------------------------------
216 # 225 #
217 reset_db 226 reset_db
218 227
219 do_test 4.1 { 228 do_test 4.1 {
220 set a "abc\uFFFEdef" 229 set a "abc\uFFFEdef"
221 set b "abc\uD800def" 230 set b "abc\uD800def"
222 set c "\uFFFEdef" 231 set c "\uFFFEdef"
223 set d "\uD800def" 232 set d "\uD800def"
224 execsql { 233 execsql {
225 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x); 234 CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x);
226 INSERT INTO t1 VALUES($a); 235 INSERT INTO t1 VALUES($a);
227 INSERT INTO t1 VALUES($b); 236 INSERT INTO t1 VALUES($b);
228 INSERT INTO t1 VALUES($c); 237 INSERT INTO t1 VALUES($c);
229 INSERT INTO t1 VALUES($d); 238 INSERT INTO t1 VALUES($d);
230 } 239 }
240
241 execsql "CREATE VIRTUAL TABLE t8 USING fts5(
242 a, b, tokenize=\"unicode61 separators '\uFFFE\uD800\u00BF'\"
243 )"
231 } {} 244 } {}
232 245
233 do_test 4.2 { 246 do_test 4.2 {
234 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}] 247 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
235 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}] 248 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
236 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}] 249 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
237 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}] 250 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
238 execsql { 251 execsql {
239 INSERT INTO t1 VALUES($a); 252 INSERT INTO t1 VALUES($a);
240 INSERT INTO t1 VALUES($b); 253 INSERT INTO t1 VALUES($b);
241 INSERT INTO t1 VALUES($c); 254 INSERT INTO t1 VALUES($c);
242 INSERT INTO t1 VALUES($d); 255 INSERT INTO t1 VALUES($d);
243 } 256 }
244 } {} 257 } {}
245 258
246 do_test 4.3 { 259 do_test 4.3 {
247 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}] 260 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
248 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}] 261 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
249 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}] 262 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
250 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}] 263 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
251 execsql { 264 execsql {
252 INSERT INTO t1 VALUES($a); 265 INSERT INTO t1 VALUES($a);
253 INSERT INTO t1 VALUES($b); 266 INSERT INTO t1 VALUES($b);
254 INSERT INTO t1 VALUES($c); 267 INSERT INTO t1 VALUES($c);
255 INSERT INTO t1 VALUES($d); 268 INSERT INTO t1 VALUES($d);
256 } 269 }
257 } {} 270 } {}
258 271
272 do_test 4.4 {
273 sqlite3_exec_hex db {
274 CREATE VIRTUAL TABLE t9 USING fts5(a, b,
275 tokenize="unicode61 separators '%C09004'"
276 );
277 INSERT INTO t9(a) VALUES('abc%88def %89ghi%90');
278 }
279 } {0 {}}
280
281
259 #------------------------------------------------------------------------- 282 #-------------------------------------------------------------------------
260 283
261 do_unicode_token_test3 5.1 {tokenchars=} { 284 breakpoint
285 do_unicode_token_test3 5.1 {tokenchars {}} {
262 sqlite3_reset sqlite3_column_int 286 sqlite3_reset sqlite3_column_int
263 } { 287 } {
264 0 sqlite3 sqlite3 288 sqlite3 sqlite3
265 1 reset reset 289 reset reset
266 2 sqlite3 sqlite3 290 sqlite3 sqlite3
267 3 column column 291 column column
268 4 int int 292 int int
269 } 293 }
270 294
271 do_unicode_token_test3 5.2 {tokenchars=_} { 295 do_unicode_token_test3 5.2 {tokenchars _} {
272 sqlite3_reset sqlite3_column_int 296 sqlite3_reset sqlite3_column_int
273 } { 297 } {
274 0 sqlite3_reset sqlite3_reset 298 sqlite3_reset sqlite3_reset
275 1 sqlite3_column_int sqlite3_column_int 299 sqlite3_column_int sqlite3_column_int
276 } 300 }
277 301
278 do_unicode_token_test3 5.3 {separators=xyz} { 302 do_unicode_token_test3 5.3 {separators xyz} {
279 Laotianxhorseyrunszfast 303 Laotianxhorseyrunszfast
280 } { 304 } {
281 0 laotian Laotian 305 laotian Laotian
282 1 horse horse 306 horse horse
283 2 runs runs 307 runs runs
284 3 fast fast 308 fast fast
285 } 309 }
286 310
287 do_unicode_token_test3 5.4 {tokenchars=xyz} { 311 do_unicode_token_test3 5.4 {tokenchars xyz} {
288 Laotianxhorseyrunszfast 312 Laotianxhorseyrunszfast
289 } { 313 } {
290 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast 314 laotianxhorseyrunszfast Laotianxhorseyrunszfast
291 } 315 }
292 316
293 do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} { 317 do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} {
294 sqlite3_resetxsqlite3_column_intyhonda_phantom 318 sqlite3_resetxsqlite3_column_intyhonda_phantom
295 } { 319 } {
296 0 sqlite3_reset sqlite3_reset 320 sqlite3_reset sqlite3_reset
297 1 sqlite3_column_int sqlite3_column_int 321 sqlite3_column_int sqlite3_column_int
298 2 honda_phantom honda_phantom 322 honda_phantom honda_phantom
299 } 323 }
300 324
301 do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" { 325 do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" {
302 0 abc abc 1 def def 326 abc abc def def
303 } 327 }
304 328
305 do_unicode_token_test3 5.7 \ 329 do_unicode_token_test3 5.7 \
306 "tokenchars=\u2444\u2445" \ 330 "tokenchars \u2444\u2445" \
307 "separators=\u05D0\u05D1\u05D2" \ 331 "separators \u05D0\u05D1\u05D2" \
308 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \ 332 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
309 [list \ 333 [list \
310 0 \u2444fre\u2445sh \u2444fre\u2445sh \ 334 \u2444fre\u2445sh \u2444fre\u2445sh \
311 1 water water \ 335 water water \
312 2 fish fish \ 336 fish fish \
313 3 \u2445timer \u2445timer \ 337 \u2445timer \u2445timer \
314 ] 338 ]
315 339
316 # Check that it is not possible to add a standalone diacritic codepoint 340 # Check that it is not possible to add a standalone diacritic codepoint
317 # to either separators or tokenchars. 341 # to either separators or tokenchars.
318 do_unicode_token_test3 5.8 "separators=\u0301" \ 342 do_unicode_token_test3 5.8 "separators \u0301" \
319 "hello\u0301world \u0301helloworld" \ 343 "hello\u0301world \u0301helloworld" \
320 "0 helloworld hello\u0301world 1 helloworld helloworld" 344 "helloworld hello\u0301world helloworld helloworld"
321 345
322 do_unicode_token_test3 5.9 "tokenchars=\u0301" \ 346 do_unicode_token_test3 5.9 "tokenchars \u0301" \
323 "hello\u0301world \u0301helloworld" \ 347 "hello\u0301world \u0301helloworld" \
324 "0 helloworld hello\u0301world 1 helloworld helloworld" 348 "helloworld hello\u0301world helloworld helloworld"
325 349
326 do_unicode_token_test3 5.10 "separators=\u0301" \ 350 do_unicode_token_test3 5.10 "separators \u0301" \
327 "remove_diacritics=0" \ 351 "remove_diacritics 0" \
328 "hello\u0301world \u0301helloworld" \ 352 "hello\u0301world \u0301helloworld" \
329 "0 hello\u0301world hello\u0301world 1 helloworld helloworld" 353 "hello\u0301world hello\u0301world helloworld helloworld"
330 354
331 do_unicode_token_test3 5.11 "tokenchars=\u0301" \ 355 do_unicode_token_test3 5.11 "tokenchars \u0301" \
332 "remove_diacritics=0" \ 356 "remove_diacritics 0" \
333 "hello\u0301world \u0301helloworld" \ 357 "hello\u0301world \u0301helloworld" \
334 "0 hello\u0301world hello\u0301world 1 helloworld helloworld" 358 "hello\u0301world hello\u0301world helloworld helloworld"
335
336 359
337 #------------------------------------------------------------------------- 360 #-------------------------------------------------------------------------
338 361
339 proc do_tokenize {tokenizer txt} { 362 proc do_tokenize {tokenizer txt} {
340 set res [list] 363 set res [list]
341 foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] { 364 foreach {b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] {
342 lappend res $b 365 lappend res $b
343 } 366 }
344 set res 367 set res
345 } 368 }
346 369
347 # Argument $lCodepoint must be a list of codepoints (integers) that 370 # Argument $lCodepoint must be a list of codepoints (integers) that
348 # correspond to whitespace characters. This command creates a string 371 # correspond to whitespace characters. This command creates a string
349 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}" 372 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
350 # using tokenizer $tokenizer. The test passes if the tokenizer successfully 373 # using tokenizer $tokenizer. The test passes if the tokenizer successfully
351 # extracts the two 5 character tokens. 374 # extracts the two 5 character tokens.
352 # 375 #
353 proc do_isspace_test {tn tokenizer lCp} { 376 proc do_isspace_test {tn tokenizer lCp} {
354 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp] 377 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
355 set txt "${whitespace}hello${whitespace}world${whitespace}" 378 set txt "${whitespace}hello${whitespace}world${whitespace}"
356 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}] 379 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
357 } 380 }
358 381
359 set tokenizers [list unicode61] 382 set tokenizers [list unicode61]
360 ifcapable icu { lappend tokenizers icu } 383 #ifcapable icu { lappend tokenizers icu }
361 384
362 # Some tests to check that the tokenizers can both identify white-space 385 # Some tests to check that the tokenizers can both identify white-space
363 # codepoints. All codepoints tested below are of type "Zs" in the 386 # codepoints. All codepoints tested below are of type "Zs" in the
364 # UnicodeData.txt file. 387 # UnicodeData.txt file.
365 foreach T $tokenizers { 388 foreach T $tokenizers {
366 do_isspace_test 6.$T.1 $T 32 389 do_isspace_test 6.$T.1 $T 32
367 do_isspace_test 6.$T.2 $T 160 390 do_isspace_test 6.$T.2 $T 160
368 do_isspace_test 6.$T.3 $T 5760 391 do_isspace_test 6.$T.3 $T 5760
369 do_isspace_test 6.$T.4 $T 6158 392 do_isspace_test 6.$T.4 $T 6158
370 do_isspace_test 6.$T.5 $T 8192 393 do_isspace_test 6.$T.5 $T 8192
(...skipping 11 matching lines...) Expand all
382 do_isspace_test 6.$T.17 $T 8287 405 do_isspace_test 6.$T.17 $T 8287
383 do_isspace_test 6.$T.18 $T 12288 406 do_isspace_test 6.$T.18 $T 12288
384 407
385 do_isspace_test 6.$T.19 $T {32 160 5760 6158} 408 do_isspace_test 6.$T.19 $T {32 160 5760 6158}
386 do_isspace_test 6.$T.20 $T {8192 8193 8194 8195} 409 do_isspace_test 6.$T.20 $T {8192 8193 8194 8195}
387 do_isspace_test 6.$T.21 $T {8196 8197 8198 8199} 410 do_isspace_test 6.$T.21 $T {8196 8197 8198 8199}
388 do_isspace_test 6.$T.22 $T {8200 8201 8202 8239} 411 do_isspace_test 6.$T.22 $T {8200 8201 8202 8239}
389 do_isspace_test 6.$T.23 $T {8287 12288} 412 do_isspace_test 6.$T.23 $T {8287 12288}
390 } 413 }
391 414
415
392 #------------------------------------------------------------------------- 416 #-------------------------------------------------------------------------
393 # Test that the private use ranges are treated as alphanumeric. 417 # Test that the private use ranges are treated as alphanumeric.
394 # 418 #
395 foreach {tn1 c} { 419 foreach {tn1 c} {
396 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff 420 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
397 } { 421 } {
398 foreach {tn2 config res} { 422 foreach {tn2 config res} {
399 1 "" "0 hello*world hello*world" 423 1 "" "hello*world hello*world"
400 2 "separators=*" "0 hello hello 1 world world" 424 2 "separators *" "hello hello world world"
401 } { 425 } {
402 set config [string map [list * $c] $config] 426 set config [string map [list * $c] $config]
403 set input [string map [list * $c] "hello*world"] 427 set input [string map [list * $c] "hello*world"]
404 set output [string map [list * $c] $res] 428 set output [string map [list * $c] $res]
405 do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output 429 do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
406 } 430 }
407 } 431 }
408 432
409 #------------------------------------------------------------------------- 433 #-------------------------------------------------------------------------
410 # Cursory test of remove_diacritics=0. 434 # Cursory test of remove_diacritics=0.
411 # 435 #
412 # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS 436 # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
413 # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS 437 # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
414 # 00E4;LATIN SMALL LETTER A WITH DIAERESIS 438 # 00E4;LATIN SMALL LETTER A WITH DIAERESIS
415 # 00F6;LATIN SMALL LETTER O WITH DIAERESIS 439 # 00F6;LATIN SMALL LETTER O WITH DIAERESIS
416 # 440 #
417 do_execsql_test 8.1.1 " 441 do_execsql_test 8.1.1 "
418 CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1'); 442 CREATE VIRTUAL TABLE t3 USING fts5(
443 content, tokenize='unicode61 remove_diacritics 1'
444 );
419 INSERT INTO t3 VALUES('o'); 445 INSERT INTO t3 VALUES('o');
420 INSERT INTO t3 VALUES('a'); 446 INSERT INTO t3 VALUES('a');
421 INSERT INTO t3 VALUES('O'); 447 INSERT INTO t3 VALUES('O');
422 INSERT INTO t3 VALUES('A'); 448 INSERT INTO t3 VALUES('A');
423 INSERT INTO t3 VALUES('\xD6'); 449 INSERT INTO t3 VALUES('\xD6');
424 INSERT INTO t3 VALUES('\xC4'); 450 INSERT INTO t3 VALUES('\xC4');
425 INSERT INTO t3 VALUES('\xF6'); 451 INSERT INTO t3 VALUES('\xF6');
426 INSERT INTO t3 VALUES('\xE4'); 452 INSERT INTO t3 VALUES('\xE4');
427 " 453 "
428 do_execsql_test 8.1.2 { 454 do_execsql_test 8.1.2 {
429 SELECT rowid FROM t3 WHERE t3 MATCH 'o'; 455 SELECT rowid FROM t3 WHERE t3 MATCH 'o' ORDER BY rowid ASC;
430 } {1 3 5 7} 456 } {1 3 5 7}
431 do_execsql_test 8.1.3 { 457 do_execsql_test 8.1.3 {
432 SELECT rowid FROM t3 WHERE t3 MATCH 'a'; 458 SELECT rowid FROM t3 WHERE t3 MATCH 'a' ORDER BY rowid ASC;
433 } {2 4 6 8} 459 } {2 4 6 8}
434 do_execsql_test 8.2.1 { 460 do_execsql_test 8.2.1 {
435 CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0"); 461 CREATE VIRTUAL TABLE t4 USING fts5(
436 INSERT INTO t4 SELECT * FROM t3; 462 content, tokenize='unicode61 remove_diacritics 0'
463 );
464 INSERT INTO t4 SELECT * FROM t3 ORDER BY rowid ASC;
437 } 465 }
438 do_execsql_test 8.2.2 { 466 do_execsql_test 8.2.2 {
439 SELECT rowid FROM t4 WHERE t4 MATCH 'o'; 467 SELECT rowid FROM t4 WHERE t4 MATCH 'o' ORDER BY rowid ASC;
440 } {1 3} 468 } {1 3}
441 do_execsql_test 8.2.3 { 469 do_execsql_test 8.2.3 {
442 SELECT rowid FROM t4 WHERE t4 MATCH 'a'; 470 SELECT rowid FROM t4 WHERE t4 MATCH 'a' ORDER BY rowid ASC;
443 } {2 4} 471 } {2 4}
444 472
445 #------------------------------------------------------------------------- 473 #-------------------------------------------------------------------------
446 # 474 #
475 if 0 {
447 foreach {tn sql} { 476 foreach {tn sql} {
448 1 { 477 1 {
449 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]); 478 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
450 CREATE VIRTUAL TABLE t6 USING fts4( 479 CREATE VIRTUAL TABLE t6 USING fts4(
451 tokenize=unicode61 [tokenchars=="] "tokenchars=[]"); 480 tokenize=unicode61 [tokenchars=="] "tokenchars=[]");
452 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]); 481 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);
453 } 482 }
454 2 { 483 2 {
455 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= ."); 484 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");
456 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]"); 485 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after
548 # 577 #
549 do_execsql_test 11.1 { 578 do_execsql_test 11.1 {
550 CREATE VIRTUAL TABLE ft1 USING fts3tokenize( 579 CREATE VIRTUAL TABLE ft1 USING fts3tokenize(
551 "unicode61", "tokenchars=@.", "separators=1234567890" 580 "unicode61", "tokenchars=@.", "separators=1234567890"
552 ); 581 );
553 SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road'; 582 SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';
554 } { 583 } {
555 berlin@street sydney.road 584 berlin@street sydney.road
556 } 585 }
557 586
587 }
588
558 finish_test 589 finish_test
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698