OLD | NEW |
| (Empty) |
1 # 2012 May 25 | |
2 # | |
3 # The author disclaims copyright to this source code. In place of | |
4 # a legal notice, here is a blessing: | |
5 # | |
6 # May you do good and not evil. | |
7 # May you find forgiveness for yourself and forgive others. | |
8 # May you share freely, never taking more than you give. | |
9 # | |
10 #************************************************************************* | |
11 # | |
12 # The tests in this file focus on testing the "unicode" FTS tokenizer. | |
13 # | |
14 # This is a modified copy of FTS4 test file "fts4_unicode.test". | |
15 # | |
16 | |
17 source [file join [file dirname [info script]] fts5_common.tcl] | |
18 set testprefix fts5unicode2 | |
19 | |
20 # If SQLITE_ENABLE_FTS5 is defined, omit this file. | |
21 ifcapable !fts5 { | |
22 finish_test | |
23 return | |
24 } | |
25 | |
26 proc do_unicode_token_test {tn input res} { | |
27 uplevel [list do_test $tn [list \ | |
28 sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input | |
29 ] [list {*}$res]] | |
30 } | |
31 | |
32 proc do_unicode_token_test2 {tn input res} { | |
33 uplevel [list do_test $tn [list \ | |
34 sqlite3_fts5_tokenize -subst db "unicode61" $input | |
35 ] [list {*}$res]] | |
36 } | |
37 | |
38 proc do_unicode_token_test3 {tn args} { | |
39 set tokenizer [concat unicode61 {*}[lrange $args 0 end-2]] | |
40 set input [lindex $args end-1] | |
41 set res [lindex $args end] | |
42 uplevel [list do_test $tn [list \ | |
43 sqlite3_fts5_tokenize -subst db $tokenizer $input | |
44 ] [list {*}$res]] | |
45 } | |
46 | |
47 do_unicode_token_test 1.0 {a B c D} {a a b B c c d D} | |
48 | |
49 do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \ | |
50 "\uE4 \uC4 \uF6 \uD6 \uFC \uDC" | |
51 | |
52 do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \ | |
53 "x\uE4x x\uC4x x\uF6x x\uD6x x\uFCx x\uDCx" | |
54 | |
55 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s. | |
56 do_unicode_token_test 1.3 "\uDF" "\uDF \uDF" | |
57 do_unicode_token_test 1.4 "\u1E9E" "\uDF \u1E9E" | |
58 | |
59 do_unicode_token_test 1.5 "The quick brown fox" { | |
60 the The quick quick brown brown fox fox | |
61 } | |
62 do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" { | |
63 the The quick quick brown brown fox fox | |
64 } | |
65 | |
66 do_unicode_token_test2 1.7 {a B c D} {a a b B c c d D} | |
67 do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "a \uC4 o \uD6 u \uDC" | |
68 | |
69 do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \ | |
70 "xax x\uC4x xox x\uD6x xux x\uDCx" | |
71 | |
72 # Check that diacritics are removed if remove_diacritics=1 is specified. | |
73 # And that they do not break tokens. | |
74 do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx" | |
75 | |
76 # Title-case mappings work | |
77 do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5" | |
78 | |
79 do_unicode_token_test 1.12 "\u00C1abc\u00C2 \u00D1def\u00C3" \ | |
80 "\u00E1abc\u00E2 \u00C1abc\u00C2 \u00F1def\u00E3 \u00D1def\u00C3" | |
81 | |
82 do_unicode_token_test 1.13 "\u00A2abc\u00A3 \u00A4def\u00A5" \ | |
83 "abc abc def def" | |
84 | |
85 #------------------------------------------------------------------------- | |
86 # | |
87 set docs [list { | |
88 Enhance the INSERT syntax to allow multiple rows to be inserted via the | |
89 VALUES clause. | |
90 } { | |
91 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause. | |
92 } { | |
93 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp(). | |
94 } { | |
95 Added the sqlite3_db_readonly() interface. | |
96 } { | |
97 Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the | |
98 ability to add new PRAGMA statements or to override built-in PRAGMAs. | |
99 } { | |
100 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on | |
101 the same row that contains the maximum x value. | |
102 } { | |
103 Added support for the FTS4 languageid option. | |
104 } { | |
105 Documented support for the FTS4 content option. This feature has actually | |
106 been in the code since version 3.7.9 but is only now considered to be | |
107 officially supported. | |
108 } { | |
109 Pending statements no longer block ROLLBACK. Instead, the pending statement | |
110 will return SQLITE_ABORT upon next access after the ROLLBACK. | |
111 } { | |
112 Improvements to the handling of CSV inputs in the command-line shell | |
113 } { | |
114 Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be | |
115 incorrectly converted into an INNER JOIN if the WHERE clause indexable terms | |
116 connected by OR. | |
117 }] | |
118 | |
119 set map(a) [list "\u00C4" "\u00E4"] ; # LATIN LETTER A WITH DIAERESIS | |
120 set map(e) [list "\u00CB" "\u00EB"] ; # LATIN LETTER E WITH DIAERESIS | |
121 set map(i) [list "\u00CF" "\u00EF"] ; # LATIN LETTER I WITH DIAERESIS | |
122 set map(o) [list "\u00D6" "\u00F6"] ; # LATIN LETTER O WITH DIAERESIS | |
123 set map(u) [list "\u00DC" "\u00FC"] ; # LATIN LETTER U WITH DIAERESIS | |
124 set map(y) [list "\u0178" "\u00FF"] ; # LATIN LETTER Y WITH DIAERESIS | |
125 set map(h) [list "\u1E26" "\u1E27"] ; # LATIN LETTER H WITH DIAERESIS | |
126 set map(w) [list "\u1E84" "\u1E85"] ; # LATIN LETTER W WITH DIAERESIS | |
127 set map(x) [list "\u1E8C" "\u1E8D"] ; # LATIN LETTER X WITH DIAERESIS | |
128 foreach k [array names map] { | |
129 lappend mappings [string toupper $k] [lindex $map($k) 0] | |
130 lappend mappings $k [lindex $map($k) 1] | |
131 } | |
132 proc mapdoc {doc} { | |
133 set doc [regsub -all {[[:space:]]+} $doc " "] | |
134 string map $::mappings [string trim $doc] | |
135 } | |
136 | |
137 do_test 2.0 { | |
138 execsql { CREATE VIRTUAL TABLE t2 USING fts5(tokenize=unicode61, x); } | |
139 foreach doc $docs { | |
140 set d [mapdoc $doc] | |
141 execsql { INSERT INTO t2 VALUES($d) } | |
142 } | |
143 } {} | |
144 | |
145 do_test 2.1 { | |
146 set q [mapdoc "row"] | |
147 execsql { SELECT * FROM t2 WHERE t2 MATCH $q } | |
148 } [list [mapdoc { | |
149 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on | |
150 the same row that contains the maximum x value. | |
151 }]] | |
152 | |
153 foreach {tn query snippet} { | |
154 2 "row" { | |
155 ...returns the value of y on the same [row] that contains | |
156 the maximum x value. | |
157 } | |
158 3 "ROW" { | |
159 ...returns the value of y on the same [row] that contains | |
160 the maximum x value. | |
161 } | |
162 4 "rollback" { | |
163 ...[ROLLBACK]. Instead, the pending statement | |
164 will return SQLITE_ABORT upon next access after the [ROLLBACK]. | |
165 } | |
166 5 "rOllback" { | |
167 ...[ROLLBACK]. Instead, the pending statement | |
168 will return SQLITE_ABORT upon next access after the [ROLLBACK]. | |
169 } | |
170 6 "lang*" { | |
171 Added support for the FTS4 [languageid] option. | |
172 } | |
173 } { | |
174 do_test 2.$tn { | |
175 set q [mapdoc $query] | |
176 execsql { | |
177 SELECT snippet(t2, -1, '[', ']', '...', 15) FROM t2 WHERE t2 MATCH $q | |
178 } | |
179 } [list [mapdoc $snippet]] | |
180 } | |
181 | |
182 #------------------------------------------------------------------------- | |
183 # Make sure the unicode61 tokenizer does not crash if it is passed a | |
184 # NULL pointer. | |
185 reset_db | |
186 do_execsql_test 3.1 { | |
187 CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x, y); | |
188 INSERT INTO t1 VALUES(NULL, 'a b c'); | |
189 } | |
190 | |
191 do_execsql_test 3.2 { | |
192 SELECT snippet(t1, -1, '[', ']', '...', 15) FROM t1 WHERE t1 MATCH 'b' | |
193 } {{a [b] c}} | |
194 | |
195 do_execsql_test 3.3 { | |
196 BEGIN; | |
197 DELETE FROM t1; | |
198 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b'); | |
199 INSERT INTO t1 SELECT * FROM t1; | |
200 INSERT INTO t1 SELECT * FROM t1; | |
201 INSERT INTO t1 SELECT * FROM t1; | |
202 INSERT INTO t1 SELECT * FROM t1; | |
203 INSERT INTO t1 SELECT * FROM t1; | |
204 INSERT INTO t1 SELECT * FROM t1; | |
205 INSERT INTO t1 SELECT * FROM t1; | |
206 INSERT INTO t1 SELECT * FROM t1; | |
207 INSERT INTO t1 SELECT * FROM t1; | |
208 INSERT INTO t1 SELECT * FROM t1; | |
209 INSERT INTO t1 SELECT * FROM t1; | |
210 INSERT INTO t1 SELECT * FROM t1; | |
211 INSERT INTO t1 SELECT * FROM t1; | |
212 INSERT INTO t1 SELECT * FROM t1; | |
213 INSERT INTO t1 SELECT * FROM t1; | |
214 INSERT INTO t1 SELECT * FROM t1; | |
215 INSERT INTO t1 VALUES('a b c', NULL); | |
216 INSERT INTO t1 VALUES('a x c', NULL); | |
217 COMMIT; | |
218 } | |
219 | |
220 do_execsql_test 3.4 { | |
221 SELECT * FROM t1 WHERE t1 MATCH 'a b'; | |
222 } {{a b c} {}} | |
223 | |
224 #------------------------------------------------------------------------- | |
225 # | |
226 reset_db | |
227 | |
228 do_test 4.1 { | |
229 set a "abc\uFFFEdef" | |
230 set b "abc\uD800def" | |
231 set c "\uFFFEdef" | |
232 set d "\uD800def" | |
233 execsql { | |
234 CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x); | |
235 INSERT INTO t1 VALUES($a); | |
236 INSERT INTO t1 VALUES($b); | |
237 INSERT INTO t1 VALUES($c); | |
238 INSERT INTO t1 VALUES($d); | |
239 } | |
240 | |
241 execsql "CREATE VIRTUAL TABLE t8 USING fts5( | |
242 a, b, tokenize=\"unicode61 separators '\uFFFE\uD800\u00BF'\" | |
243 )" | |
244 } {} | |
245 | |
246 do_test 4.2 { | |
247 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}] | |
248 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}] | |
249 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}] | |
250 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}] | |
251 execsql { | |
252 INSERT INTO t1 VALUES($a); | |
253 INSERT INTO t1 VALUES($b); | |
254 INSERT INTO t1 VALUES($c); | |
255 INSERT INTO t1 VALUES($d); | |
256 } | |
257 } {} | |
258 | |
259 do_test 4.3 { | |
260 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}] | |
261 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}] | |
262 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}] | |
263 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}] | |
264 execsql { | |
265 INSERT INTO t1 VALUES($a); | |
266 INSERT INTO t1 VALUES($b); | |
267 INSERT INTO t1 VALUES($c); | |
268 INSERT INTO t1 VALUES($d); | |
269 } | |
270 } {} | |
271 | |
272 do_test 4.4 { | |
273 sqlite3_exec_hex db { | |
274 CREATE VIRTUAL TABLE t9 USING fts5(a, b, | |
275 tokenize="unicode61 separators '%C09004'" | |
276 ); | |
277 INSERT INTO t9(a) VALUES('abc%88def %89ghi%90'); | |
278 } | |
279 } {0 {}} | |
280 | |
281 | |
282 #------------------------------------------------------------------------- | |
283 | |
284 breakpoint | |
285 do_unicode_token_test3 5.1 {tokenchars {}} { | |
286 sqlite3_reset sqlite3_column_int | |
287 } { | |
288 sqlite3 sqlite3 | |
289 reset reset | |
290 sqlite3 sqlite3 | |
291 column column | |
292 int int | |
293 } | |
294 | |
295 do_unicode_token_test3 5.2 {tokenchars _} { | |
296 sqlite3_reset sqlite3_column_int | |
297 } { | |
298 sqlite3_reset sqlite3_reset | |
299 sqlite3_column_int sqlite3_column_int | |
300 } | |
301 | |
302 do_unicode_token_test3 5.3 {separators xyz} { | |
303 Laotianxhorseyrunszfast | |
304 } { | |
305 laotian Laotian | |
306 horse horse | |
307 runs runs | |
308 fast fast | |
309 } | |
310 | |
311 do_unicode_token_test3 5.4 {tokenchars xyz} { | |
312 Laotianxhorseyrunszfast | |
313 } { | |
314 laotianxhorseyrunszfast Laotianxhorseyrunszfast | |
315 } | |
316 | |
317 do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} { | |
318 sqlite3_resetxsqlite3_column_intyhonda_phantom | |
319 } { | |
320 sqlite3_reset sqlite3_reset | |
321 sqlite3_column_int sqlite3_column_int | |
322 honda_phantom honda_phantom | |
323 } | |
324 | |
325 do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" { | |
326 abc abc def def | |
327 } | |
328 | |
329 do_unicode_token_test3 5.7 \ | |
330 "tokenchars \u2444\u2445" \ | |
331 "separators \u05D0\u05D1\u05D2" \ | |
332 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \ | |
333 [list \ | |
334 \u2444fre\u2445sh \u2444fre\u2445sh \ | |
335 water water \ | |
336 fish fish \ | |
337 \u2445timer \u2445timer \ | |
338 ] | |
339 | |
340 # Check that it is not possible to add a standalone diacritic codepoint | |
341 # to either separators or tokenchars. | |
342 do_unicode_token_test3 5.8 "separators \u0301" \ | |
343 "hello\u0301world \u0301helloworld" \ | |
344 "helloworld hello\u0301world helloworld helloworld" | |
345 | |
346 do_unicode_token_test3 5.9 "tokenchars \u0301" \ | |
347 "hello\u0301world \u0301helloworld" \ | |
348 "helloworld hello\u0301world helloworld helloworld" | |
349 | |
350 do_unicode_token_test3 5.10 "separators \u0301" \ | |
351 "remove_diacritics 0" \ | |
352 "hello\u0301world \u0301helloworld" \ | |
353 "hello\u0301world hello\u0301world helloworld helloworld" | |
354 | |
355 do_unicode_token_test3 5.11 "tokenchars \u0301" \ | |
356 "remove_diacritics 0" \ | |
357 "hello\u0301world \u0301helloworld" \ | |
358 "hello\u0301world hello\u0301world helloworld helloworld" | |
359 | |
360 #------------------------------------------------------------------------- | |
361 | |
362 proc do_tokenize {tokenizer txt} { | |
363 set res [list] | |
364 foreach {b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] { | |
365 lappend res $b | |
366 } | |
367 set res | |
368 } | |
369 | |
370 # Argument $lCodepoint must be a list of codepoints (integers) that | |
371 # correspond to whitespace characters. This command creates a string | |
372 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}" | |
373 # using tokenizer $tokenizer. The test passes if the tokenizer successfully | |
374 # extracts the two 5 character tokens. | |
375 # | |
376 proc do_isspace_test {tn tokenizer lCp} { | |
377 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp] | |
378 set txt "${whitespace}hello${whitespace}world${whitespace}" | |
379 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}] | |
380 } | |
381 | |
382 set tokenizers [list unicode61] | |
383 #ifcapable icu { lappend tokenizers icu } | |
384 | |
385 # Some tests to check that the tokenizers can both identify white-space | |
386 # codepoints. All codepoints tested below are of type "Zs" in the | |
387 # UnicodeData.txt file. | |
388 foreach T $tokenizers { | |
389 do_isspace_test 6.$T.1 $T 32 | |
390 do_isspace_test 6.$T.2 $T 160 | |
391 do_isspace_test 6.$T.3 $T 5760 | |
392 do_isspace_test 6.$T.4 $T 6158 | |
393 do_isspace_test 6.$T.5 $T 8192 | |
394 do_isspace_test 6.$T.6 $T 8193 | |
395 do_isspace_test 6.$T.7 $T 8194 | |
396 do_isspace_test 6.$T.8 $T 8195 | |
397 do_isspace_test 6.$T.9 $T 8196 | |
398 do_isspace_test 6.$T.10 $T 8197 | |
399 do_isspace_test 6.$T.11 $T 8198 | |
400 do_isspace_test 6.$T.12 $T 8199 | |
401 do_isspace_test 6.$T.13 $T 8200 | |
402 do_isspace_test 6.$T.14 $T 8201 | |
403 do_isspace_test 6.$T.15 $T 8202 | |
404 do_isspace_test 6.$T.16 $T 8239 | |
405 do_isspace_test 6.$T.17 $T 8287 | |
406 do_isspace_test 6.$T.18 $T 12288 | |
407 | |
408 do_isspace_test 6.$T.19 $T {32 160 5760 6158} | |
409 do_isspace_test 6.$T.20 $T {8192 8193 8194 8195} | |
410 do_isspace_test 6.$T.21 $T {8196 8197 8198 8199} | |
411 do_isspace_test 6.$T.22 $T {8200 8201 8202 8239} | |
412 do_isspace_test 6.$T.23 $T {8287 12288} | |
413 } | |
414 | |
415 | |
416 #------------------------------------------------------------------------- | |
417 # Test that the private use ranges are treated as alphanumeric. | |
418 # | |
419 foreach {tn1 c} { | |
420 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff | |
421 } { | |
422 foreach {tn2 config res} { | |
423 1 "" "hello*world hello*world" | |
424 2 "separators *" "hello hello world world" | |
425 } { | |
426 set config [string map [list * $c] $config] | |
427 set input [string map [list * $c] "hello*world"] | |
428 set output [string map [list * $c] $res] | |
429 do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output | |
430 } | |
431 } | |
432 | |
433 #------------------------------------------------------------------------- | |
434 # Cursory test of remove_diacritics=0. | |
435 # | |
436 # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS | |
437 # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS | |
438 # 00E4;LATIN SMALL LETTER A WITH DIAERESIS | |
439 # 00F6;LATIN SMALL LETTER O WITH DIAERESIS | |
440 # | |
441 do_execsql_test 8.1.1 " | |
442 CREATE VIRTUAL TABLE t3 USING fts5( | |
443 content, tokenize='unicode61 remove_diacritics 1' | |
444 ); | |
445 INSERT INTO t3 VALUES('o'); | |
446 INSERT INTO t3 VALUES('a'); | |
447 INSERT INTO t3 VALUES('O'); | |
448 INSERT INTO t3 VALUES('A'); | |
449 INSERT INTO t3 VALUES('\xD6'); | |
450 INSERT INTO t3 VALUES('\xC4'); | |
451 INSERT INTO t3 VALUES('\xF6'); | |
452 INSERT INTO t3 VALUES('\xE4'); | |
453 " | |
454 do_execsql_test 8.1.2 { | |
455 SELECT rowid FROM t3 WHERE t3 MATCH 'o' ORDER BY rowid ASC; | |
456 } {1 3 5 7} | |
457 do_execsql_test 8.1.3 { | |
458 SELECT rowid FROM t3 WHERE t3 MATCH 'a' ORDER BY rowid ASC; | |
459 } {2 4 6 8} | |
460 do_execsql_test 8.2.1 { | |
461 CREATE VIRTUAL TABLE t4 USING fts5( | |
462 content, tokenize='unicode61 remove_diacritics 0' | |
463 ); | |
464 INSERT INTO t4 SELECT * FROM t3 ORDER BY rowid ASC; | |
465 } | |
466 do_execsql_test 8.2.2 { | |
467 SELECT rowid FROM t4 WHERE t4 MATCH 'o' ORDER BY rowid ASC; | |
468 } {1 3} | |
469 do_execsql_test 8.2.3 { | |
470 SELECT rowid FROM t4 WHERE t4 MATCH 'a' ORDER BY rowid ASC; | |
471 } {2 4} | |
472 | |
473 #------------------------------------------------------------------------- | |
474 # | |
475 if 0 { | |
476 foreach {tn sql} { | |
477 1 { | |
478 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]); | |
479 CREATE VIRTUAL TABLE t6 USING fts4( | |
480 tokenize=unicode61 [tokenchars=="] "tokenchars=[]"); | |
481 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]); | |
482 } | |
483 2 { | |
484 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= ."); | |
485 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]"); | |
486 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4"); | |
487 } | |
488 3 { | |
489 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .'); | |
490 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]'); | |
491 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4'); | |
492 } | |
493 4 { | |
494 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`); | |
495 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`); | |
496 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`); | |
497 } | |
498 } { | |
499 do_execsql_test 9.$tn.0 { | |
500 DROP TABLE IF EXISTS t5; | |
501 DROP TABLE IF EXISTS t5aux; | |
502 DROP TABLE IF EXISTS t6; | |
503 DROP TABLE IF EXISTS t6aux; | |
504 DROP TABLE IF EXISTS t7; | |
505 DROP TABLE IF EXISTS t7aux; | |
506 } | |
507 do_execsql_test 9.$tn.1 $sql | |
508 | |
509 do_execsql_test 9.$tn.2 { | |
510 CREATE VIRTUAL TABLE t5aux USING fts4aux(t5); | |
511 INSERT INTO t5 VALUES('one two three/four.five.six'); | |
512 SELECT * FROM t5aux; | |
513 } { | |
514 four.five.six * 1 1 four.five.six 0 1 1 | |
515 {one two three} * 1 1 {one two three} 0 1 1 | |
516 } | |
517 | |
518 do_execsql_test 9.$tn.3 { | |
519 CREATE VIRTUAL TABLE t6aux USING fts4aux(t6); | |
520 INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta'); | |
521 SELECT * FROM t6aux; | |
522 } { | |
523 {alpha=beta"gamma} * 1 1 {alpha=beta"gamma} 0 1 1 | |
524 {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1 | |
525 } | |
526 | |
527 do_execsql_test 9.$tn.4 { | |
528 CREATE VIRTUAL TABLE t7aux USING fts4aux(t7); | |
529 INSERT INTO t7 VALUES('alephxbeth\xC4gimel'); | |
530 SELECT * FROM t7aux; | |
531 } { | |
532 aleph * 1 1 aleph 0 1 1 | |
533 beth * 1 1 beth 0 1 1 | |
534 gimel * 1 1 gimel 0 1 1 | |
535 } | |
536 } | |
537 | |
538 # Check that multiple options are handled correctly. | |
539 # | |
540 do_execsql_test 10.1 { | |
541 DROP TABLE IF EXISTS t1; | |
542 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61 | |
543 "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy" | |
544 "separators=a" "separators=a" "tokenchars=a" "tokenchars=a" | |
545 ); | |
546 | |
547 INSERT INTO t1 VALUES('oneatwoxthreeyfour'); | |
548 INSERT INTO t1 VALUES('a.single=word'); | |
549 CREATE VIRTUAL TABLE t1aux USING fts4aux(t1); | |
550 SELECT * FROM t1aux; | |
551 } { | |
552 .single=word * 1 1 .single=word 0 1 1 | |
553 four * 1 1 four 0 1 1 | |
554 one * 1 1 one 0 1 1 | |
555 three * 1 1 three 0 1 1 | |
556 two * 1 1 two 0 1 1 | |
557 } | |
558 | |
559 # Test that case folding happens after tokenization, not before. | |
560 # | |
561 do_execsql_test 10.2 { | |
562 DROP TABLE IF EXISTS t2; | |
563 CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB"); | |
564 INSERT INTO t2 VALUES('oneatwoBthree'); | |
565 INSERT INTO t2 VALUES('onebtwoAthree'); | |
566 CREATE VIRTUAL TABLE t2aux USING fts4aux(t2); | |
567 SELECT * FROM t2aux; | |
568 } { | |
569 one * 1 1 one 0 1 1 | |
570 onebtwoathree * 1 1 onebtwoathree 0 1 1 | |
571 three * 1 1 three 0 1 1 | |
572 two * 1 1 two 0 1 1 | |
573 } | |
574 | |
575 # Test that the tokenchars and separators options work with the | |
576 # fts3tokenize table. | |
577 # | |
578 do_execsql_test 11.1 { | |
579 CREATE VIRTUAL TABLE ft1 USING fts3tokenize( | |
580 "unicode61", "tokenchars=@.", "separators=1234567890" | |
581 ); | |
582 SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road'; | |
583 } { | |
584 berlin@street sydney.road | |
585 } | |
586 | |
587 } | |
588 | |
589 finish_test | |
OLD | NEW |