Index: third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test |
diff --git a/third_party/sqlite/src/test/fts4unicode.test b/third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test |
similarity index 75% |
copy from third_party/sqlite/src/test/fts4unicode.test |
copy to third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test |
index f237119a1871b250da71515ad124aa2de706f19d..d3ff5128dafc0def5089a0dac74aa7c8c50a73d9 100644 |
--- a/third_party/sqlite/src/test/fts4unicode.test |
+++ b/third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test |
@@ -11,69 +11,76 @@ |
# |
# The tests in this file focus on testing the "unicode" FTS tokenizer. |
# |
+# This is a modified copy of FTS4 test file "fts4_unicode.test". |
+# |
+ |
+source [file join [file dirname [info script]] fts5_common.tcl] |
+set testprefix fts5unicode2 |
-set testdir [file dirname $argv0] |
-source $testdir/tester.tcl |
-ifcapable !fts3_unicode { finish_test ; return } |
-set ::testprefix fts4unicode |
+# If SQLITE_ENABLE_FTS5 is defined, omit this file. |
+ifcapable !fts5 { |
+ finish_test |
+ return |
+} |
proc do_unicode_token_test {tn input res} { |
- set input [string map {' ''} $input] |
- uplevel [list do_execsql_test $tn " |
- SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input'); |
- " [list [list {*}$res]]] |
+ uplevel [list do_test $tn [list \ |
+ sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input |
+ ] [list {*}$res]] |
} |
proc do_unicode_token_test2 {tn input res} { |
- set input [string map {' ''} $input] |
- uplevel [list do_execsql_test $tn " |
- SELECT fts3_tokenizer_test('unicode61', '$input'); |
- " [list [list {*}$res]]] |
+ uplevel [list do_test $tn [list \ |
+ sqlite3_fts5_tokenize -subst db "unicode61" $input |
+ ] [list {*}$res]] |
} |
proc do_unicode_token_test3 {tn args} { |
- set res [lindex $args end] |
- set sql "SELECT fts3_tokenizer_test('unicode61'" |
- foreach a [lrange $args 0 end-1] { |
- append sql ", '" |
- append sql [string map {' ''} $a] |
- append sql "'" |
- } |
- append sql ")" |
- uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]] |
+ set tokenizer [concat unicode61 {*}[lrange $args 0 end-2]] |
+ set input [lindex $args end-1] |
+ set res [lindex $args end] |
+ uplevel [list do_test $tn [list \ |
+ sqlite3_fts5_tokenize -subst db $tokenizer $input |
+ ] [list {*}$res]] |
} |
-do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D} |
+do_unicode_token_test 1.0 {a B c D} {a a b B c c d D} |
do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \ |
- "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC" |
+ "\uE4 \uC4 \uF6 \uD6 \uFC \uDC" |
do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \ |
- "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx" |
+ "x\uE4x x\uC4x x\uF6x x\uD6x x\uFCx x\uDCx" |
# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s. |
-do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF" |
-do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E" |
+do_unicode_token_test 1.3 "\uDF" "\uDF \uDF" |
+do_unicode_token_test 1.4 "\u1E9E" "\uDF \u1E9E" |
do_unicode_token_test 1.5 "The quick brown fox" { |
- 0 the The 1 quick quick 2 brown brown 3 fox fox |
+ the The quick quick brown brown fox fox |
} |
do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" { |
- 0 the The 1 quick quick 2 brown brown 3 fox fox |
+ the The quick quick brown brown fox fox |
} |
-do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D} |
-do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC" |
+do_unicode_token_test2 1.7 {a B c D} {a a b B c c d D} |
+do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "a \uC4 o \uD6 u \uDC" |
do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \ |
- "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx" |
+ "xax x\uC4x xox x\uD6x xux x\uDCx" |
# Check that diacritics are removed if remove_diacritics=1 is specified. |
# And that they do not break tokens. |
-do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx" |
+do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx" |
# Title-case mappings work |
-do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5" |
+do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5" |
+ |
+do_unicode_token_test 1.12 "\u00C1abc\u00C2 \u00D1def\u00C3" \ |
+ "\u00E1abc\u00E2 \u00C1abc\u00C2 \u00F1def\u00E3 \u00D1def\u00C3" |
+ |
+do_unicode_token_test 1.13 "\u00A2abc\u00A3 \u00A4def\u00A5" \ |
+ "abc abc def def" |
#------------------------------------------------------------------------- |
# |
@@ -128,7 +135,7 @@ proc mapdoc {doc} { |
} |
do_test 2.0 { |
- execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); } |
+ execsql { CREATE VIRTUAL TABLE t2 USING fts5(tokenize=unicode61, x); } |
foreach doc $docs { |
set d [mapdoc $doc] |
execsql { INSERT INTO t2 VALUES($d) } |
@@ -166,7 +173,9 @@ foreach {tn query snippet} { |
} { |
do_test 2.$tn { |
set q [mapdoc $query] |
- execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q } |
+ execsql { |
+ SELECT snippet(t2, -1, '[', ']', '...', 15) FROM t2 WHERE t2 MATCH $q |
+ } |
} [list [mapdoc $snippet]] |
} |
@@ -175,12 +184,12 @@ foreach {tn query snippet} { |
# NULL pointer. |
reset_db |
do_execsql_test 3.1 { |
- CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y); |
+ CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x, y); |
INSERT INTO t1 VALUES(NULL, 'a b c'); |
} |
do_execsql_test 3.2 { |
- SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b' |
+ SELECT snippet(t1, -1, '[', ']', '...', 15) FROM t1 WHERE t1 MATCH 'b' |
} {{a [b] c}} |
do_execsql_test 3.3 { |
@@ -222,12 +231,16 @@ do_test 4.1 { |
set c "\uFFFEdef" |
set d "\uD800def" |
execsql { |
- CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x); |
+ CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x); |
INSERT INTO t1 VALUES($a); |
INSERT INTO t1 VALUES($b); |
INSERT INTO t1 VALUES($c); |
INSERT INTO t1 VALUES($d); |
} |
+ |
+ execsql "CREATE VIRTUAL TABLE t8 USING fts5( |
+ a, b, tokenize=\"unicode61 separators '\uFFFE\uD800\u00BF'\" |
+ )" |
} {} |
do_test 4.2 { |
@@ -256,89 +269,99 @@ do_test 4.3 { |
} |
} {} |
+do_test 4.4 { |
+ sqlite3_exec_hex db { |
+ CREATE VIRTUAL TABLE t9 USING fts5(a, b, |
+ tokenize="unicode61 separators '%C09004'" |
+ ); |
+ INSERT INTO t9(a) VALUES('abc%88def %89ghi%90'); |
+ } |
+} {0 {}} |
+ |
+ |
#------------------------------------------------------------------------- |
-do_unicode_token_test3 5.1 {tokenchars=} { |
+breakpoint |
+do_unicode_token_test3 5.1 {tokenchars {}} { |
sqlite3_reset sqlite3_column_int |
} { |
- 0 sqlite3 sqlite3 |
- 1 reset reset |
- 2 sqlite3 sqlite3 |
- 3 column column |
- 4 int int |
+ sqlite3 sqlite3 |
+ reset reset |
+ sqlite3 sqlite3 |
+ column column |
+ int int |
} |
-do_unicode_token_test3 5.2 {tokenchars=_} { |
+do_unicode_token_test3 5.2 {tokenchars _} { |
sqlite3_reset sqlite3_column_int |
} { |
- 0 sqlite3_reset sqlite3_reset |
- 1 sqlite3_column_int sqlite3_column_int |
+ sqlite3_reset sqlite3_reset |
+ sqlite3_column_int sqlite3_column_int |
} |
-do_unicode_token_test3 5.3 {separators=xyz} { |
+do_unicode_token_test3 5.3 {separators xyz} { |
Laotianxhorseyrunszfast |
} { |
- 0 laotian Laotian |
- 1 horse horse |
- 2 runs runs |
- 3 fast fast |
+ laotian Laotian |
+ horse horse |
+ runs runs |
+ fast fast |
} |
-do_unicode_token_test3 5.4 {tokenchars=xyz} { |
+do_unicode_token_test3 5.4 {tokenchars xyz} { |
Laotianxhorseyrunszfast |
} { |
- 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast |
+ laotianxhorseyrunszfast Laotianxhorseyrunszfast |
} |
-do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} { |
+do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} { |
sqlite3_resetxsqlite3_column_intyhonda_phantom |
} { |
- 0 sqlite3_reset sqlite3_reset |
- 1 sqlite3_column_int sqlite3_column_int |
- 2 honda_phantom honda_phantom |
+ sqlite3_reset sqlite3_reset |
+ sqlite3_column_int sqlite3_column_int |
+ honda_phantom honda_phantom |
} |
-do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" { |
- 0 abc abc 1 def def |
+do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" { |
+ abc abc def def |
} |
do_unicode_token_test3 5.7 \ |
- "tokenchars=\u2444\u2445" \ |
- "separators=\u05D0\u05D1\u05D2" \ |
+ "tokenchars \u2444\u2445" \ |
+ "separators \u05D0\u05D1\u05D2" \ |
"\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \ |
[list \ |
- 0 \u2444fre\u2445sh \u2444fre\u2445sh \ |
- 1 water water \ |
- 2 fish fish \ |
- 3 \u2445timer \u2445timer \ |
+ \u2444fre\u2445sh \u2444fre\u2445sh \ |
+ water water \ |
+ fish fish \ |
+ \u2445timer \u2445timer \ |
] |
# Check that it is not possible to add a standalone diacritic codepoint |
# to either separators or tokenchars. |
-do_unicode_token_test3 5.8 "separators=\u0301" \ |
+do_unicode_token_test3 5.8 "separators \u0301" \ |
"hello\u0301world \u0301helloworld" \ |
- "0 helloworld hello\u0301world 1 helloworld helloworld" |
+ "helloworld hello\u0301world helloworld helloworld" |
-do_unicode_token_test3 5.9 "tokenchars=\u0301" \ |
+do_unicode_token_test3 5.9 "tokenchars \u0301" \ |
"hello\u0301world \u0301helloworld" \ |
- "0 helloworld hello\u0301world 1 helloworld helloworld" |
+ "helloworld hello\u0301world helloworld helloworld" |
-do_unicode_token_test3 5.10 "separators=\u0301" \ |
- "remove_diacritics=0" \ |
+do_unicode_token_test3 5.10 "separators \u0301" \ |
+ "remove_diacritics 0" \ |
"hello\u0301world \u0301helloworld" \ |
- "0 hello\u0301world hello\u0301world 1 helloworld helloworld" |
+ "hello\u0301world hello\u0301world helloworld helloworld" |
-do_unicode_token_test3 5.11 "tokenchars=\u0301" \ |
- "remove_diacritics=0" \ |
+do_unicode_token_test3 5.11 "tokenchars \u0301" \ |
+ "remove_diacritics 0" \ |
"hello\u0301world \u0301helloworld" \ |
- "0 hello\u0301world hello\u0301world 1 helloworld helloworld" |
- |
+ "hello\u0301world hello\u0301world helloworld helloworld" |
#------------------------------------------------------------------------- |
proc do_tokenize {tokenizer txt} { |
set res [list] |
- foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] { |
+ foreach {b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] { |
lappend res $b |
} |
set res |
@@ -357,7 +380,7 @@ proc do_isspace_test {tn tokenizer lCp} { |
} |
set tokenizers [list unicode61] |
-ifcapable icu { lappend tokenizers icu } |
+#ifcapable icu { lappend tokenizers icu } |
# Some tests to check that the tokenizers can both identify white-space |
# codepoints. All codepoints tested below are of type "Zs" in the |
@@ -389,6 +412,7 @@ foreach T $tokenizers { |
do_isspace_test 6.$T.23 $T {8287 12288} |
} |
+ |
#------------------------------------------------------------------------- |
# Test that the private use ranges are treated as alphanumeric. |
# |
@@ -396,8 +420,8 @@ foreach {tn1 c} { |
1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff |
} { |
foreach {tn2 config res} { |
- 1 "" "0 hello*world hello*world" |
- 2 "separators=*" "0 hello hello 1 world world" |
+ 1 "" "hello*world hello*world" |
+ 2 "separators *" "hello hello world world" |
} { |
set config [string map [list * $c] $config] |
set input [string map [list * $c] "hello*world"] |
@@ -415,7 +439,9 @@ foreach {tn1 c} { |
# 00F6;LATIN SMALL LETTER O WITH DIAERESIS |
# |
do_execsql_test 8.1.1 " |
- CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1'); |
+ CREATE VIRTUAL TABLE t3 USING fts5( |
+ content, tokenize='unicode61 remove_diacritics 1' |
+ ); |
INSERT INTO t3 VALUES('o'); |
INSERT INTO t3 VALUES('a'); |
INSERT INTO t3 VALUES('O'); |
@@ -426,24 +452,27 @@ do_execsql_test 8.1.1 " |
INSERT INTO t3 VALUES('\xE4'); |
" |
do_execsql_test 8.1.2 { |
- SELECT rowid FROM t3 WHERE t3 MATCH 'o'; |
+ SELECT rowid FROM t3 WHERE t3 MATCH 'o' ORDER BY rowid ASC; |
} {1 3 5 7} |
do_execsql_test 8.1.3 { |
- SELECT rowid FROM t3 WHERE t3 MATCH 'a'; |
+ SELECT rowid FROM t3 WHERE t3 MATCH 'a' ORDER BY rowid ASC; |
} {2 4 6 8} |
do_execsql_test 8.2.1 { |
- CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0"); |
- INSERT INTO t4 SELECT * FROM t3; |
+ CREATE VIRTUAL TABLE t4 USING fts5( |
+ content, tokenize='unicode61 remove_diacritics 0' |
+ ); |
+ INSERT INTO t4 SELECT * FROM t3 ORDER BY rowid ASC; |
} |
do_execsql_test 8.2.2 { |
- SELECT rowid FROM t4 WHERE t4 MATCH 'o'; |
+ SELECT rowid FROM t4 WHERE t4 MATCH 'o' ORDER BY rowid ASC; |
} {1 3} |
do_execsql_test 8.2.3 { |
- SELECT rowid FROM t4 WHERE t4 MATCH 'a'; |
+ SELECT rowid FROM t4 WHERE t4 MATCH 'a' ORDER BY rowid ASC; |
} {2 4} |
#------------------------------------------------------------------------- |
# |
+if 0 { |
foreach {tn sql} { |
1 { |
CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]); |
@@ -555,4 +584,6 @@ do_execsql_test 11.1 { |
berlin@street sydney.road |
} |
+} |
+ |
finish_test |