| Index: third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test
|
| diff --git a/third_party/sqlite/src/test/fts4unicode.test b/third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test
|
| similarity index 75%
|
| copy from third_party/sqlite/src/test/fts4unicode.test
|
| copy to third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test
|
| index f237119a1871b250da71515ad124aa2de706f19d..d3ff5128dafc0def5089a0dac74aa7c8c50a73d9 100644
|
| --- a/third_party/sqlite/src/test/fts4unicode.test
|
| +++ b/third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test
|
| @@ -11,69 +11,76 @@
|
| #
|
| # The tests in this file focus on testing the "unicode" FTS tokenizer.
|
| #
|
| +# This is a modified copy of FTS4 test file "fts4_unicode.test".
|
| +#
|
| +
|
| +source [file join [file dirname [info script]] fts5_common.tcl]
|
| +set testprefix fts5unicode2
|
|
|
| -set testdir [file dirname $argv0]
|
| -source $testdir/tester.tcl
|
| -ifcapable !fts3_unicode { finish_test ; return }
|
| -set ::testprefix fts4unicode
|
| +# If SQLITE_ENABLE_FTS5 is defined, omit this file.
|
| +ifcapable !fts5 {
|
| + finish_test
|
| + return
|
| +}
|
|
|
| proc do_unicode_token_test {tn input res} {
|
| - set input [string map {' ''} $input]
|
| - uplevel [list do_execsql_test $tn "
|
| - SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
|
| - " [list [list {*}$res]]]
|
| + uplevel [list do_test $tn [list \
|
| + sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input
|
| + ] [list {*}$res]]
|
| }
|
|
|
| proc do_unicode_token_test2 {tn input res} {
|
| - set input [string map {' ''} $input]
|
| - uplevel [list do_execsql_test $tn "
|
| - SELECT fts3_tokenizer_test('unicode61', '$input');
|
| - " [list [list {*}$res]]]
|
| + uplevel [list do_test $tn [list \
|
| + sqlite3_fts5_tokenize -subst db "unicode61" $input
|
| + ] [list {*}$res]]
|
| }
|
|
|
| proc do_unicode_token_test3 {tn args} {
|
| - set res [lindex $args end]
|
| - set sql "SELECT fts3_tokenizer_test('unicode61'"
|
| - foreach a [lrange $args 0 end-1] {
|
| - append sql ", '"
|
| - append sql [string map {' ''} $a]
|
| - append sql "'"
|
| - }
|
| - append sql ")"
|
| - uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
|
| + set tokenizer [concat unicode61 {*}[lrange $args 0 end-2]]
|
| + set input [lindex $args end-1]
|
| + set res [lindex $args end]
|
| + uplevel [list do_test $tn [list \
|
| + sqlite3_fts5_tokenize -subst db $tokenizer $input
|
| + ] [list {*}$res]]
|
| }
|
|
|
| -do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
|
| +do_unicode_token_test 1.0 {a B c D} {a a b B c c d D}
|
|
|
| do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
|
| - "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"
|
| + "\uE4 \uC4 \uF6 \uD6 \uFC \uDC"
|
|
|
| do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
|
| - "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"
|
| + "x\uE4x x\uC4x x\uF6x x\uD6x x\uFCx x\uDCx"
|
|
|
| # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
|
| -do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
|
| -do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"
|
| +do_unicode_token_test 1.3 "\uDF" "\uDF \uDF"
|
| +do_unicode_token_test 1.4 "\u1E9E" "\uDF \u1E9E"
|
|
|
| do_unicode_token_test 1.5 "The quick brown fox" {
|
| - 0 the The 1 quick quick 2 brown brown 3 fox fox
|
| + the The quick quick brown brown fox fox
|
| }
|
| do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
|
| - 0 the The 1 quick quick 2 brown brown 3 fox fox
|
| + the The quick quick brown brown fox fox
|
| }
|
|
|
| -do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D}
|
| -do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"
|
| +do_unicode_token_test2 1.7 {a B c D} {a a b B c c d D}
|
| +do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "a \uC4 o \uD6 u \uDC"
|
|
|
| do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \
|
| - "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"
|
| + "xax x\uC4x xox x\uD6x xux x\uDCx"
|
|
|
| # Check that diacritics are removed if remove_diacritics=1 is specified.
|
| # And that they do not break tokens.
|
| -do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
|
| +do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx"
|
|
|
| # Title-case mappings work
|
| -do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"
|
| +do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5"
|
| +
|
| +do_unicode_token_test 1.12 "\u00C1abc\u00C2 \u00D1def\u00C3" \
|
| + "\u00E1abc\u00E2 \u00C1abc\u00C2 \u00F1def\u00E3 \u00D1def\u00C3"
|
| +
|
| +do_unicode_token_test 1.13 "\u00A2abc\u00A3 \u00A4def\u00A5" \
|
| + "abc abc def def"
|
|
|
| #-------------------------------------------------------------------------
|
| #
|
| @@ -128,7 +135,7 @@ proc mapdoc {doc} {
|
| }
|
|
|
| do_test 2.0 {
|
| - execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
|
| + execsql { CREATE VIRTUAL TABLE t2 USING fts5(tokenize=unicode61, x); }
|
| foreach doc $docs {
|
| set d [mapdoc $doc]
|
| execsql { INSERT INTO t2 VALUES($d) }
|
| @@ -166,7 +173,9 @@ foreach {tn query snippet} {
|
| } {
|
| do_test 2.$tn {
|
| set q [mapdoc $query]
|
| - execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
|
| + execsql {
|
| + SELECT snippet(t2, -1, '[', ']', '...', 15) FROM t2 WHERE t2 MATCH $q
|
| + }
|
| } [list [mapdoc $snippet]]
|
| }
|
|
|
| @@ -175,12 +184,12 @@ foreach {tn query snippet} {
|
| # NULL pointer.
|
| reset_db
|
| do_execsql_test 3.1 {
|
| - CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
|
| + CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x, y);
|
| INSERT INTO t1 VALUES(NULL, 'a b c');
|
| }
|
|
|
| do_execsql_test 3.2 {
|
| - SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
|
| + SELECT snippet(t1, -1, '[', ']', '...', 15) FROM t1 WHERE t1 MATCH 'b'
|
| } {{a [b] c}}
|
|
|
| do_execsql_test 3.3 {
|
| @@ -222,12 +231,16 @@ do_test 4.1 {
|
| set c "\uFFFEdef"
|
| set d "\uD800def"
|
| execsql {
|
| - CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
|
| + CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x);
|
| INSERT INTO t1 VALUES($a);
|
| INSERT INTO t1 VALUES($b);
|
| INSERT INTO t1 VALUES($c);
|
| INSERT INTO t1 VALUES($d);
|
| }
|
| +
|
| + execsql "CREATE VIRTUAL TABLE t8 USING fts5(
|
| + a, b, tokenize=\"unicode61 separators '\uFFFE\uD800\u00BF'\"
|
| + )"
|
| } {}
|
|
|
| do_test 4.2 {
|
| @@ -256,89 +269,99 @@ do_test 4.3 {
|
| }
|
| } {}
|
|
|
| +do_test 4.4 {
|
| + sqlite3_exec_hex db {
|
| + CREATE VIRTUAL TABLE t9 USING fts5(a, b,
|
| + tokenize="unicode61 separators '%C09004'"
|
| + );
|
| + INSERT INTO t9(a) VALUES('abc%88def %89ghi%90');
|
| + }
|
| +} {0 {}}
|
| +
|
| +
|
| #-------------------------------------------------------------------------
|
|
|
| -do_unicode_token_test3 5.1 {tokenchars=} {
|
| +breakpoint
|
| +do_unicode_token_test3 5.1 {tokenchars {}} {
|
| sqlite3_reset sqlite3_column_int
|
| } {
|
| - 0 sqlite3 sqlite3
|
| - 1 reset reset
|
| - 2 sqlite3 sqlite3
|
| - 3 column column
|
| - 4 int int
|
| + sqlite3 sqlite3
|
| + reset reset
|
| + sqlite3 sqlite3
|
| + column column
|
| + int int
|
| }
|
|
|
| -do_unicode_token_test3 5.2 {tokenchars=_} {
|
| +do_unicode_token_test3 5.2 {tokenchars _} {
|
| sqlite3_reset sqlite3_column_int
|
| } {
|
| - 0 sqlite3_reset sqlite3_reset
|
| - 1 sqlite3_column_int sqlite3_column_int
|
| + sqlite3_reset sqlite3_reset
|
| + sqlite3_column_int sqlite3_column_int
|
| }
|
|
|
| -do_unicode_token_test3 5.3 {separators=xyz} {
|
| +do_unicode_token_test3 5.3 {separators xyz} {
|
| Laotianxhorseyrunszfast
|
| } {
|
| - 0 laotian Laotian
|
| - 1 horse horse
|
| - 2 runs runs
|
| - 3 fast fast
|
| + laotian Laotian
|
| + horse horse
|
| + runs runs
|
| + fast fast
|
| }
|
|
|
| -do_unicode_token_test3 5.4 {tokenchars=xyz} {
|
| +do_unicode_token_test3 5.4 {tokenchars xyz} {
|
| Laotianxhorseyrunszfast
|
| } {
|
| - 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
|
| + laotianxhorseyrunszfast Laotianxhorseyrunszfast
|
| }
|
|
|
| -do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
|
| +do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} {
|
| sqlite3_resetxsqlite3_column_intyhonda_phantom
|
| } {
|
| - 0 sqlite3_reset sqlite3_reset
|
| - 1 sqlite3_column_int sqlite3_column_int
|
| - 2 honda_phantom honda_phantom
|
| + sqlite3_reset sqlite3_reset
|
| + sqlite3_column_int sqlite3_column_int
|
| + honda_phantom honda_phantom
|
| }
|
|
|
| -do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
|
| - 0 abc abc 1 def def
|
| +do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" {
|
| + abc abc def def
|
| }
|
|
|
| do_unicode_token_test3 5.7 \
|
| - "tokenchars=\u2444\u2445" \
|
| - "separators=\u05D0\u05D1\u05D2" \
|
| + "tokenchars \u2444\u2445" \
|
| + "separators \u05D0\u05D1\u05D2" \
|
| "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
|
| [list \
|
| - 0 \u2444fre\u2445sh \u2444fre\u2445sh \
|
| - 1 water water \
|
| - 2 fish fish \
|
| - 3 \u2445timer \u2445timer \
|
| + \u2444fre\u2445sh \u2444fre\u2445sh \
|
| + water water \
|
| + fish fish \
|
| + \u2445timer \u2445timer \
|
| ]
|
|
|
| # Check that it is not possible to add a standalone diacritic codepoint
|
| # to either separators or tokenchars.
|
| -do_unicode_token_test3 5.8 "separators=\u0301" \
|
| +do_unicode_token_test3 5.8 "separators \u0301" \
|
| "hello\u0301world \u0301helloworld" \
|
| - "0 helloworld hello\u0301world 1 helloworld helloworld"
|
| + "helloworld hello\u0301world helloworld helloworld"
|
|
|
| -do_unicode_token_test3 5.9 "tokenchars=\u0301" \
|
| +do_unicode_token_test3 5.9 "tokenchars \u0301" \
|
| "hello\u0301world \u0301helloworld" \
|
| - "0 helloworld hello\u0301world 1 helloworld helloworld"
|
| + "helloworld hello\u0301world helloworld helloworld"
|
|
|
| -do_unicode_token_test3 5.10 "separators=\u0301" \
|
| - "remove_diacritics=0" \
|
| +do_unicode_token_test3 5.10 "separators \u0301" \
|
| + "remove_diacritics 0" \
|
| "hello\u0301world \u0301helloworld" \
|
| - "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
|
| + "hello\u0301world hello\u0301world helloworld helloworld"
|
|
|
| -do_unicode_token_test3 5.11 "tokenchars=\u0301" \
|
| - "remove_diacritics=0" \
|
| +do_unicode_token_test3 5.11 "tokenchars \u0301" \
|
| + "remove_diacritics 0" \
|
| "hello\u0301world \u0301helloworld" \
|
| - "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
|
| -
|
| + "hello\u0301world hello\u0301world helloworld helloworld"
|
|
|
| #-------------------------------------------------------------------------
|
|
|
| proc do_tokenize {tokenizer txt} {
|
| set res [list]
|
| - foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
|
| + foreach {b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] {
|
| lappend res $b
|
| }
|
| set res
|
| @@ -357,7 +380,7 @@ proc do_isspace_test {tn tokenizer lCp} {
|
| }
|
|
|
| set tokenizers [list unicode61]
|
| -ifcapable icu { lappend tokenizers icu }
|
| +#ifcapable icu { lappend tokenizers icu }
|
|
|
| # Some tests to check that the tokenizers can both identify white-space
|
| # codepoints. All codepoints tested below are of type "Zs" in the
|
| @@ -389,6 +412,7 @@ foreach T $tokenizers {
|
| do_isspace_test 6.$T.23 $T {8287 12288}
|
| }
|
|
|
| +
|
| #-------------------------------------------------------------------------
|
| # Test that the private use ranges are treated as alphanumeric.
|
| #
|
| @@ -396,8 +420,8 @@ foreach {tn1 c} {
|
| 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
|
| } {
|
| foreach {tn2 config res} {
|
| - 1 "" "0 hello*world hello*world"
|
| - 2 "separators=*" "0 hello hello 1 world world"
|
| + 1 "" "hello*world hello*world"
|
| + 2 "separators *" "hello hello world world"
|
| } {
|
| set config [string map [list * $c] $config]
|
| set input [string map [list * $c] "hello*world"]
|
| @@ -415,7 +439,9 @@ foreach {tn1 c} {
|
| # 00F6;LATIN SMALL LETTER O WITH DIAERESIS
|
| #
|
| do_execsql_test 8.1.1 "
|
| - CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1');
|
| + CREATE VIRTUAL TABLE t3 USING fts5(
|
| + content, tokenize='unicode61 remove_diacritics 1'
|
| + );
|
| INSERT INTO t3 VALUES('o');
|
| INSERT INTO t3 VALUES('a');
|
| INSERT INTO t3 VALUES('O');
|
| @@ -426,24 +452,27 @@ do_execsql_test 8.1.1 "
|
| INSERT INTO t3 VALUES('\xE4');
|
| "
|
| do_execsql_test 8.1.2 {
|
| - SELECT rowid FROM t3 WHERE t3 MATCH 'o';
|
| + SELECT rowid FROM t3 WHERE t3 MATCH 'o' ORDER BY rowid ASC;
|
| } {1 3 5 7}
|
| do_execsql_test 8.1.3 {
|
| - SELECT rowid FROM t3 WHERE t3 MATCH 'a';
|
| + SELECT rowid FROM t3 WHERE t3 MATCH 'a' ORDER BY rowid ASC;
|
| } {2 4 6 8}
|
| do_execsql_test 8.2.1 {
|
| - CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0");
|
| - INSERT INTO t4 SELECT * FROM t3;
|
| + CREATE VIRTUAL TABLE t4 USING fts5(
|
| + content, tokenize='unicode61 remove_diacritics 0'
|
| + );
|
| + INSERT INTO t4 SELECT * FROM t3 ORDER BY rowid ASC;
|
| }
|
| do_execsql_test 8.2.2 {
|
| - SELECT rowid FROM t4 WHERE t4 MATCH 'o';
|
| + SELECT rowid FROM t4 WHERE t4 MATCH 'o' ORDER BY rowid ASC;
|
| } {1 3}
|
| do_execsql_test 8.2.3 {
|
| - SELECT rowid FROM t4 WHERE t4 MATCH 'a';
|
| + SELECT rowid FROM t4 WHERE t4 MATCH 'a' ORDER BY rowid ASC;
|
| } {2 4}
|
|
|
| #-------------------------------------------------------------------------
|
| #
|
| +if 0 {
|
| foreach {tn sql} {
|
| 1 {
|
| CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
|
| @@ -555,4 +584,6 @@ do_execsql_test 11.1 {
|
| berlin@street sydney.road
|
| }
|
|
|
| +}
|
| +
|
| finish_test
|
|
|