third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test - Issue 1610543003: [sql] Import reference version of SQLite 3.10.2.

Unified Diff: third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test

Issue 1610543003: [sql] Import reference version of SQLite 3.10.2. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode.test ('k') | third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode3.test » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test

diff --git a/third_party/sqlite/src/test/fts4unicode.test b/third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test

similarity index 75%

copy from third_party/sqlite/src/test/fts4unicode.test

copy to third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test

index f237119a1871b250da71515ad124aa2de706f19d..d3ff5128dafc0def5089a0dac74aa7c8c50a73d9 100644

--- a/third_party/sqlite/src/test/fts4unicode.test

+++ b/third_party/sqlite/sqlite-src-3100200/ext/fts5/test/fts5unicode2.test

@@ -11,69 +11,76 @@

# The tests in this file focus on testing the "unicode" FTS tokenizer.

+# This is a modified copy of FTS4 test file "fts4_unicode.test".

+source [file join [file dirname [info script]] fts5_common.tcl]

+set testprefix fts5unicode2

-set testdir [file dirname $argv0]

-source $testdir/tester.tcl

-ifcapable !fts3_unicode { finish_test ; return }

-set ::testprefix fts4unicode

+# If SQLITE_ENABLE_FTS5 is defined, omit this file.

+ifcapable !fts5 {

+ finish_test

+ return

proc do_unicode_token_test {tn input res} {

- set input [string map {' ''} $input]

- uplevel [list do_execsql_test $tn "

- SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');

- " [list [list {*}$res]]]

+ uplevel [list do_test $tn [list \

+ sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input

+ ] [list {*}$res]]

}

proc do_unicode_token_test2 {tn input res} {

- set input [string map {' ''} $input]

- uplevel [list do_execsql_test $tn "

- SELECT fts3_tokenizer_test('unicode61', '$input');

- " [list [list {*}$res]]]

+ uplevel [list do_test $tn [list \

+ sqlite3_fts5_tokenize -subst db "unicode61" $input

+ ] [list {*}$res]]

}

proc do_unicode_token_test3 {tn args} {

- set res [lindex $args end]

- set sql "SELECT fts3_tokenizer_test('unicode61'"

- foreach a [lrange $args 0 end-1] {

- append sql ", '"

- append sql [string map {' ''} $a]

- append sql "'"

- }

- append sql ")"

- uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]

+ set tokenizer [concat unicode61 {*}[lrange $args 0 end-2]]

+ set input [lindex $args end-1]

+ set res [lindex $args end]

+ uplevel [list do_test $tn [list \

+ sqlite3_fts5_tokenize -subst db $tokenizer $input

+ ] [list {*}$res]]

}

-do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}

+do_unicode_token_test 1.0 {a B c D} {a a b B c c d D}

do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \

- "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"

+ "\uE4 \uC4 \uF6 \uD6 \uFC \uDC"

do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \

- "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"

+ "x\uE4x x\uC4x x\uF6x x\uD6x x\uFCx x\uDCx"

# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.

-do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"

-do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"

+do_unicode_token_test 1.3 "\uDF" "\uDF \uDF"

+do_unicode_token_test 1.4 "\u1E9E" "\uDF \u1E9E"

do_unicode_token_test 1.5 "The quick brown fox" {

- 0 the The 1 quick quick 2 brown brown 3 fox fox

+ the The quick quick brown brown fox fox

}

do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {

- 0 the The 1 quick quick 2 brown brown 3 fox fox

+ the The quick quick brown brown fox fox

}

-do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D}

-do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"

+do_unicode_token_test2 1.7 {a B c D} {a a b B c c d D}

+do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "a \uC4 o \uD6 u \uDC"

do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \

- "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"

+ "xax x\uC4x xox x\uD6x xux x\uDCx"

# Check that diacritics are removed if remove_diacritics=1 is specified.

# And that they do not break tokens.

-do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"

+do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx"

# Title-case mappings work

-do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"

+do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5"

+do_unicode_token_test 1.12 "\u00C1abc\u00C2 \u00D1def\u00C3" \

+ "\u00E1abc\u00E2 \u00C1abc\u00C2 \u00F1def\u00E3 \u00D1def\u00C3"

+do_unicode_token_test 1.13 "\u00A2abc\u00A3 \u00A4def\u00A5" \

+ "abc abc def def"

#-------------------------------------------------------------------------

@@ -128,7 +135,7 @@ proc mapdoc {doc} {

}

do_test 2.0 {

- execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }

+ execsql { CREATE VIRTUAL TABLE t2 USING fts5(tokenize=unicode61, x); }

foreach doc $docs {

set d [mapdoc $doc]

execsql { INSERT INTO t2 VALUES($d) }

@@ -166,7 +173,9 @@ foreach {tn query snippet} {

} {

do_test 2.$tn {

set q [mapdoc $query]

- execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }

+ execsql {

+ SELECT snippet(t2, -1, '[', ']', '...', 15) FROM t2 WHERE t2 MATCH $q

+ }

} [list [mapdoc $snippet]]

}

@@ -175,12 +184,12 @@ foreach {tn query snippet} {

# NULL pointer.

reset_db

do_execsql_test 3.1 {

- CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);

+ CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x, y);

INSERT INTO t1 VALUES(NULL, 'a b c');

}

do_execsql_test 3.2 {

- SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'

+ SELECT snippet(t1, -1, '[', ']', '...', 15) FROM t1 WHERE t1 MATCH 'b'

} {{a [b] c}}

do_execsql_test 3.3 {

@@ -222,12 +231,16 @@ do_test 4.1 {

set c "\uFFFEdef"

set d "\uD800def"

execsql {

- CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);

+ CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x);

INSERT INTO t1 VALUES($a);

INSERT INTO t1 VALUES($b);

INSERT INTO t1 VALUES($c);

INSERT INTO t1 VALUES($d);

}

+ execsql "CREATE VIRTUAL TABLE t8 USING fts5(

+ a, b, tokenize=\"unicode61 separators '\uFFFE\uD800\u00BF'\"

+ )"

} {}

do_test 4.2 {

@@ -256,89 +269,99 @@ do_test 4.3 {

}

} {}

+do_test 4.4 {

+ sqlite3_exec_hex db {

+ CREATE VIRTUAL TABLE t9 USING fts5(a, b,

+ tokenize="unicode61 separators '%C09004'"

+ );

+ INSERT INTO t9(a) VALUES('abc%88def %89ghi%90');

+ }

+} {0 {}}

#-------------------------------------------------------------------------

-do_unicode_token_test3 5.1 {tokenchars=} {

+breakpoint

+do_unicode_token_test3 5.1 {tokenchars {}} {

sqlite3_reset sqlite3_column_int

} {

- 0 sqlite3 sqlite3

- 1 reset reset

- 2 sqlite3 sqlite3

- 3 column column

- 4 int int

+ sqlite3 sqlite3

+ reset reset

+ sqlite3 sqlite3

+ column column

+ int int

}

-do_unicode_token_test3 5.2 {tokenchars=_} {

+do_unicode_token_test3 5.2 {tokenchars _} {

sqlite3_reset sqlite3_column_int

} {

- 0 sqlite3_reset sqlite3_reset

- 1 sqlite3_column_int sqlite3_column_int

+ sqlite3_reset sqlite3_reset

+ sqlite3_column_int sqlite3_column_int

}

-do_unicode_token_test3 5.3 {separators=xyz} {

+do_unicode_token_test3 5.3 {separators xyz} {

Laotianxhorseyrunszfast

} {

- 0 laotian Laotian

- 1 horse horse

- 2 runs runs

- 3 fast fast

+ laotian Laotian

+ horse horse

+ runs runs

+ fast fast

}

-do_unicode_token_test3 5.4 {tokenchars=xyz} {

+do_unicode_token_test3 5.4 {tokenchars xyz} {

Laotianxhorseyrunszfast

} {

- 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast

+ laotianxhorseyrunszfast Laotianxhorseyrunszfast

}

-do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {

+do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} {

sqlite3_resetxsqlite3_column_intyhonda_phantom

} {

- 0 sqlite3_reset sqlite3_reset

- 1 sqlite3_column_int sqlite3_column_int

- 2 honda_phantom honda_phantom

+ sqlite3_reset sqlite3_reset

+ sqlite3_column_int sqlite3_column_int

+ honda_phantom honda_phantom

}

-do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {

- 0 abc abc 1 def def

+do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" {

+ abc abc def def

}

do_unicode_token_test3 5.7 \

- "tokenchars=\u2444\u2445" \

- "separators=\u05D0\u05D1\u05D2" \

+ "tokenchars \u2444\u2445" \

+ "separators \u05D0\u05D1\u05D2" \

"\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \

[list \

- 0 \u2444fre\u2445sh \u2444fre\u2445sh \

- 1 water water \

- 2 fish fish \

- 3 \u2445timer \u2445timer \

+ \u2444fre\u2445sh \u2444fre\u2445sh \

+ water water \

+ fish fish \

+ \u2445timer \u2445timer \

]

# Check that it is not possible to add a standalone diacritic codepoint

# to either separators or tokenchars.

-do_unicode_token_test3 5.8 "separators=\u0301" \

+do_unicode_token_test3 5.8 "separators \u0301" \

"hello\u0301world \u0301helloworld" \

- "0 helloworld hello\u0301world 1 helloworld helloworld"

+ "helloworld hello\u0301world helloworld helloworld"

-do_unicode_token_test3 5.9 "tokenchars=\u0301" \

+do_unicode_token_test3 5.9 "tokenchars \u0301" \

"hello\u0301world \u0301helloworld" \

- "0 helloworld hello\u0301world 1 helloworld helloworld"

+ "helloworld hello\u0301world helloworld helloworld"

-do_unicode_token_test3 5.10 "separators=\u0301" \

- "remove_diacritics=0" \

+do_unicode_token_test3 5.10 "separators \u0301" \

+ "remove_diacritics 0" \

"hello\u0301world \u0301helloworld" \

- "0 hello\u0301world hello\u0301world 1 helloworld helloworld"

+ "hello\u0301world hello\u0301world helloworld helloworld"

-do_unicode_token_test3 5.11 "tokenchars=\u0301" \

- "remove_diacritics=0" \

+do_unicode_token_test3 5.11 "tokenchars \u0301" \

+ "remove_diacritics 0" \

"hello\u0301world \u0301helloworld" \

- "0 hello\u0301world hello\u0301world 1 helloworld helloworld"

+ "hello\u0301world hello\u0301world helloworld helloworld"

#-------------------------------------------------------------------------

proc do_tokenize {tokenizer txt} {

set res [list]

- foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {

+ foreach {b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] {

lappend res $b

}

set res

@@ -357,7 +380,7 @@ proc do_isspace_test {tn tokenizer lCp} {

}

set tokenizers [list unicode61]

-ifcapable icu { lappend tokenizers icu }

+#ifcapable icu { lappend tokenizers icu }

# Some tests to check that the tokenizers can both identify white-space

# codepoints. All codepoints tested below are of type "Zs" in the

@@ -389,6 +412,7 @@ foreach T $tokenizers {

do_isspace_test 6.$T.23 $T {8287 12288}

}

#-------------------------------------------------------------------------

# Test that the private use ranges are treated as alphanumeric.

@@ -396,8 +420,8 @@ foreach {tn1 c} {

1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff

} {

foreach {tn2 config res} {

- 1 "" "0 hello*world hello*world"

- 2 "separators=*" "0 hello hello 1 world world"

+ 1 "" "hello*world hello*world"

+ 2 "separators *" "hello hello world world"

} {

set config [string map [list * $c] $config]

set input [string map [list * $c] "hello*world"]

@@ -415,7 +439,9 @@ foreach {tn1 c} {

# 00F6;LATIN SMALL LETTER O WITH DIAERESIS

do_execsql_test 8.1.1 "

- CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1');

+ CREATE VIRTUAL TABLE t3 USING fts5(

+ content, tokenize='unicode61 remove_diacritics 1'

+ );

INSERT INTO t3 VALUES('o');

INSERT INTO t3 VALUES('a');

INSERT INTO t3 VALUES('O');

@@ -426,24 +452,27 @@ do_execsql_test 8.1.1 "

INSERT INTO t3 VALUES('\xE4');

do_execsql_test 8.1.2 {

- SELECT rowid FROM t3 WHERE t3 MATCH 'o';

+ SELECT rowid FROM t3 WHERE t3 MATCH 'o' ORDER BY rowid ASC;

} {1 3 5 7}

do_execsql_test 8.1.3 {

- SELECT rowid FROM t3 WHERE t3 MATCH 'a';

+ SELECT rowid FROM t3 WHERE t3 MATCH 'a' ORDER BY rowid ASC;

} {2 4 6 8}

do_execsql_test 8.2.1 {

- CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0");

- INSERT INTO t4 SELECT * FROM t3;

+ CREATE VIRTUAL TABLE t4 USING fts5(

+ content, tokenize='unicode61 remove_diacritics 0'

+ );

+ INSERT INTO t4 SELECT * FROM t3 ORDER BY rowid ASC;

}

do_execsql_test 8.2.2 {

- SELECT rowid FROM t4 WHERE t4 MATCH 'o';

+ SELECT rowid FROM t4 WHERE t4 MATCH 'o' ORDER BY rowid ASC;

} {1 3}

do_execsql_test 8.2.3 {

- SELECT rowid FROM t4 WHERE t4 MATCH 'a';

+ SELECT rowid FROM t4 WHERE t4 MATCH 'a' ORDER BY rowid ASC;

} {2 4}

#-------------------------------------------------------------------------

+if 0 {

foreach {tn sql} {

1 {

CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);

@@ -555,4 +584,6 @@ do_execsql_test 11.1 {

berlin@street sydney.road

}

finish_test