Index: third_party/sqlite/sqlite-src-3170000/ext/fts5/test/fts5tokenizer.test |
diff --git a/third_party/sqlite/sqlite-src-3170000/ext/fts5/test/fts5tokenizer.test b/third_party/sqlite/sqlite-src-3170000/ext/fts5/test/fts5tokenizer.test |
new file mode 100644 |
index 0000000000000000000000000000000000000000..f68aa19a1057b4d271c999dddd8b5e4197053076 |
--- /dev/null |
+++ b/third_party/sqlite/sqlite-src-3170000/ext/fts5/test/fts5tokenizer.test |
@@ -0,0 +1,305 @@ |
+# 2014 Dec 20 |
+# |
+# The author disclaims copyright to this source code. In place of |
+# a legal notice, here is a blessing: |
+# |
+# May you do good and not evil. |
+# May you find forgiveness for yourself and forgive others. |
+# May you share freely, never taking more than you give. |
+# |
+#*********************************************************************** |
+# |
+# Tests focusing on the built-in fts5 tokenizers. |
+# |
+ |
+source [file join [file dirname [info script]] fts5_common.tcl] |
+set testprefix fts5tokenizer |
+ |
+# If SQLITE_ENABLE_FTS5 is defined, omit this file. |
+ifcapable !fts5 { |
+ finish_test |
+ return |
+} |
+ |
+ |
+do_execsql_test 1.0 { |
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); |
+ DROP TABLE ft1; |
+} |
+do_execsql_test 1.1 { |
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter'); |
+ DROP TABLE ft1; |
+} |
+do_execsql_test 1.2 { |
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter); |
+ DROP TABLE ft1; |
+} |
+do_execsql_test 1.3 { |
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter'); |
+ DROP TABLE ft1; |
+} |
+do_execsql_test 1.4 { |
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii'); |
+ DROP TABLE ft1; |
+} |
+ |
+do_catchsql_test 1.5 { |
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch'); |
+} {1 {no such tokenizer: nosuch}} |
+ |
+do_catchsql_test 1.6 { |
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch'); |
+} {1 {error in tokenizer constructor}} |
+ |
+do_execsql_test 2.0 { |
+ CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); |
+ INSERT INTO ft1 VALUES('embedded databases'); |
+} |
+do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1 |
+do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1 |
+do_execsql_test 2.3 { |
+ SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' |
+} 1 |
+ |
+proc tcl_create {args} { |
+ set ::targs $args |
+ error "failed" |
+} |
+sqlite3_fts5_create_tokenizer db tcl tcl_create |
+ |
+foreach {tn directive expected} { |
+ 1 {tokenize='tcl a b c'} {a b c} |
+ 2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f} |
+ 3 {tokenize="tcl 'g' 'h' 'i'"} {g h i} |
+ 4 {tokenize = tcl} {} |
+} { |
+ do_catchsql_test 3.$tn.1 " |
+ CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive) |
+ " {1 {error in tokenizer constructor}} |
+ do_test 3.$tn.2 { set ::targs } $expected |
+} |
+ |
+do_catchsql_test 4.1 { |
+ CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc); |
+} {1 {parse error in "tokenize = tcl abc"}} |
+do_catchsql_test 4.2 { |
+ CREATE VIRTUAL TABLE ft2 USING fts5(x y) |
+} {1 {unrecognized column option: y}} |
+ |
+#------------------------------------------------------------------------- |
+# Test the "separators" and "tokenchars" options a bit. |
+# |
+foreach {tn tokenizer} {1 ascii 2 unicode61} { |
+ reset_db |
+ set T "$tokenizer tokenchars ',.:' separators 'xyz'" |
+ execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")" |
+ do_execsql_test 5.$tn.1 { |
+ INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz'); |
+ } |
+ foreach {tn2 token res} { |
+ 1 abc 1 2 def 1 3 ghi 1 4 jkl {} |
+ 5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1 |
+ 9 vw 1 |
+ } { |
+ do_execsql_test 5.$tn.2.$tn2 " |
+ SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"' |
+ " $res |
+ } |
+} |
+ |
+#------------------------------------------------------------------------- |
+# Miscellaneous tests for the ascii tokenizer. |
+# |
+# 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the |
+# 'separators' option. But unicode61 does not. |
+# |
+# 5.2.*: An option without an argument is an error. |
+# |
+ |
+do_test 5.1.1 { |
+ execsql " |
+ CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`); |
+ INSERT INTO a1 VALUES('abc\u1234def'); |
+ " |
+ execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' } |
+} {} |
+ |
+do_test 5.1.2 { |
+ execsql " |
+ CREATE VIRTUAL TABLE a2 USING fts5( |
+ x, tokenize=`unicode61 separators '\u1234'`); |
+ INSERT INTO a2 VALUES('abc\u1234def'); |
+ " |
+ execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' } |
+} {1} |
+ |
+do_catchsql_test 5.2 { |
+ CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars'); |
+} {1 {error in tokenizer constructor}} |
+do_catchsql_test 5.3 { |
+ CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg'); |
+} {1 {error in tokenizer constructor}} |
+ |
+#------------------------------------------------------------------------- |
+# Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE |
+# correctly. |
+# |
+ |
+proc test_token_cb {varname token iStart iEnd} { |
+ upvar $varname var |
+ lappend var $token |
+ if {[llength $var]==3} { return "SQLITE_DONE" } |
+ return "SQLITE_OK" |
+} |
+ |
+proc tokenize {cmd} { |
+ set res [list] |
+ $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res] |
+ set res |
+} |
+sqlite3_fts5_create_function db tokenize tokenize |
+ |
+do_execsql_test 6.0 { |
+ CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii); |
+ INSERT INTO x1 VALUES('q w e r t y'); |
+ INSERT INTO x1 VALUES('y t r e w q'); |
+ SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r'; |
+} { |
+ {q w e} {y t r} |
+} |
+ |
+do_execsql_test 6.1 { |
+ CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61); |
+ INSERT INTO x2 VALUES('q w e r t y'); |
+ INSERT INTO x2 VALUES('y t r e w q'); |
+ SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r'; |
+} { |
+ {q w e} {y t r} |
+} |
+ |
+ |
+#------------------------------------------------------------------------- |
+# Miscellaneous tests for the unicode tokenizer. |
+# |
+do_catchsql_test 6.1 { |
+ CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars'); |
+} {1 {error in tokenizer constructor}} |
+do_catchsql_test 6.2 { |
+ CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b'); |
+} {1 {error in tokenizer constructor}} |
+do_catchsql_test 6.3 { |
+ CREATE VIRTUAL TABLE a3 USING fts5( |
+ x, y, tokenize = 'unicode61 remove_diacritics 2' |
+ ); |
+} {1 {error in tokenizer constructor}} |
+do_catchsql_test 6.4 { |
+ CREATE VIRTUAL TABLE a3 USING fts5( |
+ x, y, tokenize = 'unicode61 remove_diacritics 10' |
+ ); |
+} {1 {error in tokenizer constructor}} |
+ |
+#------------------------------------------------------------------------- |
+# Porter tokenizer with very large tokens. |
+# |
+set a [string repeat a 100] |
+set b [string repeat b 500] |
+set c [string repeat c 1000] |
+do_execsql_test 7.0 { |
+ CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter); |
+ INSERT INTO e5 VALUES($a || ' ' || $b); |
+ INSERT INTO e5 VALUES($b || ' ' || $c); |
+ INSERT INTO e5 VALUES($c || ' ' || $a); |
+} |
+ |
+do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 } |
+do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 } |
+do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 } |
+ |
+#------------------------------------------------------------------------- |
+# Test the 'separators' option with the unicode61 tokenizer. |
+# |
+do_execsql_test 8.1 { |
+ BEGIN; |
+ CREATE VIRTUAL TABLE e6 USING fts5(x, |
+ tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
+ ); |
+ INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); |
+ CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); |
+ SELECT term FROM e7; |
+ ROLLBACK; |
+} { |
+ brown dog fox jumped lazy over quick the |
+} |
+ |
+do_execsql_test 8.2 [subst { |
+ BEGIN; |
+ CREATE VIRTUAL TABLE e6 USING fts5(x, |
+ tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'" |
+ ); |
+ INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01' |
+ || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog' |
+ ); |
+ INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09'); |
+ CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); |
+ SELECT term FROM e7; |
+ ROLLBACK; |
+}] [subst { |
+ brown dog fox jumped lazy over quick the \u0E08 \u0E09 |
+}] |
+ |
+# Test that the porter tokenizer correctly passes arguments through to |
+# its parent tokenizer. |
+do_execsql_test 8.3 { |
+ BEGIN; |
+ CREATE VIRTUAL TABLE e6 USING fts5(x, |
+ tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
+ ); |
+ INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); |
+ CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); |
+ SELECT term FROM e7; |
+ ROLLBACK; |
+} { |
+ brown dog fox jump lazi over quick the |
+} |
+ |
+#------------------------------------------------------------------------- |
+# Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer |
+# implementation. |
+# |
+reset_db |
+proc tcl_create {args} { return "tcl_tokenize" } |
+sqlite3_fts5_create_tokenizer db tcl tcl_create |
+set ::flags [list] |
+proc tcl_tokenize {tflags text} { |
+ lappend ::flags $tflags |
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] { |
+ sqlite3_fts5_token $w $iStart $iEnd |
+ } |
+} |
+ |
+do_execsql_test 9.1.1 { |
+ CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl); |
+ INSERT INTO t1 VALUES('abc'); |
+ INSERT INTO t1 VALUES('xyz'); |
+} {} |
+do_test 9.1.2 { set ::flags } {document document} |
+ |
+set ::flags [list] |
+do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc} |
+do_test 9.2.2 { set ::flags } {query} |
+ |
+set ::flags [list] |
+do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc} |
+do_test 9.3.2 { set ::flags } {prefixquery} |
+ |
+set ::flags [list] |
+do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {} |
+do_test 9.4.2 { set ::flags } {prefixquery} |
+ |
+set ::flags [list] |
+do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {} |
+do_test 9.5.2 { set ::flags } {query} |
+ |
+ |
+finish_test |
+ |