OLD | NEW |
| (Empty) |
1 # 2014 Dec 20 | |
2 # | |
3 # The author disclaims copyright to this source code. In place of | |
4 # a legal notice, here is a blessing: | |
5 # | |
6 # May you do good and not evil. | |
7 # May you find forgiveness for yourself and forgive others. | |
8 # May you share freely, never taking more than you give. | |
9 # | |
10 #*********************************************************************** | |
11 # | |
12 # Tests focusing on the built-in fts5 tokenizers. | |
13 # | |
14 | |
15 source [file join [file dirname [info script]] fts5_common.tcl] | |
16 set testprefix fts5tokenizer | |
17 | |
18 # If SQLITE_ENABLE_FTS5 is defined, omit this file. | |
19 ifcapable !fts5 { | |
20 finish_test | |
21 return | |
22 } | |
23 | |
24 | |
25 do_execsql_test 1.0 { | |
26 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); | |
27 DROP TABLE ft1; | |
28 } | |
29 do_execsql_test 1.1 { | |
30 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize='porter'); | |
31 DROP TABLE ft1; | |
32 } | |
33 do_execsql_test 1.2 { | |
34 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = porter); | |
35 DROP TABLE ft1; | |
36 } | |
37 do_execsql_test 1.3 { | |
38 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter'); | |
39 DROP TABLE ft1; | |
40 } | |
41 do_execsql_test 1.4 { | |
42 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii'); | |
43 DROP TABLE ft1; | |
44 } | |
45 | |
46 do_catchsql_test 1.5 { | |
47 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'nosuch'); | |
48 } {1 {no such tokenizer: nosuch}} | |
49 | |
50 do_catchsql_test 1.6 { | |
51 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter nosuch'); | |
52 } {1 {error in tokenizer constructor}} | |
53 | |
54 do_execsql_test 2.0 { | |
55 CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter); | |
56 INSERT INTO ft1 VALUES('embedded databases'); | |
57 } | |
58 do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1 | |
59 do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1 | |
60 do_execsql_test 2.3 { | |
61 SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' | |
62 } 1 | |
63 | |
64 proc tcl_create {args} { | |
65 set ::targs $args | |
66 error "failed" | |
67 } | |
68 sqlite3_fts5_create_tokenizer db tcl tcl_create | |
69 | |
70 foreach {tn directive expected} { | |
71 1 {tokenize='tcl a b c'} {a b c} | |
72 2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f} | |
73 3 {tokenize="tcl 'g' 'h' 'i'"} {g h i} | |
74 4 {tokenize = tcl} {} | |
75 } { | |
76 do_catchsql_test 3.$tn.1 " | |
77 CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive) | |
78 " {1 {error in tokenizer constructor}} | |
79 do_test 3.$tn.2 { set ::targs } $expected | |
80 } | |
81 | |
82 do_catchsql_test 4.1 { | |
83 CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc); | |
84 } {1 {parse error in "tokenize = tcl abc"}} | |
85 do_catchsql_test 4.2 { | |
86 CREATE VIRTUAL TABLE ft2 USING fts5(x y) | |
87 } {1 {unrecognized column option: y}} | |
88 | |
89 #------------------------------------------------------------------------- | |
90 # Test the "separators" and "tokenchars" options a bit. | |
91 # | |
92 foreach {tn tokenizer} {1 ascii 2 unicode61} { | |
93 reset_db | |
94 set T "$tokenizer tokenchars ',.:' separators 'xyz'" | |
95 execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")" | |
96 do_execsql_test 5.$tn.1 { | |
97 INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz'); | |
98 } | |
99 foreach {tn2 token res} { | |
100 1 abc 1 2 def 1 3 ghi 1 4 jkl {} | |
101 5 mno {} 6 pqr {} 7 stu {} 8 jkl.mno,pqr:stu 1 | |
102 9 vw 1 | |
103 } { | |
104 do_execsql_test 5.$tn.2.$tn2 " | |
105 SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"' | |
106 " $res | |
107 } | |
108 } | |
109 | |
110 #------------------------------------------------------------------------- | |
111 # Miscellaneous tests for the ascii tokenizer. | |
112 # | |
113 # 5.1.*: Test that the ascii tokenizer ignores non-ASCII characters in the | |
114 # 'separators' option. But unicode61 does not. | |
115 # | |
116 # 5.2.*: An option without an argument is an error. | |
117 # | |
118 | |
119 do_test 5.1.1 { | |
120 execsql " | |
121 CREATE VIRTUAL TABLE a1 USING fts5(x, tokenize=`ascii separators '\u1234'`); | |
122 INSERT INTO a1 VALUES('abc\u1234def'); | |
123 " | |
124 execsql { SELECT rowid FROM a1 WHERE a1 MATCH 'def' } | |
125 } {} | |
126 | |
127 do_test 5.1.2 { | |
128 execsql " | |
129 CREATE VIRTUAL TABLE a2 USING fts5( | |
130 x, tokenize=`unicode61 separators '\u1234'`); | |
131 INSERT INTO a2 VALUES('abc\u1234def'); | |
132 " | |
133 execsql { SELECT rowid FROM a2 WHERE a2 MATCH 'def' } | |
134 } {1} | |
135 | |
136 do_catchsql_test 5.2 { | |
137 CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii tokenchars'); | |
138 } {1 {error in tokenizer constructor}} | |
139 do_catchsql_test 5.3 { | |
140 CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'ascii opt arg'); | |
141 } {1 {error in tokenizer constructor}} | |
142 | |
143 #------------------------------------------------------------------------- | |
144 # Test that the ASCII and unicode61 tokenizers both handle SQLITE_DONE | |
145 # correctly. | |
146 # | |
147 | |
148 proc test_token_cb {varname token iStart iEnd} { | |
149 upvar $varname var | |
150 lappend var $token | |
151 if {[llength $var]==3} { return "SQLITE_DONE" } | |
152 return "SQLITE_OK" | |
153 } | |
154 | |
155 proc tokenize {cmd} { | |
156 set res [list] | |
157 $cmd xTokenize [$cmd xColumnText 0] [list test_token_cb res] | |
158 set res | |
159 } | |
160 sqlite3_fts5_create_function db tokenize tokenize | |
161 | |
162 do_execsql_test 6.0 { | |
163 CREATE VIRTUAL TABLE x1 USING fts5(a, tokenize=ascii); | |
164 INSERT INTO x1 VALUES('q w e r t y'); | |
165 INSERT INTO x1 VALUES('y t r e w q'); | |
166 SELECT tokenize(x1) FROM x1 WHERE x1 MATCH 'e AND r'; | |
167 } { | |
168 {q w e} {y t r} | |
169 } | |
170 | |
171 do_execsql_test 6.1 { | |
172 CREATE VIRTUAL TABLE x2 USING fts5(a, tokenize=unicode61); | |
173 INSERT INTO x2 VALUES('q w e r t y'); | |
174 INSERT INTO x2 VALUES('y t r e w q'); | |
175 SELECT tokenize(x2) FROM x2 WHERE x2 MATCH 'e AND r'; | |
176 } { | |
177 {q w e} {y t r} | |
178 } | |
179 | |
180 | |
181 #------------------------------------------------------------------------- | |
182 # Miscellaneous tests for the unicode tokenizer. | |
183 # | |
184 do_catchsql_test 6.1 { | |
185 CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars'); | |
186 } {1 {error in tokenizer constructor}} | |
187 do_catchsql_test 6.2 { | |
188 CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b'); | |
189 } {1 {error in tokenizer constructor}} | |
190 do_catchsql_test 6.3 { | |
191 CREATE VIRTUAL TABLE a3 USING fts5( | |
192 x, y, tokenize = 'unicode61 remove_diacritics 2' | |
193 ); | |
194 } {1 {error in tokenizer constructor}} | |
195 do_catchsql_test 6.4 { | |
196 CREATE VIRTUAL TABLE a3 USING fts5( | |
197 x, y, tokenize = 'unicode61 remove_diacritics 10' | |
198 ); | |
199 } {1 {error in tokenizer constructor}} | |
200 | |
201 #------------------------------------------------------------------------- | |
202 # Porter tokenizer with very large tokens. | |
203 # | |
204 set a [string repeat a 100] | |
205 set b [string repeat b 500] | |
206 set c [string repeat c 1000] | |
207 do_execsql_test 7.0 { | |
208 CREATE VIRTUAL TABLE e5 USING fts5(x, tokenize=porter); | |
209 INSERT INTO e5 VALUES($a || ' ' || $b); | |
210 INSERT INTO e5 VALUES($b || ' ' || $c); | |
211 INSERT INTO e5 VALUES($c || ' ' || $a); | |
212 } | |
213 | |
214 do_execsql_test 7.1 {SELECT rowid FROM e5 WHERE e5 MATCH $a} { 1 3 } | |
215 do_execsql_test 7.2 {SELECT rowid FROM e5 WHERE e5 MATCH $b} { 1 2 } | |
216 do_execsql_test 7.3 {SELECT rowid FROM e5 WHERE e5 MATCH $c} { 2 3 } | |
217 | |
218 #------------------------------------------------------------------------- | |
219 # Test the 'separators' option with the unicode61 tokenizer. | |
220 # | |
221 do_execsql_test 8.1 { | |
222 BEGIN; | |
223 CREATE VIRTUAL TABLE e6 USING fts5(x, | |
224 tokenize="unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
225 ); | |
226 INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); | |
227 CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); | |
228 SELECT term FROM e7; | |
229 ROLLBACK; | |
230 } { | |
231 brown dog fox jumped lazy over quick the | |
232 } | |
233 | |
234 do_execsql_test 8.2 [subst { | |
235 BEGIN; | |
236 CREATE VIRTUAL TABLE e6 USING fts5(x, | |
237 tokenize="unicode61 separators '\u0E01\u0E02\u0E03\u0E04\u0E05\u0E06\u0E07'" | |
238 ); | |
239 INSERT INTO e6 VALUES('the\u0E01quick\u0E01brown\u0E01fox\u0E01' | |
240 || 'jumped\u0E01over\u0E01the\u0E01lazy\u0E01dog' | |
241 ); | |
242 INSERT INTO e6 VALUES('\u0E08\u0E07\u0E09'); | |
243 CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); | |
244 SELECT term FROM e7; | |
245 ROLLBACK; | |
246 }] [subst { | |
247 brown dog fox jumped lazy over quick the \u0E08 \u0E09 | |
248 }] | |
249 | |
250 # Test that the porter tokenizer correctly passes arguments through to | |
251 # its parent tokenizer. | |
252 do_execsql_test 8.3 { | |
253 BEGIN; | |
254 CREATE VIRTUAL TABLE e6 USING fts5(x, | |
255 tokenize="porter unicode61 separators ABCDEFGHIJKLMNOPQRSTUVWXYZ" | |
256 ); | |
257 INSERT INTO e6 VALUES('theAquickBbrownCfoxDjumpedWoverXtheYlazyZdog'); | |
258 CREATE VIRTUAL TABLE e7 USING fts5vocab(e6, 'row'); | |
259 SELECT term FROM e7; | |
260 ROLLBACK; | |
261 } { | |
262 brown dog fox jump lazi over quick the | |
263 } | |
264 | |
265 finish_test | |
266 | |
OLD | NEW |