OLD | NEW |
| (Empty) |
1 # 2007 June 21 | |
2 # | |
3 # The author disclaims copyright to this source code. In place of | |
4 # a legal notice, here is a blessing: | |
5 # | |
6 # May you do good and not evil. | |
7 # May you find forgiveness for yourself and forgive others. | |
8 # May you share freely, never taking more than you give. | |
9 # | |
10 #************************************************************************* | |
11 # This file implements regression tests for SQLite library. The focus | |
12 # of this script is testing the pluggable tokeniser feature of the | |
13 # FTS3 module. | |
14 # | |
15 # $Id: fts3atoken.test,v 1.1 2007/08/20 17:38:42 shess Exp $ | |
16 # | |
17 | |
18 set testdir [file dirname $argv0] | |
19 source $testdir/tester.tcl | |
20 | |
21 # If SQLITE_ENABLE_FTS3 is defined, omit this file. | |
22 ifcapable !fts3 { | |
23 finish_test | |
24 return | |
25 } | |
26 | |
27 set ::testprefix fts3token | |
28 | |
29 proc escape_string {str} { | |
30 set out "" | |
31 foreach char [split $str ""] { | |
32 scan $char %c i | |
33 if {$i<=127} { | |
34 append out $char | |
35 } else { | |
36 append out [format {\x%.4x} $i] | |
37 } | |
38 } | |
39 set out | |
40 } | |
41 | |
42 #-------------------------------------------------------------------------- | |
43 # Test cases fts3token-1.* are the warm-body test for the SQL scalar | |
44 # function fts3_tokenizer(). The procedure is as follows: | |
45 # | |
46 # 1: Verify that there is no such fts3 tokenizer as 'blah'. | |
47 # | |
48 # 2: Query for the built-in tokenizer 'simple'. Insert a copy of the | |
49 # retrieved value as tokenizer 'blah'. | |
50 # | |
51 # 3: Test that the value returned for tokenizer 'blah' is now the | |
52 # same as that retrieved for 'simple'. | |
53 # | |
54 # 4: Test that it is now possible to create an fts3 table using | |
55 # tokenizer 'blah' (it was not possible in step 1). | |
56 # | |
57 # 5: Test that the table created to use tokenizer 'blah' is usable. | |
58 # | |
59 do_test fts3token-1.1 { | |
60 catchsql { | |
61 CREATE VIRTUAL TABLE t1 USING fts3(content, tokenize blah); | |
62 } | |
63 } {1 {unknown tokenizer: blah}} | |
64 do_test fts3token-1.2 { | |
65 execsql { | |
66 SELECT fts3_tokenizer('blah', fts3_tokenizer('simple')) IS NULL; | |
67 } | |
68 } {0} | |
69 do_test fts3token-1.3 { | |
70 execsql { | |
71 SELECT fts3_tokenizer('blah') == fts3_tokenizer('simple'); | |
72 } | |
73 } {1} | |
74 do_test fts3token-1.4 { | |
75 catchsql { | |
76 CREATE VIRTUAL TABLE t1 USING fts3(content, tokenize blah); | |
77 } | |
78 } {0 {}} | |
79 do_test fts3token-1.5 { | |
80 execsql { | |
81 INSERT INTO t1(content) VALUES('There was movement at the station'); | |
82 INSERT INTO t1(content) VALUES('For the word has passed around'); | |
83 INSERT INTO t1(content) VALUES('That the colt from ol regret had got away'); | |
84 SELECT content FROM t1 WHERE content MATCH 'movement' | |
85 } | |
86 } {{There was movement at the station}} | |
87 | |
88 #-------------------------------------------------------------------------- | |
89 # Test cases fts3token-2.* test error cases in the scalar function based | |
90 # API for getting and setting tokenizers. | |
91 # | |
92 do_test fts3token-2.1 { | |
93 catchsql { | |
94 SELECT fts3_tokenizer('nosuchtokenizer'); | |
95 } | |
96 } {1 {unknown tokenizer: nosuchtokenizer}} | |
97 | |
98 #-------------------------------------------------------------------------- | |
99 # Test cases fts3token-3.* test the three built-in tokenizers with a | |
100 # simple input string via the built-in test function. This is as much | |
101 # to test the test function as the tokenizer implementations. | |
102 # | |
103 do_test fts3token-3.1 { | |
104 execsql { | |
105 SELECT fts3_tokenizer_test('simple', 'I don''t see how'); | |
106 } | |
107 } {{0 i I 1 don don 2 t t 3 see see 4 how how}} | |
108 do_test fts3token-3.2 { | |
109 execsql { | |
110 SELECT fts3_tokenizer_test('porter', 'I don''t see how'); | |
111 } | |
112 } {{0 i I 1 don don 2 t t 3 see see 4 how how}} | |
113 ifcapable icu { | |
114 do_test fts3token-3.3 { | |
115 execsql { | |
116 SELECT fts3_tokenizer_test('icu', 'I don''t see how'); | |
117 } | |
118 } {{0 i I 1 don't don't 2 see see 3 how how}} | |
119 } | |
120 | |
121 #-------------------------------------------------------------------------- | |
122 # Test cases fts3token-4.* test the ICU tokenizer. In practice, this | |
123 # tokenizer only has two modes - "thai" and "everybody else". Some other | |
124 # Asian languages (Lao, Khmer etc.) require the same special treatment as | |
125 # Thai, but ICU doesn't support them yet. | |
126 # | |
127 ifcapable icu { | |
128 | |
129 proc do_icu_test {name locale input output} { | |
130 set ::out [db eval { SELECT fts3_tokenizer_test('icu', $locale, $input) }] | |
131 do_test $name { | |
132 lindex $::out 0 | |
133 } $output | |
134 } | |
135 | |
136 do_icu_test fts3token-4.1 en_US {} {} | |
137 do_icu_test fts3token-4.2 en_US {Test cases fts3} [list \ | |
138 0 test Test 1 cases cases 2 fts3 fts3 | |
139 ] | |
140 | |
141 # The following test shows that ICU is smart enough to recognise | |
142 # Thai chararacters, even when the locale is set to English/United | |
143 # States. | |
144 # | |
145 set input "\u0e2d\u0e30\u0e44\u0e23\u0e19\u0e30\u0e04\u0e23\u0e31\u0e1a" | |
146 set output "0 \u0e2d\u0e30\u0e44\u0e23 \u0e2d\u0e30\u0e44\u0e23 " | |
147 append output "1 \u0e19\u0e30 \u0e19\u0e30 " | |
148 append output "2 \u0e04\u0e23\u0e31\u0e1a \u0e04\u0e23\u0e31\u0e1a" | |
149 | |
150 do_icu_test fts3token-4.3 th_TH $input $output | |
151 do_icu_test fts3token-4.4 en_US $input $output | |
152 | |
153 # ICU handles an unknown locale by falling back to the default. | |
154 # So this is not an error. | |
155 do_icu_test fts3token-4.5 MiddleOfTheOcean $input $output | |
156 | |
157 set longtoken "AReallyReallyLongTokenOneThatWillSurelyRequire" | |
158 append longtoken "AReallocInTheIcuTokenizerCode" | |
159 | |
160 set input "short tokens then " | |
161 append input $longtoken | |
162 set output "0 short short " | |
163 append output "1 tokens tokens " | |
164 append output "2 then then " | |
165 append output "3 [string tolower $longtoken] $longtoken" | |
166 | |
167 do_icu_test fts3token-4.6 MiddleOfTheOcean $input $output | |
168 do_icu_test fts3token-4.7 th_TH $input $output | |
169 do_icu_test fts3token-4.8 en_US $input $output | |
170 | |
171 do_execsql_test 5.1 { | |
172 CREATE VIRTUAL TABLE x1 USING fts3(name,TOKENIZE icu en_US); | |
173 insert into x1 (name) values (NULL); | |
174 insert into x1 (name) values (NULL); | |
175 delete from x1; | |
176 } | |
177 | |
178 proc cp_to_str {codepoint_list} { | |
179 set fmt [string repeat %c [llength $codepoint_list]] | |
180 eval [list format $fmt] $codepoint_list | |
181 } | |
182 | |
183 do_test 5.2 { | |
184 set str [cp_to_str {19968 26085 32822 32645 27874 23433 20986}] | |
185 execsql { INSERT INTO x1 VALUES($str) } | |
186 } {} | |
187 } | |
188 | |
189 | |
190 do_test fts3token-internal { | |
191 execsql { SELECT fts3_tokenizer_internal_test() } | |
192 } {ok} | |
193 | |
194 | |
195 finish_test | |
OLD | NEW |