OLD | NEW |
| (Empty) |
1 # 2009 December 03 | |
2 # | |
3 # May you do good and not evil. | |
4 # May you find forgiveness for yourself and forgive others. | |
5 # May you share freely, never taking more than you give. | |
6 # | |
7 #*********************************************************************** | |
8 # | |
9 # Brute force (random data) tests for FTS3. | |
10 # | |
11 | |
12 #------------------------------------------------------------------------- | |
13 # | |
14 # The FTS3 tests implemented in this file focus on testing that FTS3 | |
15 # returns the correct set of documents for various types of full-text | |
16 # query. This is done using pseudo-randomly generated data and queries. | |
17 # The expected result of each query is calculated using Tcl code. | |
18 # | |
19 # 1. The database is initialized to contain a single table with three | |
20 # columns. 100 rows are inserted into the table. Each of the three | |
21 # values in each row is a document consisting of between 0 and 100 | |
22 # terms. Terms are selected from a vocabulary of $G(nVocab) terms. | |
23 # | |
24 # 2. The following is performed 100 times: | |
25 # | |
26 # a. A row is inserted into the database. The row contents are | |
27 # generated as in step 1. The docid is a pseudo-randomly selected | |
28 # value between 0 and 1000000. | |
29 # | |
30 # b. A psuedo-randomly selected row is updated. One of its columns is | |
31 # set to contain a new document generated in the same way as the | |
32 # documents in step 1. | |
33 # | |
34 # c. A psuedo-randomly selected row is deleted. | |
35 # | |
36 # d. For each of several types of fts3 queries, 10 SELECT queries | |
37 # of the form: | |
38 # | |
39 # SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>' | |
40 # | |
41 # are evaluated. The results are compared to those calculated by | |
42 # Tcl code in this file. The patterns used for the different query | |
43 # types are: | |
44 # | |
45 # 1. query = <term> | |
46 # 2. query = <prefix> | |
47 # 3. query = "<term> <term>" | |
48 # 4. query = "<term> <term> <term>" | |
49 # 5. query = "<prefix> <prefix> <prefix>" | |
50 # 6. query = <term> NEAR <term> | |
51 # 7. query = <term> NEAR/11 <term> NEAR/11 <term> | |
52 # 8. query = <term> OR <term> | |
53 # 9. query = <term> NOT <term> | |
54 # 10. query = <term> AND <term> | |
55 # 11. query = <term> NEAR <term> OR <term> NEAR <term> | |
56 # 12. query = <term> NEAR <term> NOT <term> NEAR <term> | |
57 # 13. query = <term> NEAR <term> AND <term> NEAR <term> | |
58 # | |
59 # where <term> is a term psuedo-randomly selected from the vocabulary | |
60 # and prefix is the first 2 characters of such a term followed by | |
61 # a "*" character. | |
62 # | |
63 # Every second iteration, steps (a) through (d) above are performed | |
64 # within a single transaction. This forces the queries in (d) to | |
65 # read data from both the database and the in-memory hash table | |
66 # that caches the full-text index entries created by steps (a), (b) | |
67 # and (c) until the transaction is committed. | |
68 # | |
69 # The procedure above is run 5 times, using advisory fts3 node sizes of 50, | |
70 # 500, 1000 and 2000 bytes. | |
71 # | |
72 # After the test using an advisory node-size of 50, an OOM test is run using | |
73 # the database. This test is similar to step (d) above, except that it tests | |
74 # the effects of transient and persistent OOM conditions encountered while | |
75 # executing each query. | |
76 # | |
77 | |
78 set testdir [file dirname $argv0] | |
79 source $testdir/tester.tcl | |
80 | |
81 # If this build does not include FTS3, skip the tests in this file. | |
82 # | |
83 ifcapable !fts3 { finish_test ; return } | |
84 source $testdir/fts3_common.tcl | |
85 source $testdir/malloc_common.tcl | |
86 | |
87 set G(nVocab) 100 | |
88 | |
89 set nVocab 100 | |
90 set lVocab [list] | |
91 | |
92 expr srand(0) | |
93 | |
94 # Generate a vocabulary of nVocab words. Each word is 3 characters long. | |
95 # | |
96 set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z} | |
97 for {set i 0} {$i < $nVocab} {incr i} { | |
98 set len [expr int(rand()*3)+2] | |
99 set word [lindex $lChar [expr int(rand()*26)]] | |
100 append word [lindex $lChar [expr int(rand()*26)]] | |
101 if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] } | |
102 if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] } | |
103 lappend lVocab $word | |
104 } | |
105 | |
106 proc random_term {} { | |
107 lindex $::lVocab [expr {int(rand()*$::nVocab)}] | |
108 } | |
109 | |
110 # Return a document consisting of $nWord arbitrarily selected terms | |
111 # from the $::lVocab list. | |
112 # | |
113 proc generate_doc {nWord} { | |
114 set doc [list] | |
115 for {set i 0} {$i < $nWord} {incr i} { | |
116 lappend doc [random_term] | |
117 } | |
118 return $doc | |
119 } | |
120 | |
121 | |
122 | |
123 # Primitives to update the table. | |
124 # | |
125 unset -nocomplain t1 | |
126 proc insert_row {rowid} { | |
127 set a [generate_doc [expr int((rand()*100))]] | |
128 set b [generate_doc [expr int((rand()*100))]] | |
129 set c [generate_doc [expr int((rand()*100))]] | |
130 execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) } | |
131 set ::t1($rowid) [list $a $b $c] | |
132 } | |
133 proc delete_row {rowid} { | |
134 execsql { DELETE FROM t1 WHERE rowid = $rowid } | |
135 catch {unset ::t1($rowid)} | |
136 } | |
137 proc update_row {rowid} { | |
138 set cols {a b c} | |
139 set iCol [expr int(rand()*3)] | |
140 set doc [generate_doc [expr int((rand()*100))]] | |
141 lset ::t1($rowid) $iCol $doc | |
142 execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid" | |
143 } | |
144 | |
145 proc simple_phrase {zPrefix} { | |
146 set ret [list] | |
147 | |
148 set reg [string map {* {[^ ]*}} $zPrefix] | |
149 set reg " $reg " | |
150 | |
151 foreach key [lsort -integer [array names ::t1]] { | |
152 set value $::t1($key) | |
153 set cnt [list] | |
154 foreach col $value { | |
155 if {[regexp $reg " $col "]} { lappend ret $key ; break } | |
156 } | |
157 } | |
158 | |
159 #lsort -uniq -integer $ret | |
160 set ret | |
161 } | |
162 | |
163 # This [proc] is used to test the FTS3 matchinfo() function. | |
164 # | |
165 proc simple_token_matchinfo {zToken bDesc} { | |
166 | |
167 set nDoc(0) 0 | |
168 set nDoc(1) 0 | |
169 set nDoc(2) 0 | |
170 set nHit(0) 0 | |
171 set nHit(1) 0 | |
172 set nHit(2) 0 | |
173 | |
174 set dir -inc | |
175 if {$bDesc} { set dir -dec } | |
176 | |
177 foreach key [array names ::t1] { | |
178 set value $::t1($key) | |
179 set a($key) [list] | |
180 foreach i {0 1 2} col $value { | |
181 set hit [llength [lsearch -all $col $zToken]] | |
182 lappend a($key) $hit | |
183 incr nHit($i) $hit | |
184 if {$hit>0} { incr nDoc($i) } | |
185 } | |
186 } | |
187 | |
188 set ret [list] | |
189 foreach docid [lsort -integer $dir [array names a]] { | |
190 if { [lindex [lsort -integer $a($docid)] end] } { | |
191 set matchinfo [list 1 3] | |
192 foreach i {0 1 2} hit $a($docid) { | |
193 lappend matchinfo $hit $nHit($i) $nDoc($i) | |
194 } | |
195 lappend ret $docid $matchinfo | |
196 } | |
197 } | |
198 | |
199 set ret | |
200 } | |
201 | |
202 proc simple_near {termlist nNear} { | |
203 set ret [list] | |
204 | |
205 foreach {key value} [array get ::t1] { | |
206 foreach v $value { | |
207 | |
208 set l [lsearch -exact -all $v [lindex $termlist 0]] | |
209 foreach T [lrange $termlist 1 end] { | |
210 set l2 [list] | |
211 foreach i $l { | |
212 set iStart [expr $i - $nNear - 1] | |
213 set iEnd [expr $i + $nNear + 1] | |
214 if {$iStart < 0} {set iStart 0} | |
215 foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] { | |
216 incr i2 $iStart | |
217 if {$i2 != $i} { lappend l2 $i2 } | |
218 } | |
219 } | |
220 set l [lsort -uniq -integer $l2] | |
221 } | |
222 | |
223 if {[llength $l]} { | |
224 #puts "MATCH($key): $v" | |
225 lappend ret $key | |
226 } | |
227 } | |
228 } | |
229 | |
230 lsort -unique -integer $ret | |
231 } | |
232 | |
233 # The following three procs: | |
234 # | |
235 # setup_not A B | |
236 # setup_or A B | |
237 # setup_and A B | |
238 # | |
239 # each take two arguments. Both arguments must be lists of integer values | |
240 # sorted by value. The return value is the list produced by evaluating | |
241 # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or | |
242 # AND. | |
243 # | |
244 proc setop_not {A B} { | |
245 foreach b $B { set n($b) {} } | |
246 set ret [list] | |
247 foreach a $A { if {![info exists n($a)]} {lappend ret $a} } | |
248 return $ret | |
249 } | |
250 proc setop_or {A B} { | |
251 lsort -integer -uniq [concat $A $B] | |
252 } | |
253 proc setop_and {A B} { | |
254 foreach b $B { set n($b) {} } | |
255 set ret [list] | |
256 foreach a $A { if {[info exists n($a)]} {lappend ret $a} } | |
257 return $ret | |
258 } | |
259 | |
260 proc mit {blob} { | |
261 set scan(littleEndian) i* | |
262 set scan(bigEndian) I* | |
263 binary scan $blob $scan($::tcl_platform(byteOrder)) r | |
264 return $r | |
265 } | |
266 db func mit mit | |
267 set sqlite_fts3_enable_parentheses 1 | |
268 | |
269 proc do_orderbydocid_test {tn sql res} { | |
270 uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res] | |
271 uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \ | |
272 [lsort -int -dec $res] | |
273 ] | |
274 } | |
275 | |
276 set NUM_TRIALS 100 | |
277 | |
278 foreach {nodesize order} { | |
279 50 DESC | |
280 50 ASC | |
281 500 ASC | |
282 1000 DESC | |
283 2000 ASC | |
284 } { | |
285 catch { array unset ::t1 } | |
286 set testname "$nodesize/$order" | |
287 | |
288 # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows. | |
289 # | |
290 db transaction { | |
291 catchsql { DROP TABLE t1 } | |
292 execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)" | |
293 execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')" | |
294 for {set i 0} {$i < 100} {incr i} { insert_row $i } | |
295 } | |
296 | |
297 for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} { | |
298 catchsql COMMIT | |
299 | |
300 set DO_MALLOC_TEST 0 | |
301 set nRep 10 | |
302 if {$iTest==100 && $nodesize==50} { | |
303 set DO_MALLOC_TEST 1 | |
304 set nRep 2 | |
305 } | |
306 | |
307 set ::testprefix fts3rnd-1.$testname.$iTest | |
308 | |
309 # Delete one row, update one row and insert one row. | |
310 # | |
311 set rows [array names ::t1] | |
312 set nRow [llength $rows] | |
313 set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]] | |
314 set iDelete $iUpdate | |
315 while {$iDelete == $iUpdate} { | |
316 set iDelete [lindex $rows [expr {int(rand()*$nRow)}]] | |
317 } | |
318 set iInsert $iUpdate | |
319 while {[info exists ::t1($iInsert)]} { | |
320 set iInsert [expr {int(rand()*1000000)}] | |
321 } | |
322 execsql BEGIN | |
323 insert_row $iInsert | |
324 update_row $iUpdate | |
325 delete_row $iDelete | |
326 if {0==($iTest%2)} { execsql COMMIT } | |
327 | |
328 if {0==($iTest%2)} { | |
329 #do_test 0 { fts3_integrity_check t1 } ok | |
330 } | |
331 | |
332 # Pick 10 terms from the vocabulary. Check that the results of querying | |
333 # the database for the set of documents containing each of these terms | |
334 # is the same as the result obtained by scanning the contents of the Tcl | |
335 # array for each term. | |
336 # | |
337 for {set i 0} {$i < 10} {incr i} { | |
338 set term [random_term] | |
339 do_select_test 1.$i.asc { | |
340 SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term | |
341 ORDER BY docid ASC | |
342 } [simple_token_matchinfo $term 0] | |
343 do_select_test 1.$i.desc { | |
344 SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term | |
345 ORDER BY docid DESC | |
346 } [simple_token_matchinfo $term 1] | |
347 } | |
348 | |
349 # This time, use the first two characters of each term as a term prefix | |
350 # to query for. Test that querying the Tcl array produces the same results | |
351 # as querying the FTS3 table for the prefix. | |
352 # | |
353 for {set i 0} {$i < $nRep} {incr i} { | |
354 set prefix [string range [random_term] 0 end-1] | |
355 set match "${prefix}*" | |
356 do_orderbydocid_test 2.$i { | |
357 SELECT docid FROM t1 WHERE t1 MATCH $match | |
358 } [simple_phrase $match] | |
359 } | |
360 | |
361 # Similar to the above, except for phrase queries. | |
362 # | |
363 for {set i 0} {$i < $nRep} {incr i} { | |
364 set term [list [random_term] [random_term]] | |
365 set match "\"$term\"" | |
366 do_orderbydocid_test 3.$i { | |
367 SELECT docid FROM t1 WHERE t1 MATCH $match | |
368 } [simple_phrase $term] | |
369 } | |
370 | |
371 # Three word phrases. | |
372 # | |
373 for {set i 0} {$i < $nRep} {incr i} { | |
374 set term [list [random_term] [random_term] [random_term]] | |
375 set match "\"$term\"" | |
376 do_orderbydocid_test 4.$i { | |
377 SELECT docid FROM t1 WHERE t1 MATCH $match | |
378 } [simple_phrase $term] | |
379 } | |
380 | |
381 # Three word phrases made up of term-prefixes. | |
382 # | |
383 for {set i 0} {$i < $nRep} {incr i} { | |
384 set query "[string range [random_term] 0 end-1]* " | |
385 append query "[string range [random_term] 0 end-1]* " | |
386 append query "[string range [random_term] 0 end-1]*" | |
387 | |
388 set match "\"$query\"" | |
389 do_orderbydocid_test 5.$i { | |
390 SELECT docid FROM t1 WHERE t1 MATCH $match | |
391 } [simple_phrase $query] | |
392 } | |
393 | |
394 # A NEAR query with terms as the arguments: | |
395 # | |
396 # ... MATCH '$term1 NEAR $term2' ... | |
397 # | |
398 for {set i 0} {$i < $nRep} {incr i} { | |
399 set terms [list [random_term] [random_term]] | |
400 set match [join $terms " NEAR "] | |
401 do_orderbydocid_test 6.$i { | |
402 SELECT docid FROM t1 WHERE t1 MATCH $match | |
403 } [simple_near $terms 10] | |
404 } | |
405 | |
406 # A 3-way NEAR query with terms as the arguments. | |
407 # | |
408 for {set i 0} {$i < $nRep} {incr i} { | |
409 set terms [list [random_term] [random_term] [random_term]] | |
410 set nNear 11 | |
411 set match [join $terms " NEAR/$nNear "] | |
412 do_orderbydocid_test 7.$i { | |
413 SELECT docid FROM t1 WHERE t1 MATCH $match | |
414 } [simple_near $terms $nNear] | |
415 } | |
416 | |
417 # Set operations on simple term queries. | |
418 # | |
419 foreach {tn op proc} { | |
420 8 OR setop_or | |
421 9 NOT setop_not | |
422 10 AND setop_and | |
423 } { | |
424 for {set i 0} {$i < $nRep} {incr i} { | |
425 set term1 [random_term] | |
426 set term2 [random_term] | |
427 set match "$term1 $op $term2" | |
428 do_orderbydocid_test $tn.$i { | |
429 SELECT docid FROM t1 WHERE t1 MATCH $match | |
430 } [$proc [simple_phrase $term1] [simple_phrase $term2]] | |
431 } | |
432 } | |
433 | |
434 # Set operations on NEAR queries. | |
435 # | |
436 foreach {tn op proc} { | |
437 11 OR setop_or | |
438 12 NOT setop_not | |
439 13 AND setop_and | |
440 } { | |
441 for {set i 0} {$i < $nRep} {incr i} { | |
442 set term1 [random_term] | |
443 set term2 [random_term] | |
444 set term3 [random_term] | |
445 set term4 [random_term] | |
446 set match "$term1 NEAR $term2 $op $term3 NEAR $term4" | |
447 do_orderbydocid_test $tn.$i { | |
448 SELECT docid FROM t1 WHERE t1 MATCH $match | |
449 } [$proc \ | |
450 [simple_near [list $term1 $term2] 10] \ | |
451 [simple_near [list $term3 $term4] 10] | |
452 ] | |
453 } | |
454 } | |
455 | |
456 catchsql COMMIT | |
457 } | |
458 } | |
459 | |
460 finish_test | |
OLD | NEW |