OLD | NEW |
(Empty) | |
| 1 # 2009 December 03 |
| 2 # |
| 3 # May you do good and not evil. |
| 4 # May you find forgiveness for yourself and forgive others. |
| 5 # May you share freely, never taking more than you give. |
| 6 # |
| 7 #*********************************************************************** |
| 8 # |
| 9 # Brute force (random data) tests for FTS3. |
| 10 # |
| 11 |
| 12 #------------------------------------------------------------------------- |
| 13 # |
| 14 # The FTS3 tests implemented in this file focus on testing that FTS3 |
| 15 # returns the correct set of documents for various types of full-text |
| 16 # query. This is done using pseudo-randomly generated data and queries. |
| 17 # The expected result of each query is calculated using Tcl code. |
| 18 # |
| 19 # 1. The database is initialized to contain a single table with three |
| 20 # columns. 100 rows are inserted into the table. Each of the three |
| 21 # values in each row is a document consisting of between 0 and 100 |
| 22 # terms. Terms are selected from a vocabulary of $G(nVocab) terms. |
| 23 # |
| 24 # 2. The following is performed 100 times: |
| 25 # |
| 26 # a. A row is inserted into the database. The row contents are |
| 27 # generated as in step 1. The docid is a pseudo-randomly selected |
| 28 # value between 0 and 1000000. |
| 29 # |
| 30 # b. A psuedo-randomly selected row is updated. One of its columns is |
| 31 # set to contain a new document generated in the same way as the |
| 32 # documents in step 1. |
| 33 # |
| 34 # c. A psuedo-randomly selected row is deleted. |
| 35 # |
| 36 # d. For each of several types of fts3 queries, 10 SELECT queries |
| 37 # of the form: |
| 38 # |
| 39 # SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>' |
| 40 # |
| 41 # are evaluated. The results are compared to those calculated by |
| 42 # Tcl code in this file. The patterns used for the different query |
| 43 # types are: |
| 44 # |
| 45 # 1. query = <term> |
| 46 # 2. query = <prefix> |
| 47 # 3. query = "<term> <term>" |
| 48 # 4. query = "<term> <term> <term>" |
| 49 # 5. query = "<prefix> <prefix> <prefix>" |
| 50 # 6. query = <term> NEAR <term> |
| 51 # 7. query = <term> NEAR/11 <term> NEAR/11 <term> |
| 52 # 8. query = <term> OR <term> |
| 53 # 9. query = <term> NOT <term> |
| 54 # 10. query = <term> AND <term> |
| 55 # 11. query = <term> NEAR <term> OR <term> NEAR <term> |
| 56 # 12. query = <term> NEAR <term> NOT <term> NEAR <term> |
| 57 # 13. query = <term> NEAR <term> AND <term> NEAR <term> |
| 58 # |
| 59 # where <term> is a term psuedo-randomly selected from the vocabulary |
| 60 # and prefix is the first 2 characters of such a term followed by |
| 61 # a "*" character. |
| 62 # |
| 63 # Every second iteration, steps (a) through (d) above are performed |
| 64 # within a single transaction. This forces the queries in (d) to |
| 65 # read data from both the database and the in-memory hash table |
| 66 # that caches the full-text index entries created by steps (a), (b) |
| 67 # and (c) until the transaction is committed. |
| 68 # |
| 69 # The procedure above is run 5 times, using advisory fts3 node sizes of 50, |
| 70 # 500, 1000 and 2000 bytes. |
| 71 # |
| 72 # After the test using an advisory node-size of 50, an OOM test is run using |
| 73 # the database. This test is similar to step (d) above, except that it tests |
| 74 # the effects of transient and persistent OOM conditions encountered while |
| 75 # executing each query. |
| 76 # |
| 77 |
| 78 set testdir [file dirname $argv0] |
| 79 source $testdir/tester.tcl |
| 80 |
| 81 # If this build does not include FTS3, skip the tests in this file. |
| 82 # |
| 83 ifcapable !fts3 { finish_test ; return } |
| 84 source $testdir/fts3_common.tcl |
| 85 source $testdir/malloc_common.tcl |
| 86 |
| 87 set G(nVocab) 100 |
| 88 |
| 89 set nVocab 100 |
| 90 set lVocab [list] |
| 91 |
| 92 expr srand(0) |
| 93 |
| 94 # Generate a vocabulary of nVocab words. Each word is 3 characters long. |
| 95 # |
| 96 set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z} |
| 97 for {set i 0} {$i < $nVocab} {incr i} { |
| 98 set len [expr int(rand()*3)+2] |
| 99 set word [lindex $lChar [expr int(rand()*26)]] |
| 100 append word [lindex $lChar [expr int(rand()*26)]] |
| 101 if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] } |
| 102 if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] } |
| 103 lappend lVocab $word |
| 104 } |
| 105 |
| 106 proc random_term {} { |
| 107 lindex $::lVocab [expr {int(rand()*$::nVocab)}] |
| 108 } |
| 109 |
| 110 # Return a document consisting of $nWord arbitrarily selected terms |
| 111 # from the $::lVocab list. |
| 112 # |
| 113 proc generate_doc {nWord} { |
| 114 set doc [list] |
| 115 for {set i 0} {$i < $nWord} {incr i} { |
| 116 lappend doc [random_term] |
| 117 } |
| 118 return $doc |
| 119 } |
| 120 |
| 121 |
| 122 |
| 123 # Primitives to update the table. |
| 124 # |
| 125 unset -nocomplain t1 |
| 126 proc insert_row {rowid} { |
| 127 set a [generate_doc [expr int((rand()*100))]] |
| 128 set b [generate_doc [expr int((rand()*100))]] |
| 129 set c [generate_doc [expr int((rand()*100))]] |
| 130 execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) } |
| 131 set ::t1($rowid) [list $a $b $c] |
| 132 } |
| 133 proc delete_row {rowid} { |
| 134 execsql { DELETE FROM t1 WHERE rowid = $rowid } |
| 135 catch {unset ::t1($rowid)} |
| 136 } |
| 137 proc update_row {rowid} { |
| 138 set cols {a b c} |
| 139 set iCol [expr int(rand()*3)] |
| 140 set doc [generate_doc [expr int((rand()*100))]] |
| 141 lset ::t1($rowid) $iCol $doc |
| 142 execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid" |
| 143 } |
| 144 |
| 145 proc simple_phrase {zPrefix} { |
| 146 set ret [list] |
| 147 |
| 148 set reg [string map {* {[^ ]*}} $zPrefix] |
| 149 set reg " $reg " |
| 150 |
| 151 foreach key [lsort -integer [array names ::t1]] { |
| 152 set value $::t1($key) |
| 153 set cnt [list] |
| 154 foreach col $value { |
| 155 if {[regexp $reg " $col "]} { lappend ret $key ; break } |
| 156 } |
| 157 } |
| 158 |
| 159 #lsort -uniq -integer $ret |
| 160 set ret |
| 161 } |
| 162 |
| 163 # This [proc] is used to test the FTS3 matchinfo() function. |
| 164 # |
| 165 proc simple_token_matchinfo {zToken} { |
| 166 |
| 167 set nDoc(0) 0 |
| 168 set nDoc(1) 0 |
| 169 set nDoc(2) 0 |
| 170 set nHit(0) 0 |
| 171 set nHit(1) 0 |
| 172 set nHit(2) 0 |
| 173 |
| 174 |
| 175 foreach key [array names ::t1] { |
| 176 set value $::t1($key) |
| 177 set a($key) [list] |
| 178 foreach i {0 1 2} col $value { |
| 179 set hit [llength [lsearch -all $col $zToken]] |
| 180 lappend a($key) $hit |
| 181 incr nHit($i) $hit |
| 182 if {$hit>0} { incr nDoc($i) } |
| 183 } |
| 184 } |
| 185 |
| 186 set ret [list] |
| 187 foreach docid [lsort -integer [array names a]] { |
| 188 if { [lindex [lsort -integer $a($docid)] end] } { |
| 189 set matchinfo [list 1 3] |
| 190 foreach i {0 1 2} hit $a($docid) { |
| 191 lappend matchinfo $hit $nHit($i) $nDoc($i) |
| 192 } |
| 193 lappend ret $docid $matchinfo |
| 194 } |
| 195 } |
| 196 |
| 197 set ret |
| 198 } |
| 199 |
| 200 proc simple_near {termlist nNear} { |
| 201 set ret [list] |
| 202 |
| 203 foreach {key value} [array get ::t1] { |
| 204 foreach v $value { |
| 205 |
| 206 set l [lsearch -exact -all $v [lindex $termlist 0]] |
| 207 foreach T [lrange $termlist 1 end] { |
| 208 set l2 [list] |
| 209 foreach i $l { |
| 210 set iStart [expr $i - $nNear - 1] |
| 211 set iEnd [expr $i + $nNear + 1] |
| 212 if {$iStart < 0} {set iStart 0} |
| 213 foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] { |
| 214 incr i2 $iStart |
| 215 if {$i2 != $i} { lappend l2 $i2 } |
| 216 } |
| 217 } |
| 218 set l [lsort -uniq -integer $l2] |
| 219 } |
| 220 |
| 221 if {[llength $l]} { |
| 222 #puts "MATCH($key): $v" |
| 223 lappend ret $key |
| 224 } |
| 225 } |
| 226 } |
| 227 |
| 228 lsort -unique -integer $ret |
| 229 } |
| 230 |
| 231 # The following three procs: |
| 232 # |
| 233 # setup_not A B |
| 234 # setup_or A B |
| 235 # setup_and A B |
| 236 # |
| 237 # each take two arguments. Both arguments must be lists of integer values |
| 238 # sorted by value. The return value is the list produced by evaluating |
| 239 # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or |
| 240 # AND. |
| 241 # |
| 242 proc setop_not {A B} { |
| 243 foreach b $B { set n($b) {} } |
| 244 set ret [list] |
| 245 foreach a $A { if {![info exists n($a)]} {lappend ret $a} } |
| 246 return $ret |
| 247 } |
| 248 proc setop_or {A B} { |
| 249 lsort -integer -uniq [concat $A $B] |
| 250 } |
| 251 proc setop_and {A B} { |
| 252 foreach b $B { set n($b) {} } |
| 253 set ret [list] |
| 254 foreach a $A { if {[info exists n($a)]} {lappend ret $a} } |
| 255 return $ret |
| 256 } |
| 257 |
| 258 proc mit {blob} { |
| 259 set scan(littleEndian) i* |
| 260 set scan(bigEndian) I* |
| 261 binary scan $blob $scan($::tcl_platform(byteOrder)) r |
| 262 return $r |
| 263 } |
| 264 db func mit mit |
| 265 |
| 266 set sqlite_fts3_enable_parentheses 1 |
| 267 |
| 268 foreach nodesize {50 500 1000 2000} { |
| 269 catch { array unset ::t1 } |
| 270 |
| 271 # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows. |
| 272 # |
| 273 db transaction { |
| 274 catchsql { DROP TABLE t1 } |
| 275 execsql "CREATE VIRTUAL TABLE t1 USING fts3(a, b, c)" |
| 276 execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')" |
| 277 for {set i 0} {$i < 100} {incr i} { insert_row $i } |
| 278 } |
| 279 |
| 280 for {set iTest 1} {$iTest <= 100} {incr iTest} { |
| 281 catchsql COMMIT |
| 282 |
| 283 set DO_MALLOC_TEST 0 |
| 284 set nRep 10 |
| 285 if {$iTest==100 && $nodesize==50} { |
| 286 set DO_MALLOC_TEST 1 |
| 287 set nRep 2 |
| 288 } |
| 289 |
| 290 # Delete one row, update one row and insert one row. |
| 291 # |
| 292 set rows [array names ::t1] |
| 293 set nRow [llength $rows] |
| 294 set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]] |
| 295 set iDelete $iUpdate |
| 296 while {$iDelete == $iUpdate} { |
| 297 set iDelete [lindex $rows [expr {int(rand()*$nRow)}]] |
| 298 } |
| 299 set iInsert $iUpdate |
| 300 while {[info exists ::t1($iInsert)]} { |
| 301 set iInsert [expr {int(rand()*1000000)}] |
| 302 } |
| 303 execsql BEGIN |
| 304 insert_row $iInsert |
| 305 update_row $iUpdate |
| 306 delete_row $iDelete |
| 307 if {0==($iTest%2)} { execsql COMMIT } |
| 308 |
| 309 if {0==($iTest%2)} { |
| 310 do_test fts3rnd-1.$nodesize.$iTest.0 { fts3_integrity_check t1 } ok |
| 311 } |
| 312 |
| 313 # Pick 10 terms from the vocabulary. Check that the results of querying |
| 314 # the database for the set of documents containing each of these terms |
| 315 # is the same as the result obtained by scanning the contents of the Tcl |
| 316 # array for each term. |
| 317 # |
| 318 for {set i 0} {$i < 10} {incr i} { |
| 319 set term [random_term] |
| 320 do_select_test fts3rnd-1.$nodesize.$iTest.1.$i { |
| 321 SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term |
| 322 } [simple_token_matchinfo $term] |
| 323 } |
| 324 |
| 325 # This time, use the first two characters of each term as a term prefix |
| 326 # to query for. Test that querying the Tcl array produces the same results |
| 327 # as querying the FTS3 table for the prefix. |
| 328 # |
| 329 for {set i 0} {$i < $nRep} {incr i} { |
| 330 set prefix [string range [random_term] 0 end-1] |
| 331 set match "${prefix}*" |
| 332 do_select_test fts3rnd-1.$nodesize.$iTest.2.$i { |
| 333 SELECT docid FROM t1 WHERE t1 MATCH $match |
| 334 } [simple_phrase $match] |
| 335 } |
| 336 |
| 337 # Similar to the above, except for phrase queries. |
| 338 # |
| 339 for {set i 0} {$i < $nRep} {incr i} { |
| 340 set term [list [random_term] [random_term]] |
| 341 set match "\"$term\"" |
| 342 do_select_test fts3rnd-1.$nodesize.$iTest.3.$i { |
| 343 SELECT docid FROM t1 WHERE t1 MATCH $match |
| 344 } [simple_phrase $term] |
| 345 } |
| 346 |
| 347 # Three word phrases. |
| 348 # |
| 349 for {set i 0} {$i < $nRep} {incr i} { |
| 350 set term [list [random_term] [random_term] [random_term]] |
| 351 set match "\"$term\"" |
| 352 do_select_test fts3rnd-1.$nodesize.$iTest.4.$i { |
| 353 SELECT docid FROM t1 WHERE t1 MATCH $match |
| 354 } [simple_phrase $term] |
| 355 } |
| 356 |
| 357 # Three word phrases made up of term-prefixes. |
| 358 # |
| 359 for {set i 0} {$i < $nRep} {incr i} { |
| 360 set query "[string range [random_term] 0 end-1]* " |
| 361 append query "[string range [random_term] 0 end-1]* " |
| 362 append query "[string range [random_term] 0 end-1]*" |
| 363 |
| 364 set match "\"$query\"" |
| 365 do_select_test fts3rnd-1.$nodesize.$iTest.5.$i { |
| 366 SELECT docid FROM t1 WHERE t1 MATCH $match |
| 367 } [simple_phrase $query] |
| 368 } |
| 369 |
| 370 # A NEAR query with terms as the arguments. |
| 371 # |
| 372 for {set i 0} {$i < $nRep} {incr i} { |
| 373 set terms [list [random_term] [random_term]] |
| 374 set match [join $terms " NEAR "] |
| 375 do_select_test fts3rnd-1.$nodesize.$iTest.6.$i { |
| 376 SELECT docid FROM t1 WHERE t1 MATCH $match |
| 377 } [simple_near $terms 10] |
| 378 } |
| 379 |
| 380 # A 3-way NEAR query with terms as the arguments. |
| 381 # |
| 382 for {set i 0} {$i < $nRep} {incr i} { |
| 383 set terms [list [random_term] [random_term] [random_term]] |
| 384 set nNear 11 |
| 385 set match [join $terms " NEAR/$nNear "] |
| 386 do_select_test fts3rnd-1.$nodesize.$iTest.7.$i { |
| 387 SELECT docid FROM t1 WHERE t1 MATCH $match |
| 388 } [simple_near $terms $nNear] |
| 389 } |
| 390 |
| 391 # Set operations on simple term queries. |
| 392 # |
| 393 foreach {tn op proc} { |
| 394 8 OR setop_or |
| 395 9 NOT setop_not |
| 396 10 AND setop_and |
| 397 } { |
| 398 for {set i 0} {$i < $nRep} {incr i} { |
| 399 set term1 [random_term] |
| 400 set term2 [random_term] |
| 401 set match "$term1 $op $term2" |
| 402 do_select_test fts3rnd-1.$nodesize.$iTest.$tn.$i { |
| 403 SELECT docid FROM t1 WHERE t1 MATCH $match |
| 404 } [$proc [simple_phrase $term1] [simple_phrase $term2]] |
| 405 } |
| 406 } |
| 407 |
| 408 # Set operations on NEAR queries. |
| 409 # |
| 410 foreach {tn op proc} { |
| 411 8 OR setop_or |
| 412 9 NOT setop_not |
| 413 10 AND setop_and |
| 414 } { |
| 415 for {set i 0} {$i < $nRep} {incr i} { |
| 416 set term1 [random_term] |
| 417 set term2 [random_term] |
| 418 set term3 [random_term] |
| 419 set term4 [random_term] |
| 420 set match "$term1 NEAR $term2 $op $term3 NEAR $term4" |
| 421 do_select_test fts3rnd-1.$nodesize.$iTest.$tn.$i { |
| 422 SELECT docid FROM t1 WHERE t1 MATCH $match |
| 423 } [$proc \ |
| 424 [simple_near [list $term1 $term2] 10] \ |
| 425 [simple_near [list $term3 $term4] 10] |
| 426 ] |
| 427 } |
| 428 } |
| 429 |
| 430 catchsql COMMIT |
| 431 } |
| 432 } |
| 433 |
| 434 finish_test |
OLD | NEW |