| OLD | NEW |
| (Empty) |
| 1 # 2009 November 04 | |
| 2 # | |
| 3 # The author disclaims copyright to this source code. In place of | |
| 4 # a legal notice, here is a blessing: | |
| 5 # | |
| 6 # May you do good and not evil. | |
| 7 # May you find forgiveness for yourself and forgive others. | |
| 8 # May you share freely, never taking more than you give. | |
| 9 # | |
| 10 #*********************************************************************** | |
| 11 # | |
| 12 # This file contains common code used the fts3 tests. At one point | |
| 13 # equivalent functionality was implemented in C code. But it is easier | |
| 14 # to use Tcl. | |
| 15 # | |
| 16 | |
| 17 #------------------------------------------------------------------------- | |
| 18 # INSTRUCTIONS | |
| 19 # | |
| 20 # The following commands are available: | |
| 21 # | |
| 22 # fts3_build_db_1 N | |
| 23 # Using database handle [db] create an FTS4 table named t1 and populate | |
| 24 # it with N rows of data. N must be less than 10,000. Refer to the | |
| 25 # header comments above the proc implementation below for details. | |
| 26 # | |
| 27 # fts3_build_db_2 N | |
| 28 # Using database handle [db] create an FTS4 table named t2 and populate | |
| 29 # it with N rows of data. N must be less than 100,000. Refer to the | |
| 30 # header comments above the proc implementation below for details. | |
| 31 # | |
| 32 # fts3_integrity_check TBL | |
| 33 # TBL must be an FTS table in the database currently opened by handle | |
| 34 # [db]. This proc loads and tokenizes all documents within the table, | |
| 35 # then checks that the current contents of the FTS index matches the | |
| 36 # results. | |
| 37 # | |
| 38 # fts3_terms TBL WHERE | |
| 39 # Todo. | |
| 40 # | |
| 41 # fts3_doclist TBL TERM WHERE | |
| 42 # Todo. | |
| 43 # | |
| 44 # | |
| 45 # | |
| 46 | |
| 47 #------------------------------------------------------------------------- | |
| 48 # USAGE: fts3_build_db_1 SWITCHES N | |
| 49 # | |
| 50 # Build a sample FTS table in the database opened by database connection | |
| 51 # [db]. The name of the new table is "t1". | |
| 52 # | |
| 53 proc fts3_build_db_1 {args} { | |
| 54 | |
| 55 set default(-module) fts4 | |
| 56 | |
| 57 set nArg [llength $args] | |
| 58 if {($nArg%2)==0} { | |
| 59 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" | |
| 60 } | |
| 61 | |
| 62 set n [lindex $args [expr $nArg-1]] | |
| 63 array set opts [array get default] | |
| 64 array set opts [lrange $args 0 [expr $nArg-2]] | |
| 65 foreach k [array names opts] { | |
| 66 if {0==[info exists default($k)]} { error "unknown option: $k" } | |
| 67 } | |
| 68 | |
| 69 if {$n > 10000} {error "n must be <= 10000"} | |
| 70 db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)" | |
| 71 | |
| 72 set xwords [list zero one two three four five six seven eight nine ten] | |
| 73 set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa] | |
| 74 | |
| 75 for {set i 0} {$i < $n} {incr i} { | |
| 76 set x "" | |
| 77 set y "" | |
| 78 | |
| 79 set x [list] | |
| 80 lappend x [lindex $xwords [expr ($i / 1000) % 10]] | |
| 81 lappend x [lindex $xwords [expr ($i / 100) % 10]] | |
| 82 lappend x [lindex $xwords [expr ($i / 10) % 10]] | |
| 83 lappend x [lindex $xwords [expr ($i / 1) % 10]] | |
| 84 | |
| 85 set y [list] | |
| 86 lappend y [lindex $ywords [expr ($i / 1000) % 10]] | |
| 87 lappend y [lindex $ywords [expr ($i / 100) % 10]] | |
| 88 lappend y [lindex $ywords [expr ($i / 10) % 10]] | |
| 89 lappend y [lindex $ywords [expr ($i / 1) % 10]] | |
| 90 | |
| 91 db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) } | |
| 92 } | |
| 93 } | |
| 94 | |
| 95 #------------------------------------------------------------------------- | |
| 96 # USAGE: fts3_build_db_2 N ARGS | |
| 97 # | |
| 98 # Build a sample FTS table in the database opened by database connection | |
| 99 # [db]. The name of the new table is "t2". | |
| 100 # | |
| 101 proc fts3_build_db_2 {args} { | |
| 102 | |
| 103 set default(-module) fts4 | |
| 104 set default(-extra) "" | |
| 105 | |
| 106 set nArg [llength $args] | |
| 107 if {($nArg%2)==0} { | |
| 108 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" | |
| 109 } | |
| 110 | |
| 111 set n [lindex $args [expr $nArg-1]] | |
| 112 array set opts [array get default] | |
| 113 array set opts [lrange $args 0 [expr $nArg-2]] | |
| 114 foreach k [array names opts] { | |
| 115 if {0==[info exists default($k)]} { error "unknown option: $k" } | |
| 116 } | |
| 117 | |
| 118 if {$n > 100000} {error "n must be <= 100000"} | |
| 119 | |
| 120 set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content" | |
| 121 if {$opts(-extra) != ""} { | |
| 122 append sql ", " $opts(-extra) | |
| 123 } | |
| 124 append sql ")" | |
| 125 db eval $sql | |
| 126 | |
| 127 set chars [list a b c d e f g h i j k l m n o p q r s t u v w x y z ""] | |
| 128 | |
| 129 for {set i 0} {$i < $n} {incr i} { | |
| 130 set word "" | |
| 131 set nChar [llength $chars] | |
| 132 append word [lindex $chars [expr {($i / 1) % $nChar}]] | |
| 133 append word [lindex $chars [expr {($i / $nChar) % $nChar}]] | |
| 134 append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]] | |
| 135 | |
| 136 db eval { INSERT INTO t2(docid, content) VALUES($i, $word) } | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 #------------------------------------------------------------------------- | |
| 141 # USAGE: fts3_integrity_check TBL | |
| 142 # | |
| 143 # This proc is used to verify that the full-text index is consistent with | |
| 144 # the contents of the fts3 table. In other words, it checks that the | |
| 145 # data in the %_contents table matches that in the %_segdir and %_segments | |
| 146 # tables. | |
| 147 # | |
| 148 # This is not an efficient procedure. It uses a lot of memory and a lot | |
| 149 # of CPU. But it is better than not checking at all. | |
| 150 # | |
| 151 # The procedure is: | |
| 152 # | |
| 153 # 1) Read the entire full-text index from the %_segdir and %_segments | |
| 154 # tables into memory. For each entry in the index, the following is | |
| 155 # done: | |
| 156 # | |
| 157 # set C($iDocid,$iCol,$iPosition) $zTerm | |
| 158 # | |
| 159 # 2) Iterate through each column of each row of the %_content table. | |
| 160 # Tokenize all documents, and check that for each token there is | |
| 161 # a corresponding entry in the $C array. After checking a token, | |
| 162 # [unset] the $C array entry. | |
| 163 # | |
| 164 # 3) Check that array $C is now empty. | |
| 165 # | |
| 166 # | |
| 167 proc fts3_integrity_check {tbl} { | |
| 168 | |
| 169 fts3_read2 $tbl 1 A | |
| 170 | |
| 171 foreach zTerm [array names A] { | |
| 172 #puts $zTerm | |
| 173 foreach doclist $A($zTerm) { | |
| 174 set docid 0 | |
| 175 while {[string length $doclist]>0} { | |
| 176 set iCol 0 | |
| 177 set iPos 0 | |
| 178 set lPos [list] | |
| 179 set lCol [list] | |
| 180 | |
| 181 # First varint of a doclist-entry is the docid. Delta-compressed | |
| 182 # with respect to the docid of the previous entry. | |
| 183 # | |
| 184 incr docid [gobble_varint doclist] | |
| 185 if {[info exists D($zTerm,$docid)]} { | |
| 186 while {[set iDelta [gobble_varint doclist]] != 0} {} | |
| 187 continue | |
| 188 } | |
| 189 set D($zTerm,$docid) 1 | |
| 190 | |
| 191 # Gobble varints until the 0x00 that terminates the doclist-entry | |
| 192 # is found. | |
| 193 while {[set iDelta [gobble_varint doclist]] > 0} { | |
| 194 if {$iDelta == 1} { | |
| 195 set iCol [gobble_varint doclist] | |
| 196 set iPos 0 | |
| 197 } else { | |
| 198 incr iPos $iDelta | |
| 199 incr iPos -2 | |
| 200 set C($docid,$iCol,$iPos) $zTerm | |
| 201 } | |
| 202 } | |
| 203 } | |
| 204 } | |
| 205 } | |
| 206 | |
| 207 foreach key [array names C] { | |
| 208 #puts "$key -> $C($key)" | |
| 209 } | |
| 210 | |
| 211 | |
| 212 db eval "SELECT * FROM ${tbl}_content" E { | |
| 213 set iCol 0 | |
| 214 set iDoc $E(docid) | |
| 215 foreach col [lrange $E(*) 1 end] { | |
| 216 set c $E($col) | |
| 217 set sql {SELECT fts3_tokenizer_test('simple', $c)} | |
| 218 | |
| 219 foreach {pos term dummy} [db one $sql] { | |
| 220 if {![info exists C($iDoc,$iCol,$pos)]} { | |
| 221 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing" | |
| 222 lappend errors $es | |
| 223 } else { | |
| 224 if {[string compare $C($iDoc,$iCol,$pos) $term]} { | |
| 225 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index " | |
| 226 append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\"" | |
| 227 lappend errors $es | |
| 228 } | |
| 229 unset C($iDoc,$iCol,$pos) | |
| 230 } | |
| 231 } | |
| 232 incr iCol | |
| 233 } | |
| 234 } | |
| 235 | |
| 236 foreach c [array names C] { | |
| 237 lappend errors "Bad index entry: $c -> $C($c)" | |
| 238 } | |
| 239 | |
| 240 if {[info exists errors]} { return [join $errors "\n"] } | |
| 241 return "ok" | |
| 242 } | |
| 243 | |
| 244 # USAGE: fts3_terms TBL WHERE | |
| 245 # | |
| 246 # Argument TBL must be the name of an FTS3 table. Argument WHERE is an | |
| 247 # SQL expression that will be used as the WHERE clause when scanning | |
| 248 # the %_segdir table. As in the following query: | |
| 249 # | |
| 250 # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}" | |
| 251 # | |
| 252 # This function returns a list of all terms present in the segments | |
| 253 # selected by the statement above. | |
| 254 # | |
| 255 proc fts3_terms {tbl where} { | |
| 256 fts3_read $tbl $where a | |
| 257 return [lsort [array names a]] | |
| 258 } | |
| 259 | |
| 260 | |
| 261 # USAGE: fts3_doclist TBL TERM WHERE | |
| 262 # | |
| 263 # Argument TBL must be the name of an FTS3 table. TERM is a term that may | |
| 264 # or may not be present in the table. Argument WHERE is used to select a | |
| 265 # subset of the b-tree segments in the associated full-text index as | |
| 266 # described above for [fts3_terms]. | |
| 267 # | |
| 268 # This function returns the results of merging the doclists associated | |
| 269 # with TERM in the selected segments. Each doclist is an element of the | |
| 270 # returned list. Each doclist is formatted as follows: | |
| 271 # | |
| 272 # [$docid ?$col[$off1 $off2...]?...] | |
| 273 # | |
| 274 # The formatting is odd for a Tcl command in order to be compatible with | |
| 275 # the original C-language implementation. If argument WHERE is "1", then | |
| 276 # any empty doclists are omitted from the returned list. | |
| 277 # | |
| 278 proc fts3_doclist {tbl term where} { | |
| 279 fts3_read $tbl $where a | |
| 280 | |
| 281 | |
| 282 foreach doclist $a($term) { | |
| 283 set docid 0 | |
| 284 | |
| 285 while {[string length $doclist]>0} { | |
| 286 set iCol 0 | |
| 287 set iPos 0 | |
| 288 set lPos [list] | |
| 289 set lCol [list] | |
| 290 incr docid [gobble_varint doclist] | |
| 291 | |
| 292 while {[set iDelta [gobble_varint doclist]] > 0} { | |
| 293 if {$iDelta == 1} { | |
| 294 lappend lCol [list $iCol $lPos] | |
| 295 set iPos 0 | |
| 296 set lPos [list] | |
| 297 set iCol [gobble_varint doclist] | |
| 298 } else { | |
| 299 incr iPos $iDelta | |
| 300 incr iPos -2 | |
| 301 lappend lPos $iPos | |
| 302 } | |
| 303 } | |
| 304 | |
| 305 if {[llength $lPos]>0} { | |
| 306 lappend lCol [list $iCol $lPos] | |
| 307 } | |
| 308 | |
| 309 if {$where != "1" || [llength $lCol]>0} { | |
| 310 set ret($docid) $lCol | |
| 311 } else { | |
| 312 unset -nocomplain ret($docid) | |
| 313 } | |
| 314 } | |
| 315 } | |
| 316 | |
| 317 set lDoc [list] | |
| 318 foreach docid [lsort -integer [array names ret]] { | |
| 319 set lCol [list] | |
| 320 set cols "" | |
| 321 foreach col $ret($docid) { | |
| 322 foreach {iCol lPos} $col {} | |
| 323 append cols " $iCol\[[join $lPos { }]\]" | |
| 324 } | |
| 325 lappend lDoc "\[${docid}${cols}\]" | |
| 326 } | |
| 327 | |
| 328 join $lDoc " " | |
| 329 } | |
| 330 | |
| 331 ########################################################################### | |
| 332 | |
| 333 proc gobble_varint {varname} { | |
| 334 upvar $varname blob | |
| 335 set n [read_fts3varint $blob ret] | |
| 336 set blob [string range $blob $n end] | |
| 337 return $ret | |
| 338 } | |
| 339 proc gobble_string {varname nLength} { | |
| 340 upvar $varname blob | |
| 341 set ret [string range $blob 0 [expr $nLength-1]] | |
| 342 set blob [string range $blob $nLength end] | |
| 343 return $ret | |
| 344 } | |
| 345 | |
| 346 # The argument is a blob of data representing an FTS3 segment leaf. | |
| 347 # Return a list consisting of alternating terms (strings) and doclists | |
| 348 # (blobs of data). | |
| 349 # | |
| 350 proc fts3_readleaf {blob} { | |
| 351 set zPrev "" | |
| 352 set terms [list] | |
| 353 | |
| 354 while {[string length $blob] > 0} { | |
| 355 set nPrefix [gobble_varint blob] | |
| 356 set nSuffix [gobble_varint blob] | |
| 357 | |
| 358 set zTerm [string range $zPrev 0 [expr $nPrefix-1]] | |
| 359 append zTerm [gobble_string blob $nSuffix] | |
| 360 set nDoclist [gobble_varint blob] | |
| 361 set doclist [gobble_string blob $nDoclist] | |
| 362 | |
| 363 lappend terms $zTerm $doclist | |
| 364 set zPrev $zTerm | |
| 365 } | |
| 366 | |
| 367 return $terms | |
| 368 } | |
| 369 | |
| 370 proc fts3_read2 {tbl where varname} { | |
| 371 upvar $varname a | |
| 372 array unset a | |
| 373 db eval " SELECT start_block, leaves_end_block, root | |
| 374 FROM ${tbl}_segdir WHERE $where | |
| 375 ORDER BY level ASC, idx DESC | |
| 376 " { | |
| 377 set c 0 | |
| 378 binary scan $root c c | |
| 379 if {$c==0} { | |
| 380 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } | |
| 381 } else { | |
| 382 db eval " SELECT block | |
| 383 FROM ${tbl}_segments | |
| 384 WHERE blockid>=$start_block AND blockid<=$leaves_end_block | |
| 385 ORDER BY blockid | |
| 386 " { | |
| 387 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } | |
| 388 } | |
| 389 } | |
| 390 } | |
| 391 } | |
| 392 | |
| 393 proc fts3_read {tbl where varname} { | |
| 394 upvar $varname a | |
| 395 array unset a | |
| 396 db eval " SELECT start_block, leaves_end_block, root | |
| 397 FROM ${tbl}_segdir WHERE $where | |
| 398 ORDER BY level DESC, idx ASC | |
| 399 " { | |
| 400 if {$start_block == 0} { | |
| 401 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } | |
| 402 } else { | |
| 403 db eval " SELECT block | |
| 404 FROM ${tbl}_segments | |
| 405 WHERE blockid>=$start_block AND blockid<$leaves_end_block | |
| 406 ORDER BY blockid | |
| 407 " { | |
| 408 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } | |
| 409 | |
| 410 } | |
| 411 } | |
| 412 } | |
| 413 } | |
| 414 | |
| 415 ########################################################################## | |
| 416 | |
| OLD | NEW |