OLD | NEW |
(Empty) | |
| 1 # 2009 November 04 |
| 2 # |
| 3 # The author disclaims copyright to this source code. In place of |
| 4 # a legal notice, here is a blessing: |
| 5 # |
| 6 # May you do good and not evil. |
| 7 # May you find forgiveness for yourself and forgive others. |
| 8 # May you share freely, never taking more than you give. |
| 9 # |
| 10 #*********************************************************************** |
| 11 # |
| 12 # This file contains common code used the fts3 tests. At one point |
| 13 # equivalent functionality was implemented in C code. But it is easier |
| 14 # to use Tcl. |
| 15 # |
| 16 |
| 17 #------------------------------------------------------------------------- |
| 18 # USAGE: fts3_integrity_check TBL |
| 19 # |
| 20 # This proc is used to verify that the full-text index is consistent with |
| 21 # the contents of the fts3 table. In other words, it checks that the |
| 22 # data in the %_contents table matches that in the %_segdir and %_segments |
| 23 # tables. |
| 24 # |
| 25 # This is not an efficient procedure. It uses a lot of memory and a lot |
| 26 # of CPU. But it is better than not checking at all. |
| 27 # |
| 28 # The procedure is: |
| 29 # |
| 30 # 1) Read the entire full-text index from the %_segdir and %_segments |
| 31 # tables into memory. For each entry in the index, the following is |
| 32 # done: |
| 33 # |
| 34 # set C($iDocid,$iCol,$iPosition) $zTerm |
| 35 # |
| 36 # 2) Iterate through each column of each row of the %_content table. |
| 37 # Tokenize all documents, and check that for each token there is |
| 38 # a corresponding entry in the $C array. After checking a token, |
| 39 # [unset] the $C array entry. |
| 40 # |
| 41 # 3) Check that array $C is now empty. |
| 42 # |
| 43 # |
| 44 proc fts3_integrity_check {tbl} { |
| 45 |
| 46 fts3_read2 $tbl 1 A |
| 47 |
| 48 foreach zTerm [array names A] { |
| 49 foreach doclist $A($zTerm) { |
| 50 set docid 0 |
| 51 while {[string length $doclist]>0} { |
| 52 set iCol 0 |
| 53 set iPos 0 |
| 54 set lPos [list] |
| 55 set lCol [list] |
| 56 |
| 57 # First varint of a doclist-entry is the docid. Delta-compressed |
| 58 # with respect to the docid of the previous entry. |
| 59 # |
| 60 incr docid [gobble_varint doclist] |
| 61 if {[info exists D($zTerm,$docid)]} { |
| 62 while {[set iDelta [gobble_varint doclist]] != 0} {} |
| 63 continue |
| 64 } |
| 65 set D($zTerm,$docid) 1 |
| 66 |
| 67 # Gobble varints until the 0x00 that terminates the doclist-entry |
| 68 # is found. |
| 69 while {[set iDelta [gobble_varint doclist]] > 0} { |
| 70 if {$iDelta == 1} { |
| 71 set iCol [gobble_varint doclist] |
| 72 set iPos 0 |
| 73 } else { |
| 74 incr iPos $iDelta |
| 75 incr iPos -2 |
| 76 set C($docid,$iCol,$iPos) $zTerm |
| 77 } |
| 78 } |
| 79 } |
| 80 } |
| 81 } |
| 82 |
| 83 foreach key [array names C] { |
| 84 #puts "$key -> $C($key)" |
| 85 } |
| 86 |
| 87 |
| 88 db eval "SELECT * FROM ${tbl}_content" E { |
| 89 set iCol 0 |
| 90 set iDoc $E(docid) |
| 91 foreach col [lrange $E(*) 1 end] { |
| 92 set c $E($col) |
| 93 set sql {SELECT fts3_tokenizer_test('simple', $c)} |
| 94 |
| 95 foreach {pos term dummy} [db one $sql] { |
| 96 if {![info exists C($iDoc,$iCol,$pos)]} { |
| 97 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing" |
| 98 lappend errors $es |
| 99 } else { |
| 100 if {$C($iDoc,$iCol,$pos) != "$term"} { |
| 101 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index " |
| 102 append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\"" |
| 103 lappend errors $es |
| 104 } |
| 105 unset C($iDoc,$iCol,$pos) |
| 106 } |
| 107 } |
| 108 incr iCol |
| 109 } |
| 110 } |
| 111 |
| 112 foreach c [array names C] { |
| 113 lappend errors "Bad index entry: $c -> $C($c)" |
| 114 } |
| 115 |
| 116 if {[info exists errors]} { return [join $errors "\n"] } |
| 117 return "ok" |
| 118 } |
| 119 |
| 120 # USAGE: fts3_terms TBL WHERE |
| 121 # |
| 122 # Argument TBL must be the name of an FTS3 table. Argument WHERE is an |
| 123 # SQL expression that will be used as the WHERE clause when scanning |
| 124 # the %_segdir table. As in the following query: |
| 125 # |
| 126 # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}" |
| 127 # |
| 128 # This function returns a list of all terms present in the segments |
| 129 # selected by the statement above. |
| 130 # |
| 131 proc fts3_terms {tbl where} { |
| 132 fts3_read $tbl $where a |
| 133 return [lsort [array names a]] |
| 134 } |
| 135 |
| 136 |
| 137 # USAGE: fts3_doclist TBL TERM WHERE |
| 138 # |
| 139 # Argument TBL must be the name of an FTS3 table. TERM is a term that may |
| 140 # or may not be present in the table. Argument WHERE is used to select a |
| 141 # subset of the b-tree segments in the associated full-text index as |
| 142 # described above for [fts3_terms]. |
| 143 # |
| 144 # This function returns the results of merging the doclists associated |
| 145 # with TERM in the selected segments. Each doclist is an element of the |
| 146 # returned list. Each doclist is formatted as follows: |
| 147 # |
| 148 # [$docid ?$col[$off1 $off2...]?...] |
| 149 # |
| 150 # The formatting is odd for a Tcl command in order to be compatible with |
| 151 # the original C-language implementation. If argument WHERE is "1", then |
| 152 # any empty doclists are omitted from the returned list. |
| 153 # |
| 154 proc fts3_doclist {tbl term where} { |
| 155 fts3_read $tbl $where a |
| 156 |
| 157 |
| 158 foreach doclist $a($term) { |
| 159 set docid 0 |
| 160 |
| 161 while {[string length $doclist]>0} { |
| 162 set iCol 0 |
| 163 set iPos 0 |
| 164 set lPos [list] |
| 165 set lCol [list] |
| 166 incr docid [gobble_varint doclist] |
| 167 |
| 168 while {[set iDelta [gobble_varint doclist]] > 0} { |
| 169 if {$iDelta == 1} { |
| 170 lappend lCol [list $iCol $lPos] |
| 171 set iPos 0 |
| 172 set lPos [list] |
| 173 set iCol [gobble_varint doclist] |
| 174 } else { |
| 175 incr iPos $iDelta |
| 176 incr iPos -2 |
| 177 lappend lPos $iPos |
| 178 } |
| 179 } |
| 180 |
| 181 if {[llength $lPos]>0} { |
| 182 lappend lCol [list $iCol $lPos] |
| 183 } |
| 184 |
| 185 if {$where != "1" || [llength $lCol]>0} { |
| 186 set ret($docid) $lCol |
| 187 } else { |
| 188 unset -nocomplain ret($docid) |
| 189 } |
| 190 } |
| 191 } |
| 192 |
| 193 set lDoc [list] |
| 194 foreach docid [lsort -integer [array names ret]] { |
| 195 set lCol [list] |
| 196 set cols "" |
| 197 foreach col $ret($docid) { |
| 198 foreach {iCol lPos} $col {} |
| 199 append cols " $iCol\[[join $lPos { }]\]" |
| 200 } |
| 201 lappend lDoc "\[${docid}${cols}\]" |
| 202 } |
| 203 |
| 204 join $lDoc " " |
| 205 } |
| 206 |
| 207 ########################################################################### |
| 208 |
| 209 proc gobble_varint {varname} { |
| 210 upvar $varname blob |
| 211 set n [read_fts3varint $blob ret] |
| 212 set blob [string range $blob $n end] |
| 213 return $ret |
| 214 } |
| 215 proc gobble_string {varname nLength} { |
| 216 upvar $varname blob |
| 217 set ret [string range $blob 0 [expr $nLength-1]] |
| 218 set blob [string range $blob $nLength end] |
| 219 return $ret |
| 220 } |
| 221 |
| 222 # The argument is a blob of data representing an FTS3 segment leaf. |
| 223 # Return a list consisting of alternating terms (strings) and doclists |
| 224 # (blobs of data). |
| 225 # |
| 226 proc fts3_readleaf {blob} { |
| 227 set zPrev "" |
| 228 set terms [list] |
| 229 |
| 230 while {[string length $blob] > 0} { |
| 231 set nPrefix [gobble_varint blob] |
| 232 set nSuffix [gobble_varint blob] |
| 233 |
| 234 set zTerm [string range $zPrev 0 [expr $nPrefix-1]] |
| 235 append zTerm [gobble_string blob $nSuffix] |
| 236 set doclist [gobble_string blob [gobble_varint blob]] |
| 237 |
| 238 lappend terms $zTerm $doclist |
| 239 set zPrev $zTerm |
| 240 } |
| 241 |
| 242 return $terms |
| 243 } |
| 244 |
| 245 proc fts3_read2 {tbl where varname} { |
| 246 upvar $varname a |
| 247 array unset a |
| 248 db eval " SELECT start_block, leaves_end_block, root |
| 249 FROM ${tbl}_segdir WHERE $where |
| 250 ORDER BY level ASC, idx DESC |
| 251 " { |
| 252 if {$start_block == 0} { |
| 253 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } |
| 254 } else { |
| 255 db eval " SELECT block |
| 256 FROM ${tbl}_segments |
| 257 WHERE blockid>=$start_block AND blockid<=$leaves_end_block |
| 258 ORDER BY blockid |
| 259 " { |
| 260 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } |
| 261 |
| 262 } |
| 263 } |
| 264 } |
| 265 } |
| 266 |
| 267 proc fts3_read {tbl where varname} { |
| 268 upvar $varname a |
| 269 array unset a |
| 270 db eval " SELECT start_block, leaves_end_block, root |
| 271 FROM ${tbl}_segdir WHERE $where |
| 272 ORDER BY level DESC, idx ASC |
| 273 " { |
| 274 if {$start_block == 0} { |
| 275 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } |
| 276 } else { |
| 277 db eval " SELECT block |
| 278 FROM ${tbl}_segments |
| 279 WHERE blockid>=$start_block AND blockid<$leaves_end_block |
| 280 ORDER BY blockid |
| 281 " { |
| 282 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } |
| 283 |
| 284 } |
| 285 } |
| 286 } |
| 287 } |
| 288 |
| 289 ########################################################################## |
| 290 |
OLD | NEW |