OLD | NEW |
| (Empty) |
1 # 2009 November 04 | |
2 # | |
3 # The author disclaims copyright to this source code. In place of | |
4 # a legal notice, here is a blessing: | |
5 # | |
6 # May you do good and not evil. | |
7 # May you find forgiveness for yourself and forgive others. | |
8 # May you share freely, never taking more than you give. | |
9 # | |
10 #*********************************************************************** | |
11 # | |
12 # This file contains common code used the fts3 tests. At one point | |
13 # equivalent functionality was implemented in C code. But it is easier | |
14 # to use Tcl. | |
15 # | |
16 | |
17 #------------------------------------------------------------------------- | |
18 # INSTRUCTIONS | |
19 # | |
20 # The following commands are available: | |
21 # | |
22 # fts3_build_db_1 N | |
23 # Using database handle [db] create an FTS4 table named t1 and populate | |
24 # it with N rows of data. N must be less than 10,000. Refer to the | |
25 # header comments above the proc implementation below for details. | |
26 # | |
27 # fts3_build_db_2 N | |
28 # Using database handle [db] create an FTS4 table named t2 and populate | |
29 # it with N rows of data. N must be less than 100,000. Refer to the | |
30 # header comments above the proc implementation below for details. | |
31 # | |
32 # fts3_integrity_check TBL | |
33 # TBL must be an FTS table in the database currently opened by handle | |
34 # [db]. This proc loads and tokenizes all documents within the table, | |
35 # then checks that the current contents of the FTS index matches the | |
36 # results. | |
37 # | |
38 # fts3_terms TBL WHERE | |
39 # Todo. | |
40 # | |
41 # fts3_doclist TBL TERM WHERE | |
42 # Todo. | |
43 # | |
44 # | |
45 # | |
46 | |
47 #------------------------------------------------------------------------- | |
48 # USAGE: fts3_build_db_1 SWITCHES N | |
49 # | |
50 # Build a sample FTS table in the database opened by database connection | |
51 # [db]. The name of the new table is "t1". | |
52 # | |
53 proc fts3_build_db_1 {args} { | |
54 | |
55 set default(-module) fts4 | |
56 | |
57 set nArg [llength $args] | |
58 if {($nArg%2)==0} { | |
59 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" | |
60 } | |
61 | |
62 set n [lindex $args [expr $nArg-1]] | |
63 array set opts [array get default] | |
64 array set opts [lrange $args 0 [expr $nArg-2]] | |
65 foreach k [array names opts] { | |
66 if {0==[info exists default($k)]} { error "unknown option: $k" } | |
67 } | |
68 | |
69 if {$n > 10000} {error "n must be <= 10000"} | |
70 db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)" | |
71 | |
72 set xwords [list zero one two three four five six seven eight nine ten] | |
73 set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa] | |
74 | |
75 for {set i 0} {$i < $n} {incr i} { | |
76 set x "" | |
77 set y "" | |
78 | |
79 set x [list] | |
80 lappend x [lindex $xwords [expr ($i / 1000) % 10]] | |
81 lappend x [lindex $xwords [expr ($i / 100) % 10]] | |
82 lappend x [lindex $xwords [expr ($i / 10) % 10]] | |
83 lappend x [lindex $xwords [expr ($i / 1) % 10]] | |
84 | |
85 set y [list] | |
86 lappend y [lindex $ywords [expr ($i / 1000) % 10]] | |
87 lappend y [lindex $ywords [expr ($i / 100) % 10]] | |
88 lappend y [lindex $ywords [expr ($i / 10) % 10]] | |
89 lappend y [lindex $ywords [expr ($i / 1) % 10]] | |
90 | |
91 db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) } | |
92 } | |
93 } | |
94 | |
95 #------------------------------------------------------------------------- | |
96 # USAGE: fts3_build_db_2 N ARGS | |
97 # | |
98 # Build a sample FTS table in the database opened by database connection | |
99 # [db]. The name of the new table is "t2". | |
100 # | |
101 proc fts3_build_db_2 {args} { | |
102 | |
103 set default(-module) fts4 | |
104 set default(-extra) "" | |
105 | |
106 set nArg [llength $args] | |
107 if {($nArg%2)==0} { | |
108 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\"" | |
109 } | |
110 | |
111 set n [lindex $args [expr $nArg-1]] | |
112 array set opts [array get default] | |
113 array set opts [lrange $args 0 [expr $nArg-2]] | |
114 foreach k [array names opts] { | |
115 if {0==[info exists default($k)]} { error "unknown option: $k" } | |
116 } | |
117 | |
118 if {$n > 100000} {error "n must be <= 100000"} | |
119 | |
120 set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content" | |
121 if {$opts(-extra) != ""} { | |
122 append sql ", " $opts(-extra) | |
123 } | |
124 append sql ")" | |
125 db eval $sql | |
126 | |
127 set chars [list a b c d e f g h i j k l m n o p q r s t u v w x y z ""] | |
128 | |
129 for {set i 0} {$i < $n} {incr i} { | |
130 set word "" | |
131 set nChar [llength $chars] | |
132 append word [lindex $chars [expr {($i / 1) % $nChar}]] | |
133 append word [lindex $chars [expr {($i / $nChar) % $nChar}]] | |
134 append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]] | |
135 | |
136 db eval { INSERT INTO t2(docid, content) VALUES($i, $word) } | |
137 } | |
138 } | |
139 | |
140 #------------------------------------------------------------------------- | |
141 # USAGE: fts3_integrity_check TBL | |
142 # | |
143 # This proc is used to verify that the full-text index is consistent with | |
144 # the contents of the fts3 table. In other words, it checks that the | |
145 # data in the %_contents table matches that in the %_segdir and %_segments | |
146 # tables. | |
147 # | |
148 # This is not an efficient procedure. It uses a lot of memory and a lot | |
149 # of CPU. But it is better than not checking at all. | |
150 # | |
151 # The procedure is: | |
152 # | |
153 # 1) Read the entire full-text index from the %_segdir and %_segments | |
154 # tables into memory. For each entry in the index, the following is | |
155 # done: | |
156 # | |
157 # set C($iDocid,$iCol,$iPosition) $zTerm | |
158 # | |
159 # 2) Iterate through each column of each row of the %_content table. | |
160 # Tokenize all documents, and check that for each token there is | |
161 # a corresponding entry in the $C array. After checking a token, | |
162 # [unset] the $C array entry. | |
163 # | |
164 # 3) Check that array $C is now empty. | |
165 # | |
166 # | |
167 proc fts3_integrity_check {tbl} { | |
168 | |
169 fts3_read2 $tbl 1 A | |
170 | |
171 foreach zTerm [array names A] { | |
172 #puts $zTerm | |
173 foreach doclist $A($zTerm) { | |
174 set docid 0 | |
175 while {[string length $doclist]>0} { | |
176 set iCol 0 | |
177 set iPos 0 | |
178 set lPos [list] | |
179 set lCol [list] | |
180 | |
181 # First varint of a doclist-entry is the docid. Delta-compressed | |
182 # with respect to the docid of the previous entry. | |
183 # | |
184 incr docid [gobble_varint doclist] | |
185 if {[info exists D($zTerm,$docid)]} { | |
186 while {[set iDelta [gobble_varint doclist]] != 0} {} | |
187 continue | |
188 } | |
189 set D($zTerm,$docid) 1 | |
190 | |
191 # Gobble varints until the 0x00 that terminates the doclist-entry | |
192 # is found. | |
193 while {[set iDelta [gobble_varint doclist]] > 0} { | |
194 if {$iDelta == 1} { | |
195 set iCol [gobble_varint doclist] | |
196 set iPos 0 | |
197 } else { | |
198 incr iPos $iDelta | |
199 incr iPos -2 | |
200 set C($docid,$iCol,$iPos) $zTerm | |
201 } | |
202 } | |
203 } | |
204 } | |
205 } | |
206 | |
207 foreach key [array names C] { | |
208 #puts "$key -> $C($key)" | |
209 } | |
210 | |
211 | |
212 db eval "SELECT * FROM ${tbl}_content" E { | |
213 set iCol 0 | |
214 set iDoc $E(docid) | |
215 foreach col [lrange $E(*) 1 end] { | |
216 set c $E($col) | |
217 set sql {SELECT fts3_tokenizer_test('simple', $c)} | |
218 | |
219 foreach {pos term dummy} [db one $sql] { | |
220 if {![info exists C($iDoc,$iCol,$pos)]} { | |
221 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing" | |
222 lappend errors $es | |
223 } else { | |
224 if {[string compare $C($iDoc,$iCol,$pos) $term]} { | |
225 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index " | |
226 append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\"" | |
227 lappend errors $es | |
228 } | |
229 unset C($iDoc,$iCol,$pos) | |
230 } | |
231 } | |
232 incr iCol | |
233 } | |
234 } | |
235 | |
236 foreach c [array names C] { | |
237 lappend errors "Bad index entry: $c -> $C($c)" | |
238 } | |
239 | |
240 if {[info exists errors]} { return [join $errors "\n"] } | |
241 return "ok" | |
242 } | |
243 | |
244 # USAGE: fts3_terms TBL WHERE | |
245 # | |
246 # Argument TBL must be the name of an FTS3 table. Argument WHERE is an | |
247 # SQL expression that will be used as the WHERE clause when scanning | |
248 # the %_segdir table. As in the following query: | |
249 # | |
250 # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}" | |
251 # | |
252 # This function returns a list of all terms present in the segments | |
253 # selected by the statement above. | |
254 # | |
255 proc fts3_terms {tbl where} { | |
256 fts3_read $tbl $where a | |
257 return [lsort [array names a]] | |
258 } | |
259 | |
260 | |
261 # USAGE: fts3_doclist TBL TERM WHERE | |
262 # | |
263 # Argument TBL must be the name of an FTS3 table. TERM is a term that may | |
264 # or may not be present in the table. Argument WHERE is used to select a | |
265 # subset of the b-tree segments in the associated full-text index as | |
266 # described above for [fts3_terms]. | |
267 # | |
268 # This function returns the results of merging the doclists associated | |
269 # with TERM in the selected segments. Each doclist is an element of the | |
270 # returned list. Each doclist is formatted as follows: | |
271 # | |
272 # [$docid ?$col[$off1 $off2...]?...] | |
273 # | |
274 # The formatting is odd for a Tcl command in order to be compatible with | |
275 # the original C-language implementation. If argument WHERE is "1", then | |
276 # any empty doclists are omitted from the returned list. | |
277 # | |
278 proc fts3_doclist {tbl term where} { | |
279 fts3_read $tbl $where a | |
280 | |
281 | |
282 foreach doclist $a($term) { | |
283 set docid 0 | |
284 | |
285 while {[string length $doclist]>0} { | |
286 set iCol 0 | |
287 set iPos 0 | |
288 set lPos [list] | |
289 set lCol [list] | |
290 incr docid [gobble_varint doclist] | |
291 | |
292 while {[set iDelta [gobble_varint doclist]] > 0} { | |
293 if {$iDelta == 1} { | |
294 lappend lCol [list $iCol $lPos] | |
295 set iPos 0 | |
296 set lPos [list] | |
297 set iCol [gobble_varint doclist] | |
298 } else { | |
299 incr iPos $iDelta | |
300 incr iPos -2 | |
301 lappend lPos $iPos | |
302 } | |
303 } | |
304 | |
305 if {[llength $lPos]>0} { | |
306 lappend lCol [list $iCol $lPos] | |
307 } | |
308 | |
309 if {$where != "1" || [llength $lCol]>0} { | |
310 set ret($docid) $lCol | |
311 } else { | |
312 unset -nocomplain ret($docid) | |
313 } | |
314 } | |
315 } | |
316 | |
317 set lDoc [list] | |
318 foreach docid [lsort -integer [array names ret]] { | |
319 set lCol [list] | |
320 set cols "" | |
321 foreach col $ret($docid) { | |
322 foreach {iCol lPos} $col {} | |
323 append cols " $iCol\[[join $lPos { }]\]" | |
324 } | |
325 lappend lDoc "\[${docid}${cols}\]" | |
326 } | |
327 | |
328 join $lDoc " " | |
329 } | |
330 | |
331 ########################################################################### | |
332 | |
333 proc gobble_varint {varname} { | |
334 upvar $varname blob | |
335 set n [read_fts3varint $blob ret] | |
336 set blob [string range $blob $n end] | |
337 return $ret | |
338 } | |
339 proc gobble_string {varname nLength} { | |
340 upvar $varname blob | |
341 set ret [string range $blob 0 [expr $nLength-1]] | |
342 set blob [string range $blob $nLength end] | |
343 return $ret | |
344 } | |
345 | |
346 # The argument is a blob of data representing an FTS3 segment leaf. | |
347 # Return a list consisting of alternating terms (strings) and doclists | |
348 # (blobs of data). | |
349 # | |
350 proc fts3_readleaf {blob} { | |
351 set zPrev "" | |
352 set terms [list] | |
353 | |
354 while {[string length $blob] > 0} { | |
355 set nPrefix [gobble_varint blob] | |
356 set nSuffix [gobble_varint blob] | |
357 | |
358 set zTerm [string range $zPrev 0 [expr $nPrefix-1]] | |
359 append zTerm [gobble_string blob $nSuffix] | |
360 set nDoclist [gobble_varint blob] | |
361 set doclist [gobble_string blob $nDoclist] | |
362 | |
363 lappend terms $zTerm $doclist | |
364 set zPrev $zTerm | |
365 } | |
366 | |
367 return $terms | |
368 } | |
369 | |
370 proc fts3_read2 {tbl where varname} { | |
371 upvar $varname a | |
372 array unset a | |
373 db eval " SELECT start_block, leaves_end_block, root | |
374 FROM ${tbl}_segdir WHERE $where | |
375 ORDER BY level ASC, idx DESC | |
376 " { | |
377 set c 0 | |
378 binary scan $root c c | |
379 if {$c==0} { | |
380 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } | |
381 } else { | |
382 db eval " SELECT block | |
383 FROM ${tbl}_segments | |
384 WHERE blockid>=$start_block AND blockid<=$leaves_end_block | |
385 ORDER BY blockid | |
386 " { | |
387 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } | |
388 } | |
389 } | |
390 } | |
391 } | |
392 | |
393 proc fts3_read {tbl where varname} { | |
394 upvar $varname a | |
395 array unset a | |
396 db eval " SELECT start_block, leaves_end_block, root | |
397 FROM ${tbl}_segdir WHERE $where | |
398 ORDER BY level DESC, idx ASC | |
399 " { | |
400 if {$start_block == 0} { | |
401 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d } | |
402 } else { | |
403 db eval " SELECT block | |
404 FROM ${tbl}_segments | |
405 WHERE blockid>=$start_block AND blockid<$leaves_end_block | |
406 ORDER BY blockid | |
407 " { | |
408 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d } | |
409 | |
410 } | |
411 } | |
412 } | |
413 } | |
414 | |
415 ########################################################################## | |
416 | |
OLD | NEW |