OLD | NEW |
(Empty) | |
| 1 # 2014 Dec 19 |
| 2 # |
| 3 # The author disclaims copyright to this source code. In place of |
| 4 # a legal notice, here is a blessing: |
| 5 # |
| 6 # May you do good and not evil. |
| 7 # May you find forgiveness for yourself and forgive others. |
| 8 # May you share freely, never taking more than you give. |
| 9 # |
| 10 #*********************************************************************** |
| 11 # |
| 12 |
| 13 if {![info exists testdir]} { |
| 14 set testdir [file join [file dirname [info script]] .. .. .. test] |
| 15 } |
| 16 source $testdir/tester.tcl |
| 17 |
| 18 catch { |
| 19 sqlite3_fts5_may_be_corrupt 0 |
| 20 reset_db |
| 21 } |
| 22 |
| 23 proc fts5_test_poslist {cmd} { |
| 24 set res [list] |
| 25 for {set i 0} {$i < [$cmd xInstCount]} {incr i} { |
| 26 lappend res [string map {{ } .} [$cmd xInst $i]] |
| 27 } |
| 28 set res |
| 29 } |
| 30 |
| 31 proc fts5_test_columnsize {cmd} { |
| 32 set res [list] |
| 33 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { |
| 34 lappend res [$cmd xColumnSize $i] |
| 35 } |
| 36 set res |
| 37 } |
| 38 |
| 39 proc fts5_test_columntext {cmd} { |
| 40 set res [list] |
| 41 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { |
| 42 lappend res [$cmd xColumnText $i] |
| 43 } |
| 44 set res |
| 45 } |
| 46 |
| 47 proc fts5_test_columntotalsize {cmd} { |
| 48 set res [list] |
| 49 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { |
| 50 lappend res [$cmd xColumnTotalSize $i] |
| 51 } |
| 52 set res |
| 53 } |
| 54 |
| 55 proc test_append_token {varname token iStart iEnd} { |
| 56 upvar $varname var |
| 57 lappend var $token |
| 58 return "SQLITE_OK" |
| 59 } |
| 60 proc fts5_test_tokenize {cmd} { |
| 61 set res [list] |
| 62 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} { |
| 63 set tokens [list] |
| 64 $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens] |
| 65 lappend res $tokens |
| 66 } |
| 67 set res |
| 68 } |
| 69 |
| 70 proc fts5_test_rowcount {cmd} { |
| 71 $cmd xRowCount |
| 72 } |
| 73 |
| 74 proc test_queryphrase_cb {cnt cmd} { |
| 75 upvar $cnt L |
| 76 for {set i 0} {$i < [$cmd xInstCount]} {incr i} { |
| 77 foreach {ip ic io} [$cmd xInst $i] break |
| 78 set A($ic) 1 |
| 79 } |
| 80 foreach ic [array names A] { |
| 81 lset L $ic [expr {[lindex $L $ic] + 1}] |
| 82 } |
| 83 } |
| 84 proc fts5_test_queryphrase {cmd} { |
| 85 set res [list] |
| 86 for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} { |
| 87 set cnt [list] |
| 88 for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 } |
| 89 $cmd xQueryPhrase $i [list test_queryphrase_cb cnt] |
| 90 lappend res $cnt |
| 91 } |
| 92 set res |
| 93 } |
| 94 |
| 95 proc fts5_test_phrasecount {cmd} { |
| 96 $cmd xPhraseCount |
| 97 } |
| 98 |
| 99 proc fts5_test_all {cmd} { |
| 100 set res [list] |
| 101 lappend res columnsize [fts5_test_columnsize $cmd] |
| 102 lappend res columntext [fts5_test_columntext $cmd] |
| 103 lappend res columntotalsize [fts5_test_columntotalsize $cmd] |
| 104 lappend res poslist [fts5_test_poslist $cmd] |
| 105 lappend res tokenize [fts5_test_tokenize $cmd] |
| 106 lappend res rowcount [fts5_test_rowcount $cmd] |
| 107 set res |
| 108 } |
| 109 |
| 110 proc fts5_aux_test_functions {db} { |
| 111 foreach f { |
| 112 fts5_test_columnsize |
| 113 fts5_test_columntext |
| 114 fts5_test_columntotalsize |
| 115 fts5_test_poslist |
| 116 fts5_test_tokenize |
| 117 fts5_test_rowcount |
| 118 fts5_test_all |
| 119 |
| 120 fts5_test_queryphrase |
| 121 fts5_test_phrasecount |
| 122 } { |
| 123 sqlite3_fts5_create_function $db $f $f |
| 124 } |
| 125 } |
| 126 |
| 127 proc fts5_level_segs {tbl} { |
| 128 set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10" |
| 129 set ret [list] |
| 130 foreach L [lrange [db one $sql] 1 end] { |
| 131 lappend ret [expr [llength $L] - 3] |
| 132 } |
| 133 set ret |
| 134 } |
| 135 |
| 136 proc fts5_level_segids {tbl} { |
| 137 set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10" |
| 138 set ret [list] |
| 139 foreach L [lrange [db one $sql] 1 end] { |
| 140 set lvl [list] |
| 141 foreach S [lrange $L 3 end] { |
| 142 regexp {id=([1234567890]*)} $S -> segid |
| 143 lappend lvl $segid |
| 144 } |
| 145 lappend ret $lvl |
| 146 } |
| 147 set ret |
| 148 } |
| 149 |
| 150 proc fts5_rnddoc {n} { |
| 151 set map [list 0 a 1 b 2 c 3 d 4 e 5 f 6 g 7 h 8 i 9 j] |
| 152 set doc [list] |
| 153 for {set i 0} {$i < $n} {incr i} { |
| 154 lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]" |
| 155 } |
| 156 set doc |
| 157 } |
| 158 |
| 159 #------------------------------------------------------------------------- |
| 160 # Usage: |
| 161 # |
| 162 # nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2... |
| 163 # |
| 164 # This command is used to test if a document (set of column values) matches |
| 165 # the logical equivalent of a single FTS5 NEAR() clump and, if so, return |
| 166 # the equivalent of an FTS5 position list. |
| 167 # |
| 168 # Parameter $aCol is passed a list of the column values for the document |
| 169 # to test. Parameters $phrase1 and so on are the phrases. |
| 170 # |
| 171 # The result is a list of phrase hits. Each phrase hit is formatted as |
| 172 # three integers separated by "." characters, in the following format: |
| 173 # |
| 174 # <phrase number> . <column number> . <token offset> |
| 175 # |
| 176 # Options: |
| 177 # |
| 178 # -near N (NEAR distance. Default 10) |
| 179 # -col C (List of column indexes to match against) |
| 180 # -pc VARNAME (variable in caller frame to use for phrase numbering) |
| 181 # |
| 182 proc nearset {aCol args} { |
| 183 set O(-near) 10 |
| 184 set O(-col) {} |
| 185 set O(-pc) "" |
| 186 |
| 187 set nOpt [lsearch -exact $args --] |
| 188 if {$nOpt<0} { error "no -- option" } |
| 189 |
| 190 foreach {k v} [lrange $args 0 [expr $nOpt-1]] { |
| 191 if {[info exists O($k)]==0} { error "unrecognized option $k" } |
| 192 set O($k) $v |
| 193 } |
| 194 |
| 195 if {$O(-pc) == ""} { |
| 196 set counter 0 |
| 197 } else { |
| 198 upvar $O(-pc) counter |
| 199 } |
| 200 |
| 201 # Set $phraselist to be a list of phrases. $nPhrase its length. |
| 202 set phraselist [lrange $args [expr $nOpt+1] end] |
| 203 set nPhrase [llength $phraselist] |
| 204 |
| 205 for {set j 0} {$j < [llength $aCol]} {incr j} { |
| 206 for {set i 0} {$i < $nPhrase} {incr i} { |
| 207 set A($j,$i) [list] |
| 208 } |
| 209 } |
| 210 |
| 211 set iCol -1 |
| 212 foreach col $aCol { |
| 213 incr iCol |
| 214 if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue |
| 215 set nToken [llength $col] |
| 216 |
| 217 set iFL [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)] |
| 218 for { } {$iFL < $nToken} {incr iFL} { |
| 219 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { |
| 220 set B($iPhrase) [list] |
| 221 } |
| 222 |
| 223 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { |
| 224 set p [lindex $phraselist $iPhrase] |
| 225 set nPm1 [expr {[llength $p] - 1}] |
| 226 set iFirst [expr $iFL - $O(-near) - [llength $p]] |
| 227 |
| 228 for {set i $iFirst} {$i <= $iFL} {incr i} { |
| 229 if {[lrange $col $i [expr $i+$nPm1]] == $p} { lappend B($iPhrase) $i } |
| 230 } |
| 231 if {[llength $B($iPhrase)] == 0} break |
| 232 } |
| 233 |
| 234 if {$iPhrase==$nPhrase} { |
| 235 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { |
| 236 set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)] |
| 237 set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)] |
| 238 } |
| 239 } |
| 240 } |
| 241 } |
| 242 |
| 243 set res [list] |
| 244 #puts [array names A] |
| 245 |
| 246 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} { |
| 247 for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} { |
| 248 foreach a $A($iCol,$iPhrase) { |
| 249 lappend res "$counter.$iCol.$a" |
| 250 } |
| 251 } |
| 252 incr counter |
| 253 } |
| 254 |
| 255 #puts $res |
| 256 sort_poslist $res |
| 257 } |
| 258 |
| 259 #------------------------------------------------------------------------- |
| 260 # Usage: |
| 261 # |
| 262 # sort_poslist LIST |
| 263 # |
| 264 # Sort a position list of the type returned by command [nearset] |
| 265 # |
| 266 proc sort_poslist {L} { |
| 267 lsort -command instcompare $L |
| 268 } |
| 269 proc instcompare {lhs rhs} { |
| 270 foreach {p1 c1 o1} [split $lhs .] {} |
| 271 foreach {p2 c2 o2} [split $rhs .] {} |
| 272 |
| 273 set res [expr $c1 - $c2] |
| 274 if {$res==0} { set res [expr $o1 - $o2] } |
| 275 if {$res==0} { set res [expr $p1 - $p2] } |
| 276 |
| 277 return $res |
| 278 } |
| 279 |
| 280 #------------------------------------------------------------------------- |
| 281 # Logical operators used by the commands returned by fts5_tcl_expr(). |
| 282 # |
| 283 proc AND {args} { |
| 284 foreach a $args { |
| 285 if {[llength $a]==0} { return [list] } |
| 286 } |
| 287 sort_poslist [concat {*}$args] |
| 288 } |
| 289 proc OR {args} { |
| 290 sort_poslist [concat {*}$args] |
| 291 } |
| 292 proc NOT {a b} { |
| 293 if {[llength $b]>0} { return [list] } |
| 294 return $a |
| 295 } |
| 296 |
| 297 #------------------------------------------------------------------------- |
| 298 # This command is similar to [split], except that it also provides the |
| 299 # start and end offsets of each token. For example: |
| 300 # |
| 301 # [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8} |
| 302 # |
| 303 |
| 304 proc gobble_whitespace {textvar} { |
| 305 upvar $textvar t |
| 306 regexp {([ ]*)(.*)} $t -> space t |
| 307 return [string length $space] |
| 308 } |
| 309 |
| 310 proc gobble_text {textvar wordvar} { |
| 311 upvar $textvar t |
| 312 upvar $wordvar w |
| 313 regexp {([^ ]*)(.*)} $t -> w t |
| 314 return [string length $w] |
| 315 } |
| 316 |
| 317 proc fts5_tokenize_split {text} { |
| 318 set token "" |
| 319 set ret [list] |
| 320 set iOff [gobble_whitespace text] |
| 321 while {[set nToken [gobble_text text word]]} { |
| 322 lappend ret $word $iOff [expr $iOff+$nToken] |
| 323 incr iOff $nToken |
| 324 incr iOff [gobble_whitespace text] |
| 325 } |
| 326 |
| 327 set ret |
| 328 } |
| 329 |
OLD | NEW |