OLD | NEW |
(Empty) | |
| 1 |
| 2 # |
| 3 # Parameter $zName must be a path to the file UnicodeData.txt. This command |
| 4 # reads the file and returns a list of mappings required to remove all |
| 5 # diacritical marks from a unicode string. Each mapping is itself a list |
| 6 # consisting of two elements - the unicode codepoint and the single ASCII |
| 7 # character that it should be replaced with, or an empty string if the |
| 8 # codepoint should simply be removed from the input. Examples: |
| 9 # |
| 10 # { 224 a } (replace codepoint 224 to "a") |
| 11 # { 769 "" } (remove codepoint 769 from input) |
| 12 # |
| 13 # Mappings are only returned for non-upper case codepoints. It is assumed |
| 14 # that the input has already been folded to lower case. |
| 15 # |
| 16 proc rd_load_unicodedata_text {zName} { |
| 17 global tl_lookup_table |
| 18 |
| 19 set fd [open $zName] |
| 20 set lField { |
| 21 code |
| 22 character_name |
| 23 general_category |
| 24 canonical_combining_classes |
| 25 bidirectional_category |
| 26 character_decomposition_mapping |
| 27 decimal_digit_value |
| 28 digit_value |
| 29 numeric_value |
| 30 mirrored |
| 31 unicode_1_name |
| 32 iso10646_comment_field |
| 33 uppercase_mapping |
| 34 lowercase_mapping |
| 35 titlecase_mapping |
| 36 } |
| 37 set lRet [list] |
| 38 |
| 39 while { ![eof $fd] } { |
| 40 set line [gets $fd] |
| 41 if {$line == ""} continue |
| 42 |
| 43 set fields [split $line ";"] |
| 44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } |
| 45 foreach $lField $fields {} |
| 46 if { [llength $character_decomposition_mapping]!=2 |
| 47 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 |
| 48 } { |
| 49 continue |
| 50 } |
| 51 |
| 52 set iCode [expr "0x$code"] |
| 53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] |
| 54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] |
| 55 |
| 56 if {[info exists tl_lookup_table($iCode)]} continue |
| 57 |
| 58 if { ($iAscii >= 97 && $iAscii <= 122) |
| 59 || ($iAscii >= 65 && $iAscii <= 90) |
| 60 } { |
| 61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]] |
| 62 set dia($iDia) 1 |
| 63 } |
| 64 } |
| 65 |
| 66 foreach d [array names dia] { |
| 67 lappend lRet [list $d ""] |
| 68 } |
| 69 set lRet [lsort -integer -index 0 $lRet] |
| 70 |
| 71 close $fd |
| 72 set lRet |
| 73 } |
| 74 |
| 75 |
| 76 proc print_rd {map} { |
| 77 global tl_lookup_table |
| 78 set aChar [list] |
| 79 set lRange [list] |
| 80 |
| 81 set nRange 1 |
| 82 set iFirst [lindex $map 0 0] |
| 83 set cPrev [lindex $map 0 1] |
| 84 |
| 85 foreach m [lrange $map 1 end] { |
| 86 foreach {i c} $m {} |
| 87 |
| 88 if {$cPrev == $c} { |
| 89 for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { |
| 90 if {[info exists tl_lookup_table($j)]==0} break |
| 91 } |
| 92 |
| 93 if {$j==$i} { |
| 94 set nNew [expr {(1 + $i - $iFirst)}] |
| 95 if {$nNew<=8} { |
| 96 set nRange $nNew |
| 97 continue |
| 98 } |
| 99 } |
| 100 } |
| 101 |
| 102 lappend lRange [list $iFirst $nRange] |
| 103 lappend aChar $cPrev |
| 104 |
| 105 set iFirst $i |
| 106 set cPrev $c |
| 107 set nRange 1 |
| 108 } |
| 109 lappend lRange [list $iFirst $nRange] |
| 110 lappend aChar $cPrev |
| 111 |
| 112 puts "/*" |
| 113 puts "** If the argument is a codepoint corresponding to a lowercase letter" |
| 114 puts "** in the ASCII range with a diacritic added, return the codepoint" |
| 115 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" |
| 116 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" |
| 117 puts "** E\"). The resuls of passing a codepoint that corresponds to an" |
| 118 puts "** uppercase letter are undefined." |
| 119 puts "*/" |
| 120 puts "static int remove_diacritic(int c)\{" |
| 121 puts " unsigned short aDia\[\] = \{" |
| 122 puts -nonewline " 0, " |
| 123 set i 1 |
| 124 foreach r $lRange { |
| 125 foreach {iCode nRange} $r {} |
| 126 if {($i % 8)==0} {puts "" ; puts -nonewline " " } |
| 127 incr i |
| 128 |
| 129 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] |
| 130 puts -nonewline ", " |
| 131 } |
| 132 puts "" |
| 133 puts " \};" |
| 134 puts " char aChar\[\] = \{" |
| 135 puts -nonewline " '\\0', " |
| 136 set i 1 |
| 137 foreach c $aChar { |
| 138 set str "'$c', " |
| 139 if {$c == ""} { set str "'\\0', " } |
| 140 |
| 141 if {($i % 12)==0} {puts "" ; puts -nonewline " " } |
| 142 incr i |
| 143 puts -nonewline "$str" |
| 144 } |
| 145 puts "" |
| 146 puts " \};" |
| 147 puts { |
| 148 unsigned int key = (((unsigned int)c)<<3) | 0x00000007; |
| 149 int iRes = 0; |
| 150 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; |
| 151 int iLo = 0; |
| 152 while( iHi>=iLo ){ |
| 153 int iTest = (iHi + iLo) / 2; |
| 154 if( key >= aDia[iTest] ){ |
| 155 iRes = iTest; |
| 156 iLo = iTest+1; |
| 157 }else{ |
| 158 iHi = iTest-1; |
| 159 } |
| 160 } |
| 161 assert( key>=aDia[iRes] ); |
| 162 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);} |
| 163 puts "\}" |
| 164 } |
| 165 |
| 166 proc print_isdiacritic {zFunc map} { |
| 167 |
| 168 set lCode [list] |
| 169 foreach m $map { |
| 170 foreach {code char} $m {} |
| 171 if {$code && $char == ""} { lappend lCode $code } |
| 172 } |
| 173 set lCode [lsort -integer $lCode] |
| 174 set iFirst [lindex $lCode 0] |
| 175 set iLast [lindex $lCode end] |
| 176 |
| 177 set i1 0 |
| 178 set i2 0 |
| 179 |
| 180 foreach c $lCode { |
| 181 set i [expr $c - $iFirst] |
| 182 if {$i < 32} { |
| 183 set i1 [expr {$i1 | (1<<$i)}] |
| 184 } else { |
| 185 set i2 [expr {$i2 | (1<<($i-32))}] |
| 186 } |
| 187 } |
| 188 |
| 189 puts "/*" |
| 190 puts "** Return true if the argument interpreted as a unicode codepoint" |
| 191 puts "** is a diacritical modifier character." |
| 192 puts "*/" |
| 193 puts "int ${zFunc}\(int c)\{" |
| 194 puts " unsigned int mask0 = [format "0x%08X" $i1];" |
| 195 puts " unsigned int mask1 = [format "0x%08X" $i2];" |
| 196 |
| 197 puts " if( c<$iFirst || c>$iLast ) return 0;" |
| 198 puts " return (c < $iFirst+32) ?" |
| 199 puts " (mask0 & (1 << (c-$iFirst))) :" |
| 200 puts " (mask1 & (1 << (c-$iFirst-32)));" |
| 201 puts "\}" |
| 202 } |
| 203 |
| 204 |
| 205 #------------------------------------------------------------------------- |
| 206 |
| 207 # Parameter $zName must be a path to the file UnicodeData.txt. This command |
| 208 # reads the file and returns a list of codepoints (integers). The list |
| 209 # contains all codepoints in the UnicodeData.txt assigned to any "General |
| 210 # Category" that is not a "Letter" or "Number". |
| 211 # |
| 212 proc an_load_unicodedata_text {zName} { |
| 213 set fd [open $zName] |
| 214 set lField { |
| 215 code |
| 216 character_name |
| 217 general_category |
| 218 canonical_combining_classes |
| 219 bidirectional_category |
| 220 character_decomposition_mapping |
| 221 decimal_digit_value |
| 222 digit_value |
| 223 numeric_value |
| 224 mirrored |
| 225 unicode_1_name |
| 226 iso10646_comment_field |
| 227 uppercase_mapping |
| 228 lowercase_mapping |
| 229 titlecase_mapping |
| 230 } |
| 231 set lRet [list] |
| 232 |
| 233 while { ![eof $fd] } { |
| 234 set line [gets $fd] |
| 235 if {$line == ""} continue |
| 236 |
| 237 set fields [split $line ";"] |
| 238 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } |
| 239 foreach $lField $fields {} |
| 240 |
| 241 set iCode [expr "0x$code"] |
| 242 set bAlnum [expr { |
| 243 [lsearch {L N} [string range $general_category 0 0]] >= 0 |
| 244 || $general_category=="Co" |
| 245 }] |
| 246 |
| 247 if { !$bAlnum } { lappend lRet $iCode } |
| 248 } |
| 249 |
| 250 close $fd |
| 251 set lRet |
| 252 } |
| 253 |
| 254 proc an_load_separator_ranges {} { |
| 255 global unicodedata.txt |
| 256 set lSep [an_load_unicodedata_text ${unicodedata.txt}] |
| 257 unset -nocomplain iFirst |
| 258 unset -nocomplain nRange |
| 259 set lRange [list] |
| 260 foreach sep $lSep { |
| 261 if {0==[info exists iFirst]} { |
| 262 set iFirst $sep |
| 263 set nRange 1 |
| 264 } elseif { $sep == ($iFirst+$nRange) } { |
| 265 incr nRange |
| 266 } else { |
| 267 lappend lRange [list $iFirst $nRange] |
| 268 set iFirst $sep |
| 269 set nRange 1 |
| 270 } |
| 271 } |
| 272 lappend lRange [list $iFirst $nRange] |
| 273 set lRange |
| 274 } |
| 275 |
| 276 proc an_print_range_array {lRange} { |
| 277 set iFirstMax 0 |
| 278 set nRangeMax 0 |
| 279 foreach range $lRange { |
| 280 foreach {iFirst nRange} $range {} |
| 281 if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} |
| 282 if {$nRange > $nRangeMax} {set nRangeMax $nRange} |
| 283 } |
| 284 if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} |
| 285 if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} |
| 286 |
| 287 puts -nonewline " " |
| 288 puts [string trim { |
| 289 /* Each unsigned integer in the following array corresponds to a contiguous |
| 290 ** range of unicode codepoints that are not either letters or numbers (i.e. |
| 291 ** codepoints for which this function should return 0). |
| 292 ** |
| 293 ** The most significant 22 bits in each 32-bit value contain the first |
| 294 ** codepoint in the range. The least significant 10 bits are used to store |
| 295 ** the size of the range (always at least 1). In other words, the value |
| 296 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint |
| 297 ** C. It is not possible to represent a range larger than 1023 codepoints |
| 298 ** using this format. |
| 299 */ |
| 300 }] |
| 301 puts -nonewline " static const unsigned int aEntry\[\] = \{" |
| 302 set i 0 |
| 303 foreach range $lRange { |
| 304 foreach {iFirst nRange} $range {} |
| 305 set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] |
| 306 |
| 307 if {($i % 5)==0} {puts "" ; puts -nonewline " "} |
| 308 puts -nonewline " $u32," |
| 309 incr i |
| 310 } |
| 311 puts "" |
| 312 puts " \};" |
| 313 } |
| 314 |
| 315 proc an_print_ascii_bitmap {lRange} { |
| 316 foreach range $lRange { |
| 317 foreach {iFirst nRange} $range {} |
| 318 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { |
| 319 if {$i<=127} { set a($i) 1 } |
| 320 } |
| 321 } |
| 322 |
| 323 set aAscii [list 0 0 0 0] |
| 324 foreach key [array names a] { |
| 325 set idx [expr $key >> 5] |
| 326 lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))] |
| 327 } |
| 328 |
| 329 puts " static const unsigned int aAscii\[4\] = \{" |
| 330 puts -nonewline " " |
| 331 foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] } |
| 332 puts "" |
| 333 puts " \};" |
| 334 } |
| 335 |
| 336 proc print_isalnum {zFunc lRange} { |
| 337 puts "/*" |
| 338 puts "** Return true if the argument corresponds to a unicode codepoint" |
| 339 puts "** classified as either a letter or a number. Otherwise false." |
| 340 puts "**" |
| 341 puts "** The results are undefined if the value passed to this function" |
| 342 puts "** is less than zero." |
| 343 puts "*/" |
| 344 puts "int ${zFunc}\(int c)\{" |
| 345 an_print_range_array $lRange |
| 346 an_print_ascii_bitmap $lRange |
| 347 puts { |
| 348 if( c<128 ){ |
| 349 return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 ); |
| 350 }else if( c<(1<<22) ){ |
| 351 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; |
| 352 int iRes = 0; |
| 353 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; |
| 354 int iLo = 0; |
| 355 while( iHi>=iLo ){ |
| 356 int iTest = (iHi + iLo) / 2; |
| 357 if( key >= aEntry[iTest] ){ |
| 358 iRes = iTest; |
| 359 iLo = iTest+1; |
| 360 }else{ |
| 361 iHi = iTest-1; |
| 362 } |
| 363 } |
| 364 assert( aEntry[0]<key ); |
| 365 assert( key>=aEntry[iRes] ); |
| 366 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); |
| 367 } |
| 368 return 1;} |
| 369 puts "\}" |
| 370 } |
| 371 |
| 372 proc print_test_isalnum {zFunc lRange} { |
| 373 foreach range $lRange { |
| 374 foreach {iFirst nRange} $range {} |
| 375 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 } |
| 376 } |
| 377 |
| 378 puts "static int isalnum_test(int *piCode)\{" |
| 379 puts -nonewline " unsigned char aAlnum\[\] = \{" |
| 380 for {set i 0} {$i < 70000} {incr i} { |
| 381 if {($i % 32)==0} { puts "" ; puts -nonewline " " } |
| 382 set bFlag [expr ![info exists a($i)]] |
| 383 puts -nonewline "${bFlag}," |
| 384 } |
| 385 puts "" |
| 386 puts " \};" |
| 387 |
| 388 puts -nonewline " int aLargeSep\[\] = \{" |
| 389 set i 0 |
| 390 foreach iSep [lsort -integer [array names a]] { |
| 391 if {$iSep<70000} continue |
| 392 if {($i % 8)==0} { puts "" ; puts -nonewline " " } |
| 393 puts -nonewline " $iSep," |
| 394 incr i |
| 395 } |
| 396 puts "" |
| 397 puts " \};" |
| 398 puts -nonewline " int aLargeOther\[\] = \{" |
| 399 set i 0 |
| 400 foreach iSep [lsort -integer [array names a]] { |
| 401 if {$iSep<70000} continue |
| 402 if {[info exists a([expr $iSep-1])]==0} { |
| 403 if {($i % 8)==0} { puts "" ; puts -nonewline " " } |
| 404 puts -nonewline " [expr $iSep-1]," |
| 405 incr i |
| 406 } |
| 407 if {[info exists a([expr $iSep+1])]==0} { |
| 408 if {($i % 8)==0} { puts "" ; puts -nonewline " " } |
| 409 puts -nonewline " [expr $iSep+1]," |
| 410 incr i |
| 411 } |
| 412 } |
| 413 puts "" |
| 414 puts " \};" |
| 415 |
| 416 puts [subst -nocommands { |
| 417 int i; |
| 418 for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){ |
| 419 if( ${zFunc}(i)!=aAlnum[i] ){ |
| 420 *piCode = i; |
| 421 return 1; |
| 422 } |
| 423 } |
| 424 for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){ |
| 425 if( ${zFunc}(aLargeSep[i])!=0 ){ |
| 426 *piCode = aLargeSep[i]; |
| 427 return 1; |
| 428 } |
| 429 } |
| 430 for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){ |
| 431 if( ${zFunc}(aLargeOther[i])!=1 ){ |
| 432 *piCode = aLargeOther[i]; |
| 433 return 1; |
| 434 } |
| 435 } |
| 436 }] |
| 437 puts " return 0;" |
| 438 puts "\}" |
| 439 } |
| 440 |
| 441 #------------------------------------------------------------------------- |
| 442 |
| 443 proc tl_load_casefolding_txt {zName} { |
| 444 global tl_lookup_table |
| 445 |
| 446 set fd [open $zName] |
| 447 while { ![eof $fd] } { |
| 448 set line [gets $fd] |
| 449 if {[string range $line 0 0] == "#"} continue |
| 450 if {$line == ""} continue |
| 451 |
| 452 foreach x {a b c d} {unset -nocomplain $x} |
| 453 foreach {a b c d} [split $line ";"] {} |
| 454 |
| 455 set a2 [list] |
| 456 set c2 [list] |
| 457 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } |
| 458 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } |
| 459 set b [string trim $b] |
| 460 set d [string trim $d] |
| 461 |
| 462 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } |
| 463 } |
| 464 } |
| 465 |
| 466 proc tl_create_records {} { |
| 467 global tl_lookup_table |
| 468 |
| 469 set iFirst "" |
| 470 set nOff 0 |
| 471 set nRange 0 |
| 472 set nIncr 0 |
| 473 |
| 474 set lRecord [list] |
| 475 foreach code [lsort -integer [array names tl_lookup_table]] { |
| 476 set mapping $tl_lookup_table($code) |
| 477 if {$iFirst == ""} { |
| 478 set iFirst $code |
| 479 set nOff [expr $mapping - $code] |
| 480 set nRange 1 |
| 481 set nIncr 1 |
| 482 } else { |
| 483 set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))] |
| 484 if { $nRange==1 && ($diff==1 || $diff==2) } { |
| 485 set nIncr $diff |
| 486 } |
| 487 |
| 488 if {$diff != $nIncr || ($mapping - $code)!=$nOff} { |
| 489 if { $nRange==1 } {set nIncr 1} |
| 490 lappend lRecord [list $iFirst $nIncr $nRange $nOff] |
| 491 set iFirst $code |
| 492 set nOff [expr $mapping - $code] |
| 493 set nRange 1 |
| 494 set nIncr 1 |
| 495 } else { |
| 496 incr nRange |
| 497 } |
| 498 } |
| 499 } |
| 500 |
| 501 lappend lRecord [list $iFirst $nIncr $nRange $nOff] |
| 502 |
| 503 set lRecord |
| 504 } |
| 505 |
| 506 proc tl_print_table_header {} { |
| 507 puts -nonewline " " |
| 508 puts [string trim { |
| 509 /* Each entry in the following array defines a rule for folding a range |
| 510 ** of codepoints to lower case. The rule applies to a range of nRange |
| 511 ** codepoints starting at codepoint iCode. |
| 512 ** |
| 513 ** If the least significant bit in flags is clear, then the rule applies |
| 514 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and |
| 515 ** need to be folded). Or, if it is set, then the rule only applies to |
| 516 ** every second codepoint in the range, starting with codepoint C. |
| 517 ** |
| 518 ** The 7 most significant bits in flags are an index into the aiOff[] |
| 519 ** array. If a specific codepoint C does require folding, then its lower |
| 520 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). |
| 521 ** |
| 522 ** The contents of this array are generated by parsing the CaseFolding.txt |
| 523 ** file distributed as part of the "Unicode Character Database". See |
| 524 ** http://www.unicode.org for details. |
| 525 */ |
| 526 }] |
| 527 puts " static const struct TableEntry \{" |
| 528 puts " unsigned short iCode;" |
| 529 puts " unsigned char flags;" |
| 530 puts " unsigned char nRange;" |
| 531 puts " \} aEntry\[\] = \{" |
| 532 } |
| 533 |
| 534 proc tl_print_table_entry {togglevar entry liOff} { |
| 535 upvar $togglevar t |
| 536 foreach {iFirst nIncr nRange nOff} $entry {} |
| 537 |
| 538 if {$iFirst > (1<<16)} { return 1 } |
| 539 |
| 540 if {[info exists t]==0} {set t 0} |
| 541 if {$t==0} { puts -nonewline " " } |
| 542 |
| 543 set flags 0 |
| 544 if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]} |
| 545 if {$nOff<0} { incr nOff [expr (1<<16)] } |
| 546 |
| 547 set idx [lsearch $liOff $nOff] |
| 548 if {$idx<0} {error "malfunction generating aiOff"} |
| 549 set flags [expr $flags + $idx*2] |
| 550 |
| 551 set txt "{$iFirst, $flags, $nRange}," |
| 552 if {$t==2} { |
| 553 puts $txt |
| 554 } else { |
| 555 puts -nonewline [format "% -23s" $txt] |
| 556 } |
| 557 set t [expr ($t+1)%3] |
| 558 |
| 559 return 0 |
| 560 } |
| 561 |
| 562 proc tl_print_table_footer {togglevar} { |
| 563 upvar $togglevar t |
| 564 if {$t!=0} {puts ""} |
| 565 puts " \};" |
| 566 } |
| 567 |
| 568 proc tl_print_if_entry {entry} { |
| 569 foreach {iFirst nIncr nRange nOff} $entry {} |
| 570 if {$nIncr==2} {error "tl_print_if_entry needs improvement!"} |
| 571 |
| 572 puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{" |
| 573 puts " ret = c + $nOff;" |
| 574 puts " \}" |
| 575 } |
| 576 |
| 577 proc tl_generate_ioff_table {lRecord} { |
| 578 foreach entry $lRecord { |
| 579 foreach {iFirst nIncr nRange iOff} $entry {} |
| 580 if {$iOff<0} { incr iOff [expr (1<<16)] } |
| 581 if {[info exists a($iOff)]} continue |
| 582 set a($iOff) 1 |
| 583 } |
| 584 |
| 585 set liOff [lsort -integer [array names a]] |
| 586 if {[llength $liOff]>128} { error "Too many distinct ioffs" } |
| 587 return $liOff |
| 588 } |
| 589 |
| 590 proc tl_print_ioff_table {liOff} { |
| 591 puts -nonewline " static const unsigned short aiOff\[\] = \{" |
| 592 set i 0 |
| 593 foreach off $liOff { |
| 594 if {($i % 8)==0} {puts "" ; puts -nonewline " "} |
| 595 puts -nonewline [format "% -7s" "$off,"] |
| 596 incr i |
| 597 } |
| 598 puts "" |
| 599 puts " \};" |
| 600 |
| 601 } |
| 602 |
| 603 proc print_fold {zFunc} { |
| 604 |
| 605 set lRecord [tl_create_records] |
| 606 |
| 607 set lHigh [list] |
| 608 puts "/*" |
| 609 puts "** Interpret the argument as a unicode codepoint. If the codepoint" |
| 610 puts "** is an upper case character that has a lower case equivalent," |
| 611 puts "** return the codepoint corresponding to the lower case version." |
| 612 puts "** Otherwise, return a copy of the argument." |
| 613 puts "**" |
| 614 puts "** The results are undefined if the value passed to this function" |
| 615 puts "** is less than zero." |
| 616 puts "*/" |
| 617 puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{" |
| 618 |
| 619 set liOff [tl_generate_ioff_table $lRecord] |
| 620 tl_print_table_header |
| 621 foreach entry $lRecord { |
| 622 if {[tl_print_table_entry toggle $entry $liOff]} { |
| 623 lappend lHigh $entry |
| 624 } |
| 625 } |
| 626 tl_print_table_footer toggle |
| 627 tl_print_ioff_table $liOff |
| 628 |
| 629 puts { |
| 630 int ret = c; |
| 631 |
| 632 assert( c>=0 ); |
| 633 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); |
| 634 |
| 635 if( c<128 ){ |
| 636 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); |
| 637 }else if( c<65536 ){ |
| 638 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; |
| 639 int iLo = 0; |
| 640 int iRes = -1; |
| 641 |
| 642 while( iHi>=iLo ){ |
| 643 int iTest = (iHi + iLo) / 2; |
| 644 int cmp = (c - aEntry[iTest].iCode); |
| 645 if( cmp>=0 ){ |
| 646 iRes = iTest; |
| 647 iLo = iTest+1; |
| 648 }else{ |
| 649 iHi = iTest-1; |
| 650 } |
| 651 } |
| 652 assert( iRes<0 || c>=aEntry[iRes].iCode ); |
| 653 |
| 654 if( iRes>=0 ){ |
| 655 const struct TableEntry *p = &aEntry[iRes]; |
| 656 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ |
| 657 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; |
| 658 assert( ret>0 ); |
| 659 } |
| 660 } |
| 661 |
| 662 if( bRemoveDiacritic ) ret = remove_diacritic(ret); |
| 663 } |
| 664 } |
| 665 |
| 666 foreach entry $lHigh { |
| 667 tl_print_if_entry $entry |
| 668 } |
| 669 |
| 670 puts "" |
| 671 puts " return ret;" |
| 672 puts "\}" |
| 673 } |
| 674 |
| 675 proc print_fold_test {zFunc mappings} { |
| 676 global tl_lookup_table |
| 677 |
| 678 foreach m $mappings { |
| 679 set c [lindex $m 1] |
| 680 if {$c == ""} { |
| 681 set extra([lindex $m 0]) 0 |
| 682 } else { |
| 683 scan $c %c i |
| 684 set extra([lindex $m 0]) $i |
| 685 } |
| 686 } |
| 687 |
| 688 puts "static int fold_test(int *piCode)\{" |
| 689 puts -nonewline " static int aLookup\[\] = \{" |
| 690 for {set i 0} {$i < 70000} {incr i} { |
| 691 |
| 692 set expected $i |
| 693 catch { set expected $tl_lookup_table($i) } |
| 694 set expected2 $expected |
| 695 catch { set expected2 $extra($expected2) } |
| 696 |
| 697 if {($i % 4)==0} { puts "" ; puts -nonewline " " } |
| 698 puts -nonewline "$expected, $expected2, " |
| 699 } |
| 700 puts " \};" |
| 701 puts " int i;" |
| 702 puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{" |
| 703 puts " int iCode = (i/2);" |
| 704 puts " int bFlag = i & 0x0001;" |
| 705 puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{" |
| 706 puts " *piCode = iCode;" |
| 707 puts " return 1;" |
| 708 puts " \}" |
| 709 puts " \}" |
| 710 puts " return 0;" |
| 711 puts "\}" |
| 712 } |
| 713 |
| 714 |
| 715 proc print_fileheader {} { |
| 716 puts [string trim { |
| 717 /* |
| 718 ** 2012 May 25 |
| 719 ** |
| 720 ** The author disclaims copyright to this source code. In place of |
| 721 ** a legal notice, here is a blessing: |
| 722 ** |
| 723 ** May you do good and not evil. |
| 724 ** May you find forgiveness for yourself and forgive others. |
| 725 ** May you share freely, never taking more than you give. |
| 726 ** |
| 727 ****************************************************************************** |
| 728 */ |
| 729 |
| 730 /* |
| 731 ** DO NOT EDIT THIS MACHINE GENERATED FILE. |
| 732 */ |
| 733 }] |
| 734 puts "" |
| 735 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" |
| 736 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" |
| 737 puts "" |
| 738 puts "#include <assert.h>" |
| 739 puts "" |
| 740 } |
| 741 |
| 742 proc print_test_main {} { |
| 743 puts "" |
| 744 puts "#include <stdio.h>" |
| 745 puts "" |
| 746 puts "int main(int argc, char **argv)\{" |
| 747 puts " int r1, r2;" |
| 748 puts " int code;" |
| 749 puts " r1 = isalnum_test(&code);" |
| 750 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" |
| 751 puts " else printf(\"isalnum(): test passed\\n\");" |
| 752 puts " r2 = fold_test(&code);" |
| 753 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" |
| 754 puts " else printf(\"fold(): test passed\\n\");" |
| 755 puts " return (r1 || r2);" |
| 756 puts "\}" |
| 757 } |
| 758 |
| 759 # Proces the command line arguments. Exit early if they are not to |
| 760 # our liking. |
| 761 # |
| 762 proc usage {} { |
| 763 puts -nonewline stderr "Usage: $::argv0 ?-test? " |
| 764 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" |
| 765 exit 1 |
| 766 } |
| 767 if {[llength $argv]!=2 && [llength $argv]!=3} usage |
| 768 if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage |
| 769 set unicodedata.txt [lindex $argv end] |
| 770 set casefolding.txt [lindex $argv end-1] |
| 771 set generate_test_code [expr {[llength $argv]==3}] |
| 772 |
| 773 print_fileheader |
| 774 |
| 775 # Print the isalnum() function to stdout. |
| 776 # |
| 777 set lRange [an_load_separator_ranges] |
| 778 print_isalnum sqlite3FtsUnicodeIsalnum $lRange |
| 779 |
| 780 # Leave a gap between the two generated C functions. |
| 781 # |
| 782 puts "" |
| 783 puts "" |
| 784 |
| 785 # Load the fold data. This is used by the [rd_XXX] commands |
| 786 # as well as [print_fold]. |
| 787 tl_load_casefolding_txt ${casefolding.txt} |
| 788 |
| 789 set mappings [rd_load_unicodedata_text ${unicodedata.txt}] |
| 790 print_rd $mappings |
| 791 puts "" |
| 792 puts "" |
| 793 print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings |
| 794 puts "" |
| 795 puts "" |
| 796 |
| 797 # Print the fold() function to stdout. |
| 798 # |
| 799 print_fold sqlite3FtsUnicodeFold |
| 800 |
| 801 # Print the test routines and main() function to stdout, if -test |
| 802 # was specified. |
| 803 # |
| 804 if {$::generate_test_code} { |
| 805 print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange |
| 806 print_fold_test sqlite3FtsUnicodeFold $mappings |
| 807 print_test_main |
| 808 } |
| 809 |
| 810 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" |
| 811 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" |
OLD | NEW |