OLD | NEW |
| (Empty) |
1 | |
2 # | |
3 # Parameter $zName must be a path to the file UnicodeData.txt. This command | |
4 # reads the file and returns a list of mappings required to remove all | |
5 # diacritical marks from a unicode string. Each mapping is itself a list | |
6 # consisting of two elements - the unicode codepoint and the single ASCII | |
7 # character that it should be replaced with, or an empty string if the | |
8 # codepoint should simply be removed from the input. Examples: | |
9 # | |
10 # { 224 a } (replace codepoint 224 to "a") | |
11 # { 769 "" } (remove codepoint 769 from input) | |
12 # | |
13 # Mappings are only returned for non-upper case codepoints. It is assumed | |
14 # that the input has already been folded to lower case. | |
15 # | |
16 proc rd_load_unicodedata_text {zName} { | |
17 global tl_lookup_table | |
18 | |
19 set fd [open $zName] | |
20 set lField { | |
21 code | |
22 character_name | |
23 general_category | |
24 canonical_combining_classes | |
25 bidirectional_category | |
26 character_decomposition_mapping | |
27 decimal_digit_value | |
28 digit_value | |
29 numeric_value | |
30 mirrored | |
31 unicode_1_name | |
32 iso10646_comment_field | |
33 uppercase_mapping | |
34 lowercase_mapping | |
35 titlecase_mapping | |
36 } | |
37 set lRet [list] | |
38 | |
39 while { ![eof $fd] } { | |
40 set line [gets $fd] | |
41 if {$line == ""} continue | |
42 | |
43 set fields [split $line ";"] | |
44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } | |
45 foreach $lField $fields {} | |
46 if { [llength $character_decomposition_mapping]!=2 | |
47 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 | |
48 } { | |
49 continue | |
50 } | |
51 | |
52 set iCode [expr "0x$code"] | |
53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] | |
54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] | |
55 | |
56 if {[info exists tl_lookup_table($iCode)]} continue | |
57 | |
58 if { ($iAscii >= 97 && $iAscii <= 122) | |
59 || ($iAscii >= 65 && $iAscii <= 90) | |
60 } { | |
61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]] | |
62 set dia($iDia) 1 | |
63 } | |
64 } | |
65 | |
66 foreach d [array names dia] { | |
67 lappend lRet [list $d ""] | |
68 } | |
69 set lRet [lsort -integer -index 0 $lRet] | |
70 | |
71 close $fd | |
72 set lRet | |
73 } | |
74 | |
75 | |
76 proc print_rd {map} { | |
77 global tl_lookup_table | |
78 set aChar [list] | |
79 set lRange [list] | |
80 | |
81 set nRange 1 | |
82 set iFirst [lindex $map 0 0] | |
83 set cPrev [lindex $map 0 1] | |
84 | |
85 foreach m [lrange $map 1 end] { | |
86 foreach {i c} $m {} | |
87 | |
88 if {$cPrev == $c} { | |
89 for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} { | |
90 if {[info exists tl_lookup_table($j)]==0} break | |
91 } | |
92 | |
93 if {$j==$i} { | |
94 set nNew [expr {(1 + $i - $iFirst)}] | |
95 if {$nNew<=8} { | |
96 set nRange $nNew | |
97 continue | |
98 } | |
99 } | |
100 } | |
101 | |
102 lappend lRange [list $iFirst $nRange] | |
103 lappend aChar $cPrev | |
104 | |
105 set iFirst $i | |
106 set cPrev $c | |
107 set nRange 1 | |
108 } | |
109 lappend lRange [list $iFirst $nRange] | |
110 lappend aChar $cPrev | |
111 | |
112 puts "/*" | |
113 puts "** If the argument is a codepoint corresponding to a lowercase letter" | |
114 puts "** in the ASCII range with a diacritic added, return the codepoint" | |
115 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN" | |
116 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER" | |
117 puts "** E\"). The resuls of passing a codepoint that corresponds to an" | |
118 puts "** uppercase letter are undefined." | |
119 puts "*/" | |
120 puts "static int remove_diacritic(int c)\{" | |
121 puts " unsigned short aDia\[\] = \{" | |
122 puts -nonewline " 0, " | |
123 set i 1 | |
124 foreach r $lRange { | |
125 foreach {iCode nRange} $r {} | |
126 if {($i % 8)==0} {puts "" ; puts -nonewline " " } | |
127 incr i | |
128 | |
129 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]] | |
130 puts -nonewline ", " | |
131 } | |
132 puts "" | |
133 puts " \};" | |
134 puts " char aChar\[\] = \{" | |
135 puts -nonewline " '\\0', " | |
136 set i 1 | |
137 foreach c $aChar { | |
138 set str "'$c', " | |
139 if {$c == ""} { set str "'\\0', " } | |
140 | |
141 if {($i % 12)==0} {puts "" ; puts -nonewline " " } | |
142 incr i | |
143 puts -nonewline "$str" | |
144 } | |
145 puts "" | |
146 puts " \};" | |
147 puts { | |
148 unsigned int key = (((unsigned int)c)<<3) | 0x00000007; | |
149 int iRes = 0; | |
150 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1; | |
151 int iLo = 0; | |
152 while( iHi>=iLo ){ | |
153 int iTest = (iHi + iLo) / 2; | |
154 if( key >= aDia[iTest] ){ | |
155 iRes = iTest; | |
156 iLo = iTest+1; | |
157 }else{ | |
158 iHi = iTest-1; | |
159 } | |
160 } | |
161 assert( key>=aDia[iRes] ); | |
162 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);} | |
163 puts "\}" | |
164 } | |
165 | |
166 proc print_isdiacritic {zFunc map} { | |
167 | |
168 set lCode [list] | |
169 foreach m $map { | |
170 foreach {code char} $m {} | |
171 if {$code && $char == ""} { lappend lCode $code } | |
172 } | |
173 set lCode [lsort -integer $lCode] | |
174 set iFirst [lindex $lCode 0] | |
175 set iLast [lindex $lCode end] | |
176 | |
177 set i1 0 | |
178 set i2 0 | |
179 | |
180 foreach c $lCode { | |
181 set i [expr $c - $iFirst] | |
182 if {$i < 32} { | |
183 set i1 [expr {$i1 | (1<<$i)}] | |
184 } else { | |
185 set i2 [expr {$i2 | (1<<($i-32))}] | |
186 } | |
187 } | |
188 | |
189 puts "/*" | |
190 puts "** Return true if the argument interpreted as a unicode codepoint" | |
191 puts "** is a diacritical modifier character." | |
192 puts "*/" | |
193 puts "int ${zFunc}\(int c)\{" | |
194 puts " unsigned int mask0 = [format "0x%08X" $i1];" | |
195 puts " unsigned int mask1 = [format "0x%08X" $i2];" | |
196 | |
197 puts " if( c<$iFirst || c>$iLast ) return 0;" | |
198 puts " return (c < $iFirst+32) ?" | |
199 puts " (mask0 & (1 << (c-$iFirst))) :" | |
200 puts " (mask1 & (1 << (c-$iFirst-32)));" | |
201 puts "\}" | |
202 } | |
203 | |
204 | |
205 #------------------------------------------------------------------------- | |
206 | |
207 # Parameter $zName must be a path to the file UnicodeData.txt. This command | |
208 # reads the file and returns a list of codepoints (integers). The list | |
209 # contains all codepoints in the UnicodeData.txt assigned to any "General | |
210 # Category" that is not a "Letter" or "Number". | |
211 # | |
212 proc an_load_unicodedata_text {zName} { | |
213 set fd [open $zName] | |
214 set lField { | |
215 code | |
216 character_name | |
217 general_category | |
218 canonical_combining_classes | |
219 bidirectional_category | |
220 character_decomposition_mapping | |
221 decimal_digit_value | |
222 digit_value | |
223 numeric_value | |
224 mirrored | |
225 unicode_1_name | |
226 iso10646_comment_field | |
227 uppercase_mapping | |
228 lowercase_mapping | |
229 titlecase_mapping | |
230 } | |
231 set lRet [list] | |
232 | |
233 while { ![eof $fd] } { | |
234 set line [gets $fd] | |
235 if {$line == ""} continue | |
236 | |
237 set fields [split $line ";"] | |
238 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } | |
239 foreach $lField $fields {} | |
240 | |
241 set iCode [expr "0x$code"] | |
242 set bAlnum [expr { | |
243 [lsearch {L N} [string range $general_category 0 0]] >= 0 | |
244 || $general_category=="Co" | |
245 }] | |
246 | |
247 if { !$bAlnum } { lappend lRet $iCode } | |
248 } | |
249 | |
250 close $fd | |
251 set lRet | |
252 } | |
253 | |
254 proc an_load_separator_ranges {} { | |
255 global unicodedata.txt | |
256 set lSep [an_load_unicodedata_text ${unicodedata.txt}] | |
257 unset -nocomplain iFirst | |
258 unset -nocomplain nRange | |
259 set lRange [list] | |
260 foreach sep $lSep { | |
261 if {0==[info exists iFirst]} { | |
262 set iFirst $sep | |
263 set nRange 1 | |
264 } elseif { $sep == ($iFirst+$nRange) } { | |
265 incr nRange | |
266 } else { | |
267 lappend lRange [list $iFirst $nRange] | |
268 set iFirst $sep | |
269 set nRange 1 | |
270 } | |
271 } | |
272 lappend lRange [list $iFirst $nRange] | |
273 set lRange | |
274 } | |
275 | |
276 proc an_print_range_array {lRange} { | |
277 set iFirstMax 0 | |
278 set nRangeMax 0 | |
279 foreach range $lRange { | |
280 foreach {iFirst nRange} $range {} | |
281 if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} | |
282 if {$nRange > $nRangeMax} {set nRangeMax $nRange} | |
283 } | |
284 if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} | |
285 if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} | |
286 | |
287 puts -nonewline " " | |
288 puts [string trim { | |
289 /* Each unsigned integer in the following array corresponds to a contiguous | |
290 ** range of unicode codepoints that are not either letters or numbers (i.e. | |
291 ** codepoints for which this function should return 0). | |
292 ** | |
293 ** The most significant 22 bits in each 32-bit value contain the first | |
294 ** codepoint in the range. The least significant 10 bits are used to store | |
295 ** the size of the range (always at least 1). In other words, the value | |
296 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint | |
297 ** C. It is not possible to represent a range larger than 1023 codepoints | |
298 ** using this format. | |
299 */ | |
300 }] | |
301 puts -nonewline " static const unsigned int aEntry\[\] = \{" | |
302 set i 0 | |
303 foreach range $lRange { | |
304 foreach {iFirst nRange} $range {} | |
305 set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] | |
306 | |
307 if {($i % 5)==0} {puts "" ; puts -nonewline " "} | |
308 puts -nonewline " $u32," | |
309 incr i | |
310 } | |
311 puts "" | |
312 puts " \};" | |
313 } | |
314 | |
315 proc an_print_ascii_bitmap {lRange} { | |
316 foreach range $lRange { | |
317 foreach {iFirst nRange} $range {} | |
318 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { | |
319 if {$i<=127} { set a($i) 1 } | |
320 } | |
321 } | |
322 | |
323 set aAscii [list 0 0 0 0] | |
324 foreach key [array names a] { | |
325 set idx [expr $key >> 5] | |
326 lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))] | |
327 } | |
328 | |
329 puts " static const unsigned int aAscii\[4\] = \{" | |
330 puts -nonewline " " | |
331 foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] } | |
332 puts "" | |
333 puts " \};" | |
334 } | |
335 | |
336 proc print_isalnum {zFunc lRange} { | |
337 puts "/*" | |
338 puts "** Return true if the argument corresponds to a unicode codepoint" | |
339 puts "** classified as either a letter or a number. Otherwise false." | |
340 puts "**" | |
341 puts "** The results are undefined if the value passed to this function" | |
342 puts "** is less than zero." | |
343 puts "*/" | |
344 puts "int ${zFunc}\(int c)\{" | |
345 an_print_range_array $lRange | |
346 an_print_ascii_bitmap $lRange | |
347 puts { | |
348 if( c<128 ){ | |
349 return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 ); | |
350 }else if( c<(1<<22) ){ | |
351 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; | |
352 int iRes = 0; | |
353 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; | |
354 int iLo = 0; | |
355 while( iHi>=iLo ){ | |
356 int iTest = (iHi + iLo) / 2; | |
357 if( key >= aEntry[iTest] ){ | |
358 iRes = iTest; | |
359 iLo = iTest+1; | |
360 }else{ | |
361 iHi = iTest-1; | |
362 } | |
363 } | |
364 assert( aEntry[0]<key ); | |
365 assert( key>=aEntry[iRes] ); | |
366 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); | |
367 } | |
368 return 1;} | |
369 puts "\}" | |
370 } | |
371 | |
372 proc print_test_isalnum {zFunc lRange} { | |
373 foreach range $lRange { | |
374 foreach {iFirst nRange} $range {} | |
375 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 } | |
376 } | |
377 | |
378 puts "static int isalnum_test(int *piCode)\{" | |
379 puts -nonewline " unsigned char aAlnum\[\] = \{" | |
380 for {set i 0} {$i < 70000} {incr i} { | |
381 if {($i % 32)==0} { puts "" ; puts -nonewline " " } | |
382 set bFlag [expr ![info exists a($i)]] | |
383 puts -nonewline "${bFlag}," | |
384 } | |
385 puts "" | |
386 puts " \};" | |
387 | |
388 puts -nonewline " int aLargeSep\[\] = \{" | |
389 set i 0 | |
390 foreach iSep [lsort -integer [array names a]] { | |
391 if {$iSep<70000} continue | |
392 if {($i % 8)==0} { puts "" ; puts -nonewline " " } | |
393 puts -nonewline " $iSep," | |
394 incr i | |
395 } | |
396 puts "" | |
397 puts " \};" | |
398 puts -nonewline " int aLargeOther\[\] = \{" | |
399 set i 0 | |
400 foreach iSep [lsort -integer [array names a]] { | |
401 if {$iSep<70000} continue | |
402 if {[info exists a([expr $iSep-1])]==0} { | |
403 if {($i % 8)==0} { puts "" ; puts -nonewline " " } | |
404 puts -nonewline " [expr $iSep-1]," | |
405 incr i | |
406 } | |
407 if {[info exists a([expr $iSep+1])]==0} { | |
408 if {($i % 8)==0} { puts "" ; puts -nonewline " " } | |
409 puts -nonewline " [expr $iSep+1]," | |
410 incr i | |
411 } | |
412 } | |
413 puts "" | |
414 puts " \};" | |
415 | |
416 puts [subst -nocommands { | |
417 int i; | |
418 for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){ | |
419 if( ${zFunc}(i)!=aAlnum[i] ){ | |
420 *piCode = i; | |
421 return 1; | |
422 } | |
423 } | |
424 for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){ | |
425 if( ${zFunc}(aLargeSep[i])!=0 ){ | |
426 *piCode = aLargeSep[i]; | |
427 return 1; | |
428 } | |
429 } | |
430 for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){ | |
431 if( ${zFunc}(aLargeOther[i])!=1 ){ | |
432 *piCode = aLargeOther[i]; | |
433 return 1; | |
434 } | |
435 } | |
436 }] | |
437 puts " return 0;" | |
438 puts "\}" | |
439 } | |
440 | |
441 #------------------------------------------------------------------------- | |
442 | |
443 proc tl_load_casefolding_txt {zName} { | |
444 global tl_lookup_table | |
445 | |
446 set fd [open $zName] | |
447 while { ![eof $fd] } { | |
448 set line [gets $fd] | |
449 if {[string range $line 0 0] == "#"} continue | |
450 if {$line == ""} continue | |
451 | |
452 foreach x {a b c d} {unset -nocomplain $x} | |
453 foreach {a b c d} [split $line ";"] {} | |
454 | |
455 set a2 [list] | |
456 set c2 [list] | |
457 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } | |
458 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } | |
459 set b [string trim $b] | |
460 set d [string trim $d] | |
461 | |
462 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } | |
463 } | |
464 } | |
465 | |
466 proc tl_create_records {} { | |
467 global tl_lookup_table | |
468 | |
469 set iFirst "" | |
470 set nOff 0 | |
471 set nRange 0 | |
472 set nIncr 0 | |
473 | |
474 set lRecord [list] | |
475 foreach code [lsort -integer [array names tl_lookup_table]] { | |
476 set mapping $tl_lookup_table($code) | |
477 if {$iFirst == ""} { | |
478 set iFirst $code | |
479 set nOff [expr $mapping - $code] | |
480 set nRange 1 | |
481 set nIncr 1 | |
482 } else { | |
483 set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))] | |
484 if { $nRange==1 && ($diff==1 || $diff==2) } { | |
485 set nIncr $diff | |
486 } | |
487 | |
488 if {$diff != $nIncr || ($mapping - $code)!=$nOff} { | |
489 if { $nRange==1 } {set nIncr 1} | |
490 lappend lRecord [list $iFirst $nIncr $nRange $nOff] | |
491 set iFirst $code | |
492 set nOff [expr $mapping - $code] | |
493 set nRange 1 | |
494 set nIncr 1 | |
495 } else { | |
496 incr nRange | |
497 } | |
498 } | |
499 } | |
500 | |
501 lappend lRecord [list $iFirst $nIncr $nRange $nOff] | |
502 | |
503 set lRecord | |
504 } | |
505 | |
506 proc tl_print_table_header {} { | |
507 puts -nonewline " " | |
508 puts [string trim { | |
509 /* Each entry in the following array defines a rule for folding a range | |
510 ** of codepoints to lower case. The rule applies to a range of nRange | |
511 ** codepoints starting at codepoint iCode. | |
512 ** | |
513 ** If the least significant bit in flags is clear, then the rule applies | |
514 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and | |
515 ** need to be folded). Or, if it is set, then the rule only applies to | |
516 ** every second codepoint in the range, starting with codepoint C. | |
517 ** | |
518 ** The 7 most significant bits in flags are an index into the aiOff[] | |
519 ** array. If a specific codepoint C does require folding, then its lower | |
520 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF). | |
521 ** | |
522 ** The contents of this array are generated by parsing the CaseFolding.txt | |
523 ** file distributed as part of the "Unicode Character Database". See | |
524 ** http://www.unicode.org for details. | |
525 */ | |
526 }] | |
527 puts " static const struct TableEntry \{" | |
528 puts " unsigned short iCode;" | |
529 puts " unsigned char flags;" | |
530 puts " unsigned char nRange;" | |
531 puts " \} aEntry\[\] = \{" | |
532 } | |
533 | |
534 proc tl_print_table_entry {togglevar entry liOff} { | |
535 upvar $togglevar t | |
536 foreach {iFirst nIncr nRange nOff} $entry {} | |
537 | |
538 if {$iFirst > (1<<16)} { return 1 } | |
539 | |
540 if {[info exists t]==0} {set t 0} | |
541 if {$t==0} { puts -nonewline " " } | |
542 | |
543 set flags 0 | |
544 if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]} | |
545 if {$nOff<0} { incr nOff [expr (1<<16)] } | |
546 | |
547 set idx [lsearch $liOff $nOff] | |
548 if {$idx<0} {error "malfunction generating aiOff"} | |
549 set flags [expr $flags + $idx*2] | |
550 | |
551 set txt "{$iFirst, $flags, $nRange}," | |
552 if {$t==2} { | |
553 puts $txt | |
554 } else { | |
555 puts -nonewline [format "% -23s" $txt] | |
556 } | |
557 set t [expr ($t+1)%3] | |
558 | |
559 return 0 | |
560 } | |
561 | |
562 proc tl_print_table_footer {togglevar} { | |
563 upvar $togglevar t | |
564 if {$t!=0} {puts ""} | |
565 puts " \};" | |
566 } | |
567 | |
568 proc tl_print_if_entry {entry} { | |
569 foreach {iFirst nIncr nRange nOff} $entry {} | |
570 if {$nIncr==2} {error "tl_print_if_entry needs improvement!"} | |
571 | |
572 puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{" | |
573 puts " ret = c + $nOff;" | |
574 puts " \}" | |
575 } | |
576 | |
577 proc tl_generate_ioff_table {lRecord} { | |
578 foreach entry $lRecord { | |
579 foreach {iFirst nIncr nRange iOff} $entry {} | |
580 if {$iOff<0} { incr iOff [expr (1<<16)] } | |
581 if {[info exists a($iOff)]} continue | |
582 set a($iOff) 1 | |
583 } | |
584 | |
585 set liOff [lsort -integer [array names a]] | |
586 if {[llength $liOff]>128} { error "Too many distinct ioffs" } | |
587 return $liOff | |
588 } | |
589 | |
590 proc tl_print_ioff_table {liOff} { | |
591 puts -nonewline " static const unsigned short aiOff\[\] = \{" | |
592 set i 0 | |
593 foreach off $liOff { | |
594 if {($i % 8)==0} {puts "" ; puts -nonewline " "} | |
595 puts -nonewline [format "% -7s" "$off,"] | |
596 incr i | |
597 } | |
598 puts "" | |
599 puts " \};" | |
600 | |
601 } | |
602 | |
603 proc print_fold {zFunc} { | |
604 | |
605 set lRecord [tl_create_records] | |
606 | |
607 set lHigh [list] | |
608 puts "/*" | |
609 puts "** Interpret the argument as a unicode codepoint. If the codepoint" | |
610 puts "** is an upper case character that has a lower case equivalent," | |
611 puts "** return the codepoint corresponding to the lower case version." | |
612 puts "** Otherwise, return a copy of the argument." | |
613 puts "**" | |
614 puts "** The results are undefined if the value passed to this function" | |
615 puts "** is less than zero." | |
616 puts "*/" | |
617 puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{" | |
618 | |
619 set liOff [tl_generate_ioff_table $lRecord] | |
620 tl_print_table_header | |
621 foreach entry $lRecord { | |
622 if {[tl_print_table_entry toggle $entry $liOff]} { | |
623 lappend lHigh $entry | |
624 } | |
625 } | |
626 tl_print_table_footer toggle | |
627 tl_print_ioff_table $liOff | |
628 | |
629 puts { | |
630 int ret = c; | |
631 | |
632 assert( c>=0 ); | |
633 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 ); | |
634 | |
635 if( c<128 ){ | |
636 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A'); | |
637 }else if( c<65536 ){ | |
638 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; | |
639 int iLo = 0; | |
640 int iRes = -1; | |
641 | |
642 while( iHi>=iLo ){ | |
643 int iTest = (iHi + iLo) / 2; | |
644 int cmp = (c - aEntry[iTest].iCode); | |
645 if( cmp>=0 ){ | |
646 iRes = iTest; | |
647 iLo = iTest+1; | |
648 }else{ | |
649 iHi = iTest-1; | |
650 } | |
651 } | |
652 assert( iRes<0 || c>=aEntry[iRes].iCode ); | |
653 | |
654 if( iRes>=0 ){ | |
655 const struct TableEntry *p = &aEntry[iRes]; | |
656 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){ | |
657 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF; | |
658 assert( ret>0 ); | |
659 } | |
660 } | |
661 | |
662 if( bRemoveDiacritic ) ret = remove_diacritic(ret); | |
663 } | |
664 } | |
665 | |
666 foreach entry $lHigh { | |
667 tl_print_if_entry $entry | |
668 } | |
669 | |
670 puts "" | |
671 puts " return ret;" | |
672 puts "\}" | |
673 } | |
674 | |
675 proc print_fold_test {zFunc mappings} { | |
676 global tl_lookup_table | |
677 | |
678 foreach m $mappings { | |
679 set c [lindex $m 1] | |
680 if {$c == ""} { | |
681 set extra([lindex $m 0]) 0 | |
682 } else { | |
683 scan $c %c i | |
684 set extra([lindex $m 0]) $i | |
685 } | |
686 } | |
687 | |
688 puts "static int fold_test(int *piCode)\{" | |
689 puts -nonewline " static int aLookup\[\] = \{" | |
690 for {set i 0} {$i < 70000} {incr i} { | |
691 | |
692 set expected $i | |
693 catch { set expected $tl_lookup_table($i) } | |
694 set expected2 $expected | |
695 catch { set expected2 $extra($expected2) } | |
696 | |
697 if {($i % 4)==0} { puts "" ; puts -nonewline " " } | |
698 puts -nonewline "$expected, $expected2, " | |
699 } | |
700 puts " \};" | |
701 puts " int i;" | |
702 puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{" | |
703 puts " int iCode = (i/2);" | |
704 puts " int bFlag = i & 0x0001;" | |
705 puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{" | |
706 puts " *piCode = iCode;" | |
707 puts " return 1;" | |
708 puts " \}" | |
709 puts " \}" | |
710 puts " return 0;" | |
711 puts "\}" | |
712 } | |
713 | |
714 | |
715 proc print_fileheader {} { | |
716 puts [string trim { | |
717 /* | |
718 ** 2012 May 25 | |
719 ** | |
720 ** The author disclaims copyright to this source code. In place of | |
721 ** a legal notice, here is a blessing: | |
722 ** | |
723 ** May you do good and not evil. | |
724 ** May you find forgiveness for yourself and forgive others. | |
725 ** May you share freely, never taking more than you give. | |
726 ** | |
727 ****************************************************************************** | |
728 */ | |
729 | |
730 /* | |
731 ** DO NOT EDIT THIS MACHINE GENERATED FILE. | |
732 */ | |
733 }] | |
734 puts "" | |
735 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE" | |
736 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" | |
737 puts "" | |
738 puts "#include <assert.h>" | |
739 puts "" | |
740 } | |
741 | |
742 proc print_test_main {} { | |
743 puts "" | |
744 puts "#include <stdio.h>" | |
745 puts "" | |
746 puts "int main(int argc, char **argv)\{" | |
747 puts " int r1, r2;" | |
748 puts " int code;" | |
749 puts " r1 = isalnum_test(&code);" | |
750 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);" | |
751 puts " else printf(\"isalnum(): test passed\\n\");" | |
752 puts " r2 = fold_test(&code);" | |
753 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);" | |
754 puts " else printf(\"fold(): test passed\\n\");" | |
755 puts " return (r1 || r2);" | |
756 puts "\}" | |
757 } | |
758 | |
759 # Proces the command line arguments. Exit early if they are not to | |
760 # our liking. | |
761 # | |
762 proc usage {} { | |
763 puts -nonewline stderr "Usage: $::argv0 ?-test? " | |
764 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>" | |
765 exit 1 | |
766 } | |
767 if {[llength $argv]!=2 && [llength $argv]!=3} usage | |
768 if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage | |
769 set unicodedata.txt [lindex $argv end] | |
770 set casefolding.txt [lindex $argv end-1] | |
771 set generate_test_code [expr {[llength $argv]==3}] | |
772 | |
773 print_fileheader | |
774 | |
775 # Print the isalnum() function to stdout. | |
776 # | |
777 set lRange [an_load_separator_ranges] | |
778 print_isalnum sqlite3FtsUnicodeIsalnum $lRange | |
779 | |
780 # Leave a gap between the two generated C functions. | |
781 # | |
782 puts "" | |
783 puts "" | |
784 | |
785 # Load the fold data. This is used by the [rd_XXX] commands | |
786 # as well as [print_fold]. | |
787 tl_load_casefolding_txt ${casefolding.txt} | |
788 | |
789 set mappings [rd_load_unicodedata_text ${unicodedata.txt}] | |
790 print_rd $mappings | |
791 puts "" | |
792 puts "" | |
793 print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings | |
794 puts "" | |
795 puts "" | |
796 | |
797 # Print the fold() function to stdout. | |
798 # | |
799 print_fold sqlite3FtsUnicodeFold | |
800 | |
801 # Print the test routines and main() function to stdout, if -test | |
802 # was specified. | |
803 # | |
804 if {$::generate_test_code} { | |
805 print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange | |
806 print_fold_test sqlite3FtsUnicodeFold $mappings | |
807 print_test_main | |
808 } | |
809 | |
810 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" | |
811 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */" | |
OLD | NEW |