OLD | NEW |
| (Empty) |
1 | |
2 #-------------------------------------------------------------------------- | |
3 # Parameter $zName must be a path to the file UnicodeData.txt. This command | |
4 # reads the file and returns a list of mappings required to remove all | |
5 # diacritical marks from a unicode string. Each mapping is itself a list | |
6 # consisting of two elements - the unicode codepoint and the single ASCII | |
7 # character that it should be replaced with, or an empty string if the | |
8 # codepoint should simply be removed from the input. Examples: | |
9 # | |
10 # { 224 a } (replace codepoint 224 to "a") | |
11 # { 769 "" } (remove codepoint 769 from input) | |
12 # | |
13 # Mappings are only returned for non-upper case codepoints. It is assumed | |
14 # that the input has already been folded to lower case. | |
15 # | |
16 proc rd_load_unicodedata_text {zName} { | |
17 global tl_lookup_table | |
18 | |
19 set fd [open $zName] | |
20 set lField { | |
21 code | |
22 character_name | |
23 general_category | |
24 canonical_combining_classes | |
25 bidirectional_category | |
26 character_decomposition_mapping | |
27 decimal_digit_value | |
28 digit_value | |
29 numeric_value | |
30 mirrored | |
31 unicode_1_name | |
32 iso10646_comment_field | |
33 uppercase_mapping | |
34 lowercase_mapping | |
35 titlecase_mapping | |
36 } | |
37 set lRet [list] | |
38 | |
39 while { ![eof $fd] } { | |
40 set line [gets $fd] | |
41 if {$line == ""} continue | |
42 | |
43 set fields [split $line ";"] | |
44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } | |
45 foreach $lField $fields {} | |
46 if { [llength $character_decomposition_mapping]!=2 | |
47 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 | |
48 } { | |
49 continue | |
50 } | |
51 | |
52 set iCode [expr "0x$code"] | |
53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] | |
54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] | |
55 | |
56 if {[info exists tl_lookup_table($iCode)]} continue | |
57 | |
58 if { ($iAscii >= 97 && $iAscii <= 122) | |
59 || ($iAscii >= 65 && $iAscii <= 90) | |
60 } { | |
61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]] | |
62 set dia($iDia) 1 | |
63 } | |
64 } | |
65 | |
66 foreach d [array names dia] { | |
67 lappend lRet [list $d ""] | |
68 } | |
69 set lRet [lsort -integer -index 0 $lRet] | |
70 | |
71 close $fd | |
72 set lRet | |
73 } | |
74 | |
75 #------------------------------------------------------------------------- | |
76 # Parameter $zName must be a path to the file UnicodeData.txt. This command | |
77 # reads the file and returns a list of codepoints (integers). The list | |
78 # contains all codepoints in the UnicodeData.txt assigned to any "General | |
79 # Category" that is not a "Letter" or "Number". | |
80 # | |
81 proc an_load_unicodedata_text {zName} { | |
82 set fd [open $zName] | |
83 set lField { | |
84 code | |
85 character_name | |
86 general_category | |
87 canonical_combining_classes | |
88 bidirectional_category | |
89 character_decomposition_mapping | |
90 decimal_digit_value | |
91 digit_value | |
92 numeric_value | |
93 mirrored | |
94 unicode_1_name | |
95 iso10646_comment_field | |
96 uppercase_mapping | |
97 lowercase_mapping | |
98 titlecase_mapping | |
99 } | |
100 set lRet [list] | |
101 | |
102 while { ![eof $fd] } { | |
103 set line [gets $fd] | |
104 if {$line == ""} continue | |
105 | |
106 set fields [split $line ";"] | |
107 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } | |
108 foreach $lField $fields {} | |
109 | |
110 set iCode [expr "0x$code"] | |
111 set bAlnum [expr { | |
112 [lsearch {L N} [string range $general_category 0 0]] >= 0 | |
113 || $general_category=="Co" | |
114 }] | |
115 | |
116 if { !$bAlnum } { lappend lRet $iCode } | |
117 } | |
118 | |
119 close $fd | |
120 set lRet | |
121 } | |
122 | |
123 proc tl_load_casefolding_txt {zName} { | |
124 global tl_lookup_table | |
125 | |
126 set fd [open $zName] | |
127 while { ![eof $fd] } { | |
128 set line [gets $fd] | |
129 if {[string range $line 0 0] == "#"} continue | |
130 if {$line == ""} continue | |
131 | |
132 foreach x {a b c d} {unset -nocomplain $x} | |
133 foreach {a b c d} [split $line ";"] {} | |
134 | |
135 set a2 [list] | |
136 set c2 [list] | |
137 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } | |
138 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } | |
139 set b [string trim $b] | |
140 set d [string trim $d] | |
141 | |
142 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } | |
143 } | |
144 } | |
145 | |
146 | |
OLD | NEW |