OLD | NEW |
(Empty) | |
| 1 |
| 2 #-------------------------------------------------------------------------- |
| 3 # Parameter $zName must be a path to the file UnicodeData.txt. This command |
| 4 # reads the file and returns a list of mappings required to remove all |
| 5 # diacritical marks from a unicode string. Each mapping is itself a list |
| 6 # consisting of two elements - the unicode codepoint and the single ASCII |
| 7 # character that it should be replaced with, or an empty string if the |
| 8 # codepoint should simply be removed from the input. Examples: |
| 9 # |
| 10 # { 224 a } (replace codepoint 224 to "a") |
| 11 # { 769 "" } (remove codepoint 769 from input) |
| 12 # |
| 13 # Mappings are only returned for non-upper case codepoints. It is assumed |
| 14 # that the input has already been folded to lower case. |
| 15 # |
| 16 proc rd_load_unicodedata_text {zName} { |
| 17 global tl_lookup_table |
| 18 |
| 19 set fd [open $zName] |
| 20 set lField { |
| 21 code |
| 22 character_name |
| 23 general_category |
| 24 canonical_combining_classes |
| 25 bidirectional_category |
| 26 character_decomposition_mapping |
| 27 decimal_digit_value |
| 28 digit_value |
| 29 numeric_value |
| 30 mirrored |
| 31 unicode_1_name |
| 32 iso10646_comment_field |
| 33 uppercase_mapping |
| 34 lowercase_mapping |
| 35 titlecase_mapping |
| 36 } |
| 37 set lRet [list] |
| 38 |
| 39 while { ![eof $fd] } { |
| 40 set line [gets $fd] |
| 41 if {$line == ""} continue |
| 42 |
| 43 set fields [split $line ";"] |
| 44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } |
| 45 foreach $lField $fields {} |
| 46 if { [llength $character_decomposition_mapping]!=2 |
| 47 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0 |
| 48 } { |
| 49 continue |
| 50 } |
| 51 |
| 52 set iCode [expr "0x$code"] |
| 53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"] |
| 54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"] |
| 55 |
| 56 if {[info exists tl_lookup_table($iCode)]} continue |
| 57 |
| 58 if { ($iAscii >= 97 && $iAscii <= 122) |
| 59 || ($iAscii >= 65 && $iAscii <= 90) |
| 60 } { |
| 61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]] |
| 62 set dia($iDia) 1 |
| 63 } |
| 64 } |
| 65 |
| 66 foreach d [array names dia] { |
| 67 lappend lRet [list $d ""] |
| 68 } |
| 69 set lRet [lsort -integer -index 0 $lRet] |
| 70 |
| 71 close $fd |
| 72 set lRet |
| 73 } |
| 74 |
| 75 #------------------------------------------------------------------------- |
| 76 # Parameter $zName must be a path to the file UnicodeData.txt. This command |
| 77 # reads the file and returns a list of codepoints (integers). The list |
| 78 # contains all codepoints in the UnicodeData.txt assigned to any "General |
| 79 # Category" that is not a "Letter" or "Number". |
| 80 # |
| 81 proc an_load_unicodedata_text {zName} { |
| 82 set fd [open $zName] |
| 83 set lField { |
| 84 code |
| 85 character_name |
| 86 general_category |
| 87 canonical_combining_classes |
| 88 bidirectional_category |
| 89 character_decomposition_mapping |
| 90 decimal_digit_value |
| 91 digit_value |
| 92 numeric_value |
| 93 mirrored |
| 94 unicode_1_name |
| 95 iso10646_comment_field |
| 96 uppercase_mapping |
| 97 lowercase_mapping |
| 98 titlecase_mapping |
| 99 } |
| 100 set lRet [list] |
| 101 |
| 102 while { ![eof $fd] } { |
| 103 set line [gets $fd] |
| 104 if {$line == ""} continue |
| 105 |
| 106 set fields [split $line ";"] |
| 107 if {[llength $fields] != [llength $lField]} { error "parse error: $line" } |
| 108 foreach $lField $fields {} |
| 109 |
| 110 set iCode [expr "0x$code"] |
| 111 set bAlnum [expr { |
| 112 [lsearch {L N} [string range $general_category 0 0]] >= 0 |
| 113 || $general_category=="Co" |
| 114 }] |
| 115 |
| 116 if { !$bAlnum } { lappend lRet $iCode } |
| 117 } |
| 118 |
| 119 close $fd |
| 120 set lRet |
| 121 } |
| 122 |
| 123 proc tl_load_casefolding_txt {zName} { |
| 124 global tl_lookup_table |
| 125 |
| 126 set fd [open $zName] |
| 127 while { ![eof $fd] } { |
| 128 set line [gets $fd] |
| 129 if {[string range $line 0 0] == "#"} continue |
| 130 if {$line == ""} continue |
| 131 |
| 132 foreach x {a b c d} {unset -nocomplain $x} |
| 133 foreach {a b c d} [split $line ";"] {} |
| 134 |
| 135 set a2 [list] |
| 136 set c2 [list] |
| 137 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] } |
| 138 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] } |
| 139 set b [string trim $b] |
| 140 set d [string trim $d] |
| 141 |
| 142 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 } |
| 143 } |
| 144 } |
| 145 |
| 146 |
OLD | NEW |