| OLD | NEW |
| 1 #!/bin/sh | 1 #!/bin/sh |
| 2 # Copyright 2015 The Chromium Authors. All rights reserved. | 2 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
| 5 | 5 |
| 6 # References: | 6 # References: |
| 7 # https://encoding.spec.whatwg.org/#big5 | 7 # https://encoding.spec.whatwg.org/#big5 |
| 8 | 8 |
| 9 # This script downloads the following file. | 9 # This script downloads the following file. |
| 10 # https://encoding.spec.whatwg.org/index-big5.txt | 10 # https://encoding.spec.whatwg.org/index-big5.txt |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 52 | 52 |
| 53 function ascii { | 53 function ascii { |
| 54 for i in $(seq 0 127) | 54 for i in $(seq 0 127) |
| 55 do | 55 do |
| 56 printf '<U%04X> \\x%02X |0\n' $i $i | 56 printf '<U%04X> \\x%02X |0\n' $i $i |
| 57 done | 57 done |
| 58 } | 58 } |
| 59 | 59 |
| 60 | 60 |
| 61 # HKSCS characters are not supported in encoding ( |lead < 0xA1| ) | 61 # HKSCS characters are not supported in encoding ( |lead < 0xA1| ) |
| 62 # Entries with pointer=528[79] have to be decoding-only even though | 62 # Entries with pointer=528[79] and 5247 ~ 5250 have to be decoding-only |
| 63 # come before the other entry with the same Unicode character. | 63 # even though they come before the other entry with the same Unicode |
| 64 # character. The corresponding Unicode characters are U+255[0E], |
| 65 # U+256[1A], and U+534[15]. |
| 64 # See https://www.w3.org/Bugs/Public/show_bug.cgi?id=27878 | 66 # See https://www.w3.org/Bugs/Public/show_bug.cgi?id=27878 |
| 65 function big5 { | 67 function big5 { |
| 66 awk '!/^#/ && !/^$/ \ | 68 awk '!/^#/ && !/^$/ \ |
| 67 { pointer = $1; \ | 69 { pointer = $1; \ |
| 68 ucs = substr($2, 3); \ | 70 ucs = substr($2, 3); \ |
| 69 sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs; | 71 sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs; |
| 70 lead = pointer / 157 + 0x81; \ | 72 lead = pointer / 157 + 0x81; \ |
| 71 is_decoding_only = lead < 0xA1 || seen_before[ucs] || \ | 73 is_decoding_only = lead < 0xA1 || seen_before[ucs] || \ |
| 72 pointer == 5287 || pointer == 5289; \ | 74 pointer == 5287 || pointer == 5289 || \ |
| 75 (5247 <= pointer && pointer <= 5250); |
| 73 trail = $1 % 157; \ | 76 trail = $1 % 157; \ |
| 74 trail_offset = trail < 0x3F ? 0x40 : 0x62; \ | 77 trail_offset = trail < 0x3F ? 0x40 : 0x62; \ |
| 75 tag = (is_decoding_only ? 3 : 0); \ | 78 tag = (is_decoding_only ? 3 : 0); \ |
| 76 printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\ | 79 printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\ |
| 77 lead, trail + trail_offset, tag, sortkey);\ | 80 lead, trail + trail_offset, tag, sortkey);\ |
| 78 seen_before[ucs] = is_decoding_only ? 0 : 1; \ | 81 seen_before[ucs] = is_decoding_only ? 0 : 1; \ |
| 79 }' \ | 82 }' \ |
| 80 index-big5.txt | 83 index-big5.txt |
| 81 } | 84 } |
| 82 | 85 |
| 83 function two_char_seq { | 86 function two_char_seq { |
| 84 cat <<EOF | 87 cat <<EOF |
| 85 <U00CA><U0304> \x88\x62 |3 000CA | 88 <U00CA><U0304> \x88\x62 |3 000CA |
| 86 <U00CA><U030C> \x88\x64 |3 000CA | 89 <U00CA><U030C> \x88\x64 |3 000CA |
| 87 <U00EA><U0304> \x88\xA3 |3 000EA | 90 <U00EA><U0304> \x88\xA3 |3 000EA |
| 88 <U00EA><U030C> \x88\xA5 |3 000EA | 91 <U00EA><U030C> \x88\xA5 |3 000EA |
| 89 EOF | 92 EOF |
| 90 } | 93 } |
| 91 | 94 |
| 92 function unsorted_table { | 95 function unsorted_table { |
| 93 two_char_seq | 96 two_char_seq |
| 94 big5 | 97 big5 |
| 95 } | 98 } |
| 96 | 99 |
| 97 wget -N -r -nd https://encoding.spec.whatwg.org/index-big5.txt | 100 wget -N -r -nd https://encoding.spec.whatwg.org/index-big5.txt |
| 98 preamble | 101 preamble |
| 99 ascii | 102 ascii |
| 100 unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' | 103 unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' |
| 101 echo 'END CHARMAP' | 104 echo 'END CHARMAP' |
| OLD | NEW |