OLD | NEW |
(Empty) | |
| 1 #!/bin/sh |
| 2 # Copyright 2015 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 # References: |
| 7 # https://encoding.spec.whatwg.org/#big5 |
| 8 |
| 9 # This script downloads the following file. |
| 10 # https://encoding.spec.whatwg.org/index-big5.txt |
| 11 |
| 12 function preamble { |
| 13 cat <<PREAMBLE |
| 14 # *************************************************************************** |
| 15 # * |
| 16 # * Copyright (C) 1995-2014, International Business Machines |
| 17 # * Corporation and others. All Rights Reserved. |
| 18 # * |
| 19 # * Generated per the algorithm for Big5 |
| 20 # * described at http://encoding.spec.whatwg.org/#big5 |
| 21 # * |
| 22 # *************************************************************************** |
| 23 <code_set_name> "big5-html" |
| 24 <char_name_mask> "AXXXX" |
| 25 <mb_cur_max> 2 |
| 26 <mb_cur_min> 1 |
| 27 <uconv_class> "MBCS" |
| 28 <subchar> \x3F |
| 29 <icu:charsetFamily> "ASCII" |
| 30 |
| 31 # 'p' is for the range that may produce non-BMP code points. |
| 32 # See http://userguide.icu-project.org/conversion/data. |
| 33 <icu:state> 0-7f, 87-fe:1, 87-a0:2, c8:2, fa-fe:2 |
| 34 <icu:state> 40-7e, a1-fe |
| 35 <icu:state> 40-7e.p, a1-fe.p |
| 36 |
| 37 CHARMAP |
| 38 PREAMBLE |
| 39 } |
| 40 |
| 41 function ascii { |
| 42 for i in $(seq 0 127) |
| 43 do |
| 44 printf '<U%04X> \\x%02X |0\n' $i $i |
| 45 done |
| 46 } |
| 47 |
| 48 |
| 49 # HKSCS characters are not supported in encoding ( |lead < 0xA1| ) |
| 50 # Entries with pointer=528[79] have to be decoding-only even though |
| 51 # come before the other entry with the same Unicode character. |
| 52 # See https://www.w3.org/Bugs/Public/show_bug.cgi?id=27878 |
| 53 function big5 { |
| 54 awk '!/^#/ && !/^$/ \ |
| 55 { pointer = $1; \ |
| 56 ucs = substr($2, 3); \ |
| 57 sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs; |
| 58 lead = pointer / 157 + 0x81; \ |
| 59 is_decoding_only = lead < 0xA1 || seen_before[ucs] || \ |
| 60 pointer == 5287 || pointer == 5289; \ |
| 61 trail = $1 % 157; \ |
| 62 trail_offset = trail < 0x3F ? 0x40 : 0x62; \ |
| 63 tag = (is_decoding_only ? 3 : 0); \ |
| 64 printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\ |
| 65 lead, trail + trail_offset, tag, sortkey);\ |
| 66 seen_before[ucs] = is_decoding_only ? 0 : 1; \ |
| 67 }' \ |
| 68 index-big5.txt |
| 69 } |
| 70 |
| 71 function two_char_seq { |
| 72 cat <<EOF |
| 73 <U00CA><U0304> \x88\x62 |3 000CA |
| 74 <U00CA><U030C> \x88\x64 |3 000CA |
| 75 <U00EA><U0304> \x88\xA3 |3 000EA |
| 76 <U00EA><U030C> \x88\xA5 |3 000EA |
| 77 EOF |
| 78 } |
| 79 |
| 80 function unsorted_table { |
| 81 two_char_seq |
| 82 big5 |
| 83 } |
| 84 |
| 85 wget -N -r -nd https://encoding.spec.whatwg.org/index-big5.txt |
| 86 preamble |
| 87 ascii |
| 88 unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' |
| 89 echo 'END CHARMAP' |
OLD | NEW |