OLD | NEW |
---|---|
(Empty) | |
1 #!/bin/sh | |
2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. | |
jsbell
2015/01/20 21:47:22
2015
No (c) per http://www.chromium.org/developers
| |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 # References: | |
7 # http://encoding.spec.whatwg.org/#big5 | |
jsbell
2015/01/20 21:47:22
nit: can use https everywhere
| |
8 | |
9 # This script downloads the following file. | |
10 # https://encoding.spec.whatwg.org/index-big5.txt | |
11 | |
12 function preamble { | |
13 cat <<PREAMBLE | |
14 # *************************************************************************** | |
15 # * | |
16 # * Copyright (C) 1995-2014, International Business Machines | |
17 # * Corporation and others. All Rights Reserved. | |
18 # * | |
19 # * Generated per the algorithm for Big5 | |
20 # * described at http://encoding.spec.whatwg.org/#big5 | |
21 # * | |
22 # *************************************************************************** | |
23 <code_set_name> "big5-html" | |
24 <char_name_mask> "AXXXX" | |
25 <mb_cur_max> 2 | |
26 <mb_cur_min> 1 | |
27 <uconv_class> "MBCS" | |
28 <subchar> \x3F | |
29 <icu:charsetFamily> "ASCII" | |
30 | |
31 # 'p' is for the range that may produce non-BMP code points. | |
32 # See http://userguide.icu-project.org/conversion/data. | |
33 <icu:state> 0-7f, 87-fe:1, 87-a0:2, c8:2, fa-fe:2 | |
34 <icu:state> 40-7e, a1-fe | |
35 <icu:state> 40-7e.p, a1-fe.p | |
36 | |
37 CHARMAP | |
38 PREAMBLE | |
39 } | |
40 | |
41 function ascii { | |
42 for i in $(seq 0 127) | |
43 do | |
44 printf '<U%04X> \\x%02X |0\n' $i $i | |
45 done | |
46 } | |
47 | |
48 | |
49 # HKSCS characters are not supported in encoding ( |lead < 0xA1| ) | |
50 function big5 { | |
51 awk '!/^#/ && !/^$/ \ | |
52 { pointer = $1; \ | |
53 ucs = substr($2, 3); \ | |
54 sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs; | |
55 lead = pointer / 157 + 0x81; \ | |
56 is_decoding_only = lead < 0xA1 || seen_before[ucs]; \ | |
57 trail = $1 % 157; \ | |
58 trail_offset = trail < 0x3F ? 0x40 : 0x62; \ | |
59 tag = (is_decoding_only ? 3 : 0); \ | |
60 printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\ | |
61 lead, trail + trail_offset, tag, sortkey);\ | |
62 seen_before[ucs] = 1; \ | |
63 }' \ | |
64 index-big5.txt | |
65 } | |
66 | |
67 function two_char_seq { | |
68 cat <<EOF | |
69 <U00CA><U0304> \x88\x62 |3 000CA | |
70 <U00CA><U030C> \x88\x64 |3 000CA | |
71 <U00EA><U0304> \x88\xA3 |3 000EA | |
72 <U00EA><U030C> \x88\xA5 |3 000EA | |
73 EOF | |
74 } | |
75 | |
76 function unsorted_table { | |
77 two_char_seq | |
78 big5 | |
79 } | |
80 | |
81 #curl -o index-big5.txt https://encoding.spec.whatwg.org/index-big5.txt | |
jsbell
2015/01/20 21:47:22
Commented out...?
(I should note that I get hands
| |
82 preamble | |
83 ascii | |
84 unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' | |
85 echo 'END CHARMAP' | |
OLD | NEW |