OLD | NEW |
1 #!/bin/sh | 1 #!/bin/sh |
2 # Copyright 2015 The Chromium Authors. All rights reserved. | 2 # Copyright 2015 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 # References: | 6 # References: |
7 # https://encoding.spec.whatwg.org/#big5 | 7 # https://encoding.spec.whatwg.org/#big5 |
8 | 8 |
9 # This script downloads the following file. | 9 # This script downloads the following file. |
10 # https://encoding.spec.whatwg.org/index-big5.txt | 10 # https://encoding.spec.whatwg.org/index-big5.txt |
(...skipping 11 matching lines...) Expand all Loading... |
22 # *************************************************************************** | 22 # *************************************************************************** |
23 <code_set_name> "big5-html" | 23 <code_set_name> "big5-html" |
24 <char_name_mask> "AXXXX" | 24 <char_name_mask> "AXXXX" |
25 <mb_cur_max> 2 | 25 <mb_cur_max> 2 |
26 <mb_cur_min> 1 | 26 <mb_cur_min> 1 |
27 <uconv_class> "MBCS" | 27 <uconv_class> "MBCS" |
28 <subchar> \x3F | 28 <subchar> \x3F |
29 <icu:charsetFamily> "ASCII" | 29 <icu:charsetFamily> "ASCII" |
30 | 30 |
31 # 'p' is for the range that may produce non-BMP code points. | 31 # 'p' is for the range that may produce non-BMP code points. |
| 32 # 'i' is to make the code range illegal. |
| 33 # Big5 has a lot of small holes in the 2nd byte. If it's in the ASCII range, |
| 34 # the 2nd byte has to be added back to the stream to be compliant to the |
| 35 # encoding spec. Each state adds 1kB in the data size. |
32 # See http://userguide.icu-project.org/conversion/data. | 36 # See http://userguide.icu-project.org/conversion/data. |
33 <icu:state> 0-7f, 87-fe:1, 87-a0:2, c8:2, fa-fe:2 | 37 <icu:state> 0-7f, a1-fe:1, 87-a0:2, c8:2, fa-fe:2, 87:3, 89:4,
8a:5, 8b:6, 8d:7, 9b:8, 9f:9, a0:a |
34 <icu:state> 40-7e, a1-fe | 38 <icu:state> 40-7e, a1-fe |
35 <icu:state> 40-7e.p, a1-fe.p | 39 <icu:state> 40-7e.p, a1-fe.p |
| 40 <icu:state> 40-7e.p, a1-fe.p, 66.i |
| 41 <icu:state> 40-7e.p, a1-fe.p, 42.i, 44.i, 45.i, 4a-4b.i |
| 42 <icu:state> 40-7e.p, a1-fe.p, 42.i, 63.i, 75.i |
| 43 <icu:state> 40-7e.p, a1-fe.p, 54.i |
| 44 <icu:state> 40-7e.p, a1-fe.p, 41.i |
| 45 <icu:state> 40-7e.p, a1-fe.p, 61.i |
| 46 <icu:state> 40-7e.p, a1-fe.p, 4e.i |
| 47 <icu:state> 40-7e.p, a1-fe.p, 54.i, 57.i, 5a.i, 62.i, 72.i |
36 | 48 |
37 CHARMAP | 49 CHARMAP |
38 PREAMBLE | 50 PREAMBLE |
39 } | 51 } |
40 | 52 |
41 function ascii { | 53 function ascii { |
42 for i in $(seq 0 127) | 54 for i in $(seq 0 127) |
43 do | 55 do |
44 printf '<U%04X> \\x%02X |0\n' $i $i | 56 printf '<U%04X> \\x%02X |0\n' $i $i |
45 done | 57 done |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
80 function unsorted_table { | 92 function unsorted_table { |
81 two_char_seq | 93 two_char_seq |
82 big5 | 94 big5 |
83 } | 95 } |
84 | 96 |
85 wget -N -r -nd https://encoding.spec.whatwg.org/index-big5.txt | 97 wget -N -r -nd https://encoding.spec.whatwg.org/index-big5.txt |
86 preamble | 98 preamble |
87 ascii | 99 ascii |
88 unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' | 100 unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' |
89 echo 'END CHARMAP' | 101 echo 'END CHARMAP' |
OLD | NEW |