OLD | NEW |
1 #!/bin/sh | 1 #!/bin/sh |
2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 # References: | 6 # References: |
7 # http://encoding.spec.whatwg.org/#euc-jp | 7 # http://encoding.spec.whatwg.org/#euc-jp |
8 # http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 | 8 # http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 |
9 # http://www.iana.org/assignments/charset-reg/CP51932 | 9 # http://www.iana.org/assignments/charset-reg/CP51932 |
10 # Table 3-64 in CJKV Information Processing 2/e. | 10 # Table 3-64 in CJKV Information Processing 2/e. |
11 | 11 |
12 # Download the following two files, run it in source/data/mappings directory | 12 # Download the following two files, run it in source/data/mappings directory |
13 # and save the result to euc-jp-html5.ucm | 13 # and save the result to euc-jp-html5.ucm |
14 # http://encoding.spec.whatwg.org/index-jis0208.txt | 14 # http://encoding.spec.whatwg.org/index-jis0208.txt |
15 # http://encoding.spec.whatwg.org/index-jis0212.txt | 15 # http://encoding.spec.whatwg.org/index-jis0212.txt |
16 | 16 |
17 function preamble { | 17 function preamble { |
18 cat <<PREAMBLE | 18 cat <<PREAMBLE |
19 # *************************************************************************** | 19 # *************************************************************************** |
20 # * | 20 # * |
21 # * Copyright (C) 1995-2014, International Business Machines | 21 # * Copyright (C) 1995-2014, International Business Machines |
22 # * Corporation and others. All Rights Reserved. | 22 # * Corporation and others. All Rights Reserved. |
23 # * | 23 # * |
24 # * Generated per the algorithm for EUC-JP | 24 # * Generated per the algorithm for EUC-JP |
25 # * described at http://encoding.spec.whatwg.org/#euc-jp. | 25 # * described at http://encoding.spec.whatwg.org/#euc-jp. |
26 # * Added the 34 decoding only (EUC-JP to Unicode) entries from euc-jp-2007.uc
m | |
27 # * for the backward compatibility. | |
28 # * | 26 # * |
29 # *************************************************************************** | 27 # *************************************************************************** |
30 <code_set_name> "euc-jp-html5" | 28 <code_set_name> "euc-jp-html5" |
31 <char_name_mask> "AXXXX" | 29 <char_name_mask> "AXXXX" |
32 <mb_cur_max> 3 | 30 <mb_cur_max> 3 |
33 <mb_cur_min> 1 | 31 <mb_cur_min> 1 |
34 <uconv_class> "MBCS" | 32 <uconv_class> "MBCS" |
35 <subchar> \xF4\xFE | 33 <subchar> \xF4\xFE |
36 <subchar1> \x1A | 34 <subchar1> \x1A |
37 <icu:charsetFamily> "ASCII" | 35 <icu:charsetFamily> "ASCII" |
(...skipping 10 matching lines...) Expand all Loading... |
48 | 46 |
49 #<U0000> \x00 |0 | 47 #<U0000> \x00 |0 |
50 function ascii { | 48 function ascii { |
51 for i in $(seq 0 127) | 49 for i in $(seq 0 127) |
52 do | 50 do |
53 printf '<U%04X> \\x%02X |0\n' $i $i | 51 printf '<U%04X> \\x%02X |0\n' $i $i |
54 done | 52 done |
55 } | 53 } |
56 | 54 |
57 | 55 |
58 function fullwidth_ascii { | 56 # Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F |
| 57 function half_width_kana { |
59 for i in $(seq 0xA1 0xDF) | 58 for i in $(seq 0xA1 0xDF) |
60 do | 59 do |
61 # 65377 = 0xFF61, 161 = 0xA1 | 60 # 65377 = 0xFF61, 161 = 0xA1 |
62 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i | 61 printf '<U%04X> \\x8E\\x%02X |0\n' $(($i + 65377 - 161)) $i |
63 done | 62 done |
64 } | 63 } |
65 | 64 |
66 | 65 |
67 # index-jis0208.txt has index pointers larger than the size of | 66 # index-jis0208.txt has index pointers larger than the size of |
68 # the encoding space available in 2-byte Graphic plane of ISO-2022-based | 67 # the encoding space available in 2-byte Graphic plane of ISO-2022-based |
69 # encoding (94 x 94 = 8836). We have to exclude them because they're for | 68 # encoding (94 x 94 = 8836). We have to exclude them because they're for |
70 # Shift-JIS. | 69 # Shift-JIS. |
71 # In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries. | 70 # In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries. |
72 # All the bi-directional mapping entries come *before* the uni-directional | 71 # All the bi-directional mapping entries come *before* the uni-directional |
(...skipping 14 matching lines...) Expand all Loading... |
87 | 86 |
88 # JIS X 212 is for decoding only (use '|3' to denote that). | 87 # JIS X 212 is for decoding only (use '|3' to denote that). |
89 | 88 |
90 function jis212 { | 89 function jis212 { |
91 awk '!/^#/ && !/^$/ \ | 90 awk '!/^#/ && !/^$/ \ |
92 { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\ | 91 { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\ |
93 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \ | 92 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \ |
94 index-jis0212.txt | 93 index-jis0212.txt |
95 } | 94 } |
96 | 95 |
97 # Add the uni-directional mapping entries (EUC-JP to Unicode) that | |
98 # are only present in euc-jp-2007.ucm. There are 34 of them. They're added | |
99 # for the backward compatibility with the old behavior of Chrome. | |
100 # See https://www.w3.org/Bugs/Public/show_bug.cgi?id=25266 | |
101 # Here are the break-downs: | |
102 # 1. 0x8E0xE0 to 0x8E0xE2 | |
103 # 00A2 00A3 00AC | |
104 # 2. JIS X 0212 extra (0x8F 0xF3 0xhh) | |
105 # 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 | |
106 # 2172 2173 2174 2175 2176 2177 2178 2179 221A 2220 2229 222A 222B 2235 2252 | |
107 # 2261 22A5 3231 | |
108 # 3. JIS X 0208 extra : 0xFC 0xFB => FFE2 | |
109 | |
110 function decode_only_extra { | |
111 decode_only_list=$( | |
112 for i in $(grep '|3' euc-jp-2007.ucm | sed 's/^<U\(....\)>.*$/\1/') | |
113 do | |
114 grep 0x${i} index-jis0212.txt > /dev/null || echo $i | |
115 done) | |
116 | |
117 for u in $decode_only_list | |
118 do | |
119 grep $u euc-jp-2007.ucm | grep '|3' | |
120 done | |
121 } | |
122 | |
123 function unsorted_table { | 96 function unsorted_table { |
124 ascii | 97 ascii |
| 98 half_width_kana |
125 jis208 | 99 jis208 |
126 jis212 | 100 jis212 |
127 decode_only_extra | 101 decode_only_extra |
128 echo '<U00A5> \x5C |1' | 102 echo '<U00A5> \x5C |1' |
129 echo '<U203E> \x7E |1' | 103 echo '<U203E> \x7E |1' |
130 } | 104 } |
131 | 105 |
132 preamble | 106 preamble |
133 unsorted_table | sort | uniq | 107 unsorted_table | sort | uniq |
134 echo 'END CHARMAP' | 108 echo 'END CHARMAP' |
OLD | NEW |