OLD | NEW |
1 #!/bin/sh | 1 #!/bin/sh |
2 # Copyright 2014 The Chromium Authors. All rights reserved. | 2 # Copyright 2014 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 # References: | 6 # References: |
7 # http://encoding.spec.whatwg.org/#shift_jis | 7 # https://encoding.spec.whatwg.org/#shift_jis |
8 | 8 |
9 # Download the following file, run it in source/data/mappings directory | 9 # Download the following file, run it in source/data/mappings directory |
10 # and save the result to euc-jp-html5.ucm | 10 # and save the result to euc-jp-html5.ucm |
11 # http://encoding.spec.whatwg.org/index-jis0208.txt | 11 # https://encoding.spec.whatwg.org/index-jis0208.txt |
12 | 12 |
13 function preamble { | 13 function preamble { |
14 cat <<PREAMBLE | 14 cat <<PREAMBLE |
15 # *************************************************************************** | 15 # *************************************************************************** |
16 # * | 16 # * |
17 # * Copyright (C) 1995-2014, International Business Machines | 17 # * Copyright (C) 1995-2014, International Business Machines |
18 # * Corporation and others. All Rights Reserved. | 18 # * Corporation and others. All Rights Reserved. |
19 # * | 19 # * |
20 # * Generated per the algorithm for Shift_JIS | 20 # * Generated per the algorithm for Shift_JIS |
21 # * described at http://encoding.spec.whatwg.org/#shift_jis | 21 # * described at https://encoding.spec.whatwg.org/#shift_jis |
22 # * | 22 # * |
23 # *************************************************************************** | 23 # *************************************************************************** |
24 <code_set_name> "shift_jis-html5" | 24 <code_set_name> "shift_jis-html5" |
25 <char_name_mask> "AXXXX" | 25 <char_name_mask> "AXXXX" |
26 <mb_cur_max> 2 | 26 <mb_cur_max> 2 |
27 <mb_cur_min> 1 | 27 <mb_cur_min> 1 |
28 <uconv_class> "MBCS" | 28 <uconv_class> "MBCS" |
29 <subchar> \xFC\xFC | 29 <subchar> \x3F |
30 <subchar1> \x7F | |
31 <icu:charsetFamily> "ASCII" | 30 <icu:charsetFamily> "ASCII" |
32 | 31 |
33 <icu:state> 0-80, 81-9f:1, a1-df, e0-fc:1 | 32 <icu:state> 0-80, 81-9f:1, a1-df, e0-fc:1, 82:3, 84:4, 85-86:2
, 87:5, 88:2, 98:6, eb-ec:2, ef:2, f9:2, fc:7 |
| 33 |
34 <icu:state> 40-7e, 80-fc | 34 <icu:state> 40-7e, 80-fc |
| 35 <icu:state> 80-fc |
| 36 <icu:state> 4f-7e, 80-fc, 59-5f.i, 7a-7e.i |
| 37 <icu:state> 40-7e, 80-fc, 61-6f.i |
| 38 <icu:state> 40-7e, 80-fc, 76-7d.i |
| 39 <icu:state> 40-7e, 80-fc, 73-7e.i |
| 40 <icu:state> 40-4b, 80-fc |
| 41 |
35 | 42 |
36 CHARMAP | 43 CHARMAP |
37 PREAMBLE | 44 PREAMBLE |
38 } | 45 } |
39 | 46 |
40 # The encoding spec for Shift_JIS says U+0080 has to be round-tripped with | 47 # The encoding spec for Shift_JIS says U+0080 has to be round-tripped with |
41 # 0x80. So, this is one character more than ASCII up to 128 (0x80). | 48 # 0x80. So, this is one character more than ASCII up to 128 (0x80). |
42 function ascii { | 49 function ascii { |
43 for i in $(seq 0 128) | 50 for i in $(seq 0 128) |
44 do | 51 do |
45 printf '<U%04X> \\x%02X |0\n' $i $i | 52 printf '<U%04X> \\x%02X |0\n' $i $i |
46 done | 53 done |
47 } | 54 } |
48 | 55 |
49 | 56 |
50 # Map 0x[A1-DF] to U+FF61 to U+FF9F | 57 # Map 0x[A1-DF] to U+FF61 to U+FF9F |
51 function half_width_kana { | 58 function half_width_kana { |
52 for i in $(seq 0xA1 0xDF) | 59 for i in $(seq 0xA1 0xDF) |
53 do | 60 do |
54 # 65377 = 0xFF61, 161 = 0xA1 | 61 # 65377 = 0xFF61, 161 = 0xA1 |
55 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i | 62 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i |
56 done | 63 done |
57 } | 64 } |
58 | 65 |
59 | 66 |
60 # From http://encoding.spec.whatwg.org/#index-shift_jis-pointer | 67 # From https://encoding.spec.whatwg.org/#index-shift_jis-pointer |
61 # The index shift_jis pointer for code point is the return value of | 68 # The index shift_jis pointer for code point is the return value of |
62 # these steps for the round-trip code points (tag = 0) | 69 # these steps for the round-trip code points (tag = 0) |
63 # | 70 # |
64 # Let index be index jis0208 excluding all pointers in the range 8272 to 8835. | 71 # Let index be index jis0208 excluding all pointers in the range 8272 to 8835. |
65 # Return the index pointer for code point in index. | 72 # Return the index pointer for code point in index. |
66 # For index ($1) outside the above range, it's for decoding only and tag | 73 # For index ($1) outside the above range, it's for decoding only and tag |
67 # is set to '3'. | 74 # is set to '3'. |
68 # Besides, there are 24 more characters with multiple SJIS representations. | 75 # Besides, there are 24 more characters with multiple SJIS representations. |
69 # Only the first of multiple is tagged with '0' (bi-directional mapping) | 76 # Only the first of multiple is tagged with '0' (bi-directional mapping) |
70 # while the rest is tagged with '3'. | 77 # while the rest is tagged with '3'. |
71 | 78 |
72 function jis208 { | 79 function jis208 { |
73 awk '!/^#/ && !/^$/ \ | 80 awk '!/^#/ && !/^$/ \ |
74 { lead = $1 / 188; \ | 81 { lead = $1 / 188; \ |
75 lead_offset = lead < 0x1F ? 0x81 : 0xC1; \ | 82 lead_offset = lead < 0x1F ? 0x81 : 0xC1; \ |
76 trail = $1 % 188; \ | 83 trail = $1 % 188; \ |
77 trail_offset = trail < 0x3F ? 0x40 : 0x41; \ | 84 trail_offset = trail < 0x3F ? 0x40 : 0x41; \ |
78 is_in_range = ($1 < 8272 || $1 > 8835); \ | 85 is_in_range = ($1 < 8272 || $1 > 8835); \ |
79 tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \ | 86 tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \ |
80 printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ | 87 printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ |
81 lead + lead_offset, trail + trail_offset, tag);\ | 88 lead + lead_offset, trail + trail_offset, tag);\ |
82 if (is_in_range) has_seen[$2] = 1; \ | 89 if (is_in_range) has_seen[$2] = 1; \ |
83 }' \ | 90 }' \ |
84 index-jis0208.txt | 91 index-jis0208.txt |
85 } | 92 } |
86 | 93 |
87 # EUDC (End User Defined Characters) is for decoding only | 94 # EUDC (End User Defined Characters) is for decoding only |
88 # (use '|3' to denote that). | 95 # (use '|3' to denote that). |
89 # See http://encoding.spec.whatwg.org/#shift_jis-decoder - step 5 | 96 # See https://encoding.spec.whatwg.org/#shift_jis-decoder - step 5 |
90 # This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41} | 97 # This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41} |
91 # to implement it. | 98 # to implement it. |
92 | 99 |
93 function eudc { | 100 function eudc { |
94 # The upper bound for the lead byte is 0xF8 because each lead can | 101 # The upper bound for the lead byte is 0xF8 because each lead can |
95 # have 188 characters and the total # of characters in the EUDC | 102 # have 188 characters and the total # of characters in the EUDC |
96 # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder | 103 # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder |
97 # step 3.5 in the encoding spec.) | 104 # step 3.5 in the encoding spec.) |
98 for lead in $(seq 0xF0 0xF8) | 105 for lead in $(seq 0xF0 0xF8) |
99 do | 106 do |
(...skipping 10 matching lines...) Expand all Loading... |
110 function unsorted_table { | 117 function unsorted_table { |
111 ascii | 118 ascii |
112 half_width_kana | 119 half_width_kana |
113 jis208 | 120 jis208 |
114 eudc "0x40" "0x7E" "0x40" | 121 eudc "0x40" "0x7E" "0x40" |
115 eudc "0x80" "0xFC" "0x41" | 122 eudc "0x80" "0xFC" "0x41" |
116 echo '<U00A5> \x5C |1' | 123 echo '<U00A5> \x5C |1' |
117 echo '<U203E> \x7E |1' | 124 echo '<U203E> \x7E |1' |
118 } | 125 } |
119 | 126 |
| 127 wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt |
120 preamble | 128 preamble |
121 unsorted_table | sort | uniq | 129 unsorted_table | sort | uniq |
122 echo 'END CHARMAP' | 130 echo 'END CHARMAP' |
OLD | NEW |