OLD | NEW |
(Empty) | |
| 1 #!/bin/sh |
| 2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 # References: |
| 7 # http://encoding.spec.whatwg.org/#euc-jp |
| 8 # http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 |
| 9 # http://www.iana.org/assignments/charset-reg/CP51932 |
| 10 # Table 3-64 in CJKV Information Processing 2/e. |
| 11 |
| 12 # Download the following two files, run it in source/data/mappings directory |
| 13 # and save the result to euc-jp-html5.ucm |
| 14 # http://encoding.spec.whatwg.org/index-jis0208.txt |
| 15 # http://encoding.spec.whatwg.org/index-jis0212.txt |
| 16 |
| 17 function preamble { |
| 18 cat <<PREAMBLE |
| 19 # *************************************************************************** |
| 20 # * |
| 21 # * Copyright (C) 1995-2014, International Business Machines |
| 22 # * Corporation and others. All Rights Reserved. |
| 23 # * |
| 24 # * Generated per the algorithm for EUC-JP |
| 25 # * described at http://encoding.spec.whatwg.org/#euc-jp. |
| 26 # * Added the 34 decoding only (EUC-JP to Unicode) entries from euc-jp-2007.uc
m |
| 27 # * for the backward compatibility. |
| 28 # * |
| 29 # *************************************************************************** |
| 30 <code_set_name> "euc-jp-html5" |
| 31 <char_name_mask> "AXXXX" |
| 32 <mb_cur_max> 3 |
| 33 <mb_cur_min> 1 |
| 34 <uconv_class> "MBCS" |
| 35 <subchar> \xF4\xFE |
| 36 <subchar1> \x1A |
| 37 <icu:charsetFamily> "ASCII" |
| 38 |
| 39 <icu:state> 0-7f, 8e:2, 8f:3, a1-fe:1 |
| 40 <icu:state> a1-fe |
| 41 <icu:state> a1-e2 |
| 42 <icu:state> a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4
-fe:4 |
| 43 <icu:state> a1-fe.u |
| 44 |
| 45 CHARMAP |
| 46 PREAMBLE |
| 47 } |
| 48 |
| 49 #<U0000> \x00 |0 |
| 50 function ascii { |
| 51 for i in $(seq 0 127) |
| 52 do |
| 53 printf '<U%04X> \\x%02X |0\n' $i $i |
| 54 done |
| 55 } |
| 56 |
| 57 |
| 58 function fullwidth_ascii { |
| 59 for i in $(seq 0xA1 0xDF) |
| 60 do |
| 61 # 65377 = 0xFF61, 161 = 0xA1 |
| 62 printf '<U%04X> \\x%02X |0\n' $(exp $i + 65377 - 161) $i |
| 63 done |
| 64 } |
| 65 |
| 66 |
| 67 # index-jis0208.txt has index pointers larger than the size of |
| 68 # the encoding space available in 2-byte Graphic plane of ISO-2022-based |
| 69 # encoding (94 x 94 = 8836). We have to exclude them because they're for |
| 70 # Shift-JIS. |
| 71 # In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries. |
| 72 # All the bi-directional mapping entries come *before* the uni-directional |
| 73 # (EUC-JP to Unicode) entries so that we put '|3' if we have seen |
| 74 # the same Unicode code point earlier in the list. According to the definition |
| 75 # of 'index pointer' in the W3C encoding spec, it's the first entry in the |
| 76 # file for a given Unicode code point. |
| 77 |
| 78 function jis208 { |
| 79 awk '!/^#/ && !/^$/ && $1 <= 8836 \ |
| 80 { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ |
| 81 $1 / 94 + 0xA1, $1 % 94 + 0xA1,\ |
| 82 ($2 in uset) ? 3 : 0); \ |
| 83 uset[$2] = 1; |
| 84 }' \ |
| 85 index-jis0208.txt |
| 86 } |
| 87 |
| 88 # JIS X 212 is for decoding only (use '|3' to denote that). |
| 89 |
| 90 function jis212 { |
| 91 awk '!/^#/ && !/^$/ \ |
| 92 { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\ |
| 93 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \ |
| 94 index-jis0212.txt |
| 95 } |
| 96 |
| 97 # Add the uni-directional mapping entries (EUC-JP to Unicode) that |
| 98 # are only present in euc-jp-2007.ucm. There are 34 of them. They're added |
| 99 # for the backward compatibility with the old behavior of Chrome. |
| 100 # Here are the break-downs: |
| 101 # 1. 0x8E0xE0 to 0x8E0xE2 |
| 102 # 00A2 00A3 00AC |
| 103 # 2. JIS X 0212 extra (0x8F 0xF3 0xhh) |
| 104 # 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 |
| 105 # 2172 2173 2174 2175 2176 2177 2178 2179 221A 2220 2229 222A 222B 2235 2252 |
| 106 # 2261 22A5 3231 |
| 107 # 3. JIS X 0208 extra : FFE2 |
| 108 |
| 109 function decode_only_extra { |
| 110 decode_only_list=$( |
| 111 for i in $(grep '|3' euc-jp-2007.ucm | sed 's/^<U\(....\)>.*$/\1/') |
| 112 do |
| 113 grep 0x${i} index-jis0212.txt > /dev/null || echo $i |
| 114 done) |
| 115 |
| 116 for u in $decode_only_list |
| 117 do |
| 118 grep $u euc-jp-2007.ucm | grep '|3' |
| 119 done |
| 120 } |
| 121 |
| 122 function unsorted_table { |
| 123 ascii |
| 124 jis208 |
| 125 jis212 |
| 126 decode_only_extra |
| 127 echo '<U00A5> \x5C |1' |
| 128 echo '<U203E> \x7E |1' |
| 129 } |
| 130 |
| 131 preamble |
| 132 unsorted_table | sort | uniq |
| 133 echo 'END CHARMAP' |
OLD | NEW |