Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(175)

Side by Side Diff: icu52/scripts/eucjp_gen.sh

Issue 251203003: Update EUC-JP per WHATWG encoding spec (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/
Patch Set: Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « icu52/android/icudtl.dat ('k') | icu52/source/data/in/icudtl.dat » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/bin/sh 1 #!/bin/sh
2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. 2 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 # References: 6 # References:
7 # http://encoding.spec.whatwg.org/#euc-jp 7 # http://encoding.spec.whatwg.org/#euc-jp
8 # http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 8 # http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
9 # http://www.iana.org/assignments/charset-reg/CP51932 9 # http://www.iana.org/assignments/charset-reg/CP51932
10 # Table 3-64 in CJKV Information Processing 2/e. 10 # Table 3-64 in CJKV Information Processing 2/e.
11 11
12 # Download the following two files, run it in source/data/mappings directory 12 # Download the following two files, run it in source/data/mappings directory
13 # and save the result to euc-jp-html5.ucm 13 # and save the result to euc-jp-html5.ucm
14 # http://encoding.spec.whatwg.org/index-jis0208.txt 14 # http://encoding.spec.whatwg.org/index-jis0208.txt
15 # http://encoding.spec.whatwg.org/index-jis0212.txt 15 # http://encoding.spec.whatwg.org/index-jis0212.txt
16 16
17 function preamble { 17 function preamble {
18 cat <<PREAMBLE 18 cat <<PREAMBLE
19 # *************************************************************************** 19 # ***************************************************************************
20 # * 20 # *
21 # * Copyright (C) 1995-2014, International Business Machines 21 # * Copyright (C) 1995-2014, International Business Machines
22 # * Corporation and others. All Rights Reserved. 22 # * Corporation and others. All Rights Reserved.
23 # * 23 # *
24 # * Generated per the algorithm for EUC-JP 24 # * Generated per the algorithm for EUC-JP
25 # * described at http://encoding.spec.whatwg.org/#euc-jp. 25 # * described at http://encoding.spec.whatwg.org/#euc-jp.
26 # * Added the 34 decoding only (EUC-JP to Unicode) entries from euc-jp-2007.uc m
27 # * for the backward compatibility.
28 # * 26 # *
29 # *************************************************************************** 27 # ***************************************************************************
30 <code_set_name> "euc-jp-html5" 28 <code_set_name> "euc-jp-html5"
31 <char_name_mask> "AXXXX" 29 <char_name_mask> "AXXXX"
32 <mb_cur_max> 3 30 <mb_cur_max> 3
33 <mb_cur_min> 1 31 <mb_cur_min> 1
34 <uconv_class> "MBCS" 32 <uconv_class> "MBCS"
35 <subchar> \xF4\xFE 33 <subchar> \xF4\xFE
36 <subchar1> \x1A 34 <subchar1> \x1A
37 <icu:charsetFamily> "ASCII" 35 <icu:charsetFamily> "ASCII"
(...skipping 10 matching lines...) Expand all
48 46
49 #<U0000> \x00 |0 47 #<U0000> \x00 |0
50 function ascii { 48 function ascii {
51 for i in $(seq 0 127) 49 for i in $(seq 0 127)
52 do 50 do
53 printf '<U%04X> \\x%02X |0\n' $i $i 51 printf '<U%04X> \\x%02X |0\n' $i $i
54 done 52 done
55 } 53 }
56 54
57 55
58 function fullwidth_ascii { 56 # Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F
57 function half_width_kana {
59 for i in $(seq 0xA1 0xDF) 58 for i in $(seq 0xA1 0xDF)
60 do 59 do
61 # 65377 = 0xFF61, 161 = 0xA1 60 # 65377 = 0xFF61, 161 = 0xA1
62 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i 61 printf '<U%04X> \\x8E\\x%02X |0\n' $(($i + 65377 - 161)) $i
63 done 62 done
64 } 63 }
65 64
66 65
67 # index-jis0208.txt has index pointers larger than the size of 66 # index-jis0208.txt has index pointers larger than the size of
68 # the encoding space available in 2-byte Graphic plane of ISO-2022-based 67 # the encoding space available in 2-byte Graphic plane of ISO-2022-based
69 # encoding (94 x 94 = 8836). We have to exclude them because they're for 68 # encoding (94 x 94 = 8836). We have to exclude them because they're for
70 # Shift-JIS. 69 # Shift-JIS.
71 # In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries. 70 # In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries.
72 # All the bi-directional mapping entries come *before* the uni-directional 71 # All the bi-directional mapping entries come *before* the uni-directional
(...skipping 14 matching lines...) Expand all
87 86
88 # JIS X 212 is for decoding only (use '|3' to denote that). 87 # JIS X 212 is for decoding only (use '|3' to denote that).
89 88
90 function jis212 { 89 function jis212 {
91 awk '!/^#/ && !/^$/ \ 90 awk '!/^#/ && !/^$/ \
92 { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\ 91 { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\
93 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \ 92 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \
94 index-jis0212.txt 93 index-jis0212.txt
95 } 94 }
96 95
97 # Add the uni-directional mapping entries (EUC-JP to Unicode) that
98 # are only present in euc-jp-2007.ucm. There are 34 of them. They're added
99 # for the backward compatibility with the old behavior of Chrome.
100 # See https://www.w3.org/Bugs/Public/show_bug.cgi?id=25266
101 # Here are the break-downs:
102 # 1. 0x8E0xE0 to 0x8E0xE2
103 # 00A2 00A3 00AC
104 # 2. JIS X 0212 extra (0x8F 0xF3 0xhh)
105 # 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
106 # 2172 2173 2174 2175 2176 2177 2178 2179 221A 2220 2229 222A 222B 2235 2252
107 # 2261 22A5 3231
108 # 3. JIS X 0208 extra : 0xFC 0xFB => FFE2
109
110 function decode_only_extra {
111 decode_only_list=$(
112 for i in $(grep '|3' euc-jp-2007.ucm | sed 's/^<U\(....\)>.*$/\1/')
113 do
114 grep 0x${i} index-jis0212.txt > /dev/null || echo $i
115 done)
116
117 for u in $decode_only_list
118 do
119 grep $u euc-jp-2007.ucm | grep '|3'
120 done
121 }
122
123 function unsorted_table { 96 function unsorted_table {
124 ascii 97 ascii
98 half_width_kana
125 jis208 99 jis208
126 jis212 100 jis212
127 decode_only_extra 101 decode_only_extra
128 echo '<U00A5> \x5C |1' 102 echo '<U00A5> \x5C |1'
129 echo '<U203E> \x7E |1' 103 echo '<U203E> \x7E |1'
130 } 104 }
131 105
132 preamble 106 preamble
133 unsorted_table | sort | uniq 107 unsorted_table | sort | uniq
134 echo 'END CHARMAP' 108 echo 'END CHARMAP'
OLDNEW
« no previous file with comments | « icu52/android/icudtl.dat ('k') | icu52/source/data/in/icudtl.dat » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698