Index: icu52/scripts/eucjp_gen.sh |
=================================================================== |
--- icu52/scripts/eucjp_gen.sh (revision 0) |
+++ icu52/scripts/eucjp_gen.sh (revision 0) |
@@ -0,0 +1,133 @@ |
+#!/bin/sh |
+# Copyright (c) 2014 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+# References: |
+# http://encoding.spec.whatwg.org/#euc-jp |
+# http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 |
+# http://www.iana.org/assignments/charset-reg/CP51932 |
+# Table 3-64 in CJKV Information Processing 2/e. |
+ |
+# Download the following two files, run it in source/data/mappings directory |
+# and save the result to euc-jp-html5.ucm |
+# http://encoding.spec.whatwg.org/index-jis0208.txt |
+# http://encoding.spec.whatwg.org/index-jis0212.txt |
+ |
+function preamble { |
+cat <<PREAMBLE |
+# *************************************************************************** |
+# * |
+# * Copyright (C) 1995-2014, International Business Machines |
+# * Corporation and others. All Rights Reserved. |
+# * |
+# * Generated per the algorithm for EUC-JP |
+# * described at http://encoding.spec.whatwg.org/#euc-jp. |
+# * Added the 34 decoding only (EUC-JP to Unicode) entries from euc-jp-2007.ucm |
jsbell
2014/04/04 18:16:20
Do we want to propose these as additions to the En
jungshik at Google
2014/04/04 22:20:27
I don't have a good sense of how important they're
|
+# * for the backward compatibility. |
+# * |
+# *************************************************************************** |
+<code_set_name> "euc-jp-html5" |
+<char_name_mask> "AXXXX" |
+<mb_cur_max> 3 |
+<mb_cur_min> 1 |
+<uconv_class> "MBCS" |
+<subchar> \xF4\xFE |
+<subchar1> \x1A |
+<icu:charsetFamily> "ASCII" |
+ |
+<icu:state> 0-7f, 8e:2, 8f:3, a1-fe:1 |
+<icu:state> a1-fe |
+<icu:state> a1-e2 |
+<icu:state> a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4 |
+<icu:state> a1-fe.u |
+ |
+CHARMAP |
+PREAMBLE |
+} |
+ |
+#<U0000> \x00 |0 |
+function ascii { |
+for i in $(seq 0 127) |
+do |
+ printf '<U%04X> \\x%02X |0\n' $i $i |
+done |
+} |
+ |
+ |
+function fullwidth_ascii { |
+for i in $(seq 0xA1 0xDF) |
+do |
+ # 65377 = 0xFF61, 161 = 0xA1 |
+ printf '<U%04X> \\x%02X |0\n' $(exp $i + 65377 - 161) $i |
+done |
+} |
+ |
+ |
+# index-jis0208.txt has index pointers larger than the size of |
+# the encoding space available in 2-byte Graphic plane of ISO-2022-based |
+# encoding (94 x 94 = 8836). We have to exclude them because they're for |
+# Shift-JIS. |
+# In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries. |
+# All the bi-directional mapping entries come *before* the uni-directional |
+# (EUC-JP to Unicode) entries so that we put '|3' if we have seen |
+# the same Unicode code point earlier in the list. According to the definition |
+# of 'index pointer' in the W3C encoding spec, it's the first entry in the |
+# file for a given Unicode code point. |
+ |
+function jis208 { |
+awk '!/^#/ && !/^$/ && $1 <= 8836 \ |
+ { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ |
+ $1 / 94 + 0xA1, $1 % 94 + 0xA1,\ |
+ ($2 in uset) ? 3 : 0); \ |
+ uset[$2] = 1; |
+ }' \ |
+ index-jis0208.txt |
+} |
+ |
+# JIS X 212 is for decoding only (use '|3' to denote that). |
+ |
+function jis212 { |
+awk '!/^#/ && !/^$/ \ |
+ { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\ |
+ $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \ |
+ index-jis0212.txt |
+} |
+ |
+# Add the uni-directional mapping entries (EUC-JP to Unicode) that |
+# are only present in euc-jp-2007.ucm. There are 34 of them. They're added |
+# for the backward compatibility with the old behavior of Chrome. |
+# Here are the break-downs: |
+# 1. 0x8E0xE0 to 0x8E0xE2 |
+# 00A2 00A3 00AC |
+# 2. JIS X 0212 extra (0x8F 0xF3 0xhh) |
+# 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 |
+# 2172 2173 2174 2175 2176 2177 2178 2179 221A 2220 2229 222A 222B 2235 2252 |
+# 2261 22A5 3231 |
+# 3. JIS X 0208 extra : FFE2 |
+ |
+function decode_only_extra { |
+decode_only_list=$( |
+for i in $(grep '|3' euc-jp-2007.ucm | sed 's/^<U\(....\)>.*$/\1/') |
+do |
+ grep 0x${i} index-jis0212.txt > /dev/null || echo $i |
+done) |
+ |
+for u in $decode_only_list |
+do |
+ grep $u euc-jp-2007.ucm | grep '|3' |
+done |
+} |
+ |
+function unsorted_table { |
+ ascii |
+ jis208 |
+ jis212 |
+ decode_only_extra |
+ echo '<U00A5> \x5C |1' |
+ echo '<U203E> \x7E |1' |
+} |
+ |
+preamble |
+unsorted_table | sort | uniq |
+echo 'END CHARMAP' |
Property changes on: icu52/scripts/eucjp_gen.sh |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |
Added: svn:executable |
+ * |