| Index: icu52/scripts/eucjp_gen.sh
|
| ===================================================================
|
| --- icu52/scripts/eucjp_gen.sh (revision 0)
|
| +++ icu52/scripts/eucjp_gen.sh (revision 0)
|
| @@ -0,0 +1,134 @@
|
| +#!/bin/sh
|
| +# Copyright (c) 2014 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is governed by a BSD-style license that can be
|
| +# found in the LICENSE file.
|
| +
|
| +# References:
|
| +# http://encoding.spec.whatwg.org/#euc-jp
|
| +# http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
|
| +# http://www.iana.org/assignments/charset-reg/CP51932
|
| +# Table 3-64 in CJKV Information Processing 2/e.
|
| +
|
| +# Download the following two files, run it in source/data/mappings directory
|
| +# and save the result to euc-jp-html5.ucm
|
| +# http://encoding.spec.whatwg.org/index-jis0208.txt
|
| +# http://encoding.spec.whatwg.org/index-jis0212.txt
|
| +
|
| +function preamble {
|
| +cat <<PREAMBLE
|
| +# ***************************************************************************
|
| +# *
|
| +# * Copyright (C) 1995-2014, International Business Machines
|
| +# * Corporation and others. All Rights Reserved.
|
| +# *
|
| +# * Generated per the algorithm for EUC-JP
|
| +# * described at http://encoding.spec.whatwg.org/#euc-jp.
|
| +# * Added the 34 decoding only (EUC-JP to Unicode) entries from euc-jp-2007.ucm
|
| +# * for the backward compatibility.
|
| +# *
|
| +# ***************************************************************************
|
| +<code_set_name> "euc-jp-html5"
|
| +<char_name_mask> "AXXXX"
|
| +<mb_cur_max> 3
|
| +<mb_cur_min> 1
|
| +<uconv_class> "MBCS"
|
| +<subchar> \xF4\xFE
|
| +<subchar1> \x1A
|
| +<icu:charsetFamily> "ASCII"
|
| +
|
| +<icu:state> 0-7f, 8e:2, 8f:3, a1-fe:1
|
| +<icu:state> a1-fe
|
| +<icu:state> a1-e2
|
| +<icu:state> a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4
|
| +<icu:state> a1-fe.u
|
| +
|
| +CHARMAP
|
| +PREAMBLE
|
| +}
|
| +
|
| +#<U0000> \x00 |0
|
| +function ascii {
|
| + for i in $(seq 0 127)
|
| + do
|
| + printf '<U%04X> \\x%02X |0\n' $i $i
|
| + done
|
| +}
|
| +
|
| +
|
| +function fullwidth_ascii {
|
| + for i in $(seq 0xA1 0xDF)
|
| + do
|
| + # 65377 = 0xFF61, 161 = 0xA1
|
| + printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i
|
| + done
|
| +}
|
| +
|
| +
|
| +# index-jis0208.txt has index pointers larger than the size of
|
| +# the encoding space available in 2-byte Graphic plane of ISO-2022-based
|
| +# encoding (94 x 94 = 8836). We have to exclude them because they're for
|
| +# Shift-JIS.
|
| +# In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries.
|
| +# All the bi-directional mapping entries come *before* the uni-directional
|
| +# (EUC-JP to Unicode) entries so that we put '|3' if we have seen
|
| +# the same Unicode code point earlier in the list. According to the definition
|
| +# of 'index pointer' in the W3C encoding spec, it's the first entry in the
|
| +# file for a given Unicode code point.
|
| +
|
| +function jis208 {
|
| + awk '!/^#/ && !/^$/ && $1 <= 8836 \
|
| + { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
|
| + $1 / 94 + 0xA1, $1 % 94 + 0xA1,\
|
| + ($2 in uset) ? 3 : 0); \
|
| + uset[$2] = 1;
|
| + }' \
|
| + index-jis0208.txt
|
| +}
|
| +
|
| +# JIS X 212 is for decoding only (use '|3' to denote that).
|
| +
|
| +function jis212 {
|
| + awk '!/^#/ && !/^$/ \
|
| + { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\
|
| + $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \
|
| + index-jis0212.txt
|
| +}
|
| +
|
| +# Add the uni-directional mapping entries (EUC-JP to Unicode) that
|
| +# are only present in euc-jp-2007.ucm. There are 34 of them. They're added
|
| +# for the backward compatibility with the old behavior of Chrome.
|
| +# See https://www.w3.org/Bugs/Public/show_bug.cgi?id=25266
|
| +# Here are the break-downs:
|
| +# 1. 0x8E0xE0 to 0x8E0xE2
|
| +# 00A2 00A3 00AC
|
| +# 2. JIS X 0212 extra (0x8F 0xF3 0xhh)
|
| +# 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
|
| +# 2172 2173 2174 2175 2176 2177 2178 2179 221A 2220 2229 222A 222B 2235 2252
|
| +# 2261 22A5 3231
|
| +# 3. JIS X 0208 extra : 0xFC 0xFB => FFE2
|
| +
|
| +function decode_only_extra {
|
| + decode_only_list=$(
|
| + for i in $(grep '|3' euc-jp-2007.ucm | sed 's/^<U\(....\)>.*$/\1/')
|
| + do
|
| + grep 0x${i} index-jis0212.txt > /dev/null || echo $i
|
| + done)
|
| +
|
| + for u in $decode_only_list
|
| + do
|
| + grep $u euc-jp-2007.ucm | grep '|3'
|
| + done
|
| +}
|
| +
|
| +function unsorted_table {
|
| + ascii
|
| + jis208
|
| + jis212
|
| + decode_only_extra
|
| + echo '<U00A5> \x5C |1'
|
| + echo '<U203E> \x7E |1'
|
| +}
|
| +
|
| +preamble
|
| +unsorted_table | sort | uniq
|
| +echo 'END CHARMAP'
|
|
|
| Property changes on: icu52/scripts/eucjp_gen.sh
|
| ___________________________________________________________________
|
| Added: svn:executable
|
| + *
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|