Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(269)

Unified Diff: icu52/scripts/eucjp_gen.sh

Issue 224943002: icu local change part1 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/deps/third_party/
Patch Set: function indentation changed Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « icu52/patches/xopen_source.patch ('k') | icu52/scripts/ibm866_gen.sh » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: icu52/scripts/eucjp_gen.sh
===================================================================
--- icu52/scripts/eucjp_gen.sh (revision 0)
+++ icu52/scripts/eucjp_gen.sh (revision 0)
@@ -0,0 +1,134 @@
+#!/bin/sh
+# Copyright (c) 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# References:
+# http://encoding.spec.whatwg.org/#euc-jp
+# http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
+# http://www.iana.org/assignments/charset-reg/CP51932
+# Table 3-64 in CJKV Information Processing 2/e.
+
+# Download the following two files, run it in source/data/mappings directory
+# and save the result to euc-jp-html5.ucm
+# http://encoding.spec.whatwg.org/index-jis0208.txt
+# http://encoding.spec.whatwg.org/index-jis0212.txt
+
+function preamble {
+cat <<PREAMBLE
+# ***************************************************************************
+# *
+# * Copyright (C) 1995-2014, International Business Machines
+# * Corporation and others. All Rights Reserved.
+# *
+# * Generated per the algorithm for EUC-JP
+# * described at http://encoding.spec.whatwg.org/#euc-jp.
+# * Added the 34 decoding only (EUC-JP to Unicode) entries from euc-jp-2007.ucm
+# * for the backward compatibility.
+# *
+# ***************************************************************************
+<code_set_name> "euc-jp-html5"
+<char_name_mask> "AXXXX"
+<mb_cur_max> 3
+<mb_cur_min> 1
+<uconv_class> "MBCS"
+<subchar> \xF4\xFE
+<subchar1> \x1A
+<icu:charsetFamily> "ASCII"
+
+<icu:state> 0-7f, 8e:2, 8f:3, a1-fe:1
+<icu:state> a1-fe
+<icu:state> a1-e2
+<icu:state> a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4
+<icu:state> a1-fe.u
+
+CHARMAP
+PREAMBLE
+}
+
+#<U0000> \x00 |0
+function ascii {
+ for i in $(seq 0 127)
+ do
+ printf '<U%04X> \\x%02X |0\n' $i $i
+ done
+}
+
+
+function fullwidth_ascii {
+ for i in $(seq 0xA1 0xDF)
+ do
+ # 65377 = 0xFF61, 161 = 0xA1
+ printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i
+ done
+}
+
+
+# index-jis0208.txt has index pointers larger than the size of
+# the encoding space available in 2-byte Graphic plane of ISO-2022-based
+# encoding (94 x 94 = 8836). We have to exclude them because they're for
+# Shift-JIS.
+# In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries.
+# All the bi-directional mapping entries come *before* the uni-directional
+# (EUC-JP to Unicode) entries so that we put '|3' if we have seen
+# the same Unicode code point earlier in the list. According to the definition
+# of 'index pointer' in the W3C encoding spec, it's the first entry in the
+# file for a given Unicode code point.
+
+function jis208 {
+ awk '!/^#/ && !/^$/ && $1 <= 8836 \
+ { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
+ $1 / 94 + 0xA1, $1 % 94 + 0xA1,\
+ ($2 in uset) ? 3 : 0); \
+ uset[$2] = 1;
+ }' \
+ index-jis0208.txt
+}
+
+# JIS X 212 is for decoding only (use '|3' to denote that).
+
+function jis212 {
+ awk '!/^#/ && !/^$/ \
+ { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\
+ $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \
+ index-jis0212.txt
+}
+
+# Add the uni-directional mapping entries (EUC-JP to Unicode) that
+# are only present in euc-jp-2007.ucm. There are 34 of them. They're added
+# for the backward compatibility with the old behavior of Chrome.
+# See https://www.w3.org/Bugs/Public/show_bug.cgi?id=25266
+# Here are the break-downs:
+# 1. 0x8E0xE0 to 0x8E0xE2
+# 00A2 00A3 00AC
+# 2. JIS X 0212 extra (0x8F 0xF3 0xhh)
+# 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171
+# 2172 2173 2174 2175 2176 2177 2178 2179 221A 2220 2229 222A 222B 2235 2252
+# 2261 22A5 3231
+# 3. JIS X 0208 extra : 0xFC 0xFB => FFE2
+
+function decode_only_extra {
+ decode_only_list=$(
+ for i in $(grep '|3' euc-jp-2007.ucm | sed 's/^<U\(....\)>.*$/\1/')
+ do
+ grep 0x${i} index-jis0212.txt > /dev/null || echo $i
+ done)
+
+ for u in $decode_only_list
+ do
+ grep $u euc-jp-2007.ucm | grep '|3'
+ done
+}
+
+function unsorted_table {
+ ascii
+ jis208
+ jis212
+ decode_only_extra
+ echo '<U00A5> \x5C |1'
+ echo '<U203E> \x7E |1'
+}
+
+preamble
+unsorted_table | sort | uniq
+echo 'END CHARMAP'
Property changes on: icu52/scripts/eucjp_gen.sh
___________________________________________________________________
Added: svn:executable
+ *
Added: svn:eol-style
+ LF
« no previous file with comments | « icu52/patches/xopen_source.patch ('k') | icu52/scripts/ibm866_gen.sh » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698