Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(164)

Side by Side Diff: scripts/sjis_gen.sh

Issue 984233002: Update CJK converters and their generating scripts (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: add EUC-KR to README.chromium Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « scripts/euckr_gen.sh ('k') | source/data/in/icudtl.dat » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/bin/sh 1 #!/bin/sh
2 # Copyright 2014 The Chromium Authors. All rights reserved. 2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 # References: 6 # References:
7 # http://encoding.spec.whatwg.org/#shift_jis 7 # https://encoding.spec.whatwg.org/#shift_jis
8 8
9 # Download the following file, run it in source/data/mappings directory 9 # Download the following file, run it in source/data/mappings directory
10 # and save the result to euc-jp-html5.ucm 10 # and save the result to euc-jp-html5.ucm
11 # http://encoding.spec.whatwg.org/index-jis0208.txt 11 # https://encoding.spec.whatwg.org/index-jis0208.txt
12 12
13 function preamble { 13 function preamble {
14 cat <<PREAMBLE 14 cat <<PREAMBLE
15 # *************************************************************************** 15 # ***************************************************************************
16 # * 16 # *
17 # * Copyright (C) 1995-2014, International Business Machines 17 # * Copyright (C) 1995-2014, International Business Machines
18 # * Corporation and others. All Rights Reserved. 18 # * Corporation and others. All Rights Reserved.
19 # * 19 # *
20 # * Generated per the algorithm for Shift_JIS 20 # * Generated per the algorithm for Shift_JIS
21 # * described at http://encoding.spec.whatwg.org/#shift_jis 21 # * described at https://encoding.spec.whatwg.org/#shift_jis
22 # * 22 # *
23 # *************************************************************************** 23 # ***************************************************************************
24 <code_set_name> "shift_jis-html5" 24 <code_set_name> "shift_jis-html5"
25 <char_name_mask> "AXXXX" 25 <char_name_mask> "AXXXX"
26 <mb_cur_max> 2 26 <mb_cur_max> 2
27 <mb_cur_min> 1 27 <mb_cur_min> 1
28 <uconv_class> "MBCS" 28 <uconv_class> "MBCS"
29 <subchar> \xFC\xFC 29 <subchar> \x3F
30 <subchar1> \x7F
31 <icu:charsetFamily> "ASCII" 30 <icu:charsetFamily> "ASCII"
32 31
33 <icu:state> 0-80, 81-9f:1, a1-df, e0-fc:1 32 <icu:state> 0-80, 81-9f:1, a1-df, e0-fc:1, 82:3, 84:4, 85-86:2 , 87:5, 88:2, 98:6, eb-ec:2, ef:2, f9:2, fc:7
33
34 <icu:state> 40-7e, 80-fc 34 <icu:state> 40-7e, 80-fc
35 <icu:state> 80-fc
36 <icu:state> 4f-7e, 80-fc, 59-5f.i, 7a-7e.i
37 <icu:state> 40-7e, 80-fc, 61-6f.i
38 <icu:state> 40-7e, 80-fc, 76-7d.i
39 <icu:state> 40-7e, 80-fc, 73-7e.i
40 <icu:state> 40-4b, 80-fc
41
35 42
36 CHARMAP 43 CHARMAP
37 PREAMBLE 44 PREAMBLE
38 } 45 }
39 46
40 # The encoding spec for Shift_JIS says U+0080 has to be round-tripped with 47 # The encoding spec for Shift_JIS says U+0080 has to be round-tripped with
41 # 0x80. So, this is one character more than ASCII up to 128 (0x80). 48 # 0x80. So, this is one character more than ASCII up to 128 (0x80).
42 function ascii { 49 function ascii {
43 for i in $(seq 0 128) 50 for i in $(seq 0 128)
44 do 51 do
45 printf '<U%04X> \\x%02X |0\n' $i $i 52 printf '<U%04X> \\x%02X |0\n' $i $i
46 done 53 done
47 } 54 }
48 55
49 56
50 # Map 0x[A1-DF] to U+FF61 to U+FF9F 57 # Map 0x[A1-DF] to U+FF61 to U+FF9F
51 function half_width_kana { 58 function half_width_kana {
52 for i in $(seq 0xA1 0xDF) 59 for i in $(seq 0xA1 0xDF)
53 do 60 do
54 # 65377 = 0xFF61, 161 = 0xA1 61 # 65377 = 0xFF61, 161 = 0xA1
55 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i 62 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i
56 done 63 done
57 } 64 }
58 65
59 66
60 # From http://encoding.spec.whatwg.org/#index-shift_jis-pointer 67 # From https://encoding.spec.whatwg.org/#index-shift_jis-pointer
61 # The index shift_jis pointer for code point is the return value of 68 # The index shift_jis pointer for code point is the return value of
62 # these steps for the round-trip code points (tag = 0) 69 # these steps for the round-trip code points (tag = 0)
63 # 70 #
64 # Let index be index jis0208 excluding all pointers in the range 8272 to 8835. 71 # Let index be index jis0208 excluding all pointers in the range 8272 to 8835.
65 # Return the index pointer for code point in index. 72 # Return the index pointer for code point in index.
66 # For index ($1) outside the above range, it's for decoding only and tag 73 # For index ($1) outside the above range, it's for decoding only and tag
67 # is set to '3'. 74 # is set to '3'.
68 # Besides, there are 24 more characters with multiple SJIS representations. 75 # Besides, there are 24 more characters with multiple SJIS representations.
69 # Only the first of multiple is tagged with '0' (bi-directional mapping) 76 # Only the first of multiple is tagged with '0' (bi-directional mapping)
70 # while the rest is tagged with '3'. 77 # while the rest is tagged with '3'.
71 78
72 function jis208 { 79 function jis208 {
73 awk '!/^#/ && !/^$/ \ 80 awk '!/^#/ && !/^$/ \
74 { lead = $1 / 188; \ 81 { lead = $1 / 188; \
75 lead_offset = lead < 0x1F ? 0x81 : 0xC1; \ 82 lead_offset = lead < 0x1F ? 0x81 : 0xC1; \
76 trail = $1 % 188; \ 83 trail = $1 % 188; \
77 trail_offset = trail < 0x3F ? 0x40 : 0x41; \ 84 trail_offset = trail < 0x3F ? 0x40 : 0x41; \
78 is_in_range = ($1 < 8272 || $1 > 8835); \ 85 is_in_range = ($1 < 8272 || $1 > 8835); \
79 tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \ 86 tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \
80 printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ 87 printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
81 lead + lead_offset, trail + trail_offset, tag);\ 88 lead + lead_offset, trail + trail_offset, tag);\
82 if (is_in_range) has_seen[$2] = 1; \ 89 if (is_in_range) has_seen[$2] = 1; \
83 }' \ 90 }' \
84 index-jis0208.txt 91 index-jis0208.txt
85 } 92 }
86 93
87 # EUDC (End User Defined Characters) is for decoding only 94 # EUDC (End User Defined Characters) is for decoding only
88 # (use '|3' to denote that). 95 # (use '|3' to denote that).
89 # See http://encoding.spec.whatwg.org/#shift_jis-decoder - step 5 96 # See https://encoding.spec.whatwg.org/#shift_jis-decoder - step 5
90 # This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41} 97 # This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41}
91 # to implement it. 98 # to implement it.
92 99
93 function eudc { 100 function eudc {
94 # The upper bound for the lead byte is 0xF8 because each lead can 101 # The upper bound for the lead byte is 0xF8 because each lead can
95 # have 188 characters and the total # of characters in the EUDC 102 # have 188 characters and the total # of characters in the EUDC
96 # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder 103 # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder
97 # step 3.5 in the encoding spec.) 104 # step 3.5 in the encoding spec.)
98 for lead in $(seq 0xF0 0xF8) 105 for lead in $(seq 0xF0 0xF8)
99 do 106 do
(...skipping 10 matching lines...) Expand all
110 function unsorted_table { 117 function unsorted_table {
111 ascii 118 ascii
112 half_width_kana 119 half_width_kana
113 jis208 120 jis208
114 eudc "0x40" "0x7E" "0x40" 121 eudc "0x40" "0x7E" "0x40"
115 eudc "0x80" "0xFC" "0x41" 122 eudc "0x80" "0xFC" "0x41"
116 echo '<U00A5> \x5C |1' 123 echo '<U00A5> \x5C |1'
117 echo '<U203E> \x7E |1' 124 echo '<U203E> \x7E |1'
118 } 125 }
119 126
127 wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt
120 preamble 128 preamble
121 unsorted_table | sort | uniq 129 unsorted_table | sort | uniq
122 echo 'END CHARMAP' 130 echo 'END CHARMAP'
OLDNEW
« no previous file with comments | « scripts/euckr_gen.sh ('k') | source/data/in/icudtl.dat » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698