OLD | NEW |
(Empty) | |
| 1 #!/bin/sh |
| 2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 # References: |
| 7 # http://encoding.spec.whatwg.org/#shift_jis |
| 8 |
| 9 # Download the following file, run it in source/data/mappings directory |
| 10 # and save the result to euc-jp-html5.ucm |
| 11 # http://encoding.spec.whatwg.org/index-jis0208.txt |
| 12 |
| 13 function preamble { |
| 14 cat <<PREAMBLE |
| 15 # *************************************************************************** |
| 16 # * |
| 17 # * Copyright (C) 1995-2014, International Business Machines |
| 18 # * Corporation and others. All Rights Reserved. |
| 19 # * |
| 20 # * Generated per the algorithm for Shift_JIS |
| 21 # * described at http://encoding.spec.whatwg.org/#shift_jis |
| 22 # * |
| 23 # *************************************************************************** |
| 24 <code_set_name> "shift_jis-html5" |
| 25 <char_name_mask> "AXXXX" |
| 26 <mb_cur_max> 2 |
| 27 <mb_cur_min> 1 |
| 28 <uconv_class> "MBCS" |
| 29 <subchar> \xFC\xFC |
| 30 <subchar1> \x7F |
| 31 <icu:charsetFamily> "ASCII" |
| 32 |
| 33 <icu:state> 0-80, 81-9f:1, a1-df, e0-fc:1 |
| 34 <icu:state> 40-7e, 80-fc |
| 35 |
| 36 CHARMAP |
| 37 PREAMBLE |
| 38 } |
| 39 |
| 40 # The encoding spec for Shift_JIS says U+0080 has to be round-tripped with |
| 41 # 0x80. So, this is one character more than ASCII up to 128 (0x80). |
| 42 function ascii { |
| 43 for i in $(seq 0 128) |
| 44 do |
| 45 printf '<U%04X> \\x%02X |0\n' $i $i |
| 46 done |
| 47 } |
| 48 |
| 49 |
| 50 # Map 0x[A1-DF] to U+FF61 to U+FF9F |
| 51 function half_width_kana { |
| 52 for i in $(seq 0xA1 0xDF) |
| 53 do |
| 54 # 65377 = 0xFF61, 161 = 0xA1 |
| 55 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i |
| 56 done |
| 57 } |
| 58 |
| 59 |
| 60 # From http://encoding.spec.whatwg.org/#index-shift_jis-pointer |
| 61 # The index shift_jis pointer for code point is the return value of |
| 62 # these steps for the round-trip code points (tag = 0) |
| 63 # |
| 64 # Let index be index jis0208 excluding all pointers in the range 8272 to 8835. |
| 65 # Return the index pointer for code point in index. |
| 66 # For index ($1) outside the above range, it's for decoding only and tag |
| 67 # is set to '3'. |
| 68 # Besides, there are 24 more characters with multiple SJIS representations. |
| 69 # Only the first of multiple is tagged with '0' (bi-directional mapping) |
| 70 # while the rest is tagged with '3'. |
| 71 |
| 72 function jis208 { |
| 73 awk '!/^#/ && !/^$/ \ |
| 74 { lead = $1 / 188; \ |
| 75 lead_offset = lead < 0x1F ? 0x81 : 0xC1; \ |
| 76 trail = $1 % 188; \ |
| 77 trail_offset = trail < 0x3F ? 0x40 : 0x41; \ |
| 78 is_in_range = ($1 < 8272 || $1 > 8835); \ |
| 79 tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \ |
| 80 printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ |
| 81 lead + lead_offset, trail + trail_offset, tag);\ |
| 82 if (is_in_range) has_seen[$2] = 1; \ |
| 83 }' \ |
| 84 index-jis0208.txt |
| 85 } |
| 86 |
| 87 # EUDC (End User Defined Characters) is for decoding only |
| 88 # (use '|3' to denote that). |
| 89 # See http://encoding.spec.whatwg.org/#shift_jis-decoder - step 5 |
| 90 # This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41} |
| 91 # to implement it. |
| 92 |
| 93 function eudc { |
| 94 # The upper bound for the lead byte is 0xF8 because each lead can |
| 95 # have 188 characters and the total # of characters in the EUDC |
| 96 # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder |
| 97 # step 3.5 in the encoding spec.) |
| 98 for lead in $(seq 0xF0 0xF8) |
| 99 do |
| 100 for byte in $(seq $1 $2) |
| 101 do |
| 102 offset=$3 |
| 103 pointer=$((($lead - 0xC1) * 188 + $byte - $offset)) |
| 104 unicode=$(($pointer - 8836 + 0xE000)) |
| 105 printf "<U%4X> \\\x%02X\\\x%02X |3\n" $unicode $lead $byte |
| 106 done |
| 107 done |
| 108 } |
| 109 |
| 110 function unsorted_table { |
| 111 ascii |
| 112 half_width_kana |
| 113 jis208 |
| 114 eudc "0x40" "0x7E" "0x40" |
| 115 eudc "0x80" "0xFC" "0x41" |
| 116 echo '<U00A5> \x5C |1' |
| 117 echo '<U203E> \x7E |1' |
| 118 } |
| 119 |
| 120 preamble |
| 121 unsorted_table | sort | uniq |
| 122 echo 'END CHARMAP' |
OLD | NEW |