Chromium Code Reviews| Index: scripts/big5_gen.sh |
| diff --git a/scripts/big5_gen.sh b/scripts/big5_gen.sh |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..91a444a6f5426e07e8fc078966c44c60ee1f684c |
| --- /dev/null |
| +++ b/scripts/big5_gen.sh |
| @@ -0,0 +1,85 @@ |
| +#!/bin/sh |
| +# Copyright (c) 2014 The Chromium Authors. All rights reserved. |
|
jsbell
2015/01/20 21:47:22
2015
No (c) per http://www.chromium.org/developers
|
| +# Use of this source code is governed by a BSD-style license that can be |
| +# found in the LICENSE file. |
| + |
| +# References: |
| +# http://encoding.spec.whatwg.org/#big5 |
|
jsbell
2015/01/20 21:47:22
nit: can use https everywhere
|
| + |
| +# This script downloads the following file. |
| +# https://encoding.spec.whatwg.org/index-big5.txt |
| + |
| +function preamble { |
| +cat <<PREAMBLE |
| +# *************************************************************************** |
| +# * |
| +# * Copyright (C) 1995-2014, International Business Machines |
| +# * Corporation and others. All Rights Reserved. |
| +# * |
| +# * Generated per the algorithm for Big5 |
| +# * described at http://encoding.spec.whatwg.org/#big5 |
| +# * |
| +# *************************************************************************** |
| +<code_set_name> "big5-html" |
| +<char_name_mask> "AXXXX" |
| +<mb_cur_max> 2 |
| +<mb_cur_min> 1 |
| +<uconv_class> "MBCS" |
| +<subchar> \x3F |
| +<icu:charsetFamily> "ASCII" |
| + |
| +# 'p' is for the range that may produce non-BMP code points. |
| +# See http://userguide.icu-project.org/conversion/data. |
| +<icu:state> 0-7f, 87-fe:1, 87-a0:2, c8:2, fa-fe:2 |
| +<icu:state> 40-7e, a1-fe |
| +<icu:state> 40-7e.p, a1-fe.p |
| + |
| +CHARMAP |
| +PREAMBLE |
| +} |
| + |
| +function ascii { |
| + for i in $(seq 0 127) |
| + do |
| + printf '<U%04X> \\x%02X |0\n' $i $i |
| + done |
| +} |
| + |
| + |
| +# HKSCS characters are not supported in encoding ( |lead < 0xA1| ) |
| +function big5 { |
| + awk '!/^#/ && !/^$/ \ |
| + { pointer = $1; \ |
| + ucs = substr($2, 3); \ |
| + sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs; |
| + lead = pointer / 157 + 0x81; \ |
| + is_decoding_only = lead < 0xA1 || seen_before[ucs]; \ |
| + trail = $1 % 157; \ |
| + trail_offset = trail < 0x3F ? 0x40 : 0x62; \ |
| + tag = (is_decoding_only ? 3 : 0); \ |
| + printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\ |
| + lead, trail + trail_offset, tag, sortkey);\ |
| + seen_before[ucs] = 1; \ |
| + }' \ |
| + index-big5.txt |
| +} |
| + |
| +function two_char_seq { |
| +cat <<EOF |
| +<U00CA><U0304> \x88\x62 |3 000CA |
| +<U00CA><U030C> \x88\x64 |3 000CA |
| +<U00EA><U0304> \x88\xA3 |3 000EA |
| +<U00EA><U030C> \x88\xA5 |3 000EA |
| +EOF |
| +} |
| + |
| +function unsorted_table { |
| + two_char_seq |
| + big5 |
| +} |
| + |
| +#curl -o index-big5.txt https://encoding.spec.whatwg.org/index-big5.txt |
|
jsbell
2015/01/20 21:47:22
Commented out...?
(I should note that I get hands
|
| +preamble |
| +ascii |
| +unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' |
| +echo 'END CHARMAP' |