Index: scripts/big5_gen.sh |
diff --git a/scripts/big5_gen.sh b/scripts/big5_gen.sh |
new file mode 100644 |
index 0000000000000000000000000000000000000000..58bb680f5b6c817c8baa2f2bbf534c19cb885caf |
--- /dev/null |
+++ b/scripts/big5_gen.sh |
@@ -0,0 +1,89 @@ |
+#!/bin/sh |
+# Copyright 2015 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+# References: |
+# https://encoding.spec.whatwg.org/#big5 |
+ |
+# This script downloads the following file. |
+# https://encoding.spec.whatwg.org/index-big5.txt |
+ |
+function preamble { |
+cat <<PREAMBLE |
+# *************************************************************************** |
+# * |
+# * Copyright (C) 1995-2014, International Business Machines |
+# * Corporation and others. All Rights Reserved. |
+# * |
+# * Generated per the algorithm for Big5 |
+# * described at http://encoding.spec.whatwg.org/#big5 |
+# * |
+# *************************************************************************** |
+<code_set_name> "big5-html" |
+<char_name_mask> "AXXXX" |
+<mb_cur_max> 2 |
+<mb_cur_min> 1 |
+<uconv_class> "MBCS" |
+<subchar> \x3F |
+<icu:charsetFamily> "ASCII" |
+ |
+# 'p' is for the range that may produce non-BMP code points. |
+# See http://userguide.icu-project.org/conversion/data. |
+<icu:state> 0-7f, 87-fe:1, 87-a0:2, c8:2, fa-fe:2 |
+<icu:state> 40-7e, a1-fe |
+<icu:state> 40-7e.p, a1-fe.p |
+ |
+CHARMAP |
+PREAMBLE |
+} |
+ |
+function ascii { |
+ for i in $(seq 0 127) |
+ do |
+ printf '<U%04X> \\x%02X |0\n' $i $i |
+ done |
+} |
+ |
+ |
+# HKSCS characters are not supported in encoding ( |lead < 0xA1| ) |
+# Entries with pointer=528[79] have to be decoding-only even though |
+# come before the other entry with the same Unicode character. |
+# See https://www.w3.org/Bugs/Public/show_bug.cgi?id=27878 |
+function big5 { |
+ awk '!/^#/ && !/^$/ \ |
+ { pointer = $1; \ |
+ ucs = substr($2, 3); \ |
+ sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs; |
+ lead = pointer / 157 + 0x81; \ |
+ is_decoding_only = lead < 0xA1 || seen_before[ucs] || \ |
+ pointer == 5287 || pointer == 5289; \ |
+ trail = $1 % 157; \ |
+ trail_offset = trail < 0x3F ? 0x40 : 0x62; \ |
+ tag = (is_decoding_only ? 3 : 0); \ |
+ printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\ |
+ lead, trail + trail_offset, tag, sortkey);\ |
+ seen_before[ucs] = is_decoding_only ? 0 : 1; \ |
+ }' \ |
+ index-big5.txt |
+} |
+ |
+function two_char_seq { |
+cat <<EOF |
+<U00CA><U0304> \x88\x62 |3 000CA |
+<U00CA><U030C> \x88\x64 |3 000CA |
+<U00EA><U0304> \x88\xA3 |3 000EA |
+<U00EA><U030C> \x88\xA5 |3 000EA |
+EOF |
+} |
+ |
+function unsorted_table { |
+ two_char_seq |
+ big5 |
+} |
+ |
+wget -N -r -nd https://encoding.spec.whatwg.org/index-big5.txt |
+preamble |
+ascii |
+unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' |
+echo 'END CHARMAP' |