Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(791)

Side by Side Diff: scripts/big5_gen.sh

Issue 839713003: ICU update to 54 step 3 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/icu.git@master
Patch Set: add euc-kr-html.ucm (not yet used) Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/bin/sh
2 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
jsbell 2015/01/20 21:47:22 2015 No (c) per http://www.chromium.org/developers
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 # References:
7 # http://encoding.spec.whatwg.org/#big5
jsbell 2015/01/20 21:47:22 nit: can use https everywhere
8
9 # This script downloads the following file.
10 # https://encoding.spec.whatwg.org/index-big5.txt
11
12 function preamble {
13 cat <<PREAMBLE
14 # ***************************************************************************
15 # *
16 # * Copyright (C) 1995-2014, International Business Machines
17 # * Corporation and others. All Rights Reserved.
18 # *
19 # * Generated per the algorithm for Big5
20 # * described at http://encoding.spec.whatwg.org/#big5
21 # *
22 # ***************************************************************************
23 <code_set_name> "big5-html"
24 <char_name_mask> "AXXXX"
25 <mb_cur_max> 2
26 <mb_cur_min> 1
27 <uconv_class> "MBCS"
28 <subchar> \x3F
29 <icu:charsetFamily> "ASCII"
30
31 # 'p' is for the range that may produce non-BMP code points.
32 # See http://userguide.icu-project.org/conversion/data.
33 <icu:state> 0-7f, 87-fe:1, 87-a0:2, c8:2, fa-fe:2
34 <icu:state> 40-7e, a1-fe
35 <icu:state> 40-7e.p, a1-fe.p
36
37 CHARMAP
38 PREAMBLE
39 }
40
41 function ascii {
42 for i in $(seq 0 127)
43 do
44 printf '<U%04X> \\x%02X |0\n' $i $i
45 done
46 }
47
48
49 # HKSCS characters are not supported in encoding ( |lead < 0xA1| )
50 function big5 {
51 awk '!/^#/ && !/^$/ \
52 { pointer = $1; \
53 ucs = substr($2, 3); \
54 sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs;
55 lead = pointer / 157 + 0x81; \
56 is_decoding_only = lead < 0xA1 || seen_before[ucs]; \
57 trail = $1 % 157; \
58 trail_offset = trail < 0x3F ? 0x40 : 0x62; \
59 tag = (is_decoding_only ? 3 : 0); \
60 printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\
61 lead, trail + trail_offset, tag, sortkey);\
62 seen_before[ucs] = 1; \
63 }' \
64 index-big5.txt
65 }
66
67 function two_char_seq {
68 cat <<EOF
69 <U00CA><U0304> \x88\x62 |3 000CA
70 <U00CA><U030C> \x88\x64 |3 000CA
71 <U00EA><U0304> \x88\xA3 |3 000EA
72 <U00EA><U030C> \x88\xA5 |3 000EA
73 EOF
74 }
75
76 function unsorted_table {
77 two_char_seq
78 big5
79 }
80
81 #curl -o index-big5.txt https://encoding.spec.whatwg.org/index-big5.txt
jsbell 2015/01/20 21:47:22 Commented out...? (I should note that I get hands
82 preamble
83 ascii
84 unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' '
85 echo 'END CHARMAP'
OLDNEW
« no previous file with comments | « patches/uconv.patch ('k') | scripts/euckr_gen.sh » ('j') | scripts/euckr_gen.sh » ('J')

Powered by Google App Engine
This is Rietveld 408576698