OLD | NEW |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
| 5 #include "CharacterData.h" |
| 6 |
5 #include "CharacterProperty.h" | 7 #include "CharacterProperty.h" |
6 | |
7 #include <cassert> | 8 #include <cassert> |
8 #include <cstring> | 9 #include <cstring> |
9 #include <stdio.h> | 10 #include <stdio.h> |
10 #include <unicode/uobject.h> | 11 #if !defined(USING_SYSTEM_ICU) |
11 #define MUTEX_H // Prevent compile failure of utrie2.h on Windows | 12 #define MUTEX_H // Prevent compile failure of utrie2.h on Windows |
12 #include <utrie2.h> | 13 #include <utrie2.h> |
| 14 #endif |
| 15 |
| 16 #if defined(USING_SYSTEM_ICU) |
| 17 static void generate(FILE*) |
| 18 { |
| 19 } |
| 20 #else |
| 21 |
| 22 using namespace blink; |
13 | 23 |
14 const UChar32 kMaxCodepoint = 0x10FFFF; | 24 const UChar32 kMaxCodepoint = 0x10FFFF; |
15 using CharacterProperty = blink::CharacterProperty; | |
16 #define ARRAY_LENGTH(a) (sizeof(a) / sizeof((a)[0])) | 25 #define ARRAY_LENGTH(a) (sizeof(a) / sizeof((a)[0])) |
17 | 26 |
18 static const UChar32 cjkIsolatedSymbolsArray[] = { | |
19 // 0x2C7 Caron, Mandarin Chinese 3rd Tone | |
20 0x2C7, | |
21 // 0x2CA Modifier Letter Acute Accent, Mandarin Chinese 2nd Tone | |
22 0x2CA, | |
23 // 0x2CB Modifier Letter Grave Access, Mandarin Chinese 4th Tone | |
24 0x2CB, | |
25 // 0x2D9 Dot Above, Mandarin Chinese 5th Tone | |
26 0x2D9, | |
27 0x2020, 0x2021, 0x2030, 0x203B, 0x203C, 0x2042, 0x2047, 0x2048, 0x2049, 0x20
51, | |
28 0x20DD, 0x20DE, 0x2100, 0x2103, 0x2105, 0x2109, 0x210A, 0x2113, 0x2116, 0x21
21, | |
29 0x212B, 0x213B, 0x2150, 0x2151, 0x2152, 0x217F, 0x2189, 0x2307, 0x2312, 0x23
CE, | |
30 0x2423, 0x25A0, 0x25A1, 0x25A2, 0x25AA, 0x25AB, 0x25B1, 0x25B2, 0x25B3, 0x25
B6, | |
31 0x25B7, 0x25BC, 0x25BD, 0x25C0, 0x25C1, 0x25C6, 0x25C7, 0x25C9, 0x25CB, 0x25
CC, | |
32 0x25EF, 0x2605, 0x2606, 0x260E, 0x2616, 0x2617, 0x2640, 0x2642, 0x26A0, 0x26
BD, | |
33 0x26BE, 0x2713, 0x271A, 0x273F, 0x2740, 0x2756, 0x2B1A, 0xFE10, 0xFE11, 0xFE
12, | |
34 0xFE19, 0xFF1D, | |
35 // Emoji. | |
36 0x1F100 | |
37 }; | |
38 | |
39 static const UChar32 cjkIdeographRanges[] = { | |
40 // CJK Radicals Supplement and Kangxi Radicals. | |
41 0x2E80, 0x2FDF, | |
42 // CJK Strokes. | |
43 0x31C0, 0x31EF, | |
44 // CJK Unified Ideographs Extension A. | |
45 0x3400, 0x4DBF, | |
46 // The basic CJK Unified Ideographs block. | |
47 0x4E00, 0x9FFF, | |
48 // CJK Compatibility Ideographs. | |
49 0xF900, 0xFAFF, | |
50 // CJK Unified Ideographs Extension B. | |
51 0x20000, 0x2A6DF, | |
52 // CJK Unified Ideographs Extension C. | |
53 // CJK Unified Ideographs Extension D. | |
54 0x2A700, 0x2B81F, | |
55 // CJK Compatibility Ideographs Supplement. | |
56 0x2F800, 0x2FA1F | |
57 }; | |
58 | |
59 static const UChar32 cjkSymbolRanges[] = { | |
60 0x2156, 0x215A, | |
61 0x2160, 0x216B, | |
62 0x2170, 0x217B, | |
63 0x23BE, 0x23CC, | |
64 0x2460, 0x2492, | |
65 0x249C, 0x24FF, | |
66 0x25CE, 0x25D3, | |
67 0x25E2, 0x25E6, | |
68 0x2600, 0x2603, | |
69 0x2660, 0x266F, | |
70 // Emoji HEAVY HEART EXCLAMATION MARK ORNAMENT..HEAVY BLACK HEART | |
71 // Needed in order not to break Emoji heart-kiss sequences in | |
72 // CachingWordShapeIterator. | |
73 // cmp. http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html | |
74 0x2763, 0x2764, | |
75 0x2672, 0x267D, | |
76 0x2776, 0x277F, | |
77 // Ideographic Description Characters, with CJK Symbols and Punctuation, | |
78 // excluding 0x3030. | |
79 // Then Hiragana 0x3040 .. 0x309F, Katakana 0x30A0 .. 0x30FF, Bopomofo | |
80 // 0x3100 .. 0x312F | |
81 0x2FF0, 0x302F, | |
82 0x3031, 0x312F, | |
83 // More Bopomofo and Bopomofo Extended 0x31A0 .. 0x31BF | |
84 0x3190, 0x31BF, | |
85 // Enclosed CJK Letters and Months (0x3200 .. 0x32FF). | |
86 // CJK Compatibility (0x3300 .. 0x33FF). | |
87 0x3200, 0x33FF, | |
88 0xF860, 0xF862, | |
89 // CJK Compatibility Forms. | |
90 0xFE30, 0xFE4F, | |
91 // Halfwidth and Fullwidth Forms | |
92 // Usually only used in CJK | |
93 0xFF00, 0xFF0C, | |
94 0xFF0E, 0xFF1A, | |
95 0xFF1F, 0xFFEF, | |
96 // Emoji. | |
97 0x1F110, 0x1F129, | |
98 0x1F130, 0x1F149, | |
99 0x1F150, 0x1F169, | |
100 0x1F170, 0x1F189, | |
101 0x1F200, 0x1F6FF | |
102 }; | |
103 | |
104 // Individual codepoints needed for Unicode vertical text layout according to | |
105 // http://www.unicode.org/reports/tr50/ | |
106 // Taken from the corresponding data file: | |
107 // http://www.unicode.org/Public/vertical/revision-13/VerticalOrientation-13.txt | |
108 static const UChar32 isUprightInMixedVerticalArray[] = { | |
109 0x000A7, | |
110 0x000A9, | |
111 0x000AE, | |
112 0x000B1, | |
113 0x000D7, | |
114 0x000F7 | |
115 }; | |
116 | |
117 static const UChar32 isUprightInMixedVerticalRanges[] = { | |
118 0x000BC, 0x000BE, | |
119 // Spacing Modifier Letters (Part of) | |
120 0x002EA, 0x002EB, | |
121 // Hangul Jamo | |
122 0x01100, 0x011FF, | |
123 // Unified Canadian Aboriginal Syllabics | |
124 0x01401, 0x0167F, | |
125 // Unified Canadian Aboriginal Syllabics Extended | |
126 0x018B0, 0x018FF, | |
127 // General Punctuation (Part of) | |
128 0x02016, 0x02016, | |
129 0x02020, 0x02021, | |
130 0x02030, 0x02031, | |
131 0x0203B, 0x0203C, | |
132 0x02042, 0x02042, | |
133 0x02047, 0x02049, | |
134 0x02051, 0x02051, | |
135 0x02065, 0x02069, | |
136 // Combining Diacritical Marks for Symbols (Part of) | |
137 0x020DD, 0x020E0, | |
138 0x020E2, 0x020E4, | |
139 // Letterlike Symbols (Part of)/Number Forms | |
140 0x02100, 0x02101, | |
141 0x02103, 0x02109, | |
142 0x0210F, 0x0210F, | |
143 0x02113, 0x02114, | |
144 0x02116, 0x02117, | |
145 0x0211E, 0x02123, | |
146 0x02125, 0x02125, | |
147 0x02127, 0x02127, | |
148 0x02129, 0x02129, | |
149 0x0212E, 0x0212E, | |
150 0x02135, 0x0213F, | |
151 0x02145, 0x0214A, | |
152 0x0214C, 0x0214D, | |
153 0x0214F, 0x0218F, | |
154 // Mathematical Operators (Part of) | |
155 0x0221E, 0x0221E, | |
156 0x02234, 0x02235, | |
157 // Miscellaneous Technical (Part of) | |
158 0x02300, 0x02307, | |
159 0x0230C, 0x0231F, | |
160 0x02324, 0x0232B, | |
161 0x0237D, 0x0239A, | |
162 0x023BE, 0x023CD, | |
163 0x023CF, 0x023CF, | |
164 0x023D1, 0x023DB, | |
165 0x023E2, 0x02422, | |
166 // Control Pictures (Part of)/Optical Character Recognition/Enclosed | |
167 // Alphanumerics | |
168 0x02424, 0x024FF, | |
169 // Geometric Shapes/Miscellaneous Symbols (Part of) | |
170 0x025A0, 0x02619, | |
171 0x02620, 0x02767, | |
172 0x02776, 0x02793, | |
173 // Miscellaneous Symbols and Arrows (Part of) | |
174 0x02B12, 0x02B2F, | |
175 0x02B50, 0x02B59, | |
176 0x02BB8, 0x02BFF, | |
177 // Common CJK | |
178 0x02E80, 0x0A4CF, | |
179 // Hangul Jamo Extended-A | |
180 0x0A960, 0x0A97F, | |
181 // Hangul Syllables/Hangul Jamo Extended-B | |
182 0x0AC00, 0x0D7FF, | |
183 // Private Use Area/CJK Compatibility Ideographs | |
184 0x0E000, 0x0FAFF, | |
185 // Vertical Forms | |
186 0x0FE10, 0x0FE1F, | |
187 // CJK Compatibility Forms (Part of) | |
188 0x0FE30, 0x0FE48, | |
189 // Small Form Variants (Part of) | |
190 0x0FE50, 0x0FE57, | |
191 0x0FE59, 0x0FE62, | |
192 0x0FE67, 0x0FE6F, | |
193 // Halfwidth and Fullwidth Forms | |
194 0x0FF01, 0x0FF0C, | |
195 0x0FF0E, 0x0FF1B, | |
196 0x0FF1F, 0x0FF60, | |
197 0x0FFE0, 0x0FFE7, | |
198 // Specials (Part of) | |
199 0x0FFF0, 0x0FFF8, | |
200 0x0FFFC, 0x0FFFD, | |
201 // Meroitic Hieroglyphs | |
202 0x10980, 0x1099F, | |
203 // Siddham | |
204 0x11580, 0x115FF, | |
205 // Egyptian Hieroglyphs | |
206 0x13000, 0x1342F, | |
207 // Kana Supplement | |
208 0x1B000, 0x1B0FF, | |
209 // Byzantine Musical Symbols/Musical Symbols | |
210 0x1D000, 0x1D1FF, | |
211 // Tai Xuan Jing Symbols/Counting Rod Numerals | |
212 0x1D300, 0x1D37F, | |
213 // Mahjong Tiles/Domino Tiles/Playing Cards/Enclosed Alphanumeric Supplement | |
214 // Enclosed Ideographic Supplement/Enclosed Ideographic Supplement | |
215 // Emoticons/Ornamental Dingbats/Transport and Map Symbols/Alchemical | |
216 // Symbols Alchemical Symbols | |
217 0x1F000, 0x1F7FF, | |
218 // CJK Unified Ideographs Extension B/C/D | |
219 // CJK Compatibility Ideographs Supplement | |
220 0x20000, 0x2FFFD, | |
221 0x30000, 0x3FFFD, | |
222 // Supplementary Private Use Area-A | |
223 0xF0000, 0xFFFFD, | |
224 // Supplementary Private Use Area-B | |
225 0x100000, 0x10FFFD, | |
226 }; | |
227 | |
228 static void setRanges(CharacterProperty* values, | 27 static void setRanges(CharacterProperty* values, |
229 const UChar32* ranges, size_t length, | 28 const UChar32* ranges, size_t length, |
230 CharacterProperty value) | 29 CharacterProperty value) |
231 { | 30 { |
232 assert(length % 2 == 0); | 31 assert(length % 2 == 0); |
233 const UChar32* end = ranges + length; | 32 const UChar32* end = ranges + length; |
234 for (; ranges != end; ranges += 2) { | 33 for (; ranges != end; ranges += 2) { |
235 assert(ranges[0] <= ranges[1] | 34 assert(ranges[0] <= ranges[1] |
236 && ranges[1] <= kMaxCodepoint); | 35 && ranges[1] <= kMaxCodepoint); |
237 for (UChar32 c = ranges[0]; c <= ranges[1]; c++) | 36 for (UChar32 c = ranges[0]; c <= ranges[1]; c++) |
238 values[c] |= value; | 37 values[c] |= value; |
239 } | 38 } |
240 } | 39 } |
241 | 40 |
242 static void setValues(CharacterProperty* values, | 41 static void setValues(CharacterProperty* values, |
243 const UChar32* begin, size_t length, | 42 const UChar32* begin, size_t length, |
244 CharacterProperty value) | 43 CharacterProperty value) |
245 { | 44 { |
246 const UChar32* end = begin + length; | 45 const UChar32* end = begin + length; |
247 for (; begin != end; begin++) { | 46 for (; begin != end; begin++) { |
248 assert(*begin <= kMaxCodepoint); | 47 assert(*begin <= kMaxCodepoint); |
249 values[*begin] |= value; | 48 values[*begin] |= value; |
250 } | 49 } |
251 } | 50 } |
252 | 51 |
253 static void generate(FILE* fp, int32_t size, uint8_t* array) | 52 static void generateUTrieSerialized(FILE* fp, int32_t size, uint8_t* array) |
254 { | 53 { |
255 fprintf(fp, | 54 fprintf(fp, |
256 "#include <cstdint>\n\n" | 55 "#include <cstdint>\n\n" |
257 "namespace blink {\n\n" | 56 "namespace blink {\n\n" |
258 "int32_t serializedCharacterDataSize = %d;\n" | 57 "int32_t serializedCharacterDataSize = %d;\n" |
259 "uint8_t serializedCharacterData[] = {", size); | 58 "uint8_t serializedCharacterData[] = {", size); |
260 for (int32_t i = 0; i < size; ) { | 59 for (int32_t i = 0; i < size; ) { |
261 fprintf(fp, "\n "); | 60 fprintf(fp, "\n "); |
262 for (int col = 0; col < 16 && i < size; col++, i++) | 61 for (int col = 0; col < 16 && i < size; col++, i++) |
263 fprintf(fp, " 0x%02X,", array[i]); | 62 fprintf(fp, " 0x%02X,", array[i]); |
264 } | 63 } |
265 fprintf(fp, | 64 fprintf(fp, |
266 "\n};\n\n" | 65 "\n};\n\n" |
267 "} // namespace blink\n"); | 66 "} // namespace blink\n"); |
268 } | 67 } |
269 | 68 |
270 int main(int argc, char** argv) | 69 static void generate(FILE* fp) |
271 { | 70 { |
272 // Create a value array of all possible code points. | 71 // Create a value array of all possible code points. |
273 const UChar32 size = kMaxCodepoint + 1; | 72 const UChar32 size = kMaxCodepoint + 1; |
274 CharacterProperty* values = new CharacterProperty[size]; | 73 CharacterProperty* values = new CharacterProperty[size]; |
275 memset(values, 0, sizeof(CharacterProperty) * size); | 74 memset(values, 0, sizeof(CharacterProperty) * size); |
276 | 75 |
277 setRanges(values, | 76 #define SET(name) \ |
278 cjkIdeographRanges, ARRAY_LENGTH(cjkIdeographRanges), | 77 setRanges(values, name##Ranges, ARRAY_LENGTH(name##Ranges), \ |
279 CharacterProperty::isCJKIdeographOrSymbol); | 78 CharacterProperty::name); \ |
280 setRanges(values, | 79 setValues(values, name##Array, ARRAY_LENGTH(name##Array), \ |
281 cjkSymbolRanges, ARRAY_LENGTH(cjkSymbolRanges), | 80 CharacterProperty::name); |
282 CharacterProperty::isCJKIdeographOrSymbol); | |
283 setValues(values, | |
284 cjkIsolatedSymbolsArray, ARRAY_LENGTH(cjkIsolatedSymbolsArray), | |
285 CharacterProperty::isCJKIdeographOrSymbol); | |
286 | 81 |
287 setRanges(values, | 82 SET(isCJKIdeographOrSymbol); |
288 isUprightInMixedVerticalRanges, | 83 SET(isUprightInMixedVertical); |
289 ARRAY_LENGTH(isUprightInMixedVerticalRanges), | |
290 CharacterProperty::isUprightInMixedVertical); | |
291 setValues(values, | |
292 isUprightInMixedVerticalArray, | |
293 ARRAY_LENGTH(isUprightInMixedVerticalArray), | |
294 CharacterProperty::isUprightInMixedVertical); | |
295 | 84 |
296 // Create a trie from the value array. | 85 // Create a trie from the value array. |
297 UErrorCode error = U_ZERO_ERROR; | 86 UErrorCode error = U_ZERO_ERROR; |
298 UTrie2* trie = utrie2_open(0, 0, &error); | 87 UTrie2* trie = utrie2_open(0, 0, &error); |
299 assert(error == U_ZERO_ERROR); | 88 assert(error == U_ZERO_ERROR); |
300 UChar32 start = 0; | 89 UChar32 start = 0; |
301 CharacterProperty value = values[0]; | 90 CharacterProperty value = values[0]; |
302 for (UChar32 c = 1;; c++) { | 91 for (UChar32 c = 1;; c++) { |
303 if (c < size && values[c] == value) | 92 if (c < size && values[c] == value) |
304 continue; | 93 continue; |
(...skipping 10 matching lines...) Expand all Loading... |
315 | 104 |
316 // Freeze and serialize the trie to a byte array. | 105 // Freeze and serialize the trie to a byte array. |
317 utrie2_freeze(trie, UTrie2ValueBits::UTRIE2_16_VALUE_BITS, &error); | 106 utrie2_freeze(trie, UTrie2ValueBits::UTRIE2_16_VALUE_BITS, &error); |
318 assert(error == U_ZERO_ERROR); | 107 assert(error == U_ZERO_ERROR); |
319 int32_t serializedSize = utrie2_serialize(trie, nullptr, 0, &error); | 108 int32_t serializedSize = utrie2_serialize(trie, nullptr, 0, &error); |
320 error = U_ZERO_ERROR; | 109 error = U_ZERO_ERROR; |
321 uint8_t* serialized = new uint8_t[serializedSize]; | 110 uint8_t* serialized = new uint8_t[serializedSize]; |
322 serializedSize = utrie2_serialize(trie, serialized, serializedSize, &error); | 111 serializedSize = utrie2_serialize(trie, serialized, serializedSize, &error); |
323 assert(error == U_ZERO_ERROR); | 112 assert(error == U_ZERO_ERROR); |
324 | 113 |
| 114 generateUTrieSerialized(fp, serializedSize, serialized); |
| 115 |
| 116 utrie2_close(trie); |
| 117 } |
| 118 #endif |
| 119 |
| 120 int main(int argc, char** argv) |
| 121 { |
| 122 |
325 // Write the serialized array to the source file. | 123 // Write the serialized array to the source file. |
326 if (argc <= 1) { | 124 if (argc <= 1) { |
327 generate(stdout, serializedSize, serialized); | 125 generate(stdout); |
328 } else { | 126 } else { |
329 FILE* fp = fopen(argv[1], "wb"); | 127 FILE* fp = fopen(argv[1], "wb"); |
330 generate(fp, serializedSize, serialized); | 128 generate(fp); |
331 fclose(fp); | 129 fclose(fp); |
332 } | 130 } |
333 | 131 |
334 utrie2_close(trie); | |
335 | |
336 return 0; | 132 return 0; |
337 } | 133 } |
OLD | NEW |