OLD | NEW |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "CharacterProperty.h" | 5 #ifndef CharacterData_h |
| 6 #define CharacterData_h |
6 | 7 |
7 #include <cassert> | |
8 #include <cstring> | |
9 #include <stdio.h> | |
10 #include <unicode/uobject.h> | 8 #include <unicode/uobject.h> |
11 #define MUTEX_H // Prevent compile failure of utrie2.h on Windows | |
12 #include <utrie2.h> | |
13 | 9 |
14 const UChar32 kMaxCodepoint = 0x10FFFF; | 10 namespace blink { |
15 using CharacterProperty = blink::CharacterProperty; | |
16 #define ARRAY_LENGTH(a) (sizeof(a) / sizeof((a)[0])) | |
17 | 11 |
18 static const UChar32 cjkIsolatedSymbolsArray[] = { | 12 static const UChar32 isCJKIdeographOrSymbolArray[] = { |
19 // 0x2C7 Caron, Mandarin Chinese 3rd Tone | 13 // 0x2C7 Caron, Mandarin Chinese 3rd Tone |
20 0x2C7, | 14 0x2C7, |
21 // 0x2CA Modifier Letter Acute Accent, Mandarin Chinese 2nd Tone | 15 // 0x2CA Modifier Letter Acute Accent, Mandarin Chinese 2nd Tone |
22 0x2CA, | 16 0x2CA, |
23 // 0x2CB Modifier Letter Grave Access, Mandarin Chinese 4th Tone | 17 // 0x2CB Modifier Letter Grave Access, Mandarin Chinese 4th Tone |
24 0x2CB, | 18 0x2CB, |
25 // 0x2D9 Dot Above, Mandarin Chinese 5th Tone | 19 // 0x2D9 Dot Above, Mandarin Chinese 5th Tone |
26 0x2D9, | 20 0x2D9, |
27 0x2020, 0x2021, 0x2030, 0x203B, 0x203C, 0x2042, 0x2047, 0x2048, 0x2049, 0x20
51, | 21 0x2020, 0x2021, 0x2030, 0x203B, 0x203C, 0x2042, 0x2047, 0x2048, 0x2049, 0x20
51, |
28 0x20DD, 0x20DE, 0x2100, 0x2103, 0x2105, 0x2109, 0x210A, 0x2113, 0x2116, 0x21
21, | 22 0x20DD, 0x20DE, 0x2100, 0x2103, 0x2105, 0x2109, 0x210A, 0x2113, 0x2116, 0x21
21, |
29 0x212B, 0x213B, 0x2150, 0x2151, 0x2152, 0x217F, 0x2189, 0x2307, 0x2312, 0x23
CE, | 23 0x212B, 0x213B, 0x2150, 0x2151, 0x2152, 0x217F, 0x2189, 0x2307, 0x2312, 0x23
CE, |
30 0x2423, 0x25A0, 0x25A1, 0x25A2, 0x25AA, 0x25AB, 0x25B1, 0x25B2, 0x25B3, 0x25
B6, | 24 0x2423, 0x25A0, 0x25A1, 0x25A2, 0x25AA, 0x25AB, 0x25B1, 0x25B2, 0x25B3, 0x25
B6, |
31 0x25B7, 0x25BC, 0x25BD, 0x25C0, 0x25C1, 0x25C6, 0x25C7, 0x25C9, 0x25CB, 0x25
CC, | 25 0x25B7, 0x25BC, 0x25BD, 0x25C0, 0x25C1, 0x25C6, 0x25C7, 0x25C9, 0x25CB, 0x25
CC, |
32 0x25EF, 0x2605, 0x2606, 0x260E, 0x2616, 0x2617, 0x2640, 0x2642, 0x26A0, 0x26
BD, | 26 0x25EF, 0x2605, 0x2606, 0x260E, 0x2616, 0x2617, 0x2640, 0x2642, 0x26A0, 0x26
BD, |
33 0x26BE, 0x2713, 0x271A, 0x273F, 0x2740, 0x2756, 0x2B1A, 0xFE10, 0xFE11, 0xFE
12, | 27 0x26BE, 0x2713, 0x271A, 0x273F, 0x2740, 0x2756, 0x2B1A, 0xFE10, 0xFE11, 0xFE
12, |
34 0xFE19, 0xFF1D, | 28 0xFE19, 0xFF1D, |
35 // Emoji. | 29 // Emoji. |
36 0x1F100 | 30 0x1F100 |
37 }; | 31 }; |
38 | 32 |
39 static const UChar32 cjkIdeographRanges[] = { | 33 static const UChar32 isCJKIdeographOrSymbolRanges[] = { |
| 34 // cjkIdeographRanges |
40 // CJK Radicals Supplement and Kangxi Radicals. | 35 // CJK Radicals Supplement and Kangxi Radicals. |
41 0x2E80, 0x2FDF, | 36 0x2E80, 0x2FDF, |
42 // CJK Strokes. | 37 // CJK Strokes. |
43 0x31C0, 0x31EF, | 38 0x31C0, 0x31EF, |
44 // CJK Unified Ideographs Extension A. | 39 // CJK Unified Ideographs Extension A. |
45 0x3400, 0x4DBF, | 40 0x3400, 0x4DBF, |
46 // The basic CJK Unified Ideographs block. | 41 // The basic CJK Unified Ideographs block. |
47 0x4E00, 0x9FFF, | 42 0x4E00, 0x9FFF, |
48 // CJK Compatibility Ideographs. | 43 // CJK Compatibility Ideographs. |
49 0xF900, 0xFAFF, | 44 0xF900, 0xFAFF, |
50 // CJK Unified Ideographs Extension B. | 45 // CJK Unified Ideographs Extension B. |
51 0x20000, 0x2A6DF, | 46 0x20000, 0x2A6DF, |
52 // CJK Unified Ideographs Extension C. | 47 // CJK Unified Ideographs Extension C. |
53 // CJK Unified Ideographs Extension D. | 48 // CJK Unified Ideographs Extension D. |
54 0x2A700, 0x2B81F, | 49 0x2A700, 0x2B81F, |
55 // CJK Compatibility Ideographs Supplement. | 50 // CJK Compatibility Ideographs Supplement. |
56 0x2F800, 0x2FA1F | 51 0x2F800, 0x2FA1F, |
57 }; | |
58 | 52 |
59 static const UChar32 cjkSymbolRanges[] = { | 53 // cjkSymbolRanges |
60 0x2156, 0x215A, | 54 0x2156, 0x215A, |
61 0x2160, 0x216B, | 55 0x2160, 0x216B, |
62 0x2170, 0x217B, | 56 0x2170, 0x217B, |
63 0x23BE, 0x23CC, | 57 0x23BE, 0x23CC, |
64 0x2460, 0x2492, | 58 0x2460, 0x2492, |
65 0x249C, 0x24FF, | 59 0x249C, 0x24FF, |
66 0x25CE, 0x25D3, | 60 0x25CE, 0x25D3, |
67 0x25E2, 0x25E6, | 61 0x25E2, 0x25E6, |
68 0x2600, 0x2603, | 62 0x2600, 0x2603, |
69 0x2660, 0x266F, | 63 0x2660, 0x266F, |
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
218 // CJK Unified Ideographs Extension B/C/D | 212 // CJK Unified Ideographs Extension B/C/D |
219 // CJK Compatibility Ideographs Supplement | 213 // CJK Compatibility Ideographs Supplement |
220 0x20000, 0x2FFFD, | 214 0x20000, 0x2FFFD, |
221 0x30000, 0x3FFFD, | 215 0x30000, 0x3FFFD, |
222 // Supplementary Private Use Area-A | 216 // Supplementary Private Use Area-A |
223 0xF0000, 0xFFFFD, | 217 0xF0000, 0xFFFFD, |
224 // Supplementary Private Use Area-B | 218 // Supplementary Private Use Area-B |
225 0x100000, 0x10FFFD, | 219 0x100000, 0x10FFFD, |
226 }; | 220 }; |
227 | 221 |
228 static void setRanges(CharacterProperty* values, | 222 } // namespace blink |
229 const UChar32* ranges, size_t length, | |
230 CharacterProperty value) | |
231 { | |
232 assert(length % 2 == 0); | |
233 const UChar32* end = ranges + length; | |
234 for (; ranges != end; ranges += 2) { | |
235 assert(ranges[0] <= ranges[1] | |
236 && ranges[1] <= kMaxCodepoint); | |
237 for (UChar32 c = ranges[0]; c <= ranges[1]; c++) | |
238 values[c] |= value; | |
239 } | |
240 } | |
241 | 223 |
242 static void setValues(CharacterProperty* values, | 224 #endif |
243 const UChar32* begin, size_t length, | |
244 CharacterProperty value) | |
245 { | |
246 const UChar32* end = begin + length; | |
247 for (; begin != end; begin++) { | |
248 assert(*begin <= kMaxCodepoint); | |
249 values[*begin] |= value; | |
250 } | |
251 } | |
252 | |
253 static void generate(FILE* fp, int32_t size, uint8_t* array) | |
254 { | |
255 fprintf(fp, | |
256 "#include <cstdint>\n\n" | |
257 "namespace blink {\n\n" | |
258 "int32_t serializedCharacterDataSize = %d;\n" | |
259 "uint8_t serializedCharacterData[] = {", size); | |
260 for (int32_t i = 0; i < size; ) { | |
261 fprintf(fp, "\n "); | |
262 for (int col = 0; col < 16 && i < size; col++, i++) | |
263 fprintf(fp, " 0x%02X,", array[i]); | |
264 } | |
265 fprintf(fp, | |
266 "\n};\n\n" | |
267 "} // namespace blink\n"); | |
268 } | |
269 | |
270 int main(int argc, char** argv) | |
271 { | |
272 // Create a value array of all possible code points. | |
273 const UChar32 size = kMaxCodepoint + 1; | |
274 CharacterProperty* values = new CharacterProperty[size]; | |
275 memset(values, 0, sizeof(CharacterProperty) * size); | |
276 | |
277 setRanges(values, | |
278 cjkIdeographRanges, ARRAY_LENGTH(cjkIdeographRanges), | |
279 CharacterProperty::isCJKIdeographOrSymbol); | |
280 setRanges(values, | |
281 cjkSymbolRanges, ARRAY_LENGTH(cjkSymbolRanges), | |
282 CharacterProperty::isCJKIdeographOrSymbol); | |
283 setValues(values, | |
284 cjkIsolatedSymbolsArray, ARRAY_LENGTH(cjkIsolatedSymbolsArray), | |
285 CharacterProperty::isCJKIdeographOrSymbol); | |
286 | |
287 setRanges(values, | |
288 isUprightInMixedVerticalRanges, | |
289 ARRAY_LENGTH(isUprightInMixedVerticalRanges), | |
290 CharacterProperty::isUprightInMixedVertical); | |
291 setValues(values, | |
292 isUprightInMixedVerticalArray, | |
293 ARRAY_LENGTH(isUprightInMixedVerticalArray), | |
294 CharacterProperty::isUprightInMixedVertical); | |
295 | |
296 // Create a trie from the value array. | |
297 UErrorCode error = U_ZERO_ERROR; | |
298 UTrie2* trie = utrie2_open(0, 0, &error); | |
299 assert(error == U_ZERO_ERROR); | |
300 UChar32 start = 0; | |
301 CharacterProperty value = values[0]; | |
302 for (UChar32 c = 1;; c++) { | |
303 if (c < size && values[c] == value) | |
304 continue; | |
305 if (static_cast<uint32_t>(value)) { | |
306 utrie2_setRange32(trie, start, c - 1, | |
307 static_cast<uint32_t>(value), TRUE, &error); | |
308 assert(error == U_ZERO_ERROR); | |
309 } | |
310 if (c >= size) | |
311 break; | |
312 start = c; | |
313 value = values[start]; | |
314 } | |
315 | |
316 // Freeze and serialize the trie to a byte array. | |
317 utrie2_freeze(trie, UTrie2ValueBits::UTRIE2_16_VALUE_BITS, &error); | |
318 assert(error == U_ZERO_ERROR); | |
319 int32_t serializedSize = utrie2_serialize(trie, nullptr, 0, &error); | |
320 error = U_ZERO_ERROR; | |
321 uint8_t* serialized = new uint8_t[serializedSize]; | |
322 serializedSize = utrie2_serialize(trie, serialized, serializedSize, &error); | |
323 assert(error == U_ZERO_ERROR); | |
324 | |
325 // Write the serialized array to the source file. | |
326 if (argc <= 1) { | |
327 generate(stdout, serializedSize, serialized); | |
328 } else { | |
329 FILE* fp = fopen(argv[1], "wb"); | |
330 generate(fp, serializedSize, serialized); | |
331 fclose(fp); | |
332 } | |
333 | |
334 utrie2_close(trie); | |
335 | |
336 return 0; | |
337 } | |
OLD | NEW |