| OLD | NEW |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "CharacterProperty.h" | 5 #ifndef CharacterData_h |
| 6 #define CharacterData_h |
| 6 | 7 |
| 7 #include <cassert> | |
| 8 #include <cstring> | |
| 9 #include <stdio.h> | |
| 10 #include <unicode/uobject.h> | 8 #include <unicode/uobject.h> |
| 11 #define MUTEX_H // Prevent compile failure of utrie2.h on Windows | |
| 12 #include <utrie2.h> | |
| 13 | 9 |
| 14 const UChar32 kMaxCodepoint = 0x10FFFF; | 10 namespace blink { |
| 15 using CharacterProperty = blink::CharacterProperty; | |
| 16 #define ARRAY_LENGTH(a) (sizeof(a) / sizeof((a)[0])) | |
| 17 | 11 |
| 18 static const UChar32 cjkIsolatedSymbolsArray[] = { | 12 static const UChar32 isCJKIdeographOrSymbolArray[] = { |
| 19 // 0x2C7 Caron, Mandarin Chinese 3rd Tone | 13 // 0x2C7 Caron, Mandarin Chinese 3rd Tone |
| 20 0x2C7, | 14 0x2C7, |
| 21 // 0x2CA Modifier Letter Acute Accent, Mandarin Chinese 2nd Tone | 15 // 0x2CA Modifier Letter Acute Accent, Mandarin Chinese 2nd Tone |
| 22 0x2CA, | 16 0x2CA, |
| 23 // 0x2CB Modifier Letter Grave Access, Mandarin Chinese 4th Tone | 17 // 0x2CB Modifier Letter Grave Access, Mandarin Chinese 4th Tone |
| 24 0x2CB, | 18 0x2CB, |
| 25 // 0x2D9 Dot Above, Mandarin Chinese 5th Tone | 19 // 0x2D9 Dot Above, Mandarin Chinese 5th Tone |
| 26 0x2D9, | 20 0x2D9, |
| 27 0x2020, 0x2021, 0x2030, 0x203B, 0x203C, 0x2042, 0x2047, 0x2048, 0x2049, 0x20
51, | 21 0x2020, 0x2021, 0x2030, 0x203B, 0x203C, 0x2042, 0x2047, 0x2048, 0x2049, 0x20
51, |
| 28 0x20DD, 0x20DE, 0x2100, 0x2103, 0x2105, 0x2109, 0x210A, 0x2113, 0x2116, 0x21
21, | 22 0x20DD, 0x20DE, 0x2100, 0x2103, 0x2105, 0x2109, 0x210A, 0x2113, 0x2116, 0x21
21, |
| 29 0x212B, 0x213B, 0x2150, 0x2151, 0x2152, 0x217F, 0x2189, 0x2307, 0x2312, 0x23
CE, | 23 0x212B, 0x213B, 0x2150, 0x2151, 0x2152, 0x217F, 0x2189, 0x2307, 0x2312, 0x23
CE, |
| 30 0x2423, 0x25A0, 0x25A1, 0x25A2, 0x25AA, 0x25AB, 0x25B1, 0x25B2, 0x25B3, 0x25
B6, | 24 0x2423, 0x25A0, 0x25A1, 0x25A2, 0x25AA, 0x25AB, 0x25B1, 0x25B2, 0x25B3, 0x25
B6, |
| 31 0x25B7, 0x25BC, 0x25BD, 0x25C0, 0x25C1, 0x25C6, 0x25C7, 0x25C9, 0x25CB, 0x25
CC, | 25 0x25B7, 0x25BC, 0x25BD, 0x25C0, 0x25C1, 0x25C6, 0x25C7, 0x25C9, 0x25CB, 0x25
CC, |
| 32 0x25EF, 0x2605, 0x2606, 0x260E, 0x2616, 0x2617, 0x2640, 0x2642, 0x26A0, 0x26
BD, | 26 0x25EF, 0x2605, 0x2606, 0x260E, 0x2616, 0x2617, 0x2640, 0x2642, 0x26A0, 0x26
BD, |
| 33 0x26BE, 0x2713, 0x271A, 0x273F, 0x2740, 0x2756, 0x2B1A, 0xFE10, 0xFE11, 0xFE
12, | 27 0x26BE, 0x2713, 0x271A, 0x273F, 0x2740, 0x2756, 0x2B1A, 0xFE10, 0xFE11, 0xFE
12, |
| 34 0xFE19, 0xFF1D, | 28 0xFE19, 0xFF1D, |
| 35 // Emoji. | 29 // Emoji. |
| 36 0x1F100 | 30 0x1F100 |
| 37 }; | 31 }; |
| 38 | 32 |
| 39 static const UChar32 cjkIdeographRanges[] = { | 33 static const UChar32 isCJKIdeographOrSymbolRanges[] = { |
| 34 // cjkIdeographRanges |
| 40 // CJK Radicals Supplement and Kangxi Radicals. | 35 // CJK Radicals Supplement and Kangxi Radicals. |
| 41 0x2E80, 0x2FDF, | 36 0x2E80, 0x2FDF, |
| 42 // CJK Strokes. | 37 // CJK Strokes. |
| 43 0x31C0, 0x31EF, | 38 0x31C0, 0x31EF, |
| 44 // CJK Unified Ideographs Extension A. | 39 // CJK Unified Ideographs Extension A. |
| 45 0x3400, 0x4DBF, | 40 0x3400, 0x4DBF, |
| 46 // The basic CJK Unified Ideographs block. | 41 // The basic CJK Unified Ideographs block. |
| 47 0x4E00, 0x9FFF, | 42 0x4E00, 0x9FFF, |
| 48 // CJK Compatibility Ideographs. | 43 // CJK Compatibility Ideographs. |
| 49 0xF900, 0xFAFF, | 44 0xF900, 0xFAFF, |
| 50 // CJK Unified Ideographs Extension B. | 45 // CJK Unified Ideographs Extension B. |
| 51 0x20000, 0x2A6DF, | 46 0x20000, 0x2A6DF, |
| 52 // CJK Unified Ideographs Extension C. | 47 // CJK Unified Ideographs Extension C. |
| 53 // CJK Unified Ideographs Extension D. | 48 // CJK Unified Ideographs Extension D. |
| 54 0x2A700, 0x2B81F, | 49 0x2A700, 0x2B81F, |
| 55 // CJK Compatibility Ideographs Supplement. | 50 // CJK Compatibility Ideographs Supplement. |
| 56 0x2F800, 0x2FA1F | 51 0x2F800, 0x2FA1F, |
| 57 }; | |
| 58 | 52 |
| 59 static const UChar32 cjkSymbolRanges[] = { | 53 // cjkSymbolRanges |
| 60 0x2156, 0x215A, | 54 0x2156, 0x215A, |
| 61 0x2160, 0x216B, | 55 0x2160, 0x216B, |
| 62 0x2170, 0x217B, | 56 0x2170, 0x217B, |
| 63 0x23BE, 0x23CC, | 57 0x23BE, 0x23CC, |
| 64 0x2460, 0x2492, | 58 0x2460, 0x2492, |
| 65 0x249C, 0x24FF, | 59 0x249C, 0x24FF, |
| 66 0x25CE, 0x25D3, | 60 0x25CE, 0x25D3, |
| 67 0x25E2, 0x25E6, | 61 0x25E2, 0x25E6, |
| 68 0x2600, 0x2603, | 62 0x2600, 0x2603, |
| 69 0x2660, 0x266F, | 63 0x2660, 0x266F, |
| (...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 218 // CJK Unified Ideographs Extension B/C/D | 212 // CJK Unified Ideographs Extension B/C/D |
| 219 // CJK Compatibility Ideographs Supplement | 213 // CJK Compatibility Ideographs Supplement |
| 220 0x20000, 0x2FFFD, | 214 0x20000, 0x2FFFD, |
| 221 0x30000, 0x3FFFD, | 215 0x30000, 0x3FFFD, |
| 222 // Supplementary Private Use Area-A | 216 // Supplementary Private Use Area-A |
| 223 0xF0000, 0xFFFFD, | 217 0xF0000, 0xFFFFD, |
| 224 // Supplementary Private Use Area-B | 218 // Supplementary Private Use Area-B |
| 225 0x100000, 0x10FFFD, | 219 0x100000, 0x10FFFD, |
| 226 }; | 220 }; |
| 227 | 221 |
| 228 static void setRanges(CharacterProperty* values, | 222 } // namespace blink |
| 229 const UChar32* ranges, size_t length, | |
| 230 CharacterProperty value) | |
| 231 { | |
| 232 assert(length % 2 == 0); | |
| 233 const UChar32* end = ranges + length; | |
| 234 for (; ranges != end; ranges += 2) { | |
| 235 assert(ranges[0] <= ranges[1] | |
| 236 && ranges[1] <= kMaxCodepoint); | |
| 237 for (UChar32 c = ranges[0]; c <= ranges[1]; c++) | |
| 238 values[c] |= value; | |
| 239 } | |
| 240 } | |
| 241 | 223 |
| 242 static void setValues(CharacterProperty* values, | 224 #endif |
| 243 const UChar32* begin, size_t length, | |
| 244 CharacterProperty value) | |
| 245 { | |
| 246 const UChar32* end = begin + length; | |
| 247 for (; begin != end; begin++) { | |
| 248 assert(*begin <= kMaxCodepoint); | |
| 249 values[*begin] |= value; | |
| 250 } | |
| 251 } | |
| 252 | |
| 253 static void generate(FILE* fp, int32_t size, uint8_t* array) | |
| 254 { | |
| 255 fprintf(fp, | |
| 256 "#include <cstdint>\n\n" | |
| 257 "namespace blink {\n\n" | |
| 258 "int32_t serializedCharacterDataSize = %d;\n" | |
| 259 "uint8_t serializedCharacterData[] = {", size); | |
| 260 for (int32_t i = 0; i < size; ) { | |
| 261 fprintf(fp, "\n "); | |
| 262 for (int col = 0; col < 16 && i < size; col++, i++) | |
| 263 fprintf(fp, " 0x%02X,", array[i]); | |
| 264 } | |
| 265 fprintf(fp, | |
| 266 "\n};\n\n" | |
| 267 "} // namespace blink\n"); | |
| 268 } | |
| 269 | |
| 270 int main(int argc, char** argv) | |
| 271 { | |
| 272 // Create a value array of all possible code points. | |
| 273 const UChar32 size = kMaxCodepoint + 1; | |
| 274 CharacterProperty* values = new CharacterProperty[size]; | |
| 275 memset(values, 0, sizeof(CharacterProperty) * size); | |
| 276 | |
| 277 setRanges(values, | |
| 278 cjkIdeographRanges, ARRAY_LENGTH(cjkIdeographRanges), | |
| 279 CharacterProperty::isCJKIdeographOrSymbol); | |
| 280 setRanges(values, | |
| 281 cjkSymbolRanges, ARRAY_LENGTH(cjkSymbolRanges), | |
| 282 CharacterProperty::isCJKIdeographOrSymbol); | |
| 283 setValues(values, | |
| 284 cjkIsolatedSymbolsArray, ARRAY_LENGTH(cjkIsolatedSymbolsArray), | |
| 285 CharacterProperty::isCJKIdeographOrSymbol); | |
| 286 | |
| 287 setRanges(values, | |
| 288 isUprightInMixedVerticalRanges, | |
| 289 ARRAY_LENGTH(isUprightInMixedVerticalRanges), | |
| 290 CharacterProperty::isUprightInMixedVertical); | |
| 291 setValues(values, | |
| 292 isUprightInMixedVerticalArray, | |
| 293 ARRAY_LENGTH(isUprightInMixedVerticalArray), | |
| 294 CharacterProperty::isUprightInMixedVertical); | |
| 295 | |
| 296 // Create a trie from the value array. | |
| 297 UErrorCode error = U_ZERO_ERROR; | |
| 298 UTrie2* trie = utrie2_open(0, 0, &error); | |
| 299 assert(error == U_ZERO_ERROR); | |
| 300 UChar32 start = 0; | |
| 301 CharacterProperty value = values[0]; | |
| 302 for (UChar32 c = 1;; c++) { | |
| 303 if (c < size && values[c] == value) | |
| 304 continue; | |
| 305 if (static_cast<uint32_t>(value)) { | |
| 306 utrie2_setRange32(trie, start, c - 1, | |
| 307 static_cast<uint32_t>(value), TRUE, &error); | |
| 308 assert(error == U_ZERO_ERROR); | |
| 309 } | |
| 310 if (c >= size) | |
| 311 break; | |
| 312 start = c; | |
| 313 value = values[start]; | |
| 314 } | |
| 315 | |
| 316 // Freeze and serialize the trie to a byte array. | |
| 317 utrie2_freeze(trie, UTrie2ValueBits::UTRIE2_16_VALUE_BITS, &error); | |
| 318 assert(error == U_ZERO_ERROR); | |
| 319 int32_t serializedSize = utrie2_serialize(trie, nullptr, 0, &error); | |
| 320 error = U_ZERO_ERROR; | |
| 321 uint8_t* serialized = new uint8_t[serializedSize]; | |
| 322 serializedSize = utrie2_serialize(trie, serialized, serializedSize, &error); | |
| 323 assert(error == U_ZERO_ERROR); | |
| 324 | |
| 325 // Write the serialized array to the source file. | |
| 326 if (argc <= 1) { | |
| 327 generate(stdout, serializedSize, serialized); | |
| 328 } else { | |
| 329 FILE* fp = fopen(argv[1], "wb"); | |
| 330 generate(fp, serializedSize, serialized); | |
| 331 fclose(fp); | |
| 332 } | |
| 333 | |
| 334 utrie2_close(trie); | |
| 335 | |
| 336 return 0; | |
| 337 } | |
| OLD | NEW |