OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
11 namespace dart { | 11 namespace dart { |
12 | 12 |
| 13 // clang-format off |
13 const int8_t Utf8::kTrailBytes[256] = { | 14 const int8_t Utf8::kTrailBytes[256] = { |
14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
26 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 28 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
28 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 29 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
29 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 | 30 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
30 }; | 31 }; |
| 32 // clang-format on |
31 | 33 |
32 | 34 const uint32_t Utf8::kMagicBits[7] = {0, // Padding. |
33 const uint32_t Utf8::kMagicBits[7] = { | 35 0x00000000, 0x00003080, 0x000E2080, |
34 0, // Padding. | 36 0x03C82080, 0xFA082080, 0x82082080}; |
35 0x00000000, | |
36 0x00003080, | |
37 0x000E2080, | |
38 0x03C82080, | |
39 0xFA082080, | |
40 0x82082080 | |
41 }; | |
42 | 37 |
43 | 38 |
44 // Minimum values of code points used to check shortest form. | 39 // Minimum values of code points used to check shortest form. |
45 const uint32_t Utf8::kOverlongMinimum[7] = { | 40 const uint32_t Utf8::kOverlongMinimum[7] = {0, // Padding. |
46 0, // Padding. | 41 0x0, 0x80, 0x800, |
47 0x0, | 42 0x10000, 0xFFFFFFFF, 0xFFFFFFFF}; |
48 0x80, | |
49 0x800, | |
50 0x10000, | |
51 0xFFFFFFFF, | |
52 0xFFFFFFFF | |
53 }; | |
54 | 43 |
55 | 44 |
56 // Returns the most restricted coding form in which the sequence of utf8 | 45 // Returns the most restricted coding form in which the sequence of utf8 |
57 // characters in 'utf8_array' can be represented in, and the number of | 46 // characters in 'utf8_array' can be represented in, and the number of |
58 // code units needed in that form. | 47 // code units needed in that form. |
59 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, | 48 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, |
60 intptr_t array_len, | 49 intptr_t array_len, |
61 Type* type) { | 50 Type* type) { |
62 intptr_t len = 0; | 51 intptr_t len = 0; |
63 Type char_type = kLatin1; | 52 Type char_type = kLatin1; |
64 for (intptr_t i = 0; i < array_len; i++) { | 53 for (intptr_t i = 0; i < array_len; i++) { |
65 uint8_t code_unit = utf8_array[i]; | 54 uint8_t code_unit = utf8_array[i]; |
66 if (!IsTrailByte(code_unit)) { | 55 if (!IsTrailByte(code_unit)) { |
67 ++len; | 56 ++len; |
68 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | 57 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF |
69 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 | 58 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 |
70 char_type = kSupplementary; | 59 char_type = kSupplementary; |
71 ++len; | 60 ++len; |
72 } else if (char_type == kLatin1) { | 61 } else if (char_type == kLatin1) { |
73 char_type = kBMP; | 62 char_type = kBMP; |
74 } | 63 } |
75 } | 64 } |
76 } | 65 } |
77 } | 66 } |
78 *type = char_type; | 67 *type = char_type; |
(...skipping 13 matching lines...) Expand all Loading... |
92 for (; j < num_trail_bytes; ++j) { | 81 for (; j < num_trail_bytes; ++j) { |
93 if ((i + j) < array_len) { | 82 if ((i + j) < array_len) { |
94 uint8_t code_unit = utf8_array[i + j]; | 83 uint8_t code_unit = utf8_array[i + j]; |
95 is_malformed |= !IsTrailByte(code_unit); | 84 is_malformed |= !IsTrailByte(code_unit); |
96 ch = (ch << 6) + code_unit; | 85 ch = (ch << 6) + code_unit; |
97 } else { | 86 } else { |
98 return false; | 87 return false; |
99 } | 88 } |
100 } | 89 } |
101 ch -= kMagicBits[num_trail_bytes]; | 90 ch -= kMagicBits[num_trail_bytes]; |
102 if (!((is_malformed == false) && | 91 if (!((is_malformed == false) && (j == num_trail_bytes) && |
103 (j == num_trail_bytes) && | 92 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, j))) { |
104 !Utf::IsOutOfRange(ch) && | |
105 !IsNonShortestForm(ch, j))) { | |
106 return false; | 93 return false; |
107 } | 94 } |
108 } | 95 } |
109 i += j; | 96 i += j; |
110 } | 97 } |
111 return true; | 98 return true; |
112 } | 99 } |
113 | 100 |
114 | 101 |
115 intptr_t Utf8::Length(int32_t ch) { | 102 intptr_t Utf8::Length(int32_t ch) { |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
190 if (i < array_len) { | 177 if (i < array_len) { |
191 uint8_t code_unit = utf8_array[i]; | 178 uint8_t code_unit = utf8_array[i]; |
192 is_malformed |= !IsTrailByte(code_unit); | 179 is_malformed |= !IsTrailByte(code_unit); |
193 ch = (ch << 6) + code_unit; | 180 ch = (ch << 6) + code_unit; |
194 } else { | 181 } else { |
195 *dst = -1; | 182 *dst = -1; |
196 return 0; | 183 return 0; |
197 } | 184 } |
198 } | 185 } |
199 ch -= kMagicBits[num_trail_bytes]; | 186 ch -= kMagicBits[num_trail_bytes]; |
200 if (!((is_malformed == false) && | 187 if (!((is_malformed == false) && (i == num_trail_bytes) && |
201 (i == num_trail_bytes) && | 188 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, i))) { |
202 !Utf::IsOutOfRange(ch) && | |
203 !IsNonShortestForm(ch, i))) { | |
204 *dst = -1; | 189 *dst = -1; |
205 return 0; | 190 return 0; |
206 } | 191 } |
207 } | 192 } |
208 *dst = ch; | 193 *dst = ch; |
209 return i; | 194 return i; |
210 } | 195 } |
211 | 196 |
212 | 197 |
213 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, | 198 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, |
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
293 | 278 |
294 | 279 |
295 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 280 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
296 ASSERT(codepoint > Utf16::kMaxCodeUnit); | 281 ASSERT(codepoint > Utf16::kMaxCodeUnit); |
297 ASSERT(dst != NULL); | 282 ASSERT(dst != NULL); |
298 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 283 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
299 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 284 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
300 } | 285 } |
301 | 286 |
302 } // namespace dart | 287 } // namespace dart |
OLD | NEW |