OLD | NEW |
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
11 namespace dart { | 11 namespace dart { |
12 | 12 |
13 static const uint8_t kTrailBytes[256] = { | 13 static const int8_t kTrailBytes[256] = { |
14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
(...skipping 27 matching lines...) Expand all Loading... |
51 0xFFFFFFFF, | 51 0xFFFFFFFF, |
52 0xFFFFFFFF | 52 0xFFFFFFFF |
53 }; | 53 }; |
54 | 54 |
55 | 55 |
56 static bool IsTrailByte(uint8_t code_unit) { | 56 static bool IsTrailByte(uint8_t code_unit) { |
57 return (code_unit & 0xc0) == 0x80; | 57 return (code_unit & 0xc0) == 0x80; |
58 } | 58 } |
59 | 59 |
60 | 60 |
| 61 static bool IsAsciiSequenceStart(uint8_t code_unit) { |
| 62 // Check is codepoint is <= U+007F |
| 63 return (code_unit <= Utf8::kMaxOneByteChar); |
| 64 } |
| 65 |
| 66 |
| 67 static bool IsSmpSequenceStart(uint8_t code_unit) { |
| 68 // Check is codepoint is >= U+10000. |
| 69 return (code_unit >= 0xF0); |
| 70 } |
| 71 |
| 72 |
61 // Returns true if the code point is a high- or low-surrogate. | 73 // Returns true if the code point is a high- or low-surrogate. |
62 static bool IsSurrogate(uint32_t code_point) { | 74 static bool IsSurrogate(uint32_t code_point) { |
63 return (code_point & 0xfffff800) == 0xd800; | 75 return (code_point & 0xfffff800) == 0xd800; |
64 } | 76 } |
65 | 77 |
66 | 78 |
67 // Returns true if the code point value is above Plane 17. | 79 // Returns true if the code point value is above Plane 17. |
68 static bool IsOutOfRange(uint32_t code_point) { | 80 static bool IsOutOfRange(uint32_t code_point) { |
69 return code_point > 0x10FFFF; | 81 return (code_point > 0x10FFFF); |
70 } | 82 } |
71 | 83 |
72 | 84 |
73 // Returns true if the byte sequence is ill-formed. | 85 // Returns true if the byte sequence is ill-formed. |
74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { | 86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { |
75 return code_point < kOverlongMinimum[num_bytes]; | 87 return code_point < kOverlongMinimum[num_bytes]; |
76 } | 88 } |
77 | 89 |
78 | 90 |
| 91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { |
| 92 ASSERT(codepoint > kMaxBmpCodepoint); |
| 93 ASSERT(dst != NULL); |
| 94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); |
| 95 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
| 96 } |
| 97 |
| 98 |
79 // Returns a count of the number of UTF-8 trail bytes. | 99 // Returns a count of the number of UTF-8 trail bytes. |
80 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) { | 100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, |
81 bool is_two_byte_string = false; | 101 intptr_t array_len, |
82 bool is_four_byte_string = false; | 102 Type* type) { |
83 intptr_t len = 0; | 103 intptr_t len = 0; |
84 for (; *str != '\0'; ++str) { | 104 Type char_type = kAscii; |
85 uint8_t code_unit = *str; | 105 for (intptr_t i = 0; i < array_len; i++) { |
| 106 uint8_t code_unit = utf8_array[i]; |
86 if (!IsTrailByte(code_unit)) { | 107 if (!IsTrailByte(code_unit)) { |
87 ++len; | 108 ++len; |
88 } | 109 } |
89 if (code_unit > 0xC3) { // > U+00FF | 110 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F |
90 if (code_unit < 0xF0) { // < U+10000 | 111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 |
91 is_two_byte_string = true; | 112 char_type = kSMP; |
92 } else { | 113 ++len; |
93 is_four_byte_string = true; | 114 } else if (char_type == kAscii) { |
| 115 char_type = kBMP; |
94 } | 116 } |
95 } | 117 } |
96 } | 118 } |
97 if (is_four_byte_string) { | 119 *type = char_type; |
98 *width = 4; | |
99 } else if (is_two_byte_string) { | |
100 *width = 2; | |
101 } else { | |
102 *width = 1; | |
103 } | |
104 return len; | 120 return len; |
105 } | 121 } |
106 | 122 |
107 | 123 |
108 // Returns true if str is a valid NUL-terminated UTF-8 string. | 124 // Returns true if str is a valid NUL-terminated UTF-8 string. |
109 bool Utf8::IsValid(const char* str) { | 125 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { |
110 intptr_t i = 0; | 126 intptr_t i = 0; |
111 while (str[i] != '\0') { | 127 while (i < array_len) { |
112 uint32_t ch = str[i] & 0xFF; | 128 uint32_t ch = utf8_array[i] & 0xFF; |
113 intptr_t j = 1; | 129 intptr_t j = 1; |
114 if (ch >= 0x80) { | 130 if (ch >= 0x80) { |
115 uint8_t num_trail_bytes = kTrailBytes[ch]; | 131 int8_t num_trail_bytes = kTrailBytes[ch]; |
116 bool is_malformed = false; | 132 bool is_malformed = false; |
117 for (; j < num_trail_bytes; ++j) { | 133 for (; j < num_trail_bytes; ++j) { |
118 if (str[i + j] != '\0') { | 134 if ((i + j) < array_len) { |
119 uint8_t code_unit = str[i + j]; | 135 uint8_t code_unit = utf8_array[i + j]; |
120 is_malformed |= !IsTrailByte(code_unit); | 136 is_malformed |= !IsTrailByte(code_unit); |
121 ch = (ch << 6) + code_unit; | 137 ch = (ch << 6) + code_unit; |
122 } else { | 138 } else { |
123 return false; | 139 return false; |
124 } | 140 } |
125 } | 141 } |
126 ch -= kMagicBits[num_trail_bytes]; | 142 ch -= kMagicBits[num_trail_bytes]; |
127 if (!((is_malformed == false) && | 143 if (!((is_malformed == false) && |
128 (j == num_trail_bytes) && | 144 (j == num_trail_bytes) && |
129 !IsOutOfRange(ch) && | 145 !IsOutOfRange(ch) && |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
195 if (pos + num_bytes > len) { | 211 if (pos + num_bytes > len) { |
196 break; | 212 break; |
197 } | 213 } |
198 Utf8::Encode(ch, &dst[pos]); | 214 Utf8::Encode(ch, &dst[pos]); |
199 pos += num_bytes; | 215 pos += num_bytes; |
200 } | 216 } |
201 return pos; | 217 return pos; |
202 } | 218 } |
203 | 219 |
204 | 220 |
205 intptr_t Utf8::Decode(const char* src, int32_t* dst) { | 221 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
206 uint32_t ch = src[0] & 0xFF; | 222 intptr_t array_len, |
207 uint32_t i = 1; | 223 int32_t* dst) { |
| 224 uint32_t ch = utf8_array[0] & 0xFF; |
| 225 intptr_t i = 1; |
208 if (ch >= 0x80) { | 226 if (ch >= 0x80) { |
209 uint32_t num_trail_bytes = kTrailBytes[ch]; | 227 int32_t num_trail_bytes = kTrailBytes[ch]; |
210 bool is_malformed = false; | 228 bool is_malformed = false; |
211 for (; i < num_trail_bytes; ++i) { | 229 for (; i < num_trail_bytes; ++i) { |
212 if (src[i] != '\0') { | 230 if (i < array_len) { |
213 uint8_t code_unit = src[i]; | 231 uint8_t code_unit = utf8_array[i]; |
214 is_malformed |= !IsTrailByte(code_unit); | 232 is_malformed |= !IsTrailByte(code_unit); |
215 ch = (ch << 6) + code_unit; | 233 ch = (ch << 6) + code_unit; |
216 } else { | 234 } else { |
217 *dst = -1; | 235 *dst = -1; |
218 return 0; | 236 return 0; |
219 } | 237 } |
220 } | 238 } |
221 ch -= kMagicBits[num_trail_bytes]; | 239 ch -= kMagicBits[num_trail_bytes]; |
222 if (!((is_malformed == false) && | 240 if (!((is_malformed == false) && |
223 (i == num_trail_bytes) && | 241 (i == num_trail_bytes) && |
224 !IsOutOfRange(ch) && | 242 !IsOutOfRange(ch) && |
225 !IsNonShortestForm(ch, i) && | 243 !IsNonShortestForm(ch, i) && |
226 !IsSurrogate(ch))) { | 244 !IsSurrogate(ch))) { |
227 *dst = -1; | 245 *dst = -1; |
228 return 0; | 246 return 0; |
229 } | 247 } |
230 } | 248 } |
231 *dst = ch; | 249 *dst = ch; |
232 return i; | 250 return i; |
233 } | 251 } |
234 | 252 |
235 | 253 |
236 template<typename T> | 254 bool Utf8::DecodeToAscii(const uint8_t* utf8_array, |
237 static bool DecodeImpl(const char* src, T* dst, intptr_t len) { | 255 intptr_t array_len, |
| 256 uint8_t* dst, |
| 257 intptr_t len) { |
| 258 if (len < array_len) { |
| 259 return false; // output overflow |
| 260 } |
| 261 #ifdef DEBUG |
| 262 for (intptr_t i = 0; i < array_len; i++) { |
| 263 ASSERT(IsAsciiSequenceStart(utf8_array[i])); |
| 264 } |
| 265 #endif |
| 266 memmove(dst, utf8_array, array_len); |
| 267 return true; // success |
| 268 } |
| 269 |
| 270 |
| 271 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
| 272 intptr_t array_len, |
| 273 uint16_t* dst, |
| 274 intptr_t len) { |
238 intptr_t i = 0; | 275 intptr_t i = 0; |
239 intptr_t j = 0; | 276 intptr_t j = 0; |
240 intptr_t num_bytes; | 277 intptr_t num_bytes; |
241 for (; src[i] != '\0' && j < len; i += num_bytes, ++j) { | 278 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
242 int32_t ch; | 279 int32_t ch; |
243 num_bytes = Utf8::Decode(&src[i], &ch); | 280 bool is_smp = IsSmpSequenceStart(utf8_array[i]); |
| 281 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
244 if (ch == -1) { | 282 if (ch == -1) { |
245 return false; // invalid input | 283 return false; // invalid input |
246 } | 284 } |
247 dst[j] = ch; | 285 if (is_smp) { |
| 286 ConvertUTF32ToUTF16(ch, &(dst[j])); |
| 287 j = j + 1; |
| 288 } else { |
| 289 dst[j] = ch; |
| 290 } |
248 } | 291 } |
249 if (src[i] != '\0' && j == len) { | 292 if ((i < array_len) && (j == len)) { |
250 return false; // output overflow | 293 return false; // output overflow |
251 } | 294 } |
252 return true; // success | 295 return true; // success |
253 } | 296 } |
254 | 297 |
255 | 298 |
256 bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) { | 299 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
257 return DecodeImpl(src, dst, len); | 300 intptr_t array_len, |
258 } | 301 uint32_t* dst, |
259 | 302 intptr_t len) { |
260 | 303 intptr_t i = 0; |
261 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) { | 304 intptr_t j = 0; |
262 return DecodeImpl(src, dst, len); | 305 intptr_t num_bytes; |
263 } | 306 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
264 | 307 int32_t ch; |
265 | 308 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
266 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) { | 309 if (ch == -1) { |
267 return DecodeImpl(src, dst, len); | 310 return false; // invalid input |
| 311 } |
| 312 dst[j] = ch; |
| 313 } |
| 314 if ((i < array_len) && (j == len)) { |
| 315 return false; // output overflow |
| 316 } |
| 317 return true; // success |
268 } | 318 } |
269 | 319 |
270 } // namespace dart | 320 } // namespace dart |
OLD | NEW |