OLD | NEW |
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
11 namespace dart { | 11 namespace dart { |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
52 0xFFFFFFFF | 52 0xFFFFFFFF |
53 }; | 53 }; |
54 | 54 |
55 | 55 |
56 static bool IsTrailByte(uint8_t code_unit) { | 56 static bool IsTrailByte(uint8_t code_unit) { |
57 return (code_unit & 0xc0) == 0x80; | 57 return (code_unit & 0xc0) == 0x80; |
58 } | 58 } |
59 | 59 |
60 | 60 |
61 static bool IsLatin1SequenceStart(uint8_t code_unit) { | 61 static bool IsLatin1SequenceStart(uint8_t code_unit) { |
62 // Check is codepoint is <= U+00FF | 62 // Check is codepoint is <= U+00FF. |
63 return (code_unit <= Utf8::kMaxOneByteChar); | 63 return (code_unit <= Utf8::kMaxOneByteChar); |
64 } | 64 } |
65 | 65 |
66 | 66 |
67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { | 67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { |
68 // Check is codepoint is >= U+10000. | 68 // Check is codepoint is >= U+10000. |
69 return (code_unit >= 0xF0); | 69 return (code_unit >= 0xF0); |
70 } | 70 } |
71 | 71 |
72 | 72 |
(...skipping 28 matching lines...) Expand all Loading... |
101 char_type = kBMP; | 101 char_type = kBMP; |
102 } | 102 } |
103 } | 103 } |
104 } | 104 } |
105 *type = char_type; | 105 *type = char_type; |
106 return len; | 106 return len; |
107 } | 107 } |
108 | 108 |
109 | 109 |
110 // Returns true if str is a valid NUL-terminated UTF-8 string. | 110 // Returns true if str is a valid NUL-terminated UTF-8 string. |
111 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { | 111 static bool IsValidUtf8( |
| 112 const uint8_t* utf8_array, intptr_t array_len, bool allow_surrogates) { |
112 intptr_t i = 0; | 113 intptr_t i = 0; |
113 while (i < array_len) { | 114 while (i < array_len) { |
114 uint32_t ch = utf8_array[i] & 0xFF; | 115 uint32_t ch = utf8_array[i] & 0xFF; |
115 intptr_t j = 1; | 116 intptr_t j = 1; |
116 if (ch >= 0x80) { | 117 if (ch >= 0x80) { |
117 int8_t num_trail_bytes = kTrailBytes[ch]; | 118 int8_t num_trail_bytes = kTrailBytes[ch]; |
118 bool is_malformed = false; | 119 bool is_malformed = false; |
119 for (; j < num_trail_bytes; ++j) { | 120 for (; j < num_trail_bytes; ++j) { |
120 if ((i + j) < array_len) { | 121 if ((i + j) < array_len) { |
121 uint8_t code_unit = utf8_array[i + j]; | 122 uint8_t code_unit = utf8_array[i + j]; |
122 is_malformed |= !IsTrailByte(code_unit); | 123 is_malformed |= !IsTrailByte(code_unit); |
123 ch = (ch << 6) + code_unit; | 124 ch = (ch << 6) + code_unit; |
124 } else { | 125 } else { |
125 return false; | 126 return false; |
126 } | 127 } |
127 } | 128 } |
128 ch -= kMagicBits[num_trail_bytes]; | 129 ch -= kMagicBits[num_trail_bytes]; |
129 if (!((is_malformed == false) && | 130 if (!((is_malformed == false) && |
130 (j == num_trail_bytes) && | 131 (j == num_trail_bytes) && |
131 !IsOutOfRange(ch) && | 132 !IsOutOfRange(ch) && |
132 !IsNonShortestForm(ch, j) && | 133 !IsNonShortestForm(ch, j) && |
133 !Utf16::IsSurrogate(ch))) { | 134 (!Utf16::IsSurrogate(ch) || allow_surrogates))) { |
134 return false; | 135 return false; |
135 } | 136 } |
136 } | 137 } |
137 i += j; | 138 i += j; |
138 } | 139 } |
139 return true; | 140 return true; |
140 } | 141 } |
141 | 142 |
142 | 143 |
| 144 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { |
| 145 return IsValidUtf8(utf8_array, array_len, false); |
| 146 } |
| 147 |
| 148 |
| 149 bool Utf8::IsValidAllowSurrogates( |
| 150 const uint8_t* utf8_array, intptr_t array_len) { |
| 151 return IsValidUtf8(utf8_array, array_len, true); |
| 152 } |
| 153 |
| 154 |
143 intptr_t Utf8::Length(int32_t ch) { | 155 intptr_t Utf8::Length(int32_t ch) { |
144 if (ch <= kMaxOneByteChar) { | 156 if (ch <= kMaxOneByteChar) { |
145 return 1; | 157 return 1; |
146 } else if (ch <= kMaxTwoByteChar) { | 158 } else if (ch <= kMaxTwoByteChar) { |
147 return 2; | 159 return 2; |
148 } else if (ch <= kMaxThreeByteChar) { | 160 } else if (ch <= kMaxThreeByteChar) { |
149 return 3; | 161 return 3; |
150 } | 162 } |
151 ASSERT(ch <= kMaxFourByteChar); | 163 ASSERT(ch <= kMaxFourByteChar); |
152 return 4; | 164 return 4; |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
199 if (pos + num_bytes > len) { | 211 if (pos + num_bytes > len) { |
200 break; | 212 break; |
201 } | 213 } |
202 Utf8::Encode(ch, &dst[pos]); | 214 Utf8::Encode(ch, &dst[pos]); |
203 pos += num_bytes; | 215 pos += num_bytes; |
204 } | 216 } |
205 return pos; | 217 return pos; |
206 } | 218 } |
207 | 219 |
208 | 220 |
209 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 221 static intptr_t DecodeUTF8(const uint8_t* utf8_array, |
210 intptr_t array_len, | 222 intptr_t array_len, |
211 int32_t* dst) { | 223 int32_t* dst, |
| 224 bool allow_surrogates) { |
212 uint32_t ch = utf8_array[0] & 0xFF; | 225 uint32_t ch = utf8_array[0] & 0xFF; |
213 intptr_t i = 1; | 226 intptr_t i = 1; |
214 if (ch >= 0x80) { | 227 if (ch >= 0x80) { |
215 intptr_t num_trail_bytes = kTrailBytes[ch]; | 228 intptr_t num_trail_bytes = kTrailBytes[ch]; |
216 bool is_malformed = false; | 229 bool is_malformed = false; |
217 for (; i < num_trail_bytes; ++i) { | 230 for (; i < num_trail_bytes; ++i) { |
218 if (i < array_len) { | 231 if (i < array_len) { |
219 uint8_t code_unit = utf8_array[i]; | 232 uint8_t code_unit = utf8_array[i]; |
220 is_malformed |= !IsTrailByte(code_unit); | 233 is_malformed |= !IsTrailByte(code_unit); |
221 ch = (ch << 6) + code_unit; | 234 ch = (ch << 6) + code_unit; |
222 } else { | 235 } else { |
223 *dst = -1; | 236 *dst = -1; |
224 return 0; | 237 return 0; |
225 } | 238 } |
226 } | 239 } |
227 ch -= kMagicBits[num_trail_bytes]; | 240 ch -= kMagicBits[num_trail_bytes]; |
228 if (!((is_malformed == false) && | 241 if (!((is_malformed == false) && |
229 (i == num_trail_bytes) && | 242 (i == num_trail_bytes) && |
230 !IsOutOfRange(ch) && | 243 !IsOutOfRange(ch) && |
231 !IsNonShortestForm(ch, i) && | 244 !IsNonShortestForm(ch, i) && |
232 !Utf16::IsSurrogate(ch))) { | 245 (!Utf16::IsSurrogate(ch) || allow_surrogates))) { |
233 *dst = -1; | 246 *dst = -1; |
234 return 0; | 247 return 0; |
235 } | 248 } |
236 } | 249 } |
237 *dst = ch; | 250 *dst = ch; |
238 return i; | 251 return i; |
239 } | 252 } |
240 | 253 |
241 | 254 |
| 255 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
| 256 intptr_t array_len, |
| 257 int32_t* dst) { |
| 258 return DecodeUTF8(utf8_array, array_len, dst, false); |
| 259 } |
| 260 |
| 261 |
| 262 intptr_t Utf8::DecodeAllowSurrogates(const uint8_t* utf8_array, |
| 263 intptr_t array_len, |
| 264 int32_t* dst) { |
| 265 return DecodeUTF8(utf8_array, array_len, dst, true); |
| 266 } |
| 267 |
| 268 |
242 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, | 269 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, |
243 intptr_t array_len, | 270 intptr_t array_len, |
244 uint8_t* dst, | 271 uint8_t* dst, |
245 intptr_t len) { | 272 intptr_t len) { |
246 intptr_t i = 0; | 273 intptr_t i = 0; |
247 intptr_t j = 0; | 274 intptr_t j = 0; |
248 intptr_t num_bytes; | 275 intptr_t num_bytes; |
249 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 276 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
250 int32_t ch; | 277 int32_t ch; |
251 ASSERT(IsLatin1SequenceStart(utf8_array[i])); | 278 ASSERT(IsLatin1SequenceStart(utf8_array[i])); |
252 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 279 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
253 if (ch == -1) { | 280 if (ch == -1) { |
254 return false; // invalid input | 281 return false; // Invalid input. |
255 } | 282 } |
256 ASSERT(ch <= 0xff); | 283 ASSERT(ch <= 0xff); |
257 dst[j] = ch; | 284 dst[j] = ch; |
258 } | 285 } |
259 if ((i < array_len) && (j == len)) { | 286 if ((i < array_len) && (j == len)) { |
260 return false; // output overflow | 287 return false; // Output overflow. |
261 } | 288 } |
262 return true; // success | 289 return true; // Success. |
263 } | 290 } |
264 | 291 |
265 | 292 |
266 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, | 293 bool DecodeUTF8ToUTF16(const uint8_t* utf8_array, |
267 intptr_t array_len, | 294 intptr_t array_len, |
268 uint16_t* dst, | 295 uint16_t* dst, |
269 intptr_t len) { | 296 intptr_t len, |
| 297 bool allow_surrogates) { |
270 intptr_t i = 0; | 298 intptr_t i = 0; |
271 intptr_t j = 0; | 299 intptr_t j = 0; |
272 intptr_t num_bytes; | 300 intptr_t num_bytes; |
273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
274 int32_t ch; | 302 int32_t ch; |
275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); | 303 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); |
276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 304 num_bytes = DecodeUTF8( |
| 305 &utf8_array[i], (array_len - i), &ch, allow_surrogates); |
277 if (ch == -1) { | 306 if (ch == -1) { |
278 return false; // invalid input | 307 return false; // Invalid input. |
279 } | 308 } |
280 if (is_supplementary) { | 309 if (is_supplementary) { |
281 Utf16::Encode(ch, &dst[j]); | 310 Utf16::Encode(ch, &dst[j]); |
282 j = j + 1; | 311 j = j + 1; |
283 } else { | 312 } else { |
284 dst[j] = ch; | 313 dst[j] = ch; |
285 } | 314 } |
286 } | 315 } |
287 if ((i < array_len) && (j == len)) { | 316 if ((i < array_len) && (j == len)) { |
288 return false; // output overflow | 317 return false; // Output overflow. |
289 } | 318 } |
290 return true; // success | 319 return true; // Success. |
291 } | 320 } |
292 | 321 |
293 | 322 |
| 323 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
| 324 intptr_t array_len, |
| 325 uint16_t* dst, |
| 326 intptr_t len) { |
| 327 return DecodeUTF8ToUTF16(utf8_array, array_len, dst, len, false); |
| 328 } |
| 329 |
| 330 |
| 331 bool Utf8::DecodeToUTF16AllowSurrogates(const uint8_t* utf8_array, |
| 332 intptr_t array_len, |
| 333 uint16_t* dst, |
| 334 intptr_t len) { |
| 335 return DecodeUTF8ToUTF16(utf8_array, array_len, dst, len, true); |
| 336 } |
| 337 |
| 338 |
294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, | 339 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
295 intptr_t array_len, | 340 intptr_t array_len, |
296 int32_t* dst, | 341 int32_t* dst, |
297 intptr_t len) { | 342 intptr_t len) { |
298 intptr_t i = 0; | 343 intptr_t i = 0; |
299 intptr_t j = 0; | 344 intptr_t j = 0; |
300 intptr_t num_bytes; | 345 intptr_t num_bytes; |
301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 346 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
302 int32_t ch; | 347 int32_t ch; |
303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 348 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
304 if (ch == -1) { | 349 if (ch == -1) { |
305 return false; // invalid input | 350 return false; // Invalid input. |
306 } | 351 } |
307 dst[j] = ch; | 352 dst[j] = ch; |
308 } | 353 } |
309 if ((i < array_len) && (j == len)) { | 354 if ((i < array_len) && (j == len)) { |
310 return false; // output overflow | 355 return false; // Output overflow. |
311 } | 356 } |
312 return true; // success | 357 return true; // Success. |
313 } | 358 } |
314 | 359 |
315 | 360 |
316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 361 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
317 ASSERT(codepoint > kMaxBmpCodepoint); | 362 ASSERT(codepoint > kMaxBmpCodepoint); |
318 ASSERT(dst != NULL); | 363 ASSERT(dst != NULL); |
319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 364 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
320 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 365 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
321 } | 366 } |
322 | 367 |
323 } // namespace dart | 368 } // namespace dart |
OLD | NEW |