OLD | NEW |
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
(...skipping 30 matching lines...) Expand all Loading... |
41 }; | 41 }; |
42 | 42 |
43 | 43 |
44 // Minimum values of code points used to check shortest form. | 44 // Minimum values of code points used to check shortest form. |
45 static const uint32_t kOverlongMinimum[7] = { | 45 static const uint32_t kOverlongMinimum[7] = { |
46 0, // padding | 46 0, // padding |
47 0x0, | 47 0x0, |
48 0x80, | 48 0x80, |
49 0x800, | 49 0x800, |
50 0x10000, | 50 0x10000, |
51 0xFFFFFFFF, // We never allow 5 byte sequences. | 51 0xFFFFFFFF, |
52 0xFFFFFFFF // We never allow 6 byte sequences. | 52 0xFFFFFFFF |
53 }; | 53 }; |
54 | 54 |
55 | 55 |
56 static bool IsTrailByte(uint8_t code_unit) { | 56 static bool IsTrailByte(uint8_t code_unit) { |
57 return (code_unit & 0xc0) == 0x80; | 57 return (code_unit & 0xc0) == 0x80; |
58 } | 58 } |
59 | 59 |
60 | 60 |
61 static bool IsLatin1SequenceStart(uint8_t code_unit) { | 61 static bool IsLatin1SequenceStart(uint8_t code_unit) { |
62 // Check is codepoint is <= U+00FF | 62 // Check is codepoint is <= U+00FF |
63 return (code_unit <= Utf8::kMaxOneByteChar); | 63 return (code_unit <= Utf8::kMaxOneByteChar); |
64 } | 64 } |
65 | 65 |
66 | 66 |
67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { | 67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { |
68 // Check the UTF-8 code unit to determine if it is a sequence start for a | 68 // Check is codepoint is >= U+10000. |
69 // code point >= U+10000. | |
70 return (code_unit >= 0xF0); | 69 return (code_unit >= 0xF0); |
71 } | 70 } |
72 | 71 |
73 | 72 |
74 // Returns true if the code point value is above Plane 17. | 73 // Returns true if the code point value is above Plane 17. |
75 static bool IsOutOfRange(int32_t code_point) { | 74 static bool IsOutOfRange(uint32_t code_point) { |
76 return (code_point > Utf16::kMaxCodePoint); | 75 return (code_point > 0x10FFFF); |
77 } | 76 } |
78 | 77 |
79 | 78 |
80 // Returns true if the byte sequence is ill-formed. | 79 // Returns true if the byte sequence is ill-formed. |
81 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { | 80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { |
82 return code_point < kOverlongMinimum[num_bytes]; | 81 return code_point < kOverlongMinimum[num_bytes]; |
83 } | 82 } |
84 | 83 |
85 | 84 |
86 // Returns a count of the number of UTF-16 code units represented by this UTF-8 | 85 // Returns a count of the number of UTF-8 trail bytes. |
87 // array. Type is kASCII for 7-bit-only. If there are surrogate pairs then | 86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, |
88 // the type is kSupplementary. Otherwise it is kBMP. | 87 intptr_t array_len, |
89 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, | 88 Type* type) { |
90 intptr_t array_len, | |
91 Type* type) { | |
92 intptr_t len = 0; | 89 intptr_t len = 0; |
93 Type char_type = kLatin1; | 90 Type char_type = kLatin1; |
94 for (intptr_t i = 0; i < array_len; i++) { | 91 for (intptr_t i = 0; i < array_len; i++) { |
95 uint8_t code_unit = utf8_array[i]; | 92 uint8_t code_unit = utf8_array[i]; |
96 if (!IsTrailByte(code_unit)) { | 93 if (!IsTrailByte(code_unit)) { |
97 ++len; | 94 ++len; |
98 } | 95 } |
99 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | 96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF |
100 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 | 97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 |
101 char_type = kSupplementary; | 98 char_type = kSupplementary; |
102 ++len; // Surrogate pair in the UTF-16 encoding. | 99 ++len; |
103 } else if (char_type == kLatin1) { | 100 } else if (char_type == kLatin1) { |
104 char_type = kBMP; | 101 char_type = kBMP; |
105 } | 102 } |
106 } | 103 } |
107 } | 104 } |
108 *type = char_type; | 105 *type = char_type; |
109 return len; | 106 return len; |
110 } | 107 } |
111 | 108 |
112 | 109 |
113 // Returns true if str is a valid UTF-8 string. | 110 // Returns true if str is a valid NUL-terminated UTF-8 string. |
114 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { | 111 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { |
115 intptr_t i = 0; | 112 intptr_t i = 0; |
116 while (i < array_len) { | 113 while (i < array_len) { |
117 uint32_t ch = utf8_array[i] & 0xFF; | 114 uint32_t ch = utf8_array[i] & 0xFF; |
118 intptr_t j = 1; | 115 intptr_t j = 1; |
119 if (ch >= 0x80) { | 116 if (ch >= 0x80) { |
120 int8_t num_trail_bytes = kTrailBytes[ch]; | 117 int8_t num_trail_bytes = kTrailBytes[ch]; |
121 bool is_malformed = false; | 118 bool is_malformed = false; |
122 for (; j < num_trail_bytes; ++j) { | 119 for (; j < num_trail_bytes; ++j) { |
123 if ((i + j) < array_len) { | 120 if ((i + j) < array_len) { |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
205 Utf8::Encode(ch, &dst[pos]); | 202 Utf8::Encode(ch, &dst[pos]); |
206 pos += num_bytes; | 203 pos += num_bytes; |
207 } | 204 } |
208 return pos; | 205 return pos; |
209 } | 206 } |
210 | 207 |
211 | 208 |
212 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 209 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
213 intptr_t array_len, | 210 intptr_t array_len, |
214 int32_t* dst) { | 211 int32_t* dst) { |
215 int32_t ch = utf8_array[0] & 0xFF; | 212 uint32_t ch = utf8_array[0] & 0xFF; |
216 intptr_t i = 1; | 213 intptr_t i = 1; |
217 if (ch >= 0x80) { | 214 if (ch >= 0x80) { |
218 intptr_t num_trail_bytes = kTrailBytes[ch]; | 215 intptr_t num_trail_bytes = kTrailBytes[ch]; |
219 bool is_malformed = false; | 216 bool is_malformed = false; |
220 for (; i < num_trail_bytes; ++i) { | 217 for (; i < num_trail_bytes; ++i) { |
221 if (i < array_len) { | 218 if (i < array_len) { |
222 uint8_t code_unit = utf8_array[i]; | 219 uint8_t code_unit = utf8_array[i]; |
223 is_malformed |= !IsTrailByte(code_unit); | 220 is_malformed |= !IsTrailByte(code_unit); |
224 ch = (ch << 6) + code_unit; | 221 ch = (ch << 6) + code_unit; |
225 } else { | 222 } else { |
226 *dst = kInvalidCodePoint; | 223 *dst = -1; |
227 return 0; | 224 return 0; |
228 } | 225 } |
229 } | 226 } |
230 ch -= kMagicBits[num_trail_bytes]; | 227 ch -= kMagicBits[num_trail_bytes]; |
231 if (!((is_malformed == false) && | 228 if (!((is_malformed == false) && |
232 (i == num_trail_bytes) && | 229 (i == num_trail_bytes) && |
233 !IsOutOfRange(ch) && | 230 !IsOutOfRange(ch) && |
234 !IsNonShortestForm(ch, i) && | 231 !IsNonShortestForm(ch, i) && |
235 !Utf16::IsSurrogate(ch))) { | 232 !Utf16::IsSurrogate(ch))) { |
236 *dst = kInvalidCodePoint; | 233 *dst = -1; |
237 return 0; | 234 return 0; |
238 } | 235 } |
239 } | 236 } |
240 *dst = ch; | 237 *dst = ch; |
241 return i; | 238 return i; |
242 } | 239 } |
243 | 240 |
244 | 241 |
245 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, | 242 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, |
246 intptr_t array_len, | 243 intptr_t array_len, |
(...skipping 23 matching lines...) Expand all Loading... |
270 intptr_t array_len, | 267 intptr_t array_len, |
271 uint16_t* dst, | 268 uint16_t* dst, |
272 intptr_t len) { | 269 intptr_t len) { |
273 intptr_t i = 0; | 270 intptr_t i = 0; |
274 intptr_t j = 0; | 271 intptr_t j = 0; |
275 intptr_t num_bytes; | 272 intptr_t num_bytes; |
276 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
277 int32_t ch; | 274 int32_t ch; |
278 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); | 275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); |
279 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
280 if (ch == kInvalidCodePoint) { | 277 if (ch == -1) { |
281 return false; // invalid input | 278 return false; // invalid input |
282 } | 279 } |
283 if (is_supplementary) { | 280 if (is_supplementary) { |
284 Utf16::Encode(ch, &dst[j]); | 281 Utf16::Encode(ch, &dst[j]); |
285 j = j + 1; | 282 j = j + 1; |
286 } else { | 283 } else { |
287 dst[j] = ch; | 284 dst[j] = ch; |
288 } | 285 } |
289 } | 286 } |
290 if ((i < array_len) && (j == len)) { | 287 if ((i < array_len) && (j == len)) { |
291 return false; // output overflow | 288 return false; // output overflow |
292 } | 289 } |
293 return true; // success | 290 return true; // success |
294 } | 291 } |
295 | 292 |
296 | 293 |
297 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, | 294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
298 intptr_t array_len, | 295 intptr_t array_len, |
299 int32_t* dst, | 296 uint32_t* dst, |
300 intptr_t len) { | 297 intptr_t len) { |
301 intptr_t i = 0; | 298 intptr_t i = 0; |
302 intptr_t j = 0; | 299 intptr_t j = 0; |
303 intptr_t num_bytes; | 300 intptr_t num_bytes; |
304 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
305 int32_t ch; | 302 int32_t ch; |
306 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
307 if (ch == kInvalidCodePoint) { | 304 if (ch == -1) { |
308 return false; // invalid input | 305 return false; // invalid input |
309 } | 306 } |
310 dst[j] = ch; | 307 dst[j] = ch; |
311 } | 308 } |
312 if ((i < array_len) && (j == len)) { | 309 if ((i < array_len) && (j == len)) { |
313 return false; // output overflow | 310 return false; // output overflow |
314 } | 311 } |
315 return true; // success | 312 return true; // success |
316 } | 313 } |
317 | 314 |
318 | 315 |
319 int32_t Utf16::CodePointAt(const String& str, int index) { | 316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
320 int32_t code = str.CharAt(index); | 317 ASSERT(codepoint > kMaxBmpCodepoint); |
321 if (!IsLeadSurrogate(code)) return code; | |
322 if (index + 1 == str.Length()) return code; | |
323 int32_t trail = str.CharAt(index + 1); | |
324 if (!IsTrailSurrogate(trail)) return code; | |
325 return Decode(code, trail); | |
326 } | |
327 | |
328 | |
329 void Utf16::Encode(int32_t codePoint, uint16_t* dst) { | |
330 ASSERT(codePoint > kMaxBmpCodepoint); | |
331 ASSERT(dst != NULL); | 318 ASSERT(dst != NULL); |
332 dst[0] = LeadFromCodePoint(codePoint); | 319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
333 dst[1] = TrailFromCodePoint(codePoint); | 320 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
334 } | 321 } |
335 | 322 |
336 } // namespace dart | 323 } // namespace dart |
OLD | NEW |