OLD | NEW |
---|---|
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
(...skipping 13 matching lines...) Expand all Loading... | |
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
26 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 26 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
28 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 28 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
29 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 | 29 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
30 }; | 30 }; |
31 | 31 |
32 | 32 |
33 static const uint32_t kMagicBits[7] = { | 33 static const uint32_t kMagicBits[7] = { |
34 0, // padding | 34 0, // Padding. |
35 0x00000000, | 35 0x00000000, |
36 0x00003080, | 36 0x00003080, |
37 0x000E2080, | 37 0x000E2080, |
38 0x03C82080, | 38 0x03C82080, |
39 0xFA082080, | 39 0xFA082080, |
40 0x82082080 | 40 0x82082080 |
41 }; | 41 }; |
42 | 42 |
43 | 43 |
44 // Minimum values of code points used to check shortest form. | 44 // Minimum values of code points used to check shortest form. |
45 static const uint32_t kOverlongMinimum[7] = { | 45 static const uint32_t kOverlongMinimum[7] = { |
46 0, // padding | 46 0, // Padding. |
47 0x0, | 47 0x0, |
48 0x80, | 48 0x80, |
49 0x800, | 49 0x800, |
50 0x10000, | 50 0x10000, |
51 0xFFFFFFFF, | 51 0xFFFFFFFF, |
52 0xFFFFFFFF | 52 0xFFFFFFFF |
53 }; | 53 }; |
54 | 54 |
55 | 55 |
56 static bool IsTrailByte(uint8_t code_unit) { | 56 static bool IsTrailByte(uint8_t code_unit) { |
57 return (code_unit & 0xc0) == 0x80; | 57 return (code_unit & 0xc0) == 0x80; |
58 } | 58 } |
59 | 59 |
60 | 60 |
61 static bool IsLatin1SequenceStart(uint8_t code_unit) { | 61 static bool IsLatin1SequenceStart(uint8_t code_unit) { |
62 // Check is codepoint is <= U+00FF | 62 // Check if codepoint is <= U+00FF. |
63 return (code_unit <= Utf8::kMaxOneByteChar); | 63 return (code_unit <= Utf8::kMaxOneByteChar); |
64 } | 64 } |
65 | 65 |
66 | 66 |
67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { | 67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { |
68 // Check is codepoint is >= U+10000. | 68 // Check if codepoint is >= U+10000. |
69 return (code_unit >= 0xF0); | 69 return (code_unit >= 0xF0); |
70 } | 70 } |
71 | 71 |
72 | 72 |
73 // Returns true if the code point value is above Plane 17. | 73 // Returns true if the code point value is above Plane 17. |
74 static bool IsOutOfRange(uint32_t code_point) { | 74 static bool IsOutOfRange(uint32_t code_point) { |
75 return (code_point > 0x10FFFF); | 75 return (code_point > 0x10FFFF); |
76 } | 76 } |
77 | 77 |
78 | 78 |
79 // Returns true if the byte sequence is ill-formed. | 79 // Returns true if the byte sequence is ill-formed. |
80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { | 80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { |
81 return code_point < kOverlongMinimum[num_bytes]; | 81 return code_point < kOverlongMinimum[num_bytes]; |
82 } | 82 } |
83 | 83 |
84 | 84 |
85 // Returns a count of the number of UTF-8 trail bytes. | 85 // Returns a count of the number of UTF-8 trail bytes. |
86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, | 86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, |
87 intptr_t array_len, | 87 intptr_t array_len, |
88 Type* type) { | 88 Type* type) { |
89 intptr_t len = 0; | 89 intptr_t len = 0; |
90 Type char_type = kLatin1; | 90 Type char_type = kLatin1; |
91 for (intptr_t i = 0; i < array_len; i++) { | 91 for (intptr_t i = 0; i < array_len; i++) { |
92 uint8_t code_unit = utf8_array[i]; | 92 uint8_t code_unit = utf8_array[i]; |
93 if (!IsTrailByte(code_unit)) { | 93 if (!IsTrailByte(code_unit)) { |
94 ++len; | 94 ++len; |
95 } | 95 } |
96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | 96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF. |
cshapiro
2012/11/30 02:49:08
no period
Søren Gjesse
2012/11/30 12:23:07
Done.
| |
97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 | 97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000. |
cshapiro
2012/11/30 02:49:08
ditto
Søren Gjesse
2012/11/30 12:23:07
Done.
| |
98 char_type = kSupplementary; | 98 char_type = kSupplementary; |
99 ++len; | 99 ++len; |
100 } else if (char_type == kLatin1) { | 100 } else if (char_type == kLatin1) { |
101 char_type = kBMP; | 101 char_type = kBMP; |
102 } | 102 } |
103 } | 103 } |
104 } | 104 } |
105 *type = char_type; | 105 *type = char_type; |
106 return len; | 106 return len; |
107 } | 107 } |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
158 String::CodePointIterator it(str); | 158 String::CodePointIterator it(str); |
159 while (it.Next()) { | 159 while (it.Next()) { |
160 int32_t ch = it.Current(); | 160 int32_t ch = it.Current(); |
161 length += Utf8::Length(ch); | 161 length += Utf8::Length(ch); |
162 } | 162 } |
163 return length; | 163 return length; |
164 } | 164 } |
165 | 165 |
166 | 166 |
167 intptr_t Utf8::Encode(int32_t ch, char* dst) { | 167 intptr_t Utf8::Encode(int32_t ch, char* dst) { |
168 ASSERT(!Utf16::IsSurrogate(ch)); | |
168 static const int kMask = ~(1 << 6); | 169 static const int kMask = ~(1 << 6); |
169 if (ch <= kMaxOneByteChar) { | 170 if (ch <= kMaxOneByteChar) { |
170 dst[0] = ch; | 171 dst[0] = ch; |
171 return 1; | 172 return 1; |
172 } | 173 } |
173 if (ch <= kMaxTwoByteChar) { | 174 if (ch <= kMaxTwoByteChar) { |
174 dst[0] = 0xC0 | (ch >> 6); | 175 dst[0] = 0xC0 | (ch >> 6); |
175 dst[1] = 0x80 | (ch & kMask); | 176 dst[1] = 0x80 | (ch & kMask); |
176 return 2; | 177 return 2; |
177 } | 178 } |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
244 uint8_t* dst, | 245 uint8_t* dst, |
245 intptr_t len) { | 246 intptr_t len) { |
246 intptr_t i = 0; | 247 intptr_t i = 0; |
247 intptr_t j = 0; | 248 intptr_t j = 0; |
248 intptr_t num_bytes; | 249 intptr_t num_bytes; |
249 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 250 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
250 int32_t ch; | 251 int32_t ch; |
251 ASSERT(IsLatin1SequenceStart(utf8_array[i])); | 252 ASSERT(IsLatin1SequenceStart(utf8_array[i])); |
252 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 253 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
253 if (ch == -1) { | 254 if (ch == -1) { |
254 return false; // invalid input | 255 return false; // Invalid input. |
255 } | 256 } |
256 ASSERT(ch <= 0xff); | 257 ASSERT(ch <= 0xff); |
257 dst[j] = ch; | 258 dst[j] = ch; |
258 } | 259 } |
259 if ((i < array_len) && (j == len)) { | 260 if ((i < array_len) && (j == len)) { |
260 return false; // output overflow | 261 return false; // Output overflow. |
261 } | 262 } |
262 return true; // success | 263 return true; // Success. |
263 } | 264 } |
264 | 265 |
265 | 266 |
266 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, | 267 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
267 intptr_t array_len, | 268 intptr_t array_len, |
268 uint16_t* dst, | 269 uint16_t* dst, |
269 intptr_t len) { | 270 intptr_t len) { |
270 intptr_t i = 0; | 271 intptr_t i = 0; |
271 intptr_t j = 0; | 272 intptr_t j = 0; |
272 intptr_t num_bytes; | 273 intptr_t num_bytes; |
273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 274 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
274 int32_t ch; | 275 int32_t ch; |
275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); | 276 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); |
276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 277 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
277 if (ch == -1) { | 278 if (ch == -1) { |
278 return false; // invalid input | 279 return false; // Invalid input. |
279 } | 280 } |
280 if (is_supplementary) { | 281 if (is_supplementary) { |
281 Utf16::Encode(ch, &dst[j]); | 282 Utf16::Encode(ch, &dst[j]); |
282 j = j + 1; | 283 j = j + 1; |
283 } else { | 284 } else { |
284 dst[j] = ch; | 285 dst[j] = ch; |
285 } | 286 } |
286 } | 287 } |
287 if ((i < array_len) && (j == len)) { | 288 if ((i < array_len) && (j == len)) { |
288 return false; // output overflow | 289 return false; // Output overflow. |
289 } | 290 } |
290 return true; // success | 291 return true; // Success. |
291 } | 292 } |
292 | 293 |
293 | 294 |
294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, | 295 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
295 intptr_t array_len, | 296 intptr_t array_len, |
296 int32_t* dst, | 297 int32_t* dst, |
297 intptr_t len) { | 298 intptr_t len) { |
298 intptr_t i = 0; | 299 intptr_t i = 0; |
299 intptr_t j = 0; | 300 intptr_t j = 0; |
300 intptr_t num_bytes; | 301 intptr_t num_bytes; |
301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 302 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
302 int32_t ch; | 303 int32_t ch; |
303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 304 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
304 if (ch == -1) { | 305 if (ch == -1) { |
305 return false; // invalid input | 306 return false; // Invalid input. |
306 } | 307 } |
307 dst[j] = ch; | 308 dst[j] = ch; |
308 } | 309 } |
309 if ((i < array_len) && (j == len)) { | 310 if ((i < array_len) && (j == len)) { |
310 return false; // output overflow | 311 return false; // Output overflow. |
311 } | 312 } |
312 return true; // success | 313 return true; // Success. |
313 } | 314 } |
314 | 315 |
315 | 316 |
316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 317 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
317 ASSERT(codepoint > kMaxBmpCodepoint); | 318 ASSERT(codepoint > kMaxBmpCodepoint); |
318 ASSERT(dst != NULL); | 319 ASSERT(dst != NULL); |
319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 320 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
320 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 321 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
321 } | 322 } |
322 | 323 |
324 | |
325 bool Utf16::CodePointIterator::Next() { | |
326 ASSERT(index_ >= -1); | |
327 ASSERT(index_ < array_len_); | |
328 int d = Length(ch_); | |
329 if (index_ == (array_len_ - d)) { | |
330 return false; | |
331 } | |
332 index_ += d; | |
333 ch_ = utf16_array_[index_]; | |
334 if (IsLeadSurrogate(ch_) && (index_ != (array_len_ - 1))) { | |
335 int32_t ch2 = utf16_array_[index_ + 1]; | |
336 if (IsTrailSurrogate(ch2)) { | |
337 ch_ = Decode(ch_, ch2); | |
338 } | |
339 } | |
340 return true; | |
341 } | |
342 | |
323 } // namespace dart | 343 } // namespace dart |
OLD | NEW |