OLD | NEW |
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
58 } | 58 } |
59 | 59 |
60 | 60 |
61 static bool IsAsciiSequenceStart(uint8_t code_unit) { | 61 static bool IsAsciiSequenceStart(uint8_t code_unit) { |
62 // Check is codepoint is <= U+007F | 62 // Check is codepoint is <= U+007F |
63 return (code_unit <= Utf8::kMaxOneByteChar); | 63 return (code_unit <= Utf8::kMaxOneByteChar); |
64 } | 64 } |
65 | 65 |
66 | 66 |
67 static bool IsSmpSequenceStart(uint8_t code_unit) { | 67 static bool IsSmpSequenceStart(uint8_t code_unit) { |
68 // Check is codepoint is >= U+10000. | 68 // Check the UTF-8 code unit to determine if it is a sequence start for a |
| 69 // code point >= U+10000. |
69 return (code_unit >= 0xF0); | 70 return (code_unit >= 0xF0); |
70 } | 71 } |
71 | 72 |
72 | 73 |
73 // Returns true if the code point is a high- or low-surrogate. | |
74 static bool IsSurrogate(uint32_t code_point) { | |
75 return (code_point & 0xfffff800) == 0xd800; | |
76 } | |
77 | |
78 | |
79 // Returns true if the code point value is above Plane 17. | 74 // Returns true if the code point value is above Plane 17. |
80 static bool IsOutOfRange(uint32_t code_point) { | 75 static bool IsOutOfRange(uint32_t code_point) { |
81 return (code_point > 0x10FFFF); | 76 return (code_point > Utf16::kMaxCodePoint); |
82 } | 77 } |
83 | 78 |
84 | 79 |
85 // Returns true if the byte sequence is ill-formed. | 80 // Returns true if the byte sequence is ill-formed. |
86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { | 81 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { |
87 return code_point < kOverlongMinimum[num_bytes]; | 82 return code_point < kOverlongMinimum[num_bytes]; |
88 } | 83 } |
89 | 84 |
90 | 85 |
91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { | 86 // Returns a count of the number of UTF-16 code units represented by this UTF-8 |
92 ASSERT(codepoint > kMaxBmpCodepoint); | 87 // array. Type is kASCII for 7-bit-only. If there are surrogate pairs then |
93 ASSERT(dst != NULL); | 88 // the type is kSMP. Otherwise it is kBMP. |
94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); | 89 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, |
95 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 90 intptr_t array_len, |
96 } | 91 Type* type) { |
97 | |
98 | |
99 // Returns a count of the number of UTF-8 trail bytes. | |
100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, | |
101 intptr_t array_len, | |
102 Type* type) { | |
103 intptr_t len = 0; | 92 intptr_t len = 0; |
104 Type char_type = kAscii; | 93 Type char_type = kAscii; |
105 for (intptr_t i = 0; i < array_len; i++) { | 94 for (intptr_t i = 0; i < array_len; i++) { |
106 uint8_t code_unit = utf8_array[i]; | 95 uint8_t code_unit = utf8_array[i]; |
107 if (!IsTrailByte(code_unit)) { | 96 if (!IsTrailByte(code_unit)) { |
108 ++len; | 97 ++len; |
109 } | 98 } |
110 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F | 99 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F |
111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 | 100 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 |
112 char_type = kSMP; | 101 char_type = kSMP; |
113 ++len; | 102 ++len; // Surrogate pair in the UTF-16 encoding. |
114 } else if (char_type == kAscii) { | 103 } else if (char_type == kAscii) { |
115 char_type = kBMP; | 104 char_type = kBMP; |
116 } | 105 } |
117 } | 106 } |
118 } | 107 } |
119 *type = char_type; | 108 *type = char_type; |
120 return len; | 109 return len; |
121 } | 110 } |
122 | 111 |
123 | 112 |
(...skipping 13 matching lines...) Expand all Loading... |
137 ch = (ch << 6) + code_unit; | 126 ch = (ch << 6) + code_unit; |
138 } else { | 127 } else { |
139 return false; | 128 return false; |
140 } | 129 } |
141 } | 130 } |
142 ch -= kMagicBits[num_trail_bytes]; | 131 ch -= kMagicBits[num_trail_bytes]; |
143 if (!((is_malformed == false) && | 132 if (!((is_malformed == false) && |
144 (j == num_trail_bytes) && | 133 (j == num_trail_bytes) && |
145 !IsOutOfRange(ch) && | 134 !IsOutOfRange(ch) && |
146 !IsNonShortestForm(ch, j) && | 135 !IsNonShortestForm(ch, j) && |
147 !IsSurrogate(ch))) { | 136 !Utf16::IsSurrogate(ch))) { |
148 return false; | 137 return false; |
149 } | 138 } |
150 } | 139 } |
151 i += j; | 140 i += j; |
152 } | 141 } |
153 return true; | 142 return true; |
154 } | 143 } |
155 | 144 |
156 | 145 |
157 intptr_t Utf8::Length(int32_t ch) { | 146 intptr_t Utf8::Length(int32_t ch) { |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
213 } | 202 } |
214 Utf8::Encode(ch, &dst[pos]); | 203 Utf8::Encode(ch, &dst[pos]); |
215 pos += num_bytes; | 204 pos += num_bytes; |
216 } | 205 } |
217 return pos; | 206 return pos; |
218 } | 207 } |
219 | 208 |
220 | 209 |
221 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 210 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
222 intptr_t array_len, | 211 intptr_t array_len, |
223 int32_t* dst) { | 212 uint32_t* dst) { |
224 uint32_t ch = utf8_array[0] & 0xFF; | 213 uint32_t ch = utf8_array[0] & 0xFF; |
225 intptr_t i = 1; | 214 intptr_t i = 1; |
226 if (ch >= 0x80) { | 215 if (ch >= 0x80) { |
227 int32_t num_trail_bytes = kTrailBytes[ch]; | 216 int32_t num_trail_bytes = kTrailBytes[ch]; |
228 bool is_malformed = false; | 217 bool is_malformed = false; |
229 for (; i < num_trail_bytes; ++i) { | 218 for (; i < num_trail_bytes; ++i) { |
230 if (i < array_len) { | 219 if (i < array_len) { |
231 uint8_t code_unit = utf8_array[i]; | 220 uint8_t code_unit = utf8_array[i]; |
232 is_malformed |= !IsTrailByte(code_unit); | 221 is_malformed |= !IsTrailByte(code_unit); |
233 ch = (ch << 6) + code_unit; | 222 ch = (ch << 6) + code_unit; |
234 } else { | 223 } else { |
235 *dst = -1; | 224 *dst = kInvalidCodePoint; |
236 return 0; | 225 return 0; |
237 } | 226 } |
238 } | 227 } |
239 ch -= kMagicBits[num_trail_bytes]; | 228 ch -= kMagicBits[num_trail_bytes]; |
240 if (!((is_malformed == false) && | 229 if (!((is_malformed == false) && |
241 (i == num_trail_bytes) && | 230 (i == num_trail_bytes) && |
242 !IsOutOfRange(ch) && | 231 !IsOutOfRange(ch) && |
243 !IsNonShortestForm(ch, i) && | 232 !IsNonShortestForm(ch, i) && |
244 !IsSurrogate(ch))) { | 233 !Utf16::IsSurrogate(ch))) { |
245 *dst = -1; | 234 *dst = kInvalidCodePoint; |
246 return 0; | 235 return 0; |
247 } | 236 } |
248 } | 237 } |
249 *dst = ch; | 238 *dst = ch; |
250 return i; | 239 return i; |
251 } | 240 } |
252 | 241 |
253 | 242 |
254 bool Utf8::DecodeToAscii(const uint8_t* utf8_array, | 243 bool Utf8::DecodeToAscii(const uint8_t* utf8_array, |
255 intptr_t array_len, | 244 intptr_t array_len, |
(...skipping 13 matching lines...) Expand all Loading... |
269 | 258 |
270 | 259 |
271 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, | 260 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
272 intptr_t array_len, | 261 intptr_t array_len, |
273 uint16_t* dst, | 262 uint16_t* dst, |
274 intptr_t len) { | 263 intptr_t len) { |
275 intptr_t i = 0; | 264 intptr_t i = 0; |
276 intptr_t j = 0; | 265 intptr_t j = 0; |
277 intptr_t num_bytes; | 266 intptr_t num_bytes; |
278 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 267 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
279 int32_t ch; | 268 uint32_t ch; |
280 bool is_smp = IsSmpSequenceStart(utf8_array[i]); | 269 bool is_smp = IsSmpSequenceStart(utf8_array[i]); |
281 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 270 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
282 if (ch == -1) { | 271 if (ch == kInvalidCodePoint) { |
283 return false; // invalid input | 272 return false; // invalid input |
284 } | 273 } |
285 if (is_smp) { | 274 if (is_smp) { |
286 ConvertUTF32ToUTF16(ch, &(dst[j])); | 275 dst[j] = Utf16::LeadFromCodePoint(ch); |
287 j = j + 1; | 276 dst[j + 1] = Utf16::TrailFromCodePoint(ch); |
| 277 ++j; |
288 } else { | 278 } else { |
289 dst[j] = ch; | 279 dst[j] = ch; |
290 } | 280 } |
291 } | 281 } |
292 if ((i < array_len) && (j == len)) { | 282 if ((i < array_len) && (j == len)) { |
293 return false; // output overflow | 283 return false; // output overflow |
294 } | 284 } |
295 return true; // success | 285 return true; // success |
296 } | 286 } |
297 | 287 |
298 | 288 |
299 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, | 289 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
300 intptr_t array_len, | 290 intptr_t array_len, |
301 uint32_t* dst, | 291 uint32_t* dst, |
302 intptr_t len) { | 292 intptr_t len) { |
303 intptr_t i = 0; | 293 intptr_t i = 0; |
304 intptr_t j = 0; | 294 intptr_t j = 0; |
305 intptr_t num_bytes; | 295 intptr_t num_bytes; |
306 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 296 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
307 int32_t ch; | 297 uint32_t ch; |
308 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 298 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
309 if (ch == -1) { | 299 if (ch == kInvalidCodePoint) { |
310 return false; // invalid input | 300 return false; // invalid input |
311 } | 301 } |
312 dst[j] = ch; | 302 dst[j] = ch; |
313 } | 303 } |
314 if ((i < array_len) && (j == len)) { | 304 if ((i < array_len) && (j == len)) { |
315 return false; // output overflow | 305 return false; // output overflow |
316 } | 306 } |
317 return true; // success | 307 return true; // success |
318 } | 308 } |
319 | 309 |
320 } // namespace dart | 310 } // namespace dart |
OLD | NEW |