| OLD | NEW |
| 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
| 6 | 6 |
| 7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
| 8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
| 9 #include "vm/object.h" | 9 #include "vm/object.h" |
| 10 | 10 |
| 11 namespace dart { | 11 namespace dart { |
| 12 | 12 |
| 13 static const uint8_t kTrailBytes[256] = { | 13 static const int8_t kTrailBytes[256] = { |
| 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| (...skipping 27 matching lines...) Expand all Loading... |
| 51 0xFFFFFFFF, | 51 0xFFFFFFFF, |
| 52 0xFFFFFFFF | 52 0xFFFFFFFF |
| 53 }; | 53 }; |
| 54 | 54 |
| 55 | 55 |
| 56 static bool IsTrailByte(uint8_t code_unit) { | 56 static bool IsTrailByte(uint8_t code_unit) { |
| 57 return (code_unit & 0xc0) == 0x80; | 57 return (code_unit & 0xc0) == 0x80; |
| 58 } | 58 } |
| 59 | 59 |
| 60 | 60 |
| 61 static bool IsAsciiSequenceStart(uint8_t code_unit) { |
| 62 // Check is codepoint is <= U+007F |
| 63 return (code_unit <= Utf8::kMaxOneByteChar); |
| 64 } |
| 65 |
| 66 |
| 67 static bool IsSmpSequenceStart(uint8_t code_unit) { |
| 68 // Check is codepoint is >= U+10000. |
| 69 return (code_unit >= 0xF0); |
| 70 } |
| 71 |
| 72 |
| 61 // Returns true if the code point is a high- or low-surrogate. | 73 // Returns true if the code point is a high- or low-surrogate. |
| 62 static bool IsSurrogate(uint32_t code_point) { | 74 static bool IsSurrogate(uint32_t code_point) { |
| 63 return (code_point & 0xfffff800) == 0xd800; | 75 return (code_point & 0xfffff800) == 0xd800; |
| 64 } | 76 } |
| 65 | 77 |
| 66 | 78 |
| 67 // Returns true if the code point value is above Plane 17. | 79 // Returns true if the code point value is above Plane 17. |
| 68 static bool IsOutOfRange(uint32_t code_point) { | 80 static bool IsOutOfRange(uint32_t code_point) { |
| 69 return code_point > 0x10FFFF; | 81 return (code_point > 0x10FFFF); |
| 70 } | 82 } |
| 71 | 83 |
| 72 | 84 |
| 73 // Returns true if the byte sequence is ill-formed. | 85 // Returns true if the byte sequence is ill-formed. |
| 74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { | 86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { |
| 75 return code_point < kOverlongMinimum[num_bytes]; | 87 return code_point < kOverlongMinimum[num_bytes]; |
| 76 } | 88 } |
| 77 | 89 |
| 78 | 90 |
| 91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { |
| 92 ASSERT(codepoint > kMaxBmpCodepoint); |
| 93 ASSERT(dst != NULL); |
| 94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); |
| 95 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
| 96 } |
| 97 |
| 98 |
| 79 // Returns a count of the number of UTF-8 trail bytes. | 99 // Returns a count of the number of UTF-8 trail bytes. |
| 80 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) { | 100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, |
| 81 bool is_two_byte_string = false; | 101 intptr_t array_len, |
| 82 bool is_four_byte_string = false; | 102 Type* type) { |
| 83 intptr_t len = 0; | 103 intptr_t len = 0; |
| 84 for (; *str != '\0'; ++str) { | 104 Type char_type = kAscii; |
| 85 uint8_t code_unit = *str; | 105 for (intptr_t i = 0; i < array_len; i++) { |
| 106 uint8_t code_unit = utf8_array[i]; |
| 86 if (!IsTrailByte(code_unit)) { | 107 if (!IsTrailByte(code_unit)) { |
| 87 ++len; | 108 ++len; |
| 88 } | 109 } |
| 89 if (code_unit > 0xC3) { // > U+00FF | 110 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F |
| 90 if (code_unit < 0xF0) { // < U+10000 | 111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 |
| 91 is_two_byte_string = true; | 112 char_type = kSMP; |
| 92 } else { | 113 ++len; |
| 93 is_four_byte_string = true; | 114 } else if (char_type == kAscii) { |
| 115 char_type = kBMP; |
| 94 } | 116 } |
| 95 } | 117 } |
| 96 } | 118 } |
| 97 if (is_four_byte_string) { | 119 *type = char_type; |
| 98 *width = 4; | |
| 99 } else if (is_two_byte_string) { | |
| 100 *width = 2; | |
| 101 } else { | |
| 102 *width = 1; | |
| 103 } | |
| 104 return len; | 120 return len; |
| 105 } | 121 } |
| 106 | 122 |
| 107 | 123 |
| 108 // Returns true if str is a valid NUL-terminated UTF-8 string. | 124 // Returns true if str is a valid NUL-terminated UTF-8 string. |
| 109 bool Utf8::IsValid(const char* str) { | 125 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { |
| 110 intptr_t i = 0; | 126 intptr_t i = 0; |
| 111 while (str[i] != '\0') { | 127 while (i < array_len) { |
| 112 uint32_t ch = str[i] & 0xFF; | 128 uint32_t ch = utf8_array[i] & 0xFF; |
| 113 intptr_t j = 1; | 129 intptr_t j = 1; |
| 114 if (ch >= 0x80) { | 130 if (ch >= 0x80) { |
| 115 uint8_t num_trail_bytes = kTrailBytes[ch]; | 131 int8_t num_trail_bytes = kTrailBytes[ch]; |
| 116 bool is_malformed = false; | 132 bool is_malformed = false; |
| 117 for (; j < num_trail_bytes; ++j) { | 133 for (; j < num_trail_bytes; ++j) { |
| 118 if (str[i + j] != '\0') { | 134 if ((i + j) < array_len) { |
| 119 uint8_t code_unit = str[i + j]; | 135 uint8_t code_unit = utf8_array[i + j]; |
| 120 is_malformed |= !IsTrailByte(code_unit); | 136 is_malformed |= !IsTrailByte(code_unit); |
| 121 ch = (ch << 6) + code_unit; | 137 ch = (ch << 6) + code_unit; |
| 122 } else { | 138 } else { |
| 123 return false; | 139 return false; |
| 124 } | 140 } |
| 125 } | 141 } |
| 126 ch -= kMagicBits[num_trail_bytes]; | 142 ch -= kMagicBits[num_trail_bytes]; |
| 127 if (!((is_malformed == false) && | 143 if (!((is_malformed == false) && |
| 128 (j == num_trail_bytes) && | 144 (j == num_trail_bytes) && |
| 129 !IsOutOfRange(ch) && | 145 !IsOutOfRange(ch) && |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 195 if (pos + num_bytes > len) { | 211 if (pos + num_bytes > len) { |
| 196 break; | 212 break; |
| 197 } | 213 } |
| 198 Utf8::Encode(ch, &dst[pos]); | 214 Utf8::Encode(ch, &dst[pos]); |
| 199 pos += num_bytes; | 215 pos += num_bytes; |
| 200 } | 216 } |
| 201 return pos; | 217 return pos; |
| 202 } | 218 } |
| 203 | 219 |
| 204 | 220 |
| 205 intptr_t Utf8::Decode(const char* src, int32_t* dst) { | 221 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
| 206 uint32_t ch = src[0] & 0xFF; | 222 intptr_t array_len, |
| 207 uint32_t i = 1; | 223 int32_t* dst) { |
| 224 uint32_t ch = utf8_array[0] & 0xFF; |
| 225 intptr_t i = 1; |
| 208 if (ch >= 0x80) { | 226 if (ch >= 0x80) { |
| 209 uint32_t num_trail_bytes = kTrailBytes[ch]; | 227 int32_t num_trail_bytes = kTrailBytes[ch]; |
| 210 bool is_malformed = false; | 228 bool is_malformed = false; |
| 211 for (; i < num_trail_bytes; ++i) { | 229 for (; i < num_trail_bytes; ++i) { |
| 212 if (src[i] != '\0') { | 230 if (i < array_len) { |
| 213 uint8_t code_unit = src[i]; | 231 uint8_t code_unit = utf8_array[i]; |
| 214 is_malformed |= !IsTrailByte(code_unit); | 232 is_malformed |= !IsTrailByte(code_unit); |
| 215 ch = (ch << 6) + code_unit; | 233 ch = (ch << 6) + code_unit; |
| 216 } else { | 234 } else { |
| 217 *dst = -1; | 235 *dst = -1; |
| 218 return 0; | 236 return 0; |
| 219 } | 237 } |
| 220 } | 238 } |
| 221 ch -= kMagicBits[num_trail_bytes]; | 239 ch -= kMagicBits[num_trail_bytes]; |
| 222 if (!((is_malformed == false) && | 240 if (!((is_malformed == false) && |
| 223 (i == num_trail_bytes) && | 241 (i == num_trail_bytes) && |
| 224 !IsOutOfRange(ch) && | 242 !IsOutOfRange(ch) && |
| 225 !IsNonShortestForm(ch, i) && | 243 !IsNonShortestForm(ch, i) && |
| 226 !IsSurrogate(ch))) { | 244 !IsSurrogate(ch))) { |
| 227 *dst = -1; | 245 *dst = -1; |
| 228 return 0; | 246 return 0; |
| 229 } | 247 } |
| 230 } | 248 } |
| 231 *dst = ch; | 249 *dst = ch; |
| 232 return i; | 250 return i; |
| 233 } | 251 } |
| 234 | 252 |
| 235 | 253 |
| 236 template<typename T> | 254 bool Utf8::DecodeToAscii(const uint8_t* utf8_array, |
| 237 static bool DecodeImpl(const char* src, T* dst, intptr_t len) { | 255 intptr_t array_len, |
| 256 uint8_t* dst, |
| 257 intptr_t len) { |
| 258 if (len < array_len) { |
| 259 return false; // output overflow |
| 260 } |
| 261 #ifdef DEBUG |
| 262 for (intptr_t i = 0; i < array_len; i++) { |
| 263 ASSERT(IsAsciiSequenceStart(utf8_array[i])); |
| 264 } |
| 265 #endif |
| 266 memmove(dst, utf8_array, array_len); |
| 267 return true; // success |
| 268 } |
| 269 |
| 270 |
| 271 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
| 272 intptr_t array_len, |
| 273 uint16_t* dst, |
| 274 intptr_t len) { |
| 238 intptr_t i = 0; | 275 intptr_t i = 0; |
| 239 intptr_t j = 0; | 276 intptr_t j = 0; |
| 240 intptr_t num_bytes; | 277 intptr_t num_bytes; |
| 241 for (; src[i] != '\0' && j < len; i += num_bytes, ++j) { | 278 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| 242 int32_t ch; | 279 int32_t ch; |
| 243 num_bytes = Utf8::Decode(&src[i], &ch); | 280 bool is_smp = IsSmpSequenceStart(utf8_array[i]); |
| 281 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| 244 if (ch == -1) { | 282 if (ch == -1) { |
| 245 return false; // invalid input | 283 return false; // invalid input |
| 246 } | 284 } |
| 247 dst[j] = ch; | 285 if (is_smp) { |
| 286 ConvertUTF32ToUTF16(ch, &(dst[j])); |
| 287 j = j + 1; |
| 288 } else { |
| 289 dst[j] = ch; |
| 290 } |
| 248 } | 291 } |
| 249 if (src[i] != '\0' && j == len) { | 292 if ((i < array_len) && (j == len)) { |
| 250 return false; // output overflow | 293 return false; // output overflow |
| 251 } | 294 } |
| 252 return true; // success | 295 return true; // success |
| 253 } | 296 } |
| 254 | 297 |
| 255 | 298 |
| 256 bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) { | 299 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
| 257 return DecodeImpl(src, dst, len); | 300 intptr_t array_len, |
| 258 } | 301 uint32_t* dst, |
| 259 | 302 intptr_t len) { |
| 260 | 303 intptr_t i = 0; |
| 261 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) { | 304 intptr_t j = 0; |
| 262 return DecodeImpl(src, dst, len); | 305 intptr_t num_bytes; |
| 263 } | 306 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| 264 | 307 int32_t ch; |
| 265 | 308 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| 266 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) { | 309 if (ch == -1) { |
| 267 return DecodeImpl(src, dst, len); | 310 return false; // invalid input |
| 311 } |
| 312 dst[j] = ch; |
| 313 } |
| 314 if ((i < array_len) && (j == len)) { |
| 315 return false; // output overflow |
| 316 } |
| 317 return true; // success |
| 268 } | 318 } |
| 269 | 319 |
| 270 } // namespace dart | 320 } // namespace dart |
| OLD | NEW |