| OLD | NEW |
| 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
| 6 | 6 |
| 7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
| 8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
| 9 #include "vm/object.h" | 9 #include "vm/object.h" |
| 10 | 10 |
| (...skipping 30 matching lines...) Expand all Loading... |
| 41 }; | 41 }; |
| 42 | 42 |
| 43 | 43 |
| 44 // Minimum values of code points used to check shortest form. | 44 // Minimum values of code points used to check shortest form. |
| 45 static const uint32_t kOverlongMinimum[7] = { | 45 static const uint32_t kOverlongMinimum[7] = { |
| 46 0, // padding | 46 0, // padding |
| 47 0x0, | 47 0x0, |
| 48 0x80, | 48 0x80, |
| 49 0x800, | 49 0x800, |
| 50 0x10000, | 50 0x10000, |
| 51 0xFFFFFFFF, | 51 0xFFFFFFFF, // We never allow 5 byte sequences. |
| 52 0xFFFFFFFF | 52 0xFFFFFFFF // We never allow 6 byte sequences. |
| 53 }; | 53 }; |
| 54 | 54 |
| 55 | 55 |
| 56 static bool IsTrailByte(uint8_t code_unit) { | 56 static bool IsTrailByte(uint8_t code_unit) { |
| 57 return (code_unit & 0xc0) == 0x80; | 57 return (code_unit & 0xc0) == 0x80; |
| 58 } | 58 } |
| 59 | 59 |
| 60 | 60 |
| 61 static bool IsLatin1SequenceStart(uint8_t code_unit) { | 61 static bool IsLatin1SequenceStart(uint8_t code_unit) { |
| 62 // Check is codepoint is <= U+00FF | 62 // Check is codepoint is <= U+00FF |
| 63 return (code_unit <= Utf8::kMaxOneByteChar); | 63 return (code_unit <= Utf8::kMaxOneByteChar); |
| 64 } | 64 } |
| 65 | 65 |
| 66 | 66 |
| 67 static bool IsSmpSequenceStart(uint8_t code_unit) { | 67 static bool IsSmpSequenceStart(uint8_t code_unit) { |
| 68 // Check is codepoint is >= U+10000. | 68 // Check the UTF-8 code unit to determine if it is a sequence start for a |
| 69 // code point >= U+10000. |
| 69 return (code_unit >= 0xF0); | 70 return (code_unit >= 0xF0); |
| 70 } | 71 } |
| 71 | 72 |
| 72 | 73 |
| 73 // Returns true if the code point is a high- or low-surrogate. | |
| 74 static bool IsSurrogate(uint32_t code_point) { | |
| 75 return (code_point & 0xfffff800) == 0xd800; | |
| 76 } | |
| 77 | |
| 78 | |
| 79 // Returns true if the code point value is above Plane 17. | 74 // Returns true if the code point value is above Plane 17. |
| 80 static bool IsOutOfRange(uint32_t code_point) { | 75 static bool IsOutOfRange(int32_t code_point) { |
| 81 return (code_point > 0x10FFFF); | 76 return (code_point > Utf16::kMaxCodePoint); |
| 82 } | 77 } |
| 83 | 78 |
| 84 | 79 |
| 85 // Returns true if the byte sequence is ill-formed. | 80 // Returns true if the byte sequence is ill-formed. |
| 86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { | 81 static bool IsNonShortestForm(int32_t code_point, size_t num_bytes) { |
| 87 return code_point < kOverlongMinimum[num_bytes]; | 82 return static_cast<uint32_t>(code_point) < kOverlongMinimum[num_bytes]; |
| 88 } | 83 } |
| 89 | 84 |
| 90 | 85 |
| 91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) { | 86 // Returns a count of the number of UTF-16 code units represented by this UTF-8 |
| 92 ASSERT(codepoint > kMaxBmpCodepoint); | 87 // array. Type is kASCII for 7-bit-only. If there are surrogate pairs then |
| 93 ASSERT(dst != NULL); | 88 // the type is kSMP. Otherwise it is kBMP. |
| 94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10)); | 89 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, |
| 95 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 90 intptr_t array_len, |
| 96 } | 91 Type* type) { |
| 97 | |
| 98 | |
| 99 // Returns a count of the number of UTF-8 trail bytes. | |
| 100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, | |
| 101 intptr_t array_len, | |
| 102 Type* type) { | |
| 103 intptr_t len = 0; | 92 intptr_t len = 0; |
| 104 Type char_type = kLatin1; | 93 Type char_type = kLatin1; |
| 105 for (intptr_t i = 0; i < array_len; i++) { | 94 for (intptr_t i = 0; i < array_len; i++) { |
| 106 uint8_t code_unit = utf8_array[i]; | 95 uint8_t code_unit = utf8_array[i]; |
| 107 if (!IsTrailByte(code_unit)) { | 96 if (!IsTrailByte(code_unit)) { |
| 108 ++len; | 97 ++len; |
| 109 } | 98 } |
| 110 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | 99 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF |
| 111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 | 100 if (IsSmpSequenceStart(code_unit)) { // >= U+10000 |
| 112 char_type = kSMP; | 101 char_type = kSMP; |
| 113 ++len; | 102 ++len; // Surrogate pair in the UTF-16 encoding. |
| 114 } else if (char_type == kLatin1) { | 103 } else if (char_type == kLatin1) { |
| 115 char_type = kBMP; | 104 char_type = kBMP; |
| 116 } | 105 } |
| 117 } | 106 } |
| 118 } | 107 } |
| 119 *type = char_type; | 108 *type = char_type; |
| 120 return len; | 109 return len; |
| 121 } | 110 } |
| 122 | 111 |
| 123 | 112 |
| 124 // Returns true if str is a valid NUL-terminated UTF-8 string. | 113 // Returns true if str is a valid UTF-8 string. |
| 125 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { | 114 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { |
| 126 intptr_t i = 0; | 115 intptr_t i = 0; |
| 127 while (i < array_len) { | 116 while (i < array_len) { |
| 128 uint32_t ch = utf8_array[i] & 0xFF; | 117 uint32_t ch = utf8_array[i] & 0xFF; |
| 129 intptr_t j = 1; | 118 intptr_t j = 1; |
| 130 if (ch >= 0x80) { | 119 if (ch >= 0x80) { |
| 131 int8_t num_trail_bytes = kTrailBytes[ch]; | 120 int8_t num_trail_bytes = kTrailBytes[ch]; |
| 132 bool is_malformed = false; | 121 bool is_malformed = false; |
| 133 for (; j < num_trail_bytes; ++j) { | 122 for (; j < num_trail_bytes; ++j) { |
| 134 if ((i + j) < array_len) { | 123 if ((i + j) < array_len) { |
| 135 uint8_t code_unit = utf8_array[i + j]; | 124 uint8_t code_unit = utf8_array[i + j]; |
| 136 is_malformed |= !IsTrailByte(code_unit); | 125 is_malformed |= !IsTrailByte(code_unit); |
| 137 ch = (ch << 6) + code_unit; | 126 ch = (ch << 6) + code_unit; |
| 138 } else { | 127 } else { |
| 139 return false; | 128 return false; |
| 140 } | 129 } |
| 141 } | 130 } |
| 142 ch -= kMagicBits[num_trail_bytes]; | 131 ch -= kMagicBits[num_trail_bytes]; |
| 143 if (!((is_malformed == false) && | 132 if (!((is_malformed == false) && |
| 144 (j == num_trail_bytes) && | 133 (j == num_trail_bytes) && |
| 145 !IsOutOfRange(ch) && | 134 !IsOutOfRange(ch) && |
| 146 !IsNonShortestForm(ch, j) && | 135 !IsNonShortestForm(ch, j) && |
| 147 !IsSurrogate(ch))) { | 136 !Utf16::IsSurrogate(ch))) { |
| 148 return false; | 137 return false; |
| 149 } | 138 } |
| 150 } | 139 } |
| 151 i += j; | 140 i += j; |
| 152 } | 141 } |
| 153 return true; | 142 return true; |
| 154 } | 143 } |
| 155 | 144 |
| 156 | 145 |
| 157 intptr_t Utf8::Length(int32_t ch) { | 146 intptr_t Utf8::Length(int32_t ch) { |
| 158 if (ch <= kMaxOneByteChar) { | 147 if (ch <= kMaxOneByteChar) { |
| 159 return 1; | 148 return 1; |
| 160 } else if (ch <= kMaxTwoByteChar) { | 149 } else if (ch <= kMaxTwoByteChar) { |
| 161 return 2; | 150 return 2; |
| 162 } else if (ch <= kMaxThreeByteChar) { | 151 } else if (ch <= kMaxThreeByteChar) { |
| 163 return 3; | 152 return 3; |
| 164 } | 153 } |
| 165 ASSERT(ch <= kMaxFourByteChar); | 154 ASSERT(ch <= kMaxFourByteChar); |
| 166 return 4; | 155 return 4; |
| 167 } | 156 } |
| 168 | 157 |
| 169 | 158 |
| 170 intptr_t Utf8::Length(const String& str) { | 159 intptr_t Utf8::Length(const String& str) { |
| 171 intptr_t length = 0; | 160 intptr_t length = 0; |
| 172 for (intptr_t i = 0; i < str.Length(); ++i) { | 161 for (intptr_t i = 0; i < str.Length(); ++i) { |
| 173 int32_t ch = str.CharAt(i); | 162 int32_t ch = Utf16::CodePointAt(str, i); |
| 174 length += Utf8::Length(ch); | 163 length += Utf8::Length(ch); |
| 164 if (ch >= 0x10000) i++; // Surrogate pair in input |
| 175 } | 165 } |
| 176 return length; | 166 return length; |
| 177 } | 167 } |
| 178 | 168 |
| 179 | 169 |
| 180 intptr_t Utf8::Encode(int32_t ch, char* dst) { | 170 intptr_t Utf8::Encode(int32_t ch, char* dst) { |
| 181 static const int kMask = ~(1 << 6); | 171 static const int kMask = ~(1 << 6); |
| 182 if (ch <= kMaxOneByteChar) { | 172 if (ch <= kMaxOneByteChar) { |
| 183 dst[0] = ch; | 173 dst[0] = ch; |
| 184 return 1; | 174 return 1; |
| (...skipping 14 matching lines...) Expand all Loading... |
| 199 dst[1] = 0x80 | ((ch >> 12) & kMask); | 189 dst[1] = 0x80 | ((ch >> 12) & kMask); |
| 200 dst[2] = 0x80 | ((ch >> 6) & kMask); | 190 dst[2] = 0x80 | ((ch >> 6) & kMask); |
| 201 dst[3] = 0x80 | (ch & kMask); | 191 dst[3] = 0x80 | (ch & kMask); |
| 202 return 4; | 192 return 4; |
| 203 } | 193 } |
| 204 | 194 |
| 205 | 195 |
| 206 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { | 196 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { |
| 207 intptr_t pos = 0; | 197 intptr_t pos = 0; |
| 208 for (intptr_t i = 0; i < src.Length(); ++i) { | 198 for (intptr_t i = 0; i < src.Length(); ++i) { |
| 209 intptr_t ch = src.CharAt(i); | 199 intptr_t ch = Utf16::CodePointAt(src, i); |
| 210 intptr_t num_bytes = Utf8::Length(ch); | 200 intptr_t num_bytes = Utf8::Length(ch); |
| 211 if (pos + num_bytes > len) { | 201 if (pos + num_bytes > len) { |
| 212 break; | 202 break; |
| 213 } | 203 } |
| 214 Utf8::Encode(ch, &dst[pos]); | 204 Utf8::Encode(ch, &dst[pos]); |
| 215 pos += num_bytes; | 205 pos += num_bytes; |
| 206 if (num_bytes > 3) i++; // Surrogate pair in input. |
| 216 } | 207 } |
| 217 return pos; | 208 return pos; |
| 218 } | 209 } |
| 219 | 210 |
| 220 | 211 |
| 221 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 212 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
| 222 intptr_t array_len, | 213 intptr_t array_len, |
| 223 int32_t* dst) { | 214 int32_t* dst) { |
| 224 uint32_t ch = utf8_array[0] & 0xFF; | 215 int32_t ch = utf8_array[0] & 0xFF; |
| 225 intptr_t i = 1; | 216 intptr_t i = 1; |
| 226 if (ch >= 0x80) { | 217 if (ch >= 0x80) { |
| 227 int32_t num_trail_bytes = kTrailBytes[ch]; | 218 int32_t num_trail_bytes = kTrailBytes[ch]; |
| 228 bool is_malformed = false; | 219 bool is_malformed = false; |
| 229 for (; i < num_trail_bytes; ++i) { | 220 for (; i < num_trail_bytes; ++i) { |
| 230 if (i < array_len) { | 221 if (i < array_len) { |
| 231 uint8_t code_unit = utf8_array[i]; | 222 uint8_t code_unit = utf8_array[i]; |
| 232 is_malformed |= !IsTrailByte(code_unit); | 223 is_malformed |= !IsTrailByte(code_unit); |
| 233 ch = (ch << 6) + code_unit; | 224 ch = (ch << 6) + code_unit; |
| 234 } else { | 225 } else { |
| 235 *dst = -1; | 226 *dst = kInvalidCodePoint; |
| 236 return 0; | 227 return 0; |
| 237 } | 228 } |
| 238 } | 229 } |
| 239 ch -= kMagicBits[num_trail_bytes]; | 230 ch -= kMagicBits[num_trail_bytes]; |
| 240 if (!((is_malformed == false) && | 231 if (!((is_malformed == false) && |
| 241 (i == num_trail_bytes) && | 232 (i == num_trail_bytes) && |
| 242 !IsOutOfRange(ch) && | 233 !IsOutOfRange(ch) && |
| 243 !IsNonShortestForm(ch, i) && | 234 !IsNonShortestForm(ch, i) && |
| 244 !IsSurrogate(ch))) { | 235 !Utf16::IsSurrogate(ch))) { |
| 245 *dst = -1; | 236 *dst = kInvalidCodePoint; |
| 246 return 0; | 237 return 0; |
| 247 } | 238 } |
| 248 } | 239 } |
| 249 *dst = ch; | 240 *dst = ch; |
| 250 return i; | 241 return i; |
| 251 } | 242 } |
| 252 | 243 |
| 253 | 244 |
| 254 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, | 245 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, |
| 255 intptr_t array_len, | 246 intptr_t array_len, |
| (...skipping 23 matching lines...) Expand all Loading... |
| 279 intptr_t array_len, | 270 intptr_t array_len, |
| 280 uint16_t* dst, | 271 uint16_t* dst, |
| 281 intptr_t len) { | 272 intptr_t len) { |
| 282 intptr_t i = 0; | 273 intptr_t i = 0; |
| 283 intptr_t j = 0; | 274 intptr_t j = 0; |
| 284 intptr_t num_bytes; | 275 intptr_t num_bytes; |
| 285 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 276 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| 286 int32_t ch; | 277 int32_t ch; |
| 287 bool is_smp = IsSmpSequenceStart(utf8_array[i]); | 278 bool is_smp = IsSmpSequenceStart(utf8_array[i]); |
| 288 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 279 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| 289 if (ch == -1) { | 280 if (ch == kInvalidCodePoint) { |
| 290 return false; // invalid input | 281 return false; // invalid input |
| 291 } | 282 } |
| 292 if (is_smp) { | 283 if (is_smp) { |
| 293 ConvertUTF32ToUTF16(ch, &(dst[j])); | 284 dst[j] = Utf16::LeadFromCodePoint(ch); |
| 294 j = j + 1; | 285 dst[j + 1] = Utf16::TrailFromCodePoint(ch); |
| 286 ++j; |
| 295 } else { | 287 } else { |
| 296 dst[j] = ch; | 288 dst[j] = ch; |
| 297 } | 289 } |
| 298 } | 290 } |
| 299 if ((i < array_len) && (j == len)) { | 291 if ((i < array_len) && (j == len)) { |
| 300 return false; // output overflow | 292 return false; // output overflow |
| 301 } | 293 } |
| 302 return true; // success | 294 return true; // success |
| 303 } | 295 } |
| 304 | 296 |
| 305 | 297 |
| 306 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, | 298 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
| 307 intptr_t array_len, | 299 intptr_t array_len, |
| 308 uint32_t* dst, | 300 int32_t* dst, |
| 309 intptr_t len) { | 301 intptr_t len) { |
| 310 intptr_t i = 0; | 302 intptr_t i = 0; |
| 311 intptr_t j = 0; | 303 intptr_t j = 0; |
| 312 intptr_t num_bytes; | 304 intptr_t num_bytes; |
| 313 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 305 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| 314 int32_t ch; | 306 int32_t ch; |
| 315 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 307 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| 316 if (ch == -1) { | 308 if (ch == kInvalidCodePoint) { |
| 317 return false; // invalid input | 309 return false; // invalid input |
| 318 } | 310 } |
| 319 dst[j] = ch; | 311 dst[j] = ch; |
| 320 } | 312 } |
| 321 if ((i < array_len) && (j == len)) { | 313 if ((i < array_len) && (j == len)) { |
| 322 return false; // output overflow | 314 return false; // output overflow |
| 323 } | 315 } |
| 324 return true; // success | 316 return true; // success |
| 325 } | 317 } |
| 326 | 318 |
| 319 |
| 320 int32_t Utf16::CodePointAt(const String& str, int index) { |
| 321 int32_t code = str.CharAt(index); |
| 322 if (!IsLeadSurrogate(code)) return code; |
| 323 if (index + 1 == str.Length()) return code; |
| 324 int32_t trail = str.CharAt(index + 1); |
| 325 if (!IsTrailSurrogate(trail)) return code; |
| 326 return CodePointFromCodeUnits(code, trail); |
| 327 } |
| 328 |
| 327 } // namespace dart | 329 } // namespace dart |
| OLD | NEW |