| OLD | NEW |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
| 6 | 6 |
| 7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
| 8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
| 9 #include "vm/object.h" | 9 #include "vm/object.h" |
| 10 | 10 |
| (...skipping 17 matching lines...) Expand all Loading... |
| 28 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 28 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 29 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 29 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 30 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 | 30 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
| 31 }; | 31 }; |
| 32 // clang-format on | 32 // clang-format on |
| 33 | 33 |
| 34 const uint32_t Utf8::kMagicBits[7] = {0, // Padding. | 34 const uint32_t Utf8::kMagicBits[7] = {0, // Padding. |
| 35 0x00000000, 0x00003080, 0x000E2080, | 35 0x00000000, 0x00003080, 0x000E2080, |
| 36 0x03C82080, 0xFA082080, 0x82082080}; | 36 0x03C82080, 0xFA082080, 0x82082080}; |
| 37 | 37 |
| 38 | |
| 39 // Minimum values of code points used to check shortest form. | 38 // Minimum values of code points used to check shortest form. |
| 40 const uint32_t Utf8::kOverlongMinimum[7] = {0, // Padding. | 39 const uint32_t Utf8::kOverlongMinimum[7] = {0, // Padding. |
| 41 0x0, 0x80, 0x800, | 40 0x0, 0x80, 0x800, |
| 42 0x10000, 0xFFFFFFFF, 0xFFFFFFFF}; | 41 0x10000, 0xFFFFFFFF, 0xFFFFFFFF}; |
| 43 | 42 |
| 44 | |
| 45 // Returns the most restricted coding form in which the sequence of utf8 | 43 // Returns the most restricted coding form in which the sequence of utf8 |
| 46 // characters in 'utf8_array' can be represented in, and the number of | 44 // characters in 'utf8_array' can be represented in, and the number of |
| 47 // code units needed in that form. | 45 // code units needed in that form. |
| 48 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, | 46 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, |
| 49 intptr_t array_len, | 47 intptr_t array_len, |
| 50 Type* type) { | 48 Type* type) { |
| 51 intptr_t len = 0; | 49 intptr_t len = 0; |
| 52 Type char_type = kLatin1; | 50 Type char_type = kLatin1; |
| 53 for (intptr_t i = 0; i < array_len; i++) { | 51 for (intptr_t i = 0; i < array_len; i++) { |
| 54 uint8_t code_unit = utf8_array[i]; | 52 uint8_t code_unit = utf8_array[i]; |
| 55 if (!IsTrailByte(code_unit)) { | 53 if (!IsTrailByte(code_unit)) { |
| 56 ++len; | 54 ++len; |
| 57 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | 55 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF |
| 58 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 | 56 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 |
| 59 char_type = kSupplementary; | 57 char_type = kSupplementary; |
| 60 ++len; | 58 ++len; |
| 61 } else if (char_type == kLatin1) { | 59 } else if (char_type == kLatin1) { |
| 62 char_type = kBMP; | 60 char_type = kBMP; |
| 63 } | 61 } |
| 64 } | 62 } |
| 65 } | 63 } |
| 66 } | 64 } |
| 67 *type = char_type; | 65 *type = char_type; |
| 68 return len; | 66 return len; |
| 69 } | 67 } |
| 70 | 68 |
| 71 | |
| 72 // Returns true if str is a valid NUL-terminated UTF-8 string. | 69 // Returns true if str is a valid NUL-terminated UTF-8 string. |
| 73 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { | 70 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { |
| 74 intptr_t i = 0; | 71 intptr_t i = 0; |
| 75 while (i < array_len) { | 72 while (i < array_len) { |
| 76 uint32_t ch = utf8_array[i] & 0xFF; | 73 uint32_t ch = utf8_array[i] & 0xFF; |
| 77 intptr_t j = 1; | 74 intptr_t j = 1; |
| 78 if (ch >= 0x80) { | 75 if (ch >= 0x80) { |
| 79 int8_t num_trail_bytes = kTrailBytes[ch]; | 76 int8_t num_trail_bytes = kTrailBytes[ch]; |
| 80 bool is_malformed = false; | 77 bool is_malformed = false; |
| 81 for (; j < num_trail_bytes; ++j) { | 78 for (; j < num_trail_bytes; ++j) { |
| 82 if ((i + j) < array_len) { | 79 if ((i + j) < array_len) { |
| 83 uint8_t code_unit = utf8_array[i + j]; | 80 uint8_t code_unit = utf8_array[i + j]; |
| 84 is_malformed |= !IsTrailByte(code_unit); | 81 is_malformed |= !IsTrailByte(code_unit); |
| 85 ch = (ch << 6) + code_unit; | 82 ch = (ch << 6) + code_unit; |
| 86 } else { | 83 } else { |
| 87 return false; | 84 return false; |
| 88 } | 85 } |
| 89 } | 86 } |
| 90 ch -= kMagicBits[num_trail_bytes]; | 87 ch -= kMagicBits[num_trail_bytes]; |
| 91 if (!((is_malformed == false) && (j == num_trail_bytes) && | 88 if (!((is_malformed == false) && (j == num_trail_bytes) && |
| 92 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, j))) { | 89 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, j))) { |
| 93 return false; | 90 return false; |
| 94 } | 91 } |
| 95 } | 92 } |
| 96 i += j; | 93 i += j; |
| 97 } | 94 } |
| 98 return true; | 95 return true; |
| 99 } | 96 } |
| 100 | 97 |
| 101 | |
| 102 intptr_t Utf8::Length(int32_t ch) { | 98 intptr_t Utf8::Length(int32_t ch) { |
| 103 if (ch <= kMaxOneByteChar) { | 99 if (ch <= kMaxOneByteChar) { |
| 104 return 1; | 100 return 1; |
| 105 } else if (ch <= kMaxTwoByteChar) { | 101 } else if (ch <= kMaxTwoByteChar) { |
| 106 return 2; | 102 return 2; |
| 107 } else if (ch <= kMaxThreeByteChar) { | 103 } else if (ch <= kMaxThreeByteChar) { |
| 108 return 3; | 104 return 3; |
| 109 } | 105 } |
| 110 ASSERT(ch <= kMaxFourByteChar); | 106 ASSERT(ch <= kMaxFourByteChar); |
| 111 return 4; | 107 return 4; |
| 112 } | 108 } |
| 113 | 109 |
| 114 | |
| 115 intptr_t Utf8::Length(const String& str) { | 110 intptr_t Utf8::Length(const String& str) { |
| 116 intptr_t length = 0; | 111 intptr_t length = 0; |
| 117 String::CodePointIterator it(str); | 112 String::CodePointIterator it(str); |
| 118 while (it.Next()) { | 113 while (it.Next()) { |
| 119 int32_t ch = it.Current(); | 114 int32_t ch = it.Current(); |
| 120 length += Utf8::Length(ch); | 115 length += Utf8::Length(ch); |
| 121 } | 116 } |
| 122 return length; | 117 return length; |
| 123 } | 118 } |
| 124 | 119 |
| 125 | |
| 126 intptr_t Utf8::Encode(int32_t ch, char* dst) { | 120 intptr_t Utf8::Encode(int32_t ch, char* dst) { |
| 127 static const int kMask = ~(1 << 6); | 121 static const int kMask = ~(1 << 6); |
| 128 if (ch <= kMaxOneByteChar) { | 122 if (ch <= kMaxOneByteChar) { |
| 129 dst[0] = ch; | 123 dst[0] = ch; |
| 130 return 1; | 124 return 1; |
| 131 } | 125 } |
| 132 if (ch <= kMaxTwoByteChar) { | 126 if (ch <= kMaxTwoByteChar) { |
| 133 dst[0] = 0xC0 | (ch >> 6); | 127 dst[0] = 0xC0 | (ch >> 6); |
| 134 dst[1] = 0x80 | (ch & kMask); | 128 dst[1] = 0x80 | (ch & kMask); |
| 135 return 2; | 129 return 2; |
| 136 } | 130 } |
| 137 if (ch <= kMaxThreeByteChar) { | 131 if (ch <= kMaxThreeByteChar) { |
| 138 dst[0] = 0xE0 | (ch >> 12); | 132 dst[0] = 0xE0 | (ch >> 12); |
| 139 dst[1] = 0x80 | ((ch >> 6) & kMask); | 133 dst[1] = 0x80 | ((ch >> 6) & kMask); |
| 140 dst[2] = 0x80 | (ch & kMask); | 134 dst[2] = 0x80 | (ch & kMask); |
| 141 return 3; | 135 return 3; |
| 142 } | 136 } |
| 143 ASSERT(ch <= kMaxFourByteChar); | 137 ASSERT(ch <= kMaxFourByteChar); |
| 144 dst[0] = 0xF0 | (ch >> 18); | 138 dst[0] = 0xF0 | (ch >> 18); |
| 145 dst[1] = 0x80 | ((ch >> 12) & kMask); | 139 dst[1] = 0x80 | ((ch >> 12) & kMask); |
| 146 dst[2] = 0x80 | ((ch >> 6) & kMask); | 140 dst[2] = 0x80 | ((ch >> 6) & kMask); |
| 147 dst[3] = 0x80 | (ch & kMask); | 141 dst[3] = 0x80 | (ch & kMask); |
| 148 return 4; | 142 return 4; |
| 149 } | 143 } |
| 150 | 144 |
| 151 | |
| 152 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { | 145 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { |
| 153 intptr_t pos = 0; | 146 intptr_t pos = 0; |
| 154 String::CodePointIterator it(src); | 147 String::CodePointIterator it(src); |
| 155 while (it.Next()) { | 148 while (it.Next()) { |
| 156 int32_t ch = it.Current(); | 149 int32_t ch = it.Current(); |
| 157 intptr_t num_bytes = Utf8::Length(ch); | 150 intptr_t num_bytes = Utf8::Length(ch); |
| 158 if (pos + num_bytes > len) { | 151 if (pos + num_bytes > len) { |
| 159 break; | 152 break; |
| 160 } | 153 } |
| 161 Utf8::Encode(ch, &dst[pos]); | 154 Utf8::Encode(ch, &dst[pos]); |
| 162 pos += num_bytes; | 155 pos += num_bytes; |
| 163 } | 156 } |
| 164 return pos; | 157 return pos; |
| 165 } | 158 } |
| 166 | 159 |
| 167 | |
| 168 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 160 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
| 169 intptr_t array_len, | 161 intptr_t array_len, |
| 170 int32_t* dst) { | 162 int32_t* dst) { |
| 171 uint32_t ch = utf8_array[0] & 0xFF; | 163 uint32_t ch = utf8_array[0] & 0xFF; |
| 172 intptr_t i = 1; | 164 intptr_t i = 1; |
| 173 if (ch >= 0x80) { | 165 if (ch >= 0x80) { |
| 174 intptr_t num_trail_bytes = kTrailBytes[ch]; | 166 intptr_t num_trail_bytes = kTrailBytes[ch]; |
| 175 bool is_malformed = false; | 167 bool is_malformed = false; |
| 176 for (; i < num_trail_bytes; ++i) { | 168 for (; i < num_trail_bytes; ++i) { |
| 177 if (i < array_len) { | 169 if (i < array_len) { |
| 178 uint8_t code_unit = utf8_array[i]; | 170 uint8_t code_unit = utf8_array[i]; |
| 179 is_malformed |= !IsTrailByte(code_unit); | 171 is_malformed |= !IsTrailByte(code_unit); |
| 180 ch = (ch << 6) + code_unit; | 172 ch = (ch << 6) + code_unit; |
| 181 } else { | 173 } else { |
| 182 *dst = -1; | 174 *dst = -1; |
| 183 return 0; | 175 return 0; |
| 184 } | 176 } |
| 185 } | 177 } |
| 186 ch -= kMagicBits[num_trail_bytes]; | 178 ch -= kMagicBits[num_trail_bytes]; |
| 187 if (!((is_malformed == false) && (i == num_trail_bytes) && | 179 if (!((is_malformed == false) && (i == num_trail_bytes) && |
| 188 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, i))) { | 180 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, i))) { |
| 189 *dst = -1; | 181 *dst = -1; |
| 190 return 0; | 182 return 0; |
| 191 } | 183 } |
| 192 } | 184 } |
| 193 *dst = ch; | 185 *dst = ch; |
| 194 return i; | 186 return i; |
| 195 } | 187 } |
| 196 | 188 |
| 197 | |
| 198 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, | 189 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, |
| 199 intptr_t array_len, | 190 intptr_t array_len, |
| 200 uint8_t* dst, | 191 uint8_t* dst, |
| 201 intptr_t len) { | 192 intptr_t len) { |
| 202 intptr_t i = 0; | 193 intptr_t i = 0; |
| 203 intptr_t j = 0; | 194 intptr_t j = 0; |
| 204 intptr_t num_bytes; | 195 intptr_t num_bytes; |
| 205 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 196 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| 206 int32_t ch; | 197 int32_t ch; |
| 207 ASSERT(IsLatin1SequenceStart(utf8_array[i])); | 198 ASSERT(IsLatin1SequenceStart(utf8_array[i])); |
| 208 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 199 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| 209 if (ch == -1) { | 200 if (ch == -1) { |
| 210 return false; // Invalid input. | 201 return false; // Invalid input. |
| 211 } | 202 } |
| 212 ASSERT(Utf::IsLatin1(ch)); | 203 ASSERT(Utf::IsLatin1(ch)); |
| 213 dst[j] = ch; | 204 dst[j] = ch; |
| 214 } | 205 } |
| 215 if ((i < array_len) && (j == len)) { | 206 if ((i < array_len) && (j == len)) { |
| 216 return false; // Output overflow. | 207 return false; // Output overflow. |
| 217 } | 208 } |
| 218 return true; // Success. | 209 return true; // Success. |
| 219 } | 210 } |
| 220 | 211 |
| 221 | |
| 222 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, | 212 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
| 223 intptr_t array_len, | 213 intptr_t array_len, |
| 224 uint16_t* dst, | 214 uint16_t* dst, |
| 225 intptr_t len) { | 215 intptr_t len) { |
| 226 intptr_t i = 0; | 216 intptr_t i = 0; |
| 227 intptr_t j = 0; | 217 intptr_t j = 0; |
| 228 intptr_t num_bytes; | 218 intptr_t num_bytes; |
| 229 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 219 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| 230 int32_t ch; | 220 int32_t ch; |
| 231 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); | 221 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); |
| 232 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 222 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| 233 if (ch == -1) { | 223 if (ch == -1) { |
| 234 return false; // Invalid input. | 224 return false; // Invalid input. |
| 235 } | 225 } |
| 236 if (is_supplementary) { | 226 if (is_supplementary) { |
| 237 Utf16::Encode(ch, &dst[j]); | 227 Utf16::Encode(ch, &dst[j]); |
| 238 j = j + 1; | 228 j = j + 1; |
| 239 } else { | 229 } else { |
| 240 dst[j] = ch; | 230 dst[j] = ch; |
| 241 } | 231 } |
| 242 } | 232 } |
| 243 if ((i < array_len) && (j == len)) { | 233 if ((i < array_len) && (j == len)) { |
| 244 return false; // Output overflow. | 234 return false; // Output overflow. |
| 245 } | 235 } |
| 246 return true; // Success. | 236 return true; // Success. |
| 247 } | 237 } |
| 248 | 238 |
| 249 | |
| 250 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, | 239 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
| 251 intptr_t array_len, | 240 intptr_t array_len, |
| 252 int32_t* dst, | 241 int32_t* dst, |
| 253 intptr_t len) { | 242 intptr_t len) { |
| 254 intptr_t i = 0; | 243 intptr_t i = 0; |
| 255 intptr_t j = 0; | 244 intptr_t j = 0; |
| 256 intptr_t num_bytes; | 245 intptr_t num_bytes; |
| 257 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 246 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
| 258 int32_t ch; | 247 int32_t ch; |
| 259 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 248 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
| 260 if (ch == -1) { | 249 if (ch == -1) { |
| 261 return false; // Invalid input. | 250 return false; // Invalid input. |
| 262 } | 251 } |
| 263 dst[j] = ch; | 252 dst[j] = ch; |
| 264 } | 253 } |
| 265 if ((i < array_len) && (j == len)) { | 254 if ((i < array_len) && (j == len)) { |
| 266 return false; // Output overflow. | 255 return false; // Output overflow. |
| 267 } | 256 } |
| 268 return true; // Success. | 257 return true; // Success. |
| 269 } | 258 } |
| 270 | 259 |
| 271 | |
| 272 bool Utf8::DecodeCStringToUTF32(const char* str, int32_t* dst, intptr_t len) { | 260 bool Utf8::DecodeCStringToUTF32(const char* str, int32_t* dst, intptr_t len) { |
| 273 ASSERT(str != NULL); | 261 ASSERT(str != NULL); |
| 274 intptr_t array_len = strlen(str); | 262 intptr_t array_len = strlen(str); |
| 275 const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str); | 263 const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str); |
| 276 return Utf8::DecodeToUTF32(utf8_array, array_len, dst, len); | 264 return Utf8::DecodeToUTF32(utf8_array, array_len, dst, len); |
| 277 } | 265 } |
| 278 | 266 |
| 279 | |
| 280 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 267 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
| 281 ASSERT(codepoint > Utf16::kMaxCodeUnit); | 268 ASSERT(codepoint > Utf16::kMaxCodeUnit); |
| 282 ASSERT(dst != NULL); | 269 ASSERT(dst != NULL); |
| 283 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 270 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
| 284 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 271 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
| 285 } | 272 } |
| 286 | 273 |
| 287 } // namespace dart | 274 } // namespace dart |
| OLD | NEW |