| OLD | NEW |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
| 6 | 6 |
| 7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
| 8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
| 9 #include "vm/object.h" | 9 #include "vm/object.h" |
| 10 | 10 |
| 11 namespace dart { | 11 namespace dart { |
| 12 | 12 |
| 13 // clang-format off |
| 13 const int8_t Utf8::kTrailBytes[256] = { | 14 const int8_t Utf8::kTrailBytes[256] = { |
| 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | 22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
| 22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | 26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
| 26 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 28 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
| 28 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 29 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
| 29 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 | 30 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
| 30 }; | 31 }; |
| 32 // clang-format on |
| 31 | 33 |
| 32 | 34 const uint32_t Utf8::kMagicBits[7] = {0, // Padding. |
| 33 const uint32_t Utf8::kMagicBits[7] = { | 35 0x00000000, 0x00003080, 0x000E2080, |
| 34 0, // Padding. | 36 0x03C82080, 0xFA082080, 0x82082080}; |
| 35 0x00000000, | |
| 36 0x00003080, | |
| 37 0x000E2080, | |
| 38 0x03C82080, | |
| 39 0xFA082080, | |
| 40 0x82082080 | |
| 41 }; | |
| 42 | 37 |
| 43 | 38 |
| 44 // Minimum values of code points used to check shortest form. | 39 // Minimum values of code points used to check shortest form. |
| 45 const uint32_t Utf8::kOverlongMinimum[7] = { | 40 const uint32_t Utf8::kOverlongMinimum[7] = {0, // Padding. |
| 46 0, // Padding. | 41 0x0, 0x80, 0x800, |
| 47 0x0, | 42 0x10000, 0xFFFFFFFF, 0xFFFFFFFF}; |
| 48 0x80, | |
| 49 0x800, | |
| 50 0x10000, | |
| 51 0xFFFFFFFF, | |
| 52 0xFFFFFFFF | |
| 53 }; | |
| 54 | 43 |
| 55 | 44 |
| 56 // Returns the most restricted coding form in which the sequence of utf8 | 45 // Returns the most restricted coding form in which the sequence of utf8 |
| 57 // characters in 'utf8_array' can be represented in, and the number of | 46 // characters in 'utf8_array' can be represented in, and the number of |
| 58 // code units needed in that form. | 47 // code units needed in that form. |
| 59 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, | 48 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, |
| 60 intptr_t array_len, | 49 intptr_t array_len, |
| 61 Type* type) { | 50 Type* type) { |
| 62 intptr_t len = 0; | 51 intptr_t len = 0; |
| 63 Type char_type = kLatin1; | 52 Type char_type = kLatin1; |
| 64 for (intptr_t i = 0; i < array_len; i++) { | 53 for (intptr_t i = 0; i < array_len; i++) { |
| 65 uint8_t code_unit = utf8_array[i]; | 54 uint8_t code_unit = utf8_array[i]; |
| 66 if (!IsTrailByte(code_unit)) { | 55 if (!IsTrailByte(code_unit)) { |
| 67 ++len; | 56 ++len; |
| 68 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | 57 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF |
| 69 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 | 58 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 |
| 70 char_type = kSupplementary; | 59 char_type = kSupplementary; |
| 71 ++len; | 60 ++len; |
| 72 } else if (char_type == kLatin1) { | 61 } else if (char_type == kLatin1) { |
| 73 char_type = kBMP; | 62 char_type = kBMP; |
| 74 } | 63 } |
| 75 } | 64 } |
| 76 } | 65 } |
| 77 } | 66 } |
| 78 *type = char_type; | 67 *type = char_type; |
| (...skipping 13 matching lines...) Expand all Loading... |
| 92 for (; j < num_trail_bytes; ++j) { | 81 for (; j < num_trail_bytes; ++j) { |
| 93 if ((i + j) < array_len) { | 82 if ((i + j) < array_len) { |
| 94 uint8_t code_unit = utf8_array[i + j]; | 83 uint8_t code_unit = utf8_array[i + j]; |
| 95 is_malformed |= !IsTrailByte(code_unit); | 84 is_malformed |= !IsTrailByte(code_unit); |
| 96 ch = (ch << 6) + code_unit; | 85 ch = (ch << 6) + code_unit; |
| 97 } else { | 86 } else { |
| 98 return false; | 87 return false; |
| 99 } | 88 } |
| 100 } | 89 } |
| 101 ch -= kMagicBits[num_trail_bytes]; | 90 ch -= kMagicBits[num_trail_bytes]; |
| 102 if (!((is_malformed == false) && | 91 if (!((is_malformed == false) && (j == num_trail_bytes) && |
| 103 (j == num_trail_bytes) && | 92 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, j))) { |
| 104 !Utf::IsOutOfRange(ch) && | |
| 105 !IsNonShortestForm(ch, j))) { | |
| 106 return false; | 93 return false; |
| 107 } | 94 } |
| 108 } | 95 } |
| 109 i += j; | 96 i += j; |
| 110 } | 97 } |
| 111 return true; | 98 return true; |
| 112 } | 99 } |
| 113 | 100 |
| 114 | 101 |
| 115 intptr_t Utf8::Length(int32_t ch) { | 102 intptr_t Utf8::Length(int32_t ch) { |
| (...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 190 if (i < array_len) { | 177 if (i < array_len) { |
| 191 uint8_t code_unit = utf8_array[i]; | 178 uint8_t code_unit = utf8_array[i]; |
| 192 is_malformed |= !IsTrailByte(code_unit); | 179 is_malformed |= !IsTrailByte(code_unit); |
| 193 ch = (ch << 6) + code_unit; | 180 ch = (ch << 6) + code_unit; |
| 194 } else { | 181 } else { |
| 195 *dst = -1; | 182 *dst = -1; |
| 196 return 0; | 183 return 0; |
| 197 } | 184 } |
| 198 } | 185 } |
| 199 ch -= kMagicBits[num_trail_bytes]; | 186 ch -= kMagicBits[num_trail_bytes]; |
| 200 if (!((is_malformed == false) && | 187 if (!((is_malformed == false) && (i == num_trail_bytes) && |
| 201 (i == num_trail_bytes) && | 188 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, i))) { |
| 202 !Utf::IsOutOfRange(ch) && | |
| 203 !IsNonShortestForm(ch, i))) { | |
| 204 *dst = -1; | 189 *dst = -1; |
| 205 return 0; | 190 return 0; |
| 206 } | 191 } |
| 207 } | 192 } |
| 208 *dst = ch; | 193 *dst = ch; |
| 209 return i; | 194 return i; |
| 210 } | 195 } |
| 211 | 196 |
| 212 | 197 |
| 213 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, | 198 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, |
| (...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 293 | 278 |
| 294 | 279 |
| 295 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 280 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
| 296 ASSERT(codepoint > Utf16::kMaxCodeUnit); | 281 ASSERT(codepoint > Utf16::kMaxCodeUnit); |
| 297 ASSERT(dst != NULL); | 282 ASSERT(dst != NULL); |
| 298 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 283 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
| 299 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 284 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
| 300 } | 285 } |
| 301 | 286 |
| 302 } // namespace dart | 287 } // namespace dart |
| OLD | NEW |