| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | |
| 2 // for details. All rights reserved. Use of this source code is governed by a | |
| 3 // BSD-style license that can be found in the LICENSE file. | |
| 4 | |
| 5 #include "unicode.h" | |
| 6 | |
| 7 const int8_t Utf8::kTrailBytes[256] = { | |
| 8 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 9 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 10 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 11 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 12 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 13 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
| 16 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 17 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
| 20 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
| 21 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
| 22 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
| 23 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 | |
| 24 }; | |
| 25 | |
| 26 | |
| 27 const uint32_t Utf8::kMagicBits[7] = { | |
| 28 0, // Padding. | |
| 29 0x00000000, | |
| 30 0x00003080, | |
| 31 0x000E2080, | |
| 32 0x03C82080, | |
| 33 0xFA082080, | |
| 34 0x82082080 | |
| 35 }; | |
| 36 | |
| 37 | |
| 38 // Minimum values of code points used to check shortest form. | |
| 39 const uint32_t Utf8::kOverlongMinimum[7] = { | |
| 40 0, // Padding. | |
| 41 0x0, | |
| 42 0x80, | |
| 43 0x800, | |
| 44 0x10000, | |
| 45 0xFFFFFFFF, | |
| 46 0xFFFFFFFF | |
| 47 }; | |
| 48 | |
| 49 class CodePointIterator { | |
| 50 public: | |
| 51 explicit CodePointIterator(List<uint16_t> str) | |
| 52 : str_(str), | |
| 53 ch_(0), | |
| 54 index_(-1), | |
| 55 end_(str.length()) { | |
| 56 } | |
| 57 | |
| 58 int32_t Current() const { | |
| 59 return ch_; | |
| 60 } | |
| 61 | |
| 62 bool Next() { | |
| 63 intptr_t length = Utf16::Length(ch_); | |
| 64 if (index_ < (end_ - length)) { | |
| 65 index_ += length; | |
| 66 ch_ = str_[index_]; | |
| 67 if (Utf16::IsLeadSurrogate(ch_) && (index_ < (end_ - 1))) { | |
| 68 int32_t ch2 = str_[index_ + 1]; | |
| 69 if (Utf16::IsTrailSurrogate(ch2)) { | |
| 70 ch_ = Utf16::Decode(ch_, ch2); | |
| 71 } | |
| 72 } | |
| 73 return true; | |
| 74 } | |
| 75 index_ = end_; | |
| 76 return false; | |
| 77 } | |
| 78 | |
| 79 private: | |
| 80 List<uint16_t> str_; | |
| 81 int32_t ch_; | |
| 82 intptr_t index_; | |
| 83 intptr_t end_; | |
| 84 }; | |
| 85 | |
| 86 // Returns the most restricted coding form in which the sequence of utf8 | |
| 87 // characters in 'utf8_array' can be represented in, and the number of | |
| 88 // code units needed in that form. | |
| 89 intptr_t Utf8::CodeUnitCount(const char* utf8_array, | |
| 90 intptr_t array_len, | |
| 91 Type* type) { | |
| 92 intptr_t len = 0; | |
| 93 Type char_type = kLatin1; | |
| 94 for (intptr_t i = 0; i < array_len; i++) { | |
| 95 uint8_t code_unit = utf8_array[i]; | |
| 96 if (!IsTrailByte(code_unit)) { | |
| 97 ++len; | |
| 98 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | |
| 99 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 | |
| 100 char_type = kSupplementary; | |
| 101 ++len; | |
| 102 } else if (char_type == kLatin1) { | |
| 103 char_type = kBMP; | |
| 104 } | |
| 105 } | |
| 106 } | |
| 107 } | |
| 108 *type = char_type; | |
| 109 return len; | |
| 110 } | |
| 111 | |
| 112 intptr_t Utf8::Length(int32_t ch) { | |
| 113 if (ch <= kMaxOneByteChar) { | |
| 114 return 1; | |
| 115 } else if (ch <= kMaxTwoByteChar) { | |
| 116 return 2; | |
| 117 } else if (ch <= kMaxThreeByteChar) { | |
| 118 return 3; | |
| 119 } | |
| 120 return 4; | |
| 121 } | |
| 122 | |
| 123 intptr_t Utf8::Length(List<uint16_t> str) { | |
| 124 intptr_t length = 0; | |
| 125 CodePointIterator it(str); | |
| 126 while (it.Next()) { | |
| 127 int32_t ch = it.Current(); | |
| 128 length += Utf8::Length(ch); | |
| 129 } | |
| 130 return length; | |
| 131 } | |
| 132 | |
| 133 intptr_t Utf8::Encode(int32_t ch, char* dst) { | |
| 134 static const int kMask = ~(1 << 6); | |
| 135 if (ch <= kMaxOneByteChar) { | |
| 136 dst[0] = ch; | |
| 137 return 1; | |
| 138 } | |
| 139 if (ch <= kMaxTwoByteChar) { | |
| 140 dst[0] = 0xC0 | (ch >> 6); | |
| 141 dst[1] = 0x80 | (ch & kMask); | |
| 142 return 2; | |
| 143 } | |
| 144 if (ch <= kMaxThreeByteChar) { | |
| 145 dst[0] = 0xE0 | (ch >> 12); | |
| 146 dst[1] = 0x80 | ((ch >> 6) & kMask); | |
| 147 dst[2] = 0x80 | (ch & kMask); | |
| 148 return 3; | |
| 149 } | |
| 150 dst[0] = 0xF0 | (ch >> 18); | |
| 151 dst[1] = 0x80 | ((ch >> 12) & kMask); | |
| 152 dst[2] = 0x80 | ((ch >> 6) & kMask); | |
| 153 dst[3] = 0x80 | (ch & kMask); | |
| 154 return 4; | |
| 155 } | |
| 156 | |
| 157 intptr_t Utf8::Encode(List<uint16_t> src, char* dst, intptr_t len) { | |
| 158 intptr_t pos = 0; | |
| 159 CodePointIterator it(src); | |
| 160 while (it.Next()) { | |
| 161 int32_t ch = it.Current(); | |
| 162 intptr_t num_bytes = Utf8::Length(ch); | |
| 163 if (pos + num_bytes > len) { | |
| 164 break; | |
| 165 } | |
| 166 Utf8::Encode(ch, &dst[pos]); | |
| 167 pos += num_bytes; | |
| 168 } | |
| 169 return pos; | |
| 170 } | |
| 171 | |
| 172 intptr_t Utf8::Decode(const char* utf8_array, | |
| 173 intptr_t array_len, | |
| 174 int32_t* dst) { | |
| 175 uint32_t ch = utf8_array[0] & 0xFF; | |
| 176 intptr_t i = 1; | |
| 177 if (ch >= 0x80) { | |
| 178 intptr_t num_trail_bytes = kTrailBytes[ch]; | |
| 179 bool is_malformed = false; | |
| 180 for (; i < num_trail_bytes; ++i) { | |
| 181 if (i < array_len) { | |
| 182 uint8_t code_unit = utf8_array[i]; | |
| 183 is_malformed |= !IsTrailByte(code_unit); | |
| 184 ch = (ch << 6) + code_unit; | |
| 185 } else { | |
| 186 *dst = -1; | |
| 187 return 0; | |
| 188 } | |
| 189 } | |
| 190 ch -= kMagicBits[num_trail_bytes]; | |
| 191 if (!((is_malformed == false) && | |
| 192 (i == num_trail_bytes) && | |
| 193 !Utf::IsOutOfRange(ch) && | |
| 194 !IsNonShortestForm(ch, i) && | |
| 195 !Utf16::IsSurrogate(ch))) { | |
| 196 *dst = -1; | |
| 197 return 0; | |
| 198 } | |
| 199 } | |
| 200 *dst = ch; | |
| 201 return i; | |
| 202 } | |
| 203 | |
| 204 bool Utf8::DecodeToUTF16(const char* utf8_array, | |
| 205 intptr_t array_len, | |
| 206 uint16_t* dst, | |
| 207 intptr_t len) { | |
| 208 intptr_t i = 0; | |
| 209 intptr_t j = 0; | |
| 210 intptr_t num_bytes; | |
| 211 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | |
| 212 int32_t ch; | |
| 213 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); | |
| 214 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | |
| 215 if (ch == -1) { | |
| 216 return false; // Invalid input. | |
| 217 } | |
| 218 if (is_supplementary) { | |
| 219 Utf16::Encode(ch, &dst[j]); | |
| 220 j = j + 1; | |
| 221 } else { | |
| 222 dst[j] = ch; | |
| 223 } | |
| 224 } | |
| 225 if ((i < array_len) && (j == len)) { | |
| 226 return false; // Output overflow. | |
| 227 } | |
| 228 return true; // Success. | |
| 229 } | |
| 230 | |
| 231 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | |
| 232 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | |
| 233 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | |
| 234 } | |
| OLD | NEW |