OLD | NEW |
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 #include "vm/unicode.h" | 5 #include "vm/unicode.h" |
6 | 6 |
7 #include "vm/allocation.h" | 7 #include "vm/allocation.h" |
8 #include "vm/globals.h" | 8 #include "vm/globals.h" |
9 #include "vm/object.h" | 9 #include "vm/object.h" |
10 | 10 |
(...skipping 17 matching lines...) Expand all Loading... |
28 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | 28 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
29 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | 29 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
30 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 | 30 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 |
31 }; | 31 }; |
32 // clang-format on | 32 // clang-format on |
33 | 33 |
34 const uint32_t Utf8::kMagicBits[7] = {0, // Padding. | 34 const uint32_t Utf8::kMagicBits[7] = {0, // Padding. |
35 0x00000000, 0x00003080, 0x000E2080, | 35 0x00000000, 0x00003080, 0x000E2080, |
36 0x03C82080, 0xFA082080, 0x82082080}; | 36 0x03C82080, 0xFA082080, 0x82082080}; |
37 | 37 |
38 | |
39 // Minimum values of code points used to check shortest form. | 38 // Minimum values of code points used to check shortest form. |
40 const uint32_t Utf8::kOverlongMinimum[7] = {0, // Padding. | 39 const uint32_t Utf8::kOverlongMinimum[7] = {0, // Padding. |
41 0x0, 0x80, 0x800, | 40 0x0, 0x80, 0x800, |
42 0x10000, 0xFFFFFFFF, 0xFFFFFFFF}; | 41 0x10000, 0xFFFFFFFF, 0xFFFFFFFF}; |
43 | 42 |
44 | |
45 // Returns the most restricted coding form in which the sequence of utf8 | 43 // Returns the most restricted coding form in which the sequence of utf8 |
46 // characters in 'utf8_array' can be represented in, and the number of | 44 // characters in 'utf8_array' can be represented in, and the number of |
47 // code units needed in that form. | 45 // code units needed in that form. |
48 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, | 46 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array, |
49 intptr_t array_len, | 47 intptr_t array_len, |
50 Type* type) { | 48 Type* type) { |
51 intptr_t len = 0; | 49 intptr_t len = 0; |
52 Type char_type = kLatin1; | 50 Type char_type = kLatin1; |
53 for (intptr_t i = 0; i < array_len; i++) { | 51 for (intptr_t i = 0; i < array_len; i++) { |
54 uint8_t code_unit = utf8_array[i]; | 52 uint8_t code_unit = utf8_array[i]; |
55 if (!IsTrailByte(code_unit)) { | 53 if (!IsTrailByte(code_unit)) { |
56 ++len; | 54 ++len; |
57 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF | 55 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF |
58 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 | 56 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 |
59 char_type = kSupplementary; | 57 char_type = kSupplementary; |
60 ++len; | 58 ++len; |
61 } else if (char_type == kLatin1) { | 59 } else if (char_type == kLatin1) { |
62 char_type = kBMP; | 60 char_type = kBMP; |
63 } | 61 } |
64 } | 62 } |
65 } | 63 } |
66 } | 64 } |
67 *type = char_type; | 65 *type = char_type; |
68 return len; | 66 return len; |
69 } | 67 } |
70 | 68 |
71 | |
72 // Returns true if str is a valid NUL-terminated UTF-8 string. | 69 // Returns true if str is a valid NUL-terminated UTF-8 string. |
73 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { | 70 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { |
74 intptr_t i = 0; | 71 intptr_t i = 0; |
75 while (i < array_len) { | 72 while (i < array_len) { |
76 uint32_t ch = utf8_array[i] & 0xFF; | 73 uint32_t ch = utf8_array[i] & 0xFF; |
77 intptr_t j = 1; | 74 intptr_t j = 1; |
78 if (ch >= 0x80) { | 75 if (ch >= 0x80) { |
79 int8_t num_trail_bytes = kTrailBytes[ch]; | 76 int8_t num_trail_bytes = kTrailBytes[ch]; |
80 bool is_malformed = false; | 77 bool is_malformed = false; |
81 for (; j < num_trail_bytes; ++j) { | 78 for (; j < num_trail_bytes; ++j) { |
82 if ((i + j) < array_len) { | 79 if ((i + j) < array_len) { |
83 uint8_t code_unit = utf8_array[i + j]; | 80 uint8_t code_unit = utf8_array[i + j]; |
84 is_malformed |= !IsTrailByte(code_unit); | 81 is_malformed |= !IsTrailByte(code_unit); |
85 ch = (ch << 6) + code_unit; | 82 ch = (ch << 6) + code_unit; |
86 } else { | 83 } else { |
87 return false; | 84 return false; |
88 } | 85 } |
89 } | 86 } |
90 ch -= kMagicBits[num_trail_bytes]; | 87 ch -= kMagicBits[num_trail_bytes]; |
91 if (!((is_malformed == false) && (j == num_trail_bytes) && | 88 if (!((is_malformed == false) && (j == num_trail_bytes) && |
92 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, j))) { | 89 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, j))) { |
93 return false; | 90 return false; |
94 } | 91 } |
95 } | 92 } |
96 i += j; | 93 i += j; |
97 } | 94 } |
98 return true; | 95 return true; |
99 } | 96 } |
100 | 97 |
101 | |
102 intptr_t Utf8::Length(int32_t ch) { | 98 intptr_t Utf8::Length(int32_t ch) { |
103 if (ch <= kMaxOneByteChar) { | 99 if (ch <= kMaxOneByteChar) { |
104 return 1; | 100 return 1; |
105 } else if (ch <= kMaxTwoByteChar) { | 101 } else if (ch <= kMaxTwoByteChar) { |
106 return 2; | 102 return 2; |
107 } else if (ch <= kMaxThreeByteChar) { | 103 } else if (ch <= kMaxThreeByteChar) { |
108 return 3; | 104 return 3; |
109 } | 105 } |
110 ASSERT(ch <= kMaxFourByteChar); | 106 ASSERT(ch <= kMaxFourByteChar); |
111 return 4; | 107 return 4; |
112 } | 108 } |
113 | 109 |
114 | |
115 intptr_t Utf8::Length(const String& str) { | 110 intptr_t Utf8::Length(const String& str) { |
116 intptr_t length = 0; | 111 intptr_t length = 0; |
117 String::CodePointIterator it(str); | 112 String::CodePointIterator it(str); |
118 while (it.Next()) { | 113 while (it.Next()) { |
119 int32_t ch = it.Current(); | 114 int32_t ch = it.Current(); |
120 length += Utf8::Length(ch); | 115 length += Utf8::Length(ch); |
121 } | 116 } |
122 return length; | 117 return length; |
123 } | 118 } |
124 | 119 |
125 | |
126 intptr_t Utf8::Encode(int32_t ch, char* dst) { | 120 intptr_t Utf8::Encode(int32_t ch, char* dst) { |
127 static const int kMask = ~(1 << 6); | 121 static const int kMask = ~(1 << 6); |
128 if (ch <= kMaxOneByteChar) { | 122 if (ch <= kMaxOneByteChar) { |
129 dst[0] = ch; | 123 dst[0] = ch; |
130 return 1; | 124 return 1; |
131 } | 125 } |
132 if (ch <= kMaxTwoByteChar) { | 126 if (ch <= kMaxTwoByteChar) { |
133 dst[0] = 0xC0 | (ch >> 6); | 127 dst[0] = 0xC0 | (ch >> 6); |
134 dst[1] = 0x80 | (ch & kMask); | 128 dst[1] = 0x80 | (ch & kMask); |
135 return 2; | 129 return 2; |
136 } | 130 } |
137 if (ch <= kMaxThreeByteChar) { | 131 if (ch <= kMaxThreeByteChar) { |
138 dst[0] = 0xE0 | (ch >> 12); | 132 dst[0] = 0xE0 | (ch >> 12); |
139 dst[1] = 0x80 | ((ch >> 6) & kMask); | 133 dst[1] = 0x80 | ((ch >> 6) & kMask); |
140 dst[2] = 0x80 | (ch & kMask); | 134 dst[2] = 0x80 | (ch & kMask); |
141 return 3; | 135 return 3; |
142 } | 136 } |
143 ASSERT(ch <= kMaxFourByteChar); | 137 ASSERT(ch <= kMaxFourByteChar); |
144 dst[0] = 0xF0 | (ch >> 18); | 138 dst[0] = 0xF0 | (ch >> 18); |
145 dst[1] = 0x80 | ((ch >> 12) & kMask); | 139 dst[1] = 0x80 | ((ch >> 12) & kMask); |
146 dst[2] = 0x80 | ((ch >> 6) & kMask); | 140 dst[2] = 0x80 | ((ch >> 6) & kMask); |
147 dst[3] = 0x80 | (ch & kMask); | 141 dst[3] = 0x80 | (ch & kMask); |
148 return 4; | 142 return 4; |
149 } | 143 } |
150 | 144 |
151 | |
152 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { | 145 intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) { |
153 intptr_t pos = 0; | 146 intptr_t pos = 0; |
154 String::CodePointIterator it(src); | 147 String::CodePointIterator it(src); |
155 while (it.Next()) { | 148 while (it.Next()) { |
156 int32_t ch = it.Current(); | 149 int32_t ch = it.Current(); |
157 intptr_t num_bytes = Utf8::Length(ch); | 150 intptr_t num_bytes = Utf8::Length(ch); |
158 if (pos + num_bytes > len) { | 151 if (pos + num_bytes > len) { |
159 break; | 152 break; |
160 } | 153 } |
161 Utf8::Encode(ch, &dst[pos]); | 154 Utf8::Encode(ch, &dst[pos]); |
162 pos += num_bytes; | 155 pos += num_bytes; |
163 } | 156 } |
164 return pos; | 157 return pos; |
165 } | 158 } |
166 | 159 |
167 | |
168 intptr_t Utf8::Decode(const uint8_t* utf8_array, | 160 intptr_t Utf8::Decode(const uint8_t* utf8_array, |
169 intptr_t array_len, | 161 intptr_t array_len, |
170 int32_t* dst) { | 162 int32_t* dst) { |
171 uint32_t ch = utf8_array[0] & 0xFF; | 163 uint32_t ch = utf8_array[0] & 0xFF; |
172 intptr_t i = 1; | 164 intptr_t i = 1; |
173 if (ch >= 0x80) { | 165 if (ch >= 0x80) { |
174 intptr_t num_trail_bytes = kTrailBytes[ch]; | 166 intptr_t num_trail_bytes = kTrailBytes[ch]; |
175 bool is_malformed = false; | 167 bool is_malformed = false; |
176 for (; i < num_trail_bytes; ++i) { | 168 for (; i < num_trail_bytes; ++i) { |
177 if (i < array_len) { | 169 if (i < array_len) { |
178 uint8_t code_unit = utf8_array[i]; | 170 uint8_t code_unit = utf8_array[i]; |
179 is_malformed |= !IsTrailByte(code_unit); | 171 is_malformed |= !IsTrailByte(code_unit); |
180 ch = (ch << 6) + code_unit; | 172 ch = (ch << 6) + code_unit; |
181 } else { | 173 } else { |
182 *dst = -1; | 174 *dst = -1; |
183 return 0; | 175 return 0; |
184 } | 176 } |
185 } | 177 } |
186 ch -= kMagicBits[num_trail_bytes]; | 178 ch -= kMagicBits[num_trail_bytes]; |
187 if (!((is_malformed == false) && (i == num_trail_bytes) && | 179 if (!((is_malformed == false) && (i == num_trail_bytes) && |
188 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, i))) { | 180 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, i))) { |
189 *dst = -1; | 181 *dst = -1; |
190 return 0; | 182 return 0; |
191 } | 183 } |
192 } | 184 } |
193 *dst = ch; | 185 *dst = ch; |
194 return i; | 186 return i; |
195 } | 187 } |
196 | 188 |
197 | |
198 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, | 189 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, |
199 intptr_t array_len, | 190 intptr_t array_len, |
200 uint8_t* dst, | 191 uint8_t* dst, |
201 intptr_t len) { | 192 intptr_t len) { |
202 intptr_t i = 0; | 193 intptr_t i = 0; |
203 intptr_t j = 0; | 194 intptr_t j = 0; |
204 intptr_t num_bytes; | 195 intptr_t num_bytes; |
205 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 196 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
206 int32_t ch; | 197 int32_t ch; |
207 ASSERT(IsLatin1SequenceStart(utf8_array[i])); | 198 ASSERT(IsLatin1SequenceStart(utf8_array[i])); |
208 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 199 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
209 if (ch == -1) { | 200 if (ch == -1) { |
210 return false; // Invalid input. | 201 return false; // Invalid input. |
211 } | 202 } |
212 ASSERT(Utf::IsLatin1(ch)); | 203 ASSERT(Utf::IsLatin1(ch)); |
213 dst[j] = ch; | 204 dst[j] = ch; |
214 } | 205 } |
215 if ((i < array_len) && (j == len)) { | 206 if ((i < array_len) && (j == len)) { |
216 return false; // Output overflow. | 207 return false; // Output overflow. |
217 } | 208 } |
218 return true; // Success. | 209 return true; // Success. |
219 } | 210 } |
220 | 211 |
221 | |
222 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, | 212 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, |
223 intptr_t array_len, | 213 intptr_t array_len, |
224 uint16_t* dst, | 214 uint16_t* dst, |
225 intptr_t len) { | 215 intptr_t len) { |
226 intptr_t i = 0; | 216 intptr_t i = 0; |
227 intptr_t j = 0; | 217 intptr_t j = 0; |
228 intptr_t num_bytes; | 218 intptr_t num_bytes; |
229 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 219 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
230 int32_t ch; | 220 int32_t ch; |
231 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); | 221 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); |
232 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 222 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
233 if (ch == -1) { | 223 if (ch == -1) { |
234 return false; // Invalid input. | 224 return false; // Invalid input. |
235 } | 225 } |
236 if (is_supplementary) { | 226 if (is_supplementary) { |
237 Utf16::Encode(ch, &dst[j]); | 227 Utf16::Encode(ch, &dst[j]); |
238 j = j + 1; | 228 j = j + 1; |
239 } else { | 229 } else { |
240 dst[j] = ch; | 230 dst[j] = ch; |
241 } | 231 } |
242 } | 232 } |
243 if ((i < array_len) && (j == len)) { | 233 if ((i < array_len) && (j == len)) { |
244 return false; // Output overflow. | 234 return false; // Output overflow. |
245 } | 235 } |
246 return true; // Success. | 236 return true; // Success. |
247 } | 237 } |
248 | 238 |
249 | |
250 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, | 239 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, |
251 intptr_t array_len, | 240 intptr_t array_len, |
252 int32_t* dst, | 241 int32_t* dst, |
253 intptr_t len) { | 242 intptr_t len) { |
254 intptr_t i = 0; | 243 intptr_t i = 0; |
255 intptr_t j = 0; | 244 intptr_t j = 0; |
256 intptr_t num_bytes; | 245 intptr_t num_bytes; |
257 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { | 246 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { |
258 int32_t ch; | 247 int32_t ch; |
259 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); | 248 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); |
260 if (ch == -1) { | 249 if (ch == -1) { |
261 return false; // Invalid input. | 250 return false; // Invalid input. |
262 } | 251 } |
263 dst[j] = ch; | 252 dst[j] = ch; |
264 } | 253 } |
265 if ((i < array_len) && (j == len)) { | 254 if ((i < array_len) && (j == len)) { |
266 return false; // Output overflow. | 255 return false; // Output overflow. |
267 } | 256 } |
268 return true; // Success. | 257 return true; // Success. |
269 } | 258 } |
270 | 259 |
271 | |
272 bool Utf8::DecodeCStringToUTF32(const char* str, int32_t* dst, intptr_t len) { | 260 bool Utf8::DecodeCStringToUTF32(const char* str, int32_t* dst, intptr_t len) { |
273 ASSERT(str != NULL); | 261 ASSERT(str != NULL); |
274 intptr_t array_len = strlen(str); | 262 intptr_t array_len = strlen(str); |
275 const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str); | 263 const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str); |
276 return Utf8::DecodeToUTF32(utf8_array, array_len, dst, len); | 264 return Utf8::DecodeToUTF32(utf8_array, array_len, dst, len); |
277 } | 265 } |
278 | 266 |
279 | |
280 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { | 267 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { |
281 ASSERT(codepoint > Utf16::kMaxCodeUnit); | 268 ASSERT(codepoint > Utf16::kMaxCodeUnit); |
282 ASSERT(dst != NULL); | 269 ASSERT(dst != NULL); |
283 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); | 270 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); |
284 dst[1] = (0xDC00 + (codepoint & 0x3FF)); | 271 dst[1] = (0xDC00 + (codepoint & 0x3FF)); |
285 } | 272 } |
286 | 273 |
287 } // namespace dart | 274 } // namespace dart |
OLD | NEW |