Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(184)

Side by Side Diff: runtime/vm/unicode.cc

Issue 11280150: Add support for surrogates when serializing and deserializing for native ports (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« runtime/vm/snapshot_test.cc ('K') | « runtime/vm/unicode.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
11 namespace dart { 11 namespace dart {
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
52 0xFFFFFFFF 52 0xFFFFFFFF
53 }; 53 };
54 54
55 55
56 static bool IsTrailByte(uint8_t code_unit) { 56 static bool IsTrailByte(uint8_t code_unit) {
57 return (code_unit & 0xc0) == 0x80; 57 return (code_unit & 0xc0) == 0x80;
58 } 58 }
59 59
60 60
61 static bool IsLatin1SequenceStart(uint8_t code_unit) { 61 static bool IsLatin1SequenceStart(uint8_t code_unit) {
62 // Check is codepoint is <= U+00FF 62 // Check is codepoint is <= U+00FF.
63 return (code_unit <= Utf8::kMaxOneByteChar); 63 return (code_unit <= Utf8::kMaxOneByteChar);
64 } 64 }
65 65
66 66
67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { 67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {
68 // Check is codepoint is >= U+10000. 68 // Check is codepoint is >= U+10000.
69 return (code_unit >= 0xF0); 69 return (code_unit >= 0xF0);
70 } 70 }
71 71
72 72
(...skipping 28 matching lines...) Expand all
101 char_type = kBMP; 101 char_type = kBMP;
102 } 102 }
103 } 103 }
104 } 104 }
105 *type = char_type; 105 *type = char_type;
106 return len; 106 return len;
107 } 107 }
108 108
109 109
110 // Returns true if str is a valid NUL-terminated UTF-8 string. 110 // Returns true if str is a valid NUL-terminated UTF-8 string.
111 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) { 111 static bool IsValidUtf8(
112 const uint8_t* utf8_array, intptr_t array_len, bool allow_surrogates) {
112 intptr_t i = 0; 113 intptr_t i = 0;
113 while (i < array_len) { 114 while (i < array_len) {
114 uint32_t ch = utf8_array[i] & 0xFF; 115 uint32_t ch = utf8_array[i] & 0xFF;
115 intptr_t j = 1; 116 intptr_t j = 1;
116 if (ch >= 0x80) { 117 if (ch >= 0x80) {
117 int8_t num_trail_bytes = kTrailBytes[ch]; 118 int8_t num_trail_bytes = kTrailBytes[ch];
118 bool is_malformed = false; 119 bool is_malformed = false;
119 for (; j < num_trail_bytes; ++j) { 120 for (; j < num_trail_bytes; ++j) {
120 if ((i + j) < array_len) { 121 if ((i + j) < array_len) {
121 uint8_t code_unit = utf8_array[i + j]; 122 uint8_t code_unit = utf8_array[i + j];
122 is_malformed |= !IsTrailByte(code_unit); 123 is_malformed |= !IsTrailByte(code_unit);
123 ch = (ch << 6) + code_unit; 124 ch = (ch << 6) + code_unit;
124 } else { 125 } else {
125 return false; 126 return false;
126 } 127 }
127 } 128 }
128 ch -= kMagicBits[num_trail_bytes]; 129 ch -= kMagicBits[num_trail_bytes];
129 if (!((is_malformed == false) && 130 if (!((is_malformed == false) &&
130 (j == num_trail_bytes) && 131 (j == num_trail_bytes) &&
131 !IsOutOfRange(ch) && 132 !IsOutOfRange(ch) &&
132 !IsNonShortestForm(ch, j) && 133 !IsNonShortestForm(ch, j) &&
133 !Utf16::IsSurrogate(ch))) { 134 (!Utf16::IsSurrogate(ch) || allow_surrogates))) {
134 return false; 135 return false;
135 } 136 }
136 } 137 }
137 i += j; 138 i += j;
138 } 139 }
139 return true; 140 return true;
140 } 141 }
141 142
142 143
144 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {
145 return IsValidUtf8(utf8_array, array_len, false);
146 }
147
148
149 bool Utf8::IsValidAllowSurrogates(
150 const uint8_t* utf8_array, intptr_t array_len) {
151 return IsValidUtf8(utf8_array, array_len, true);
152 }
153
154
143 intptr_t Utf8::Length(int32_t ch) { 155 intptr_t Utf8::Length(int32_t ch) {
144 if (ch <= kMaxOneByteChar) { 156 if (ch <= kMaxOneByteChar) {
145 return 1; 157 return 1;
146 } else if (ch <= kMaxTwoByteChar) { 158 } else if (ch <= kMaxTwoByteChar) {
147 return 2; 159 return 2;
148 } else if (ch <= kMaxThreeByteChar) { 160 } else if (ch <= kMaxThreeByteChar) {
149 return 3; 161 return 3;
150 } 162 }
151 ASSERT(ch <= kMaxFourByteChar); 163 ASSERT(ch <= kMaxFourByteChar);
152 return 4; 164 return 4;
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
199 if (pos + num_bytes > len) { 211 if (pos + num_bytes > len) {
200 break; 212 break;
201 } 213 }
202 Utf8::Encode(ch, &dst[pos]); 214 Utf8::Encode(ch, &dst[pos]);
203 pos += num_bytes; 215 pos += num_bytes;
204 } 216 }
205 return pos; 217 return pos;
206 } 218 }
207 219
208 220
209 intptr_t Utf8::Decode(const uint8_t* utf8_array, 221 static intptr_t DecodeUTF8(const uint8_t* utf8_array,
210 intptr_t array_len, 222 intptr_t array_len,
211 int32_t* dst) { 223 int32_t* dst,
224 bool allow_surrogates) {
212 uint32_t ch = utf8_array[0] & 0xFF; 225 uint32_t ch = utf8_array[0] & 0xFF;
213 intptr_t i = 1; 226 intptr_t i = 1;
214 if (ch >= 0x80) { 227 if (ch >= 0x80) {
215 intptr_t num_trail_bytes = kTrailBytes[ch]; 228 intptr_t num_trail_bytes = kTrailBytes[ch];
216 bool is_malformed = false; 229 bool is_malformed = false;
217 for (; i < num_trail_bytes; ++i) { 230 for (; i < num_trail_bytes; ++i) {
218 if (i < array_len) { 231 if (i < array_len) {
219 uint8_t code_unit = utf8_array[i]; 232 uint8_t code_unit = utf8_array[i];
220 is_malformed |= !IsTrailByte(code_unit); 233 is_malformed |= !IsTrailByte(code_unit);
221 ch = (ch << 6) + code_unit; 234 ch = (ch << 6) + code_unit;
222 } else { 235 } else {
223 *dst = -1; 236 *dst = -1;
224 return 0; 237 return 0;
225 } 238 }
226 } 239 }
227 ch -= kMagicBits[num_trail_bytes]; 240 ch -= kMagicBits[num_trail_bytes];
228 if (!((is_malformed == false) && 241 if (!((is_malformed == false) &&
229 (i == num_trail_bytes) && 242 (i == num_trail_bytes) &&
230 !IsOutOfRange(ch) && 243 !IsOutOfRange(ch) &&
231 !IsNonShortestForm(ch, i) && 244 !IsNonShortestForm(ch, i) &&
232 !Utf16::IsSurrogate(ch))) { 245 (!Utf16::IsSurrogate(ch) || allow_surrogates))) {
233 *dst = -1; 246 *dst = -1;
234 return 0; 247 return 0;
235 } 248 }
236 } 249 }
237 *dst = ch; 250 *dst = ch;
238 return i; 251 return i;
239 } 252 }
240 253
241 254
255 intptr_t Utf8::Decode(const uint8_t* utf8_array,
256 intptr_t array_len,
257 int32_t* dst) {
258 return DecodeUTF8(utf8_array, array_len, dst, false);
259 }
260
261
262 intptr_t Utf8::DecodeAllowSurrogates(const uint8_t* utf8_array,
263 intptr_t array_len,
264 int32_t* dst) {
265 return DecodeUTF8(utf8_array, array_len, dst, true);
266 }
267
268
242 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array, 269 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array,
243 intptr_t array_len, 270 intptr_t array_len,
244 uint8_t* dst, 271 uint8_t* dst,
245 intptr_t len) { 272 intptr_t len) {
246 intptr_t i = 0; 273 intptr_t i = 0;
247 intptr_t j = 0; 274 intptr_t j = 0;
248 intptr_t num_bytes; 275 intptr_t num_bytes;
249 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 276 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
250 int32_t ch; 277 int32_t ch;
251 ASSERT(IsLatin1SequenceStart(utf8_array[i])); 278 ASSERT(IsLatin1SequenceStart(utf8_array[i]));
252 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 279 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
253 if (ch == -1) { 280 if (ch == -1) {
254 return false; // invalid input 281 return false; // Invalid input.
255 } 282 }
256 ASSERT(ch <= 0xff); 283 ASSERT(ch <= 0xff);
257 dst[j] = ch; 284 dst[j] = ch;
258 } 285 }
259 if ((i < array_len) && (j == len)) { 286 if ((i < array_len) && (j == len)) {
260 return false; // output overflow 287 return false; // Output overflow.
261 } 288 }
262 return true; // success 289 return true; // Success.
263 } 290 }
264 291
265 292
266 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, 293 bool DecodeUTF8ToUTF16(const uint8_t* utf8_array,
267 intptr_t array_len, 294 intptr_t array_len,
268 uint16_t* dst, 295 uint16_t* dst,
269 intptr_t len) { 296 intptr_t len,
297 bool allow_surrogates) {
270 intptr_t i = 0; 298 intptr_t i = 0;
271 intptr_t j = 0; 299 intptr_t j = 0;
272 intptr_t num_bytes; 300 intptr_t num_bytes;
273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
274 int32_t ch; 302 int32_t ch;
275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); 303 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);
276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 304 num_bytes = DecodeUTF8(
305 &utf8_array[i], (array_len - i), &ch, allow_surrogates);
277 if (ch == -1) { 306 if (ch == -1) {
278 return false; // invalid input 307 return false; // Invalid input.
279 } 308 }
280 if (is_supplementary) { 309 if (is_supplementary) {
281 Utf16::Encode(ch, &dst[j]); 310 Utf16::Encode(ch, &dst[j]);
282 j = j + 1; 311 j = j + 1;
283 } else { 312 } else {
284 dst[j] = ch; 313 dst[j] = ch;
285 } 314 }
286 } 315 }
287 if ((i < array_len) && (j == len)) { 316 if ((i < array_len) && (j == len)) {
288 return false; // output overflow 317 return false; // Output overflow.
289 } 318 }
290 return true; // success 319 return true; // Success.
291 } 320 }
292 321
293 322
323 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
324 intptr_t array_len,
325 uint16_t* dst,
326 intptr_t len) {
327 return DecodeUTF8ToUTF16(utf8_array, array_len, dst, len, false);
328 }
329
330
331 bool Utf8::DecodeToUTF16AllowSurrogates(const uint8_t* utf8_array,
332 intptr_t array_len,
333 uint16_t* dst,
334 intptr_t len) {
335 return DecodeUTF8ToUTF16(utf8_array, array_len, dst, len, true);
336 }
337
338
294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, 339 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
295 intptr_t array_len, 340 intptr_t array_len,
296 int32_t* dst, 341 int32_t* dst,
297 intptr_t len) { 342 intptr_t len) {
298 intptr_t i = 0; 343 intptr_t i = 0;
299 intptr_t j = 0; 344 intptr_t j = 0;
300 intptr_t num_bytes; 345 intptr_t num_bytes;
301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 346 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
302 int32_t ch; 347 int32_t ch;
303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 348 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
304 if (ch == -1) { 349 if (ch == -1) {
305 return false; // invalid input 350 return false; // Invalid input.
306 } 351 }
307 dst[j] = ch; 352 dst[j] = ch;
308 } 353 }
309 if ((i < array_len) && (j == len)) { 354 if ((i < array_len) && (j == len)) {
310 return false; // output overflow 355 return false; // Output overflow.
311 } 356 }
312 return true; // success 357 return true; // Success.
313 } 358 }
314 359
315 360
316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { 361 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {
317 ASSERT(codepoint > kMaxBmpCodepoint); 362 ASSERT(codepoint > kMaxBmpCodepoint);
318 ASSERT(dst != NULL); 363 ASSERT(dst != NULL);
319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); 364 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));
320 dst[1] = (0xDC00 + (codepoint & 0x3FF)); 365 dst[1] = (0xDC00 + (codepoint & 0x3FF));
321 } 366 }
322 367
323 } // namespace dart 368 } // namespace dart
OLDNEW
« runtime/vm/snapshot_test.cc ('K') | « runtime/vm/unicode.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698