Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(63)

Side by Side Diff: runtime/vm/unicode.cc

Issue 11280150: Add support for surrogates when serializing and deserializing for native ports (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Fixed long line Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« runtime/vm/unicode.h ('K') | « runtime/vm/unicode.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 #include "vm/unicode.h" 5 #include "vm/unicode.h"
6 6
7 #include "vm/allocation.h" 7 #include "vm/allocation.h"
8 #include "vm/globals.h" 8 #include "vm/globals.h"
9 #include "vm/object.h" 9 #include "vm/object.h"
10 10
(...skipping 13 matching lines...) Expand all
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 26 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
28 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 28 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
29 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 29 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
30 }; 30 };
31 31
32 32
33 static const uint32_t kMagicBits[7] = { 33 static const uint32_t kMagicBits[7] = {
34 0, // padding 34 0, // Padding.
35 0x00000000, 35 0x00000000,
36 0x00003080, 36 0x00003080,
37 0x000E2080, 37 0x000E2080,
38 0x03C82080, 38 0x03C82080,
39 0xFA082080, 39 0xFA082080,
40 0x82082080 40 0x82082080
41 }; 41 };
42 42
43 43
44 // Minimum values of code points used to check shortest form. 44 // Minimum values of code points used to check shortest form.
45 static const uint32_t kOverlongMinimum[7] = { 45 static const uint32_t kOverlongMinimum[7] = {
46 0, // padding 46 0, // Padding.
47 0x0, 47 0x0,
48 0x80, 48 0x80,
49 0x800, 49 0x800,
50 0x10000, 50 0x10000,
51 0xFFFFFFFF, 51 0xFFFFFFFF,
52 0xFFFFFFFF 52 0xFFFFFFFF
53 }; 53 };
54 54
55 55
56 static bool IsTrailByte(uint8_t code_unit) { 56 static bool IsTrailByte(uint8_t code_unit) {
57 return (code_unit & 0xc0) == 0x80; 57 return (code_unit & 0xc0) == 0x80;
58 } 58 }
59 59
60 60
61 static bool IsLatin1SequenceStart(uint8_t code_unit) { 61 static bool IsLatin1SequenceStart(uint8_t code_unit) {
62 // Check is codepoint is <= U+00FF 62 // Check if codepoint is <= U+00FF.
63 return (code_unit <= Utf8::kMaxOneByteChar); 63 return (code_unit <= Utf8::kMaxOneByteChar);
64 } 64 }
65 65
66 66
67 static bool IsSupplementarySequenceStart(uint8_t code_unit) { 67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {
68 // Check is codepoint is >= U+10000. 68 // Check if codepoint is >= U+10000.
69 return (code_unit >= 0xF0); 69 return (code_unit >= 0xF0);
70 } 70 }
71 71
72 72
73 // Returns true if the code point value is above Plane 17. 73 // Returns true if the code point value is above Plane 17.
74 static bool IsOutOfRange(uint32_t code_point) { 74 static bool IsOutOfRange(uint32_t code_point) {
75 return (code_point > 0x10FFFF); 75 return (code_point > 0x10FFFF);
76 } 76 }
77 77
78 78
79 // Returns true if the byte sequence is ill-formed. 79 // Returns true if the byte sequence is ill-formed.
80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) { 80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {
81 return code_point < kOverlongMinimum[num_bytes]; 81 return code_point < kOverlongMinimum[num_bytes];
82 } 82 }
83 83
84 84
85 // Returns a count of the number of UTF-8 trail bytes. 85 // Returns a count of the number of UTF-8 trail bytes.
86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array, 86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
87 intptr_t array_len, 87 intptr_t array_len,
88 Type* type) { 88 Type* type) {
89 intptr_t len = 0; 89 intptr_t len = 0;
90 Type char_type = kLatin1; 90 Type char_type = kLatin1;
91 for (intptr_t i = 0; i < array_len; i++) { 91 for (intptr_t i = 0; i < array_len; i++) {
92 uint8_t code_unit = utf8_array[i]; 92 uint8_t code_unit = utf8_array[i];
93 if (!IsTrailByte(code_unit)) { 93 if (!IsTrailByte(code_unit)) {
94 ++len; 94 ++len;
95 } 95 }
96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF 96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF.
cshapiro 2012/11/30 02:49:08 no period
Søren Gjesse 2012/11/30 12:23:07 Done.
97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000 97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000.
cshapiro 2012/11/30 02:49:08 ditto
Søren Gjesse 2012/11/30 12:23:07 Done.
98 char_type = kSupplementary; 98 char_type = kSupplementary;
99 ++len; 99 ++len;
100 } else if (char_type == kLatin1) { 100 } else if (char_type == kLatin1) {
101 char_type = kBMP; 101 char_type = kBMP;
102 } 102 }
103 } 103 }
104 } 104 }
105 *type = char_type; 105 *type = char_type;
106 return len; 106 return len;
107 } 107 }
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
158 String::CodePointIterator it(str); 158 String::CodePointIterator it(str);
159 while (it.Next()) { 159 while (it.Next()) {
160 int32_t ch = it.Current(); 160 int32_t ch = it.Current();
161 length += Utf8::Length(ch); 161 length += Utf8::Length(ch);
162 } 162 }
163 return length; 163 return length;
164 } 164 }
165 165
166 166
167 intptr_t Utf8::Encode(int32_t ch, char* dst) { 167 intptr_t Utf8::Encode(int32_t ch, char* dst) {
168 ASSERT(!Utf16::IsSurrogate(ch));
168 static const int kMask = ~(1 << 6); 169 static const int kMask = ~(1 << 6);
169 if (ch <= kMaxOneByteChar) { 170 if (ch <= kMaxOneByteChar) {
170 dst[0] = ch; 171 dst[0] = ch;
171 return 1; 172 return 1;
172 } 173 }
173 if (ch <= kMaxTwoByteChar) { 174 if (ch <= kMaxTwoByteChar) {
174 dst[0] = 0xC0 | (ch >> 6); 175 dst[0] = 0xC0 | (ch >> 6);
175 dst[1] = 0x80 | (ch & kMask); 176 dst[1] = 0x80 | (ch & kMask);
176 return 2; 177 return 2;
177 } 178 }
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
244 uint8_t* dst, 245 uint8_t* dst,
245 intptr_t len) { 246 intptr_t len) {
246 intptr_t i = 0; 247 intptr_t i = 0;
247 intptr_t j = 0; 248 intptr_t j = 0;
248 intptr_t num_bytes; 249 intptr_t num_bytes;
249 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 250 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
250 int32_t ch; 251 int32_t ch;
251 ASSERT(IsLatin1SequenceStart(utf8_array[i])); 252 ASSERT(IsLatin1SequenceStart(utf8_array[i]));
252 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 253 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
253 if (ch == -1) { 254 if (ch == -1) {
254 return false; // invalid input 255 return false; // Invalid input.
255 } 256 }
256 ASSERT(ch <= 0xff); 257 ASSERT(ch <= 0xff);
257 dst[j] = ch; 258 dst[j] = ch;
258 } 259 }
259 if ((i < array_len) && (j == len)) { 260 if ((i < array_len) && (j == len)) {
260 return false; // output overflow 261 return false; // Output overflow.
261 } 262 }
262 return true; // success 263 return true; // Success.
263 } 264 }
264 265
265 266
266 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array, 267 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
267 intptr_t array_len, 268 intptr_t array_len,
268 uint16_t* dst, 269 uint16_t* dst,
269 intptr_t len) { 270 intptr_t len) {
270 intptr_t i = 0; 271 intptr_t i = 0;
271 intptr_t j = 0; 272 intptr_t j = 0;
272 intptr_t num_bytes; 273 intptr_t num_bytes;
273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 274 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
274 int32_t ch; 275 int32_t ch;
275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]); 276 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);
276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 277 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
277 if (ch == -1) { 278 if (ch == -1) {
278 return false; // invalid input 279 return false; // Invalid input.
279 } 280 }
280 if (is_supplementary) { 281 if (is_supplementary) {
281 Utf16::Encode(ch, &dst[j]); 282 Utf16::Encode(ch, &dst[j]);
282 j = j + 1; 283 j = j + 1;
283 } else { 284 } else {
284 dst[j] = ch; 285 dst[j] = ch;
285 } 286 }
286 } 287 }
287 if ((i < array_len) && (j == len)) { 288 if ((i < array_len) && (j == len)) {
288 return false; // output overflow 289 return false; // Output overflow.
289 } 290 }
290 return true; // success 291 return true; // Success.
291 } 292 }
292 293
293 294
294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array, 295 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
295 intptr_t array_len, 296 intptr_t array_len,
296 int32_t* dst, 297 int32_t* dst,
297 intptr_t len) { 298 intptr_t len) {
298 intptr_t i = 0; 299 intptr_t i = 0;
299 intptr_t j = 0; 300 intptr_t j = 0;
300 intptr_t num_bytes; 301 intptr_t num_bytes;
301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) { 302 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
302 int32_t ch; 303 int32_t ch;
303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch); 304 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
304 if (ch == -1) { 305 if (ch == -1) {
305 return false; // invalid input 306 return false; // Invalid input.
306 } 307 }
307 dst[j] = ch; 308 dst[j] = ch;
308 } 309 }
309 if ((i < array_len) && (j == len)) { 310 if ((i < array_len) && (j == len)) {
310 return false; // output overflow 311 return false; // Output overflow.
311 } 312 }
312 return true; // success 313 return true; // Success.
313 } 314 }
314 315
315 316
316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) { 317 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {
317 ASSERT(codepoint > kMaxBmpCodepoint); 318 ASSERT(codepoint > kMaxBmpCodepoint);
318 ASSERT(dst != NULL); 319 ASSERT(dst != NULL);
319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10)); 320 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));
320 dst[1] = (0xDC00 + (codepoint & 0x3FF)); 321 dst[1] = (0xDC00 + (codepoint & 0x3FF));
321 } 322 }
322 323
324
325 bool Utf16::CodePointIterator::Next() {
326 ASSERT(index_ >= -1);
327 ASSERT(index_ < array_len_);
328 int d = Length(ch_);
329 if (index_ == (array_len_ - d)) {
330 return false;
331 }
332 index_ += d;
333 ch_ = utf16_array_[index_];
334 if (IsLeadSurrogate(ch_) && (index_ != (array_len_ - 1))) {
335 int32_t ch2 = utf16_array_[index_ + 1];
336 if (IsTrailSurrogate(ch2)) {
337 ch_ = Decode(ch_, ch2);
338 }
339 }
340 return true;
341 }
342
323 } // namespace dart 343 } // namespace dart
OLDNEW
« runtime/vm/unicode.h ('K') | « runtime/vm/unicode.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698