runtime/vm/unicode.cc - Issue 11280150: Add support for surrogates when serializing and deserializing for native ports

Side by Side Diff: runtime/vm/unicode.cc

Issue 11280150: Add support for surrogates when serializing and deserializing for native ports (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Fixed long line Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/unicode.h"	5 #include "vm/unicode.h"

6	6

7 #include "vm/allocation.h"	7 #include "vm/allocation.h"

8 #include "vm/globals.h"	8 #include "vm/globals.h"

9 #include "vm/object.h"	9 #include "vm/object.h"

10	10

(...skipping 13 matching lines...) Expand all Loading...
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

26 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	26 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,	27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

28 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,	28 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

29 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0	29 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0

30 };	30 };

31	31

32	32

33 static const uint32_t kMagicBits[7] = {	33 static const uint32_t kMagicBits[7] = {

34 0, // padding	34 0, // Padding.

35 0x00000000,	35 0x00000000,

36 0x00003080,	36 0x00003080,

37 0x000E2080,	37 0x000E2080,

38 0x03C82080,	38 0x03C82080,

39 0xFA082080,	39 0xFA082080,

40 0x82082080	40 0x82082080

41 };	41 };

42	42

43	43

44 // Minimum values of code points used to check shortest form.	44 // Minimum values of code points used to check shortest form.

45 static const uint32_t kOverlongMinimum[7] = {	45 static const uint32_t kOverlongMinimum[7] = {

46 0, // padding	46 0, // Padding.

47 0x0,	47 0x0,

48 0x80,	48 0x80,

49 0x800,	49 0x800,

50 0x10000,	50 0x10000,

51 0xFFFFFFFF,	51 0xFFFFFFFF,

52 0xFFFFFFFF	52 0xFFFFFFFF

53 };	53 };

54	54

55	55

56 static bool IsTrailByte(uint8_t code_unit) {	56 static bool IsTrailByte(uint8_t code_unit) {

57 return (code_unit & 0xc0) == 0x80;	57 return (code_unit & 0xc0) == 0x80;

58 }	58 }

59	59

60	60

61 static bool IsLatin1SequenceStart(uint8_t code_unit) {	61 static bool IsLatin1SequenceStart(uint8_t code_unit) {

62 // Check is codepoint is <= U+00FF	62 // Check if codepoint is <= U+00FF.

63 return (code_unit <= Utf8::kMaxOneByteChar);	63 return (code_unit <= Utf8::kMaxOneByteChar);

64 }	64 }

65	65

66	66

67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {	67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {

68 // Check is codepoint is >= U+10000.	68 // Check if codepoint is >= U+10000.

69 return (code_unit >= 0xF0);	69 return (code_unit >= 0xF0);

70 }	70 }

71	71

72	72

73 // Returns true if the code point value is above Plane 17.	73 // Returns true if the code point value is above Plane 17.

74 static bool IsOutOfRange(uint32_t code_point) {	74 static bool IsOutOfRange(uint32_t code_point) {

75 return (code_point > 0x10FFFF);	75 return (code_point > 0x10FFFF);

76 }	76 }

77	77

78	78

79 // Returns true if the byte sequence is ill-formed.	79 // Returns true if the byte sequence is ill-formed.

80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {	80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {

81 return code_point < kOverlongMinimum[num_bytes];	81 return code_point < kOverlongMinimum[num_bytes];

82 }	82 }

83	83

84	84

85 // Returns a count of the number of UTF-8 trail bytes.	85 // Returns a count of the number of UTF-8 trail bytes.

86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,	86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,

87 intptr_t array_len,	87 intptr_t array_len,

88 Type* type) {	88 Type* type) {

89 intptr_t len = 0;	89 intptr_t len = 0;

90 Type char_type = kLatin1;	90 Type char_type = kLatin1;

91 for (intptr_t i = 0; i < array_len; i++) {	91 for (intptr_t i = 0; i < array_len; i++) {

92 uint8_t code_unit = utf8_array[i];	92 uint8_t code_unit = utf8_array[i];

93 if (!IsTrailByte(code_unit)) {	93 if (!IsTrailByte(code_unit)) {

94 ++len;	94 ++len;

95 }	95 }

96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF	96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF.
	cshapiro 2012/11/30 02:49:08 no period no period Søren Gjesse 2012/11/30 12:23:07 Done. Show quoted text On 2012/11/30 02:49:08, cshapiro wrote: > no period Done.
97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000	97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000.
	cshapiro 2012/11/30 02:49:08 ditto ditto Søren Gjesse 2012/11/30 12:23:07 Done. Show quoted text On 2012/11/30 02:49:08, cshapiro wrote: > ditto Done.
98 char_type = kSupplementary;	98 char_type = kSupplementary;

99 ++len;	99 ++len;

100 } else if (char_type == kLatin1) {	100 } else if (char_type == kLatin1) {

101 char_type = kBMP;	101 char_type = kBMP;

102 }	102 }

103 }	103 }

104 }	104 }

105 *type = char_type;	105 *type = char_type;

106 return len;	106 return len;

107 }	107 }

(...skipping 50 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
158 String::CodePointIterator it(str);	158 String::CodePointIterator it(str);

159 while (it.Next()) {	159 while (it.Next()) {

160 int32_t ch = it.Current();	160 int32_t ch = it.Current();

161 length += Utf8::Length(ch);	161 length += Utf8::Length(ch);

162 }	162 }

163 return length;	163 return length;

164 }	164 }

165	165

166	166

167 intptr_t Utf8::Encode(int32_t ch, char* dst) {	167 intptr_t Utf8::Encode(int32_t ch, char* dst) {

	168 ASSERT(!Utf16::IsSurrogate(ch));

168 static const int kMask = ~(1 << 6);	169 static const int kMask = ~(1 << 6);

169 if (ch <= kMaxOneByteChar) {	170 if (ch <= kMaxOneByteChar) {

170 dst[0] = ch;	171 dst[0] = ch;

171 return 1;	172 return 1;

172 }	173 }

173 if (ch <= kMaxTwoByteChar) {	174 if (ch <= kMaxTwoByteChar) {

174 dst[0] = 0xC0 \| (ch >> 6);	175 dst[0] = 0xC0 \| (ch >> 6);

175 dst[1] = 0x80 \| (ch & kMask);	176 dst[1] = 0x80 \| (ch & kMask);

176 return 2;	177 return 2;

177 }	178 }

(...skipping 66 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
244 uint8_t* dst,	245 uint8_t* dst,

245 intptr_t len) {	246 intptr_t len) {

246 intptr_t i = 0;	247 intptr_t i = 0;

247 intptr_t j = 0;	248 intptr_t j = 0;

248 intptr_t num_bytes;	249 intptr_t num_bytes;

249 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {	250 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

250 int32_t ch;	251 int32_t ch;

251 ASSERT(IsLatin1SequenceStart(utf8_array[i]));	252 ASSERT(IsLatin1SequenceStart(utf8_array[i]));

252 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);	253 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

253 if (ch == -1) {	254 if (ch == -1) {

254 return false; // invalid input	255 return false; // Invalid input.

255 }	256 }

256 ASSERT(ch <= 0xff);	257 ASSERT(ch <= 0xff);

257 dst[j] = ch;	258 dst[j] = ch;

258 }	259 }

259 if ((i < array_len) && (j == len)) {	260 if ((i < array_len) && (j == len)) {

260 return false; // output overflow	261 return false; // Output overflow.

261 }	262 }

262 return true; // success	263 return true; // Success.

263 }	264 }

264	265

265	266

266 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,	267 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,

267 intptr_t array_len,	268 intptr_t array_len,

268 uint16_t* dst,	269 uint16_t* dst,

269 intptr_t len) {	270 intptr_t len) {

270 intptr_t i = 0;	271 intptr_t i = 0;

271 intptr_t j = 0;	272 intptr_t j = 0;

272 intptr_t num_bytes;	273 intptr_t num_bytes;

273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {	274 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

274 int32_t ch;	275 int32_t ch;

275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);	276 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);

276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);	277 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

277 if (ch == -1) {	278 if (ch == -1) {

278 return false; // invalid input	279 return false; // Invalid input.

279 }	280 }

280 if (is_supplementary) {	281 if (is_supplementary) {

281 Utf16::Encode(ch, &dst[j]);	282 Utf16::Encode(ch, &dst[j]);

282 j = j + 1;	283 j = j + 1;

283 } else {	284 } else {

284 dst[j] = ch;	285 dst[j] = ch;

285 }	286 }

286 }	287 }

287 if ((i < array_len) && (j == len)) {	288 if ((i < array_len) && (j == len)) {

288 return false; // output overflow	289 return false; // Output overflow.

289 }	290 }

290 return true; // success	291 return true; // Success.

291 }	292 }

292	293

293	294

294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,	295 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,

295 intptr_t array_len,	296 intptr_t array_len,

296 int32_t* dst,	297 int32_t* dst,

297 intptr_t len) {	298 intptr_t len) {

298 intptr_t i = 0;	299 intptr_t i = 0;

299 intptr_t j = 0;	300 intptr_t j = 0;

300 intptr_t num_bytes;	301 intptr_t num_bytes;

301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {	302 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

302 int32_t ch;	303 int32_t ch;

303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);	304 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

304 if (ch == -1) {	305 if (ch == -1) {

305 return false; // invalid input	306 return false; // Invalid input.

306 }	307 }

307 dst[j] = ch;	308 dst[j] = ch;

308 }	309 }

309 if ((i < array_len) && (j == len)) {	310 if ((i < array_len) && (j == len)) {

310 return false; // output overflow	311 return false; // Output overflow.

311 }	312 }

312 return true; // success	313 return true; // Success.

313 }	314 }

314	315

315	316

316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {	317 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {

317 ASSERT(codepoint > kMaxBmpCodepoint);	318 ASSERT(codepoint > kMaxBmpCodepoint);

318 ASSERT(dst != NULL);	319 ASSERT(dst != NULL);

319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));	320 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));

320 dst[1] = (0xDC00 + (codepoint & 0x3FF));	321 dst[1] = (0xDC00 + (codepoint & 0x3FF));

321 }	322 }

322	323

	324

	325 bool Utf16::CodePointIterator::Next() {

	326 ASSERT(index_ >= -1);

	327 ASSERT(index_ < array_len_);

	328 int d = Length(ch_);

	329 if (index_ == (array_len_ - d)) {

	330 return false;

	331 }

	332 index_ += d;

	333 ch_ = utf16_array_[index_];

	334 if (IsLeadSurrogate(ch_) && (index_ != (array_len_ - 1))) {

	335 int32_t ch2 = utf16_array_[index_ + 1];

	336 if (IsTrailSurrogate(ch2)) {

	337 ch_ = Decode(ch_, ch2);

	338 }

	339 }

	340 return true;

	341 }

	342

323 } // namespace dart	343 } // namespace dart

OLD	NEW

« runtime/vm/unicode.h ('K') | « runtime/vm/unicode.h ('k') | no next file » | no next file with comments »