runtime/vm/unicode.cc - Issue 11280150: Add support for surrogates when serializing and deserializing for native ports

Side by Side Diff: runtime/vm/unicode.cc

Issue 11280150: Add support for surrogates when serializing and deserializing for native ports (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/unicode.h"	5 #include "vm/unicode.h"

6	6

7 #include "vm/allocation.h"	7 #include "vm/allocation.h"

8 #include "vm/globals.h"	8 #include "vm/globals.h"

9 #include "vm/object.h"	9 #include "vm/object.h"

10	10

11 namespace dart {	11 namespace dart {

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
52 0xFFFFFFFF	52 0xFFFFFFFF

53 };	53 };

54	54

55	55

56 static bool IsTrailByte(uint8_t code_unit) {	56 static bool IsTrailByte(uint8_t code_unit) {

57 return (code_unit & 0xc0) == 0x80;	57 return (code_unit & 0xc0) == 0x80;

58 }	58 }

59	59

60	60

61 static bool IsLatin1SequenceStart(uint8_t code_unit) {	61 static bool IsLatin1SequenceStart(uint8_t code_unit) {

62 // Check is codepoint is <= U+00FF	62 // Check is codepoint is <= U+00FF.

63 return (code_unit <= Utf8::kMaxOneByteChar);	63 return (code_unit <= Utf8::kMaxOneByteChar);

64 }	64 }

65	65

66	66

67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {	67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {

68 // Check is codepoint is >= U+10000.	68 // Check is codepoint is >= U+10000.

69 return (code_unit >= 0xF0);	69 return (code_unit >= 0xF0);

70 }	70 }

71	71

72	72

(...skipping 28 matching lines...) Expand all Loading...
101 char_type = kBMP;	101 char_type = kBMP;

102 }	102 }

103 }	103 }

104 }	104 }

105 *type = char_type;	105 *type = char_type;

106 return len;	106 return len;

107 }	107 }

108	108

109	109

110 // Returns true if str is a valid NUL-terminated UTF-8 string.	110 // Returns true if str is a valid NUL-terminated UTF-8 string.

111 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {	111 static bool IsValidUtf8(

	112 const uint8_t* utf8_array, intptr_t array_len, bool allow_surrogates) {

112 intptr_t i = 0;	113 intptr_t i = 0;

113 while (i < array_len) {	114 while (i < array_len) {

114 uint32_t ch = utf8_array[i] & 0xFF;	115 uint32_t ch = utf8_array[i] & 0xFF;

115 intptr_t j = 1;	116 intptr_t j = 1;

116 if (ch >= 0x80) {	117 if (ch >= 0x80) {

117 int8_t num_trail_bytes = kTrailBytes[ch];	118 int8_t num_trail_bytes = kTrailBytes[ch];

118 bool is_malformed = false;	119 bool is_malformed = false;

119 for (; j < num_trail_bytes; ++j) {	120 for (; j < num_trail_bytes; ++j) {

120 if ((i + j) < array_len) {	121 if ((i + j) < array_len) {

121 uint8_t code_unit = utf8_array[i + j];	122 uint8_t code_unit = utf8_array[i + j];

122 is_malformed \|= !IsTrailByte(code_unit);	123 is_malformed \|= !IsTrailByte(code_unit);

123 ch = (ch << 6) + code_unit;	124 ch = (ch << 6) + code_unit;

124 } else {	125 } else {

125 return false;	126 return false;

126 }	127 }

127 }	128 }

128 ch -= kMagicBits[num_trail_bytes];	129 ch -= kMagicBits[num_trail_bytes];

129 if (!((is_malformed == false) &&	130 if (!((is_malformed == false) &&

130 (j == num_trail_bytes) &&	131 (j == num_trail_bytes) &&

131 !IsOutOfRange(ch) &&	132 !IsOutOfRange(ch) &&

132 !IsNonShortestForm(ch, j) &&	133 !IsNonShortestForm(ch, j) &&

133 !Utf16::IsSurrogate(ch))) {	134 (!Utf16::IsSurrogate(ch) \|\| allow_surrogates))) {

134 return false;	135 return false;

135 }	136 }

136 }	137 }

137 i += j;	138 i += j;

138 }	139 }

139 return true;	140 return true;

140 }	141 }

141	142

142	143

	144 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {

	145 return IsValidUtf8(utf8_array, array_len, false);

	146 }

	147

	148

	149 bool Utf8::IsValidAllowSurrogates(

	150 const uint8_t* utf8_array, intptr_t array_len) {

	151 return IsValidUtf8(utf8_array, array_len, true);

	152 }

	153

	154

143 intptr_t Utf8::Length(int32_t ch) {	155 intptr_t Utf8::Length(int32_t ch) {

144 if (ch <= kMaxOneByteChar) {	156 if (ch <= kMaxOneByteChar) {

145 return 1;	157 return 1;

146 } else if (ch <= kMaxTwoByteChar) {	158 } else if (ch <= kMaxTwoByteChar) {

147 return 2;	159 return 2;

148 } else if (ch <= kMaxThreeByteChar) {	160 } else if (ch <= kMaxThreeByteChar) {

149 return 3;	161 return 3;

150 }	162 }

151 ASSERT(ch <= kMaxFourByteChar);	163 ASSERT(ch <= kMaxFourByteChar);

152 return 4;	164 return 4;

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
199 if (pos + num_bytes > len) {	211 if (pos + num_bytes > len) {

200 break;	212 break;

201 }	213 }

202 Utf8::Encode(ch, &dst[pos]);	214 Utf8::Encode(ch, &dst[pos]);

203 pos += num_bytes;	215 pos += num_bytes;

204 }	216 }

205 return pos;	217 return pos;

206 }	218 }

207	219

208	220

209 intptr_t Utf8::Decode(const uint8_t* utf8_array,	221 static intptr_t DecodeUTF8(const uint8_t* utf8_array,

210 intptr_t array_len,	222 intptr_t array_len,

211 int32_t* dst) {	223 int32_t* dst,

	224 bool allow_surrogates) {

212 uint32_t ch = utf8_array[0] & 0xFF;	225 uint32_t ch = utf8_array[0] & 0xFF;

213 intptr_t i = 1;	226 intptr_t i = 1;

214 if (ch >= 0x80) {	227 if (ch >= 0x80) {

215 intptr_t num_trail_bytes = kTrailBytes[ch];	228 intptr_t num_trail_bytes = kTrailBytes[ch];

216 bool is_malformed = false;	229 bool is_malformed = false;

217 for (; i < num_trail_bytes; ++i) {	230 for (; i < num_trail_bytes; ++i) {

218 if (i < array_len) {	231 if (i < array_len) {

219 uint8_t code_unit = utf8_array[i];	232 uint8_t code_unit = utf8_array[i];

220 is_malformed \|= !IsTrailByte(code_unit);	233 is_malformed \|= !IsTrailByte(code_unit);

221 ch = (ch << 6) + code_unit;	234 ch = (ch << 6) + code_unit;

222 } else {	235 } else {

223 *dst = -1;	236 *dst = -1;

224 return 0;	237 return 0;

225 }	238 }

226 }	239 }

227 ch -= kMagicBits[num_trail_bytes];	240 ch -= kMagicBits[num_trail_bytes];

228 if (!((is_malformed == false) &&	241 if (!((is_malformed == false) &&

229 (i == num_trail_bytes) &&	242 (i == num_trail_bytes) &&

230 !IsOutOfRange(ch) &&	243 !IsOutOfRange(ch) &&

231 !IsNonShortestForm(ch, i) &&	244 !IsNonShortestForm(ch, i) &&

232 !Utf16::IsSurrogate(ch))) {	245 (!Utf16::IsSurrogate(ch) \|\| allow_surrogates))) {

233 *dst = -1;	246 *dst = -1;

234 return 0;	247 return 0;

235 }	248 }

236 }	249 }

237 *dst = ch;	250 *dst = ch;

238 return i;	251 return i;

239 }	252 }

240	253

241	254

	255 intptr_t Utf8::Decode(const uint8_t* utf8_array,

	256 intptr_t array_len,

	257 int32_t* dst) {

	258 return DecodeUTF8(utf8_array, array_len, dst, false);

	259 }

	260

	261

	262 intptr_t Utf8::DecodeAllowSurrogates(const uint8_t* utf8_array,

	263 intptr_t array_len,

	264 int32_t* dst) {

	265 return DecodeUTF8(utf8_array, array_len, dst, true);

	266 }

	267

	268

242 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array,	269 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array,

243 intptr_t array_len,	270 intptr_t array_len,

244 uint8_t* dst,	271 uint8_t* dst,

245 intptr_t len) {	272 intptr_t len) {

246 intptr_t i = 0;	273 intptr_t i = 0;

247 intptr_t j = 0;	274 intptr_t j = 0;

248 intptr_t num_bytes;	275 intptr_t num_bytes;

249 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {	276 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

250 int32_t ch;	277 int32_t ch;

251 ASSERT(IsLatin1SequenceStart(utf8_array[i]));	278 ASSERT(IsLatin1SequenceStart(utf8_array[i]));

252 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);	279 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

253 if (ch == -1) {	280 if (ch == -1) {

254 return false; // invalid input	281 return false; // Invalid input.

255 }	282 }

256 ASSERT(ch <= 0xff);	283 ASSERT(ch <= 0xff);

257 dst[j] = ch;	284 dst[j] = ch;

258 }	285 }

259 if ((i < array_len) && (j == len)) {	286 if ((i < array_len) && (j == len)) {

260 return false; // output overflow	287 return false; // Output overflow.

261 }	288 }

262 return true; // success	289 return true; // Success.

263 }	290 }

264	291

265	292

266 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,	293 bool DecodeUTF8ToUTF16(const uint8_t* utf8_array,

267 intptr_t array_len,	294 intptr_t array_len,

268 uint16_t* dst,	295 uint16_t* dst,

269 intptr_t len) {	296 intptr_t len,

	297 bool allow_surrogates) {

270 intptr_t i = 0;	298 intptr_t i = 0;

271 intptr_t j = 0;	299 intptr_t j = 0;

272 intptr_t num_bytes;	300 intptr_t num_bytes;

273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {	301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

274 int32_t ch;	302 int32_t ch;

275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);	303 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);

276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);	304 num_bytes = DecodeUTF8(

	305 &utf8_array[i], (array_len - i), &ch, allow_surrogates);

277 if (ch == -1) {	306 if (ch == -1) {

278 return false; // invalid input	307 return false; // Invalid input.

279 }	308 }

280 if (is_supplementary) {	309 if (is_supplementary) {

281 Utf16::Encode(ch, &dst[j]);	310 Utf16::Encode(ch, &dst[j]);

282 j = j + 1;	311 j = j + 1;

283 } else {	312 } else {

284 dst[j] = ch;	313 dst[j] = ch;

285 }	314 }

286 }	315 }

287 if ((i < array_len) && (j == len)) {	316 if ((i < array_len) && (j == len)) {

288 return false; // output overflow	317 return false; // Output overflow.

289 }	318 }

290 return true; // success	319 return true; // Success.

291 }	320 }

292	321

293	322

	323 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,

	324 intptr_t array_len,

	325 uint16_t* dst,

	326 intptr_t len) {

	327 return DecodeUTF8ToUTF16(utf8_array, array_len, dst, len, false);

	328 }

	329

	330

	331 bool Utf8::DecodeToUTF16AllowSurrogates(const uint8_t* utf8_array,

	332 intptr_t array_len,

	333 uint16_t* dst,

	334 intptr_t len) {

	335 return DecodeUTF8ToUTF16(utf8_array, array_len, dst, len, true);

	336 }

	337

	338

294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,	339 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,

295 intptr_t array_len,	340 intptr_t array_len,

296 int32_t* dst,	341 int32_t* dst,

297 intptr_t len) {	342 intptr_t len) {

298 intptr_t i = 0;	343 intptr_t i = 0;

299 intptr_t j = 0;	344 intptr_t j = 0;

300 intptr_t num_bytes;	345 intptr_t num_bytes;

301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {	346 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

302 int32_t ch;	347 int32_t ch;

303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);	348 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

304 if (ch == -1) {	349 if (ch == -1) {

305 return false; // invalid input	350 return false; // Invalid input.

306 }	351 }

307 dst[j] = ch;	352 dst[j] = ch;

308 }	353 }

309 if ((i < array_len) && (j == len)) {	354 if ((i < array_len) && (j == len)) {

310 return false; // output overflow	355 return false; // Output overflow.

311 }	356 }

312 return true; // success	357 return true; // Success.

313 }	358 }

314	359

315	360

316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {	361 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {

317 ASSERT(codepoint > kMaxBmpCodepoint);	362 ASSERT(codepoint > kMaxBmpCodepoint);

318 ASSERT(dst != NULL);	363 ASSERT(dst != NULL);

319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));	364 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));

320 dst[1] = (0xDC00 + (codepoint & 0x3FF));	365 dst[1] = (0xDC00 + (codepoint & 0x3FF));

321 }	366 }

322	367

323 } // namespace dart	368 } // namespace dart

OLD	NEW

« runtime/vm/snapshot_test.cc ('K') | « runtime/vm/unicode.h ('k') | no next file » | no next file with comments »