runtime/vm/unicode.cc - Issue 11318018: - Represent strings internally in UTF-16 format, this makes it

Side by Side Diff: runtime/vm/unicode.cc

Issue 11318018: - Represent strings internally in UTF-16 format, this makes it (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/

Patch Set: Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/unicode.h"	5 #include "vm/unicode.h"

6	6

7 #include "vm/allocation.h"	7 #include "vm/allocation.h"

8 #include "vm/globals.h"	8 #include "vm/globals.h"

9 #include "vm/object.h"	9 #include "vm/object.h"

10	10

11 namespace dart {	11 namespace dart {

12	12

13 static const uint8_t kTrailBytes[256] = {	13 static const int8_t kTrailBytes[256] = {

14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	22 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

(...skipping 27 matching lines...) Expand all Loading...
51 0xFFFFFFFF,	51 0xFFFFFFFF,

52 0xFFFFFFFF	52 0xFFFFFFFF

53 };	53 };

54	54

55	55

56 static bool IsTrailByte(uint8_t code_unit) {	56 static bool IsTrailByte(uint8_t code_unit) {

57 return (code_unit & 0xc0) == 0x80;	57 return (code_unit & 0xc0) == 0x80;

58 }	58 }

59	59

60	60

	61 static bool IsAsciiSequenceStart(uint8_t code_unit) {

	62 // Check is codepoint is <= U+007F

	63 return (code_unit <= Utf8::kMaxOneByteChar);

	64 }

	65

	66

	67 static bool IsSmpSequenceStart(uint8_t code_unit) {

	68 // Check is codepoint is >= U+10000.

	69 return (code_unit >= 0xF0);

	70 }

	71

	72

61 // Returns true if the code point is a high- or low-surrogate.	73 // Returns true if the code point is a high- or low-surrogate.

62 static bool IsSurrogate(uint32_t code_point) {	74 static bool IsSurrogate(uint32_t code_point) {

63 return (code_point & 0xfffff800) == 0xd800;	75 return (code_point & 0xfffff800) == 0xd800;

64 }	76 }

65	77

66	78

67 // Returns true if the code point value is above Plane 17.	79 // Returns true if the code point value is above Plane 17.

68 static bool IsOutOfRange(uint32_t code_point) {	80 static bool IsOutOfRange(uint32_t code_point) {

69 return code_point > 0x10FFFF;	81 return (code_point > 0x10FFFF);

70 }	82 }

71	83

72	84

73 // Returns true if the byte sequence is ill-formed.	85 // Returns true if the byte sequence is ill-formed.

74 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {	86 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {

75 return code_point < kOverlongMinimum[num_bytes];	87 return code_point < kOverlongMinimum[num_bytes];

76 }	88 }

77	89

78	90

	91 void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) {

	92 ASSERT(codepoint > kMaxBmpCodepoint);

	93 ASSERT(dst != NULL);

	94 dst[0] = (Utf8::kLeadOffset + (codepoint >> 10));

	95 dst[1] = (0xDC00 + (codepoint & 0x3FF));

	96 }

	97

	98

79 // Returns a count of the number of UTF-8 trail bytes.	99 // Returns a count of the number of UTF-8 trail bytes.

80 intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) {	100 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,

81 bool is_two_byte_string = false;	101 intptr_t array_len,

82 bool is_four_byte_string = false;	102 Type* type) {

83 intptr_t len = 0;	103 intptr_t len = 0;

84 for (; *str != '\0'; ++str) {	104 Type char_type = kAscii;

85 uint8_t code_unit = *str;	105 for (intptr_t i = 0; i < array_len; i++) {

	106 uint8_t code_unit = utf8_array[i];

86 if (!IsTrailByte(code_unit)) {	107 if (!IsTrailByte(code_unit)) {

87 ++len;	108 ++len;

88 }	109 }

89 if (code_unit > 0xC3) { // > U+00FF	110 if (!IsAsciiSequenceStart(code_unit)) { // > U+007F

90 if (code_unit < 0xF0) { // < U+10000	111 if (IsSmpSequenceStart(code_unit)) { // >= U+10000

91 is_two_byte_string = true;	112 char_type = kSMP;

92 } else {	113 ++len;

93 is_four_byte_string = true;	114 } else if (char_type == kAscii) {

	115 char_type = kBMP;

94 }	116 }

95 }	117 }

96 }	118 }

97 if (is_four_byte_string) {	119 *type = char_type;

98 *width = 4;

99 } else if (is_two_byte_string) {

100 *width = 2;

101 } else {

102 *width = 1;

103 }

104 return len;	120 return len;

105 }	121 }

106	122

107	123

108 // Returns true if str is a valid NUL-terminated UTF-8 string.	124 // Returns true if str is a valid NUL-terminated UTF-8 string.

109 bool Utf8::IsValid(const char* str) {	125 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {

110 intptr_t i = 0;	126 intptr_t i = 0;

111 while (str[i] != '\0') {	127 while (i < array_len) {

112 uint32_t ch = str[i] & 0xFF;	128 uint32_t ch = utf8_array[i] & 0xFF;

113 intptr_t j = 1;	129 intptr_t j = 1;

114 if (ch >= 0x80) {	130 if (ch >= 0x80) {

115 uint8_t num_trail_bytes = kTrailBytes[ch];	131 int8_t num_trail_bytes = kTrailBytes[ch];

116 bool is_malformed = false;	132 bool is_malformed = false;

117 for (; j < num_trail_bytes; ++j) {	133 for (; j < num_trail_bytes; ++j) {

118 if (str[i + j] != '\0') {	134 if ((i + j) < array_len) {

119 uint8_t code_unit = str[i + j];	135 uint8_t code_unit = utf8_array[i + j];

120 is_malformed \|= !IsTrailByte(code_unit);	136 is_malformed \|= !IsTrailByte(code_unit);

121 ch = (ch << 6) + code_unit;	137 ch = (ch << 6) + code_unit;

122 } else {	138 } else {

123 return false;	139 return false;

124 }	140 }

125 }	141 }

126 ch -= kMagicBits[num_trail_bytes];	142 ch -= kMagicBits[num_trail_bytes];

127 if (!((is_malformed == false) &&	143 if (!((is_malformed == false) &&

128 (j == num_trail_bytes) &&	144 (j == num_trail_bytes) &&

129 !IsOutOfRange(ch) &&	145 !IsOutOfRange(ch) &&

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
195 if (pos + num_bytes > len) {	211 if (pos + num_bytes > len) {

196 break;	212 break;

197 }	213 }

198 Utf8::Encode(ch, &dst[pos]);	214 Utf8::Encode(ch, &dst[pos]);

199 pos += num_bytes;	215 pos += num_bytes;

200 }	216 }

201 return pos;	217 return pos;

202 }	218 }

203	219

204	220

205 intptr_t Utf8::Decode(const char* src, int32_t* dst) {	221 intptr_t Utf8::Decode(const uint8_t* utf8_array,

206 uint32_t ch = src[0] & 0xFF;	222 intptr_t array_len,

207 uint32_t i = 1;	223 int32_t* dst) {

	224 uint32_t ch = utf8_array[0] & 0xFF;

	225 intptr_t i = 1;

208 if (ch >= 0x80) {	226 if (ch >= 0x80) {

209 uint32_t num_trail_bytes = kTrailBytes[ch];	227 int32_t num_trail_bytes = kTrailBytes[ch];

210 bool is_malformed = false;	228 bool is_malformed = false;

211 for (; i < num_trail_bytes; ++i) {	229 for (; i < num_trail_bytes; ++i) {

212 if (src[i] != '\0') {	230 if (i < array_len) {

213 uint8_t code_unit = src[i];	231 uint8_t code_unit = utf8_array[i];

214 is_malformed \|= !IsTrailByte(code_unit);	232 is_malformed \|= !IsTrailByte(code_unit);

215 ch = (ch << 6) + code_unit;	233 ch = (ch << 6) + code_unit;

216 } else {	234 } else {

217 *dst = -1;	235 *dst = -1;

218 return 0;	236 return 0;

219 }	237 }

220 }	238 }

221 ch -= kMagicBits[num_trail_bytes];	239 ch -= kMagicBits[num_trail_bytes];

222 if (!((is_malformed == false) &&	240 if (!((is_malformed == false) &&

223 (i == num_trail_bytes) &&	241 (i == num_trail_bytes) &&

224 !IsOutOfRange(ch) &&	242 !IsOutOfRange(ch) &&

225 !IsNonShortestForm(ch, i) &&	243 !IsNonShortestForm(ch, i) &&

226 !IsSurrogate(ch))) {	244 !IsSurrogate(ch))) {

227 *dst = -1;	245 *dst = -1;

228 return 0;	246 return 0;

229 }	247 }

230 }	248 }

231 *dst = ch;	249 *dst = ch;

232 return i;	250 return i;

233 }	251 }

234	252

235	253

236 template<typename T>	254 bool Utf8::DecodeToAscii(const uint8_t* utf8_array,

237 static bool DecodeImpl(const char* src, T* dst, intptr_t len) {	255 intptr_t array_len,

	256 uint8_t* dst,

	257 intptr_t len) {

	258 if (len < array_len) {

	259 return false; // output overflow

	260 }

	261 #ifdef DEBUG

	262 for (intptr_t i = 0; i < array_len; i++) {

	263 ASSERT(IsAsciiSequenceStart(utf8_array[i]));

	264 }

	265 #endif

	266 memmove(dst, utf8_array, array_len);

	267 return true; // success

	268 }

	269

	270

	271 bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,

	272 intptr_t array_len,

	273 uint16_t* dst,

	274 intptr_t len) {

238 intptr_t i = 0;	275 intptr_t i = 0;

239 intptr_t j = 0;	276 intptr_t j = 0;

240 intptr_t num_bytes;	277 intptr_t num_bytes;

241 for (; src[i] != '\0' && j < len; i += num_bytes, ++j) {	278 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

242 int32_t ch;	279 int32_t ch;

243 num_bytes = Utf8::Decode(&src[i], &ch);	280 bool is_smp = IsSmpSequenceStart(utf8_array[i]);

	281 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

244 if (ch == -1) {	282 if (ch == -1) {

245 return false; // invalid input	283 return false; // invalid input

246 }	284 }

247 dst[j] = ch;	285 if (is_smp) {

	286 ConvertUTF32ToUTF16(ch, &(dst[j]));

	287 j = j + 1;

	288 } else {

	289 dst[j] = ch;

	290 }

248 }	291 }

249 if (src[i] != '\0' && j == len) {	292 if ((i < array_len) && (j == len)) {

250 return false; // output overflow	293 return false; // output overflow

251 }	294 }

252 return true; // success	295 return true; // success

253 }	296 }

254	297

255	298

256 bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) {	299 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,

257 return DecodeImpl(src, dst, len);	300 intptr_t array_len,

258 }	301 uint32_t* dst,

259	302 intptr_t len) {

260	303 intptr_t i = 0;

261 bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) {	304 intptr_t j = 0;

262 return DecodeImpl(src, dst, len);	305 intptr_t num_bytes;

263 }	306 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

264	307 int32_t ch;

265	308 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

266 bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) {	309 if (ch == -1) {

267 return DecodeImpl(src, dst, len);	310 return false; // invalid input

	311 }

	312 dst[j] = ch;

	313 }

	314 if ((i < array_len) && (j == len)) {

	315 return false; // output overflow

	316 }

	317 return true; // success

268 }	318 }

269	319

270 } // namespace dart	320 } // namespace dart

OLD	NEW

« runtime/vm/dart_api_impl.cc ('K') | « runtime/vm/unicode.h ('k') | runtime/vm/unicode_test.cc » ('j') | no next file with comments »