runtime/vm/unicode.cc - Issue 11411092: Revert "Add some support for the code-point code-unit distinction."

Side by Side Diff: runtime/vm/unicode.cc

Issue 11411092: Revert "Add some support for the code-point code-unit distinction." (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/unicode.h"	5 #include "vm/unicode.h"

6	6

7 #include "vm/allocation.h"	7 #include "vm/allocation.h"

8 #include "vm/globals.h"	8 #include "vm/globals.h"

9 #include "vm/object.h"	9 #include "vm/object.h"

10	10

(...skipping 30 matching lines...) Expand all Loading...
41 };	41 };

42	42

43	43

44 // Minimum values of code points used to check shortest form.	44 // Minimum values of code points used to check shortest form.

45 static const uint32_t kOverlongMinimum[7] = {	45 static const uint32_t kOverlongMinimum[7] = {

46 0, // padding	46 0, // padding

47 0x0,	47 0x0,

48 0x80,	48 0x80,

49 0x800,	49 0x800,

50 0x10000,	50 0x10000,

51 0xFFFFFFFF, // We never allow 5 byte sequences.	51 0xFFFFFFFF,

52 0xFFFFFFFF // We never allow 6 byte sequences.	52 0xFFFFFFFF

53 };	53 };

54	54

55	55

56 static bool IsTrailByte(uint8_t code_unit) {	56 static bool IsTrailByte(uint8_t code_unit) {

57 return (code_unit & 0xc0) == 0x80;	57 return (code_unit & 0xc0) == 0x80;

58 }	58 }

59	59

60	60

61 static bool IsLatin1SequenceStart(uint8_t code_unit) {	61 static bool IsLatin1SequenceStart(uint8_t code_unit) {

62 // Check is codepoint is <= U+00FF	62 // Check is codepoint is <= U+00FF

63 return (code_unit <= Utf8::kMaxOneByteChar);	63 return (code_unit <= Utf8::kMaxOneByteChar);

64 }	64 }

65	65

66	66

67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {	67 static bool IsSupplementarySequenceStart(uint8_t code_unit) {

68 // Check the UTF-8 code unit to determine if it is a sequence start for a	68 // Check is codepoint is >= U+10000.

69 // code point >= U+10000.

70 return (code_unit >= 0xF0);	69 return (code_unit >= 0xF0);

71 }	70 }

72	71

73	72

74 // Returns true if the code point value is above Plane 17.	73 // Returns true if the code point value is above Plane 17.

75 static bool IsOutOfRange(int32_t code_point) {	74 static bool IsOutOfRange(uint32_t code_point) {

76 return (code_point > Utf16::kMaxCodePoint);	75 return (code_point > 0x10FFFF);

77 }	76 }

78	77

79	78

80 // Returns true if the byte sequence is ill-formed.	79 // Returns true if the byte sequence is ill-formed.

81 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {	80 static bool IsNonShortestForm(uint32_t code_point, size_t num_bytes) {

82 return code_point < kOverlongMinimum[num_bytes];	81 return code_point < kOverlongMinimum[num_bytes];

83 }	82 }

84	83

85	84

86 // Returns a count of the number of UTF-16 code units represented by this UTF-8	85 // Returns a count of the number of UTF-8 trail bytes.

87 // array. Type is kASCII for 7-bit-only. If there are surrogate pairs then	86 intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,

88 // the type is kSupplementary. Otherwise it is kBMP.	87 intptr_t array_len,

89 intptr_t Utf8::CodeUnitCount(const uint8_t* utf8_array,	88 Type* type) {

90 intptr_t array_len,

91 Type* type) {

92 intptr_t len = 0;	89 intptr_t len = 0;

93 Type char_type = kLatin1;	90 Type char_type = kLatin1;

94 for (intptr_t i = 0; i < array_len; i++) {	91 for (intptr_t i = 0; i < array_len; i++) {

95 uint8_t code_unit = utf8_array[i];	92 uint8_t code_unit = utf8_array[i];

96 if (!IsTrailByte(code_unit)) {	93 if (!IsTrailByte(code_unit)) {

97 ++len;	94 ++len;

98 }	95 }

99 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF	96 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF

100 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000	97 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000

101 char_type = kSupplementary;	98 char_type = kSupplementary;

102 ++len; // Surrogate pair in the UTF-16 encoding.	99 ++len;

103 } else if (char_type == kLatin1) {	100 } else if (char_type == kLatin1) {

104 char_type = kBMP;	101 char_type = kBMP;

105 }	102 }

106 }	103 }

107 }	104 }

108 *type = char_type;	105 *type = char_type;

109 return len;	106 return len;

110 }	107 }

111	108

112	109

113 // Returns true if str is a valid UTF-8 string.	110 // Returns true if str is a valid NUL-terminated UTF-8 string.

114 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {	111 bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {

115 intptr_t i = 0;	112 intptr_t i = 0;

116 while (i < array_len) {	113 while (i < array_len) {

117 uint32_t ch = utf8_array[i] & 0xFF;	114 uint32_t ch = utf8_array[i] & 0xFF;

118 intptr_t j = 1;	115 intptr_t j = 1;

119 if (ch >= 0x80) {	116 if (ch >= 0x80) {

120 int8_t num_trail_bytes = kTrailBytes[ch];	117 int8_t num_trail_bytes = kTrailBytes[ch];

121 bool is_malformed = false;	118 bool is_malformed = false;

122 for (; j < num_trail_bytes; ++j) {	119 for (; j < num_trail_bytes; ++j) {

123 if ((i + j) < array_len) {	120 if ((i + j) < array_len) {

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
205 Utf8::Encode(ch, &dst[pos]);	202 Utf8::Encode(ch, &dst[pos]);

206 pos += num_bytes;	203 pos += num_bytes;

207 }	204 }

208 return pos;	205 return pos;

209 }	206 }

210	207

211	208

212 intptr_t Utf8::Decode(const uint8_t* utf8_array,	209 intptr_t Utf8::Decode(const uint8_t* utf8_array,

213 intptr_t array_len,	210 intptr_t array_len,

214 int32_t* dst) {	211 int32_t* dst) {

215 int32_t ch = utf8_array[0] & 0xFF;	212 uint32_t ch = utf8_array[0] & 0xFF;

216 intptr_t i = 1;	213 intptr_t i = 1;

217 if (ch >= 0x80) {	214 if (ch >= 0x80) {

218 intptr_t num_trail_bytes = kTrailBytes[ch];	215 intptr_t num_trail_bytes = kTrailBytes[ch];

219 bool is_malformed = false;	216 bool is_malformed = false;

220 for (; i < num_trail_bytes; ++i) {	217 for (; i < num_trail_bytes; ++i) {

221 if (i < array_len) {	218 if (i < array_len) {

222 uint8_t code_unit = utf8_array[i];	219 uint8_t code_unit = utf8_array[i];

223 is_malformed \|= !IsTrailByte(code_unit);	220 is_malformed \|= !IsTrailByte(code_unit);

224 ch = (ch << 6) + code_unit;	221 ch = (ch << 6) + code_unit;

225 } else {	222 } else {

226 *dst = kInvalidCodePoint;	223 *dst = -1;

227 return 0;	224 return 0;

228 }	225 }

229 }	226 }

230 ch -= kMagicBits[num_trail_bytes];	227 ch -= kMagicBits[num_trail_bytes];

231 if (!((is_malformed == false) &&	228 if (!((is_malformed == false) &&

232 (i == num_trail_bytes) &&	229 (i == num_trail_bytes) &&

233 !IsOutOfRange(ch) &&	230 !IsOutOfRange(ch) &&

234 !IsNonShortestForm(ch, i) &&	231 !IsNonShortestForm(ch, i) &&

235 !Utf16::IsSurrogate(ch))) {	232 !Utf16::IsSurrogate(ch))) {

236 *dst = kInvalidCodePoint;	233 *dst = -1;

237 return 0;	234 return 0;

238 }	235 }

239 }	236 }

240 *dst = ch;	237 *dst = ch;

241 return i;	238 return i;

242 }	239 }

243	240

244	241

245 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array,	242 bool Utf8::DecodeToLatin1(const uint8_t* utf8_array,

246 intptr_t array_len,	243 intptr_t array_len,

(...skipping 23 matching lines...) Expand all Loading...
270 intptr_t array_len,	267 intptr_t array_len,

271 uint16_t* dst,	268 uint16_t* dst,

272 intptr_t len) {	269 intptr_t len) {

273 intptr_t i = 0;	270 intptr_t i = 0;

274 intptr_t j = 0;	271 intptr_t j = 0;

275 intptr_t num_bytes;	272 intptr_t num_bytes;

276 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {	273 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

277 int32_t ch;	274 int32_t ch;

278 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);	275 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);

279 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);	276 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

280 if (ch == kInvalidCodePoint) {	277 if (ch == -1) {

281 return false; // invalid input	278 return false; // invalid input

282 }	279 }

283 if (is_supplementary) {	280 if (is_supplementary) {

284 Utf16::Encode(ch, &dst[j]);	281 Utf16::Encode(ch, &dst[j]);

285 j = j + 1;	282 j = j + 1;

286 } else {	283 } else {

287 dst[j] = ch;	284 dst[j] = ch;

288 }	285 }

289 }	286 }

290 if ((i < array_len) && (j == len)) {	287 if ((i < array_len) && (j == len)) {

291 return false; // output overflow	288 return false; // output overflow

292 }	289 }

293 return true; // success	290 return true; // success

294 }	291 }

295	292

296	293

297 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,	294 bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,

298 intptr_t array_len,	295 intptr_t array_len,

299 int32_t* dst,	296 uint32_t* dst,

300 intptr_t len) {	297 intptr_t len) {

301 intptr_t i = 0;	298 intptr_t i = 0;

302 intptr_t j = 0;	299 intptr_t j = 0;

303 intptr_t num_bytes;	300 intptr_t num_bytes;

304 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {	301 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {

305 int32_t ch;	302 int32_t ch;

306 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);	303 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);

307 if (ch == kInvalidCodePoint) {	304 if (ch == -1) {

308 return false; // invalid input	305 return false; // invalid input

309 }	306 }

310 dst[j] = ch;	307 dst[j] = ch;

311 }	308 }

312 if ((i < array_len) && (j == len)) {	309 if ((i < array_len) && (j == len)) {

313 return false; // output overflow	310 return false; // output overflow

314 }	311 }

315 return true; // success	312 return true; // success

316 }	313 }

317	314

318	315

319 int32_t Utf16::CodePointAt(const String& str, int index) {	316 void Utf16::Encode(int32_t codepoint, uint16_t* dst) {

320 int32_t code = str.CharAt(index);	317 ASSERT(codepoint > kMaxBmpCodepoint);

321 if (!IsLeadSurrogate(code)) return code;

322 if (index + 1 == str.Length()) return code;

323 int32_t trail = str.CharAt(index + 1);

324 if (!IsTrailSurrogate(trail)) return code;

325 return Decode(code, trail);

326 }

327

328

329 void Utf16::Encode(int32_t codePoint, uint16_t* dst) {

330 ASSERT(codePoint > kMaxBmpCodepoint);

331 ASSERT(dst != NULL);	318 ASSERT(dst != NULL);

332 dst[0] = LeadFromCodePoint(codePoint);	319 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));

333 dst[1] = TrailFromCodePoint(codePoint);	320 dst[1] = (0xDC00 + (codepoint & 0x3FF));

334 }	321 }

335	322

336 } // namespace dart	323 } // namespace dart

OLD	NEW

« no previous file with comments | « runtime/vm/unicode.h ('k') | runtime/vm/unicode_test.cc » ('j') | no next file with comments »