vm/unicode.h - Issue 11419259: Fix bug in Utf8::CodePointCount which was causing some strings with latin1

Side by Side Diff: vm/unicode.h

Issue 11419259: Fix bug in Utf8::CodePointCount which was causing some strings with latin1 (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/runtime/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #ifndef VM_UNICODE_H_	5 #ifndef VM_UNICODE_H_

6 #define VM_UNICODE_H_	6 #define VM_UNICODE_H_

7	7

8 #include "vm/allocation.h"	8 #include "vm/allocation.h"

9 #include "vm/globals.h"	9 #include "vm/globals.h"

10	10

(...skipping 25 matching lines...) Expand all Loading...
36	36

37	37

38 class Utf8 : AllStatic {	38 class Utf8 : AllStatic {

39 public:	39 public:

40 enum Type {	40 enum Type {

41 kLatin1 = 0, // Latin-1 code point [U+0000, U+00FF].	41 kLatin1 = 0, // Latin-1 code point [U+0000, U+00FF].

42 kBMP, // Basic Multilingual Plane code point [U+0000, U+FFFF].	42 kBMP, // Basic Multilingual Plane code point [U+0000, U+FFFF].

43 kSupplementary, // Supplementary code point [U+010000, U+10FFFF].	43 kSupplementary, // Supplementary code point [U+010000, U+10FFFF].

44 };	44 };

45	45

46 static intptr_t CodePointCount(const uint8_t* utf8_array,	46 // Returns a count of the number of UTF-16 code units needed to represent the
	cshapiro 2012/11/30 21:32:26 This is not strictly true, right? This returns th This is not strictly true, right? This returns the minimal number of code units as well as the most restricted coding form. siva 2012/11/30 21:47:19 Changed the comment to: Returns the most restricte Changed the comment to: Returns the most restricted coding form in which the sequence of utf8 characters in 'utf8_array' can be represented in and the number of code units needed in that form. On 2012/11/30 21:32:26, cshapiro wrote: Show quoted text > This is not strictly true, right? > > This returns the minimal number of code units as well as the most restricted > coding form.
47 intptr_t array_len,	47 // sequence of utf8 characters in 'utf8_array'.

48 Type* type);	48 static intptr_t CodeUnitCount(const uint8_t* utf8_array,

	49 intptr_t array_len,

	50 Type* type);

49	51

50 // Returns true if 'utf8_array' is a valid UTF-8 string.	52 // Returns true if 'utf8_array' is a valid UTF-8 string.

51 static bool IsValid(const uint8_t* utf8_array, intptr_t array_len);	53 static bool IsValid(const uint8_t* utf8_array, intptr_t array_len);

52	54

53 static intptr_t Length(int32_t ch);	55 static intptr_t Length(int32_t ch);

54 static intptr_t Length(const String& str);	56 static intptr_t Length(const String& str);

55	57

56 static intptr_t Encode(int32_t ch, char* dst);	58 static intptr_t Encode(int32_t ch, char* dst);

57 static intptr_t Encode(const String& src, char* dst, intptr_t len);	59 static intptr_t Encode(const String& src, char* dst, intptr_t len);

58	60

(...skipping 16 matching lines...) Expand all Loading...
75 static bool DecodeCStringToUTF32(const char* str,	77 static bool DecodeCStringToUTF32(const char* str,

76 int32_t* dst,	78 int32_t* dst,

77 intptr_t len);	79 intptr_t len);

78	80

79 private:	81 private:

80 static const int32_t kMaxOneByteChar = 0x7F;	82 static const int32_t kMaxOneByteChar = 0x7F;

81 static const int32_t kMaxTwoByteChar = 0x7FF;	83 static const int32_t kMaxTwoByteChar = 0x7FF;

82 static const int32_t kMaxThreeByteChar = 0xFFFF;	84 static const int32_t kMaxThreeByteChar = 0xFFFF;

83 static const int32_t kMaxFourByteChar = Utf::kMaxCodePoint;	85 static const int32_t kMaxFourByteChar = Utf::kMaxCodePoint;

84	86

85 static bool IsTrailByte(uint8_t code_unit) {	87 static bool IsTrailByte(uint8_t utf8_byte) {
	cshapiro 2012/11/30 21:32:26 the utf-8 spec removed all mention of "byte" and r the utf-8 spec removed all mention of "byte" and replaced it with code unit. it should be okay to keep the old name since this is in the utf-8 class now. siva 2012/11/30 21:47:19 Done. Show quoted text On 2012/11/30 21:32:26, cshapiro wrote: > the utf-8 spec removed all mention of "byte" and replaced it with code unit. > > it should be okay to keep the old name since this is in the utf-8 class now. Done.
86 return (code_unit & 0xc0) == 0x80;	88 return (utf8_byte & 0xC0) == 0x80;

87 }	89 }

88	90

89 static bool IsNonShortestForm(uint32_t code_point, size_t num_code_units) {	91 static bool IsNonShortestForm(uint32_t code_point, size_t num_code_units) {

90 return code_point < kOverlongMinimum[num_code_units];	92 return code_point < kOverlongMinimum[num_code_units];

91 }	93 }

92	94

93 static bool IsLatin1SequenceStart(uint8_t code_unit) {	95 static bool IsLatin1SequenceStart(uint8_t utf8_byte) {

94 // Check is codepoint is <= U+00FF	96 // Check if utf8 sequence is start of a codepoint <= U+00FF

95 return (code_unit <= Utf8::kMaxOneByteChar);	97 return (utf8_byte <= 0xC3);

96 }	98 }

97	99

98 static bool IsSupplementarySequenceStart(uint8_t code_unit) {	100 static bool IsSupplementarySequenceStart(uint8_t utf8_byte) {

99 // Check is codepoint is >= U+10000.	101 // Check if utf8 sequence is start of a codepoint >= U+10000.

100 return (code_unit >= 0xF0);	102 return (utf8_byte >= 0xF0);

101 }	103 }

102	104

103 static const int8_t kTrailBytes[];	105 static const int8_t kTrailBytes[];

104 static const uint32_t kMagicBits[];	106 static const uint32_t kMagicBits[];

105 static const uint32_t kOverlongMinimum[];	107 static const uint32_t kOverlongMinimum[];

106 };	108 };

107	109

108	110

109 class Utf16 : AllStatic {	111 class Utf16 : AllStatic {

110 public:	112 public:

(...skipping 108 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
219 // Data for small code points with one mapping	221 // Data for small code points with one mapping

220 static const int16_t stage2_[];	222 static const int16_t stage2_[];

221	223

222 // Data for large code points or code points with both mappings.	224 // Data for large code points or code points with both mappings.

223 static const int32_t stage2_exception_[][2];	225 static const int32_t stage2_exception_[][2];

224 };	226 };

225	227

226 } // namespace dart	228 } // namespace dart

227	229

228 #endif // VM_UNICODE_H_	230 #endif // VM_UNICODE_H_

OLD	NEW

« no previous file with comments | « vm/symbols.cc ('k') | vm/unicode.cc » ('j') | no next file with comments »