vm/unicode.h - Issue 11419259: Fix bug in Utf8::CodePointCount which was causing some strings with latin1

Unified Diff: vm/unicode.h

Issue 11419259: Fix bug in Utf8::CodePointCount which was causing some strings with latin1 (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/runtime/

Patch Set: Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: vm/unicode.h

===================================================================

--- vm/unicode.h (revision 15591)

+++ vm/unicode.h (working copy)

@@ -43,9 +43,11 @@

kSupplementary, // Supplementary code point [U+010000, U+10FFFF].

};

- static intptr_t CodePointCount(const uint8_t* utf8_array,

- intptr_t array_len,

- Type* type);

+ // Returns a count of the number of UTF-16 code units needed to represent the

cshapiro 2012/11/30 21:32:26 This is not strictly true, right? This returns th

siva 2012/11/30 21:47:19 Changed the comment to: Returns the most restricte

+ // sequence of utf8 characters in 'utf8_array'.

+ static intptr_t CodeUnitCount(const uint8_t* utf8_array,

+ intptr_t array_len,

+ Type* type);

// Returns true if 'utf8_array' is a valid UTF-8 string.

static bool IsValid(const uint8_t* utf8_array, intptr_t array_len);

@@ -82,22 +84,22 @@

static const int32_t kMaxThreeByteChar = 0xFFFF;

static const int32_t kMaxFourByteChar = Utf::kMaxCodePoint;

- static bool IsTrailByte(uint8_t code_unit) {

- return (code_unit & 0xc0) == 0x80;

+ static bool IsTrailByte(uint8_t utf8_byte) {

cshapiro 2012/11/30 21:32:26 the utf-8 spec removed all mention of "byte" and r

siva 2012/11/30 21:47:19 Done.

+ return (utf8_byte & 0xC0) == 0x80;

}

static bool IsNonShortestForm(uint32_t code_point, size_t num_code_units) {

return code_point < kOverlongMinimum[num_code_units];

}

- static bool IsLatin1SequenceStart(uint8_t code_unit) {

- // Check is codepoint is <= U+00FF

- return (code_unit <= Utf8::kMaxOneByteChar);

+ static bool IsLatin1SequenceStart(uint8_t utf8_byte) {

+ // Check if utf8 sequence is start of a codepoint <= U+00FF

+ return (utf8_byte <= 0xC3);

}

- static bool IsSupplementarySequenceStart(uint8_t code_unit) {

- // Check is codepoint is >= U+10000.

- return (code_unit >= 0xF0);

+ static bool IsSupplementarySequenceStart(uint8_t utf8_byte) {

+ // Check if utf8 sequence is start of a codepoint >= U+10000.

+ return (utf8_byte >= 0xF0);

}

static const int8_t kTrailBytes[];

« no previous file with comments | « vm/symbols.cc ('k') | vm/unicode.cc » ('j') | no next file with comments »