runtime/vm/unicode.h - Issue 11368138: Add some support for the code-point code-unit distinction.

Unified Diff: runtime/vm/unicode.h

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: New version integrates feedback, adds less to standard String class. Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: runtime/vm/unicode.h

diff --git a/runtime/vm/unicode.h b/runtime/vm/unicode.h

index ce572dc633e32d5e00418ffa7252670899f9ccef..fdde8e16da3dbc83faf29a84aa6bfb652499dff1 100644

--- a/runtime/vm/unicode.h

+++ b/runtime/vm/unicode.h

@@ -12,6 +12,55 @@ namespace dart {

class String;

+class Utf16 : AllStatic {

+ public:

+ static const uint32_t kMaxCodeUnit = 0xffff;

+ static const uint32_t kMaxCodePoint = 0x10ffff;

+ static bool IsLeadSurrogate(uint32_t c) {

+ return c >= kLeadBase && c < kLeadEnd;

+ }

+ static bool IsTrailSurrogate(uint32_t c) {

+ return c >= kTrailBase && c < kTrailEnd;

+ }

+ static bool IsSurrogate(uint32_t c) {

+ return (c & 0xfffff800u) == 0xd800u;

+ }

+ static int32_t CodePointFromCodeUnits(int32_t lead, int32_t trail) {

+ return kSurrogateEncodingBase +

+ ((lead & kSurrogateMask) << 10) + (trail & kSurrogateMask);

+ }

+ static int32_t LeadFromCodePoint(uint32_t code_point) {

+ ASSERT(code_point >= kSurrogateEncodingBase);

+ return kLeadBase +

+ (((code_point - kSurrogateEncodingBase) >> 10) & kSurrogateMask);

+ }

+ static int32_t TrailFromCodePoint(uint32_t code_point) {

+ ASSERT(code_point >= kSurrogateEncodingBase);

+ return kTrailBase + (code_point & kSurrogateMask);

+ }

+ // Gets the 21 bit Unicode code point at the given index in a string. If the

+ // returned value is greater than kMaxCodePoint then the next position of the

+ // string encodes a trail surrogate and should be skipped on iteration. May

+ // return individual surrogate values if they are not part of a pair.

+ static uint32_t CodePointAt(const String& str, int index);

+ private:

+ static const uint32_t kLeadBase = 0xd800;

+ static const uint32_t kLeadEnd = 0xdbff;

+ static const uint32_t kTrailBase = 0xdc00;

+ static const uint32_t kTrailEnd = 0xdfff;

+ static const uint32_t kSurrogateMask = 0x3ff;

+ static const uint32_t kSurrogateEncodingBase = 0x10000;

+};

class Utf8 : AllStatic {

public:

enum Type {

@@ -24,14 +73,12 @@ class Utf8 : AllStatic {

static const intptr_t kMaxTwoByteChar = 0x7FF;

static const intptr_t kMaxThreeByteChar = 0xFFFF;

static const intptr_t kMaxFourByteChar = 0x10FFFF;

- static const intptr_t kMaxBmpCodepoint = 0xffff;

- static const int32_t kLeadOffset = (0xD800 - (0x10000 >> 10));

- static const int32_t kSurrogateOffset = (0x10000 - (0xD800 << 10) - 0xDC00);

- static void ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst);

- static intptr_t CodePointCount(const uint8_t* utf8_array,

- intptr_t array_len,

- Type* type);

+ static const uint32_t kInvalidCodePoint = 0xffffffffu;

cshapiro 2012/11/15 20:14:51 can we keep the casing of these values consistent?

erikcorry 2012/11/15 23:47:05 Done.

+ static intptr_t CodeUnitCount(const uint8_t* utf8_array,

+ intptr_t array_len,

+ Type* type);

// Returns true if 'utf8_array' is a valid UTF-8 string.

static bool IsValid(const uint8_t* utf8_array, intptr_t array_len);

@@ -44,7 +91,7 @@ class Utf8 : AllStatic {

static intptr_t Decode(const uint8_t* utf8_array,

intptr_t array_len,

- int32_t* ch);

+ uint32_t* ch);

static bool DecodeToAscii(const uint8_t* utf8_array,

intptr_t array_len,

« runtime/vm/object.cc ('K') | « runtime/vm/symbols.cc ('k') | runtime/vm/unicode.cc » ('j') | no next file with comments »