Index: runtime/vm/unicode.h |
diff --git a/runtime/vm/unicode.h b/runtime/vm/unicode.h |
index ce572dc633e32d5e00418ffa7252670899f9ccef..fdde8e16da3dbc83faf29a84aa6bfb652499dff1 100644 |
--- a/runtime/vm/unicode.h |
+++ b/runtime/vm/unicode.h |
@@ -12,6 +12,55 @@ namespace dart { |
class String; |
+class Utf16 : AllStatic { |
+ public: |
+ static const uint32_t kMaxCodeUnit = 0xffff; |
+ static const uint32_t kMaxCodePoint = 0x10ffff; |
+ |
+ static bool IsLeadSurrogate(uint32_t c) { |
+ return c >= kLeadBase && c < kLeadEnd; |
+ } |
+ |
+ static bool IsTrailSurrogate(uint32_t c) { |
+ return c >= kTrailBase && c < kTrailEnd; |
+ } |
+ |
+ static bool IsSurrogate(uint32_t c) { |
+ return (c & 0xfffff800u) == 0xd800u; |
+ } |
+ |
+ static int32_t CodePointFromCodeUnits(int32_t lead, int32_t trail) { |
+ return kSurrogateEncodingBase + |
+ ((lead & kSurrogateMask) << 10) + (trail & kSurrogateMask); |
+ } |
+ |
+ static int32_t LeadFromCodePoint(uint32_t code_point) { |
+ ASSERT(code_point >= kSurrogateEncodingBase); |
+ return kLeadBase + |
+ (((code_point - kSurrogateEncodingBase) >> 10) & kSurrogateMask); |
+ } |
+ |
+ static int32_t TrailFromCodePoint(uint32_t code_point) { |
+ ASSERT(code_point >= kSurrogateEncodingBase); |
+ return kTrailBase + (code_point & kSurrogateMask); |
+ } |
+ |
+ // Gets the 21 bit Unicode code point at the given index in a string. If the |
+ // returned value is greater than kMaxCodePoint then the next position of the |
+ // string encodes a trail surrogate and should be skipped on iteration. May |
+ // return individual surrogate values if they are not part of a pair. |
+ static uint32_t CodePointAt(const String& str, int index); |
+ |
+ private: |
+ static const uint32_t kLeadBase = 0xd800; |
+ static const uint32_t kLeadEnd = 0xdbff; |
+ static const uint32_t kTrailBase = 0xdc00; |
+ static const uint32_t kTrailEnd = 0xdfff; |
+ static const uint32_t kSurrogateMask = 0x3ff; |
+ static const uint32_t kSurrogateEncodingBase = 0x10000; |
+}; |
+ |
+ |
class Utf8 : AllStatic { |
public: |
enum Type { |
@@ -24,14 +73,12 @@ class Utf8 : AllStatic { |
static const intptr_t kMaxTwoByteChar = 0x7FF; |
static const intptr_t kMaxThreeByteChar = 0xFFFF; |
static const intptr_t kMaxFourByteChar = 0x10FFFF; |
- static const intptr_t kMaxBmpCodepoint = 0xffff; |
- static const int32_t kLeadOffset = (0xD800 - (0x10000 >> 10)); |
- static const int32_t kSurrogateOffset = (0x10000 - (0xD800 << 10) - 0xDC00); |
- static void ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst); |
- static intptr_t CodePointCount(const uint8_t* utf8_array, |
- intptr_t array_len, |
- Type* type); |
+ static const uint32_t kInvalidCodePoint = 0xffffffffu; |
cshapiro
2012/11/15 20:14:51
can we keep the casing of these values consistent?
erikcorry
2012/11/15 23:47:05
Done.
|
+ |
+ static intptr_t CodeUnitCount(const uint8_t* utf8_array, |
+ intptr_t array_len, |
+ Type* type); |
// Returns true if 'utf8_array' is a valid UTF-8 string. |
static bool IsValid(const uint8_t* utf8_array, intptr_t array_len); |
@@ -44,7 +91,7 @@ class Utf8 : AllStatic { |
static intptr_t Decode(const uint8_t* utf8_array, |
intptr_t array_len, |
- int32_t* ch); |
+ uint32_t* ch); |
static bool DecodeToAscii(const uint8_t* utf8_array, |
intptr_t array_len, |