Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(401)

Unified Diff: runtime/vm/unicode.cc

Issue 11318018: - Represent strings internally in UTF-16 format, this makes it (Closed) Base URL: http://dart.googlecode.com/svn/branches/bleeding_edge/dart/
Patch Set: Created 8 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: runtime/vm/unicode.cc
===================================================================
--- runtime/vm/unicode.cc (revision 14314)
+++ runtime/vm/unicode.cc (working copy)
@@ -10,7 +10,7 @@
namespace dart {
-static const uint8_t kTrailBytes[256] = {
+static const int8_t kTrailBytes[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -58,6 +58,18 @@
}
+static bool IsAsciiSequenceStart(uint8_t code_unit) {
+ // Check is codepoint is <= U+007F
+ return (code_unit <= Utf8::kMaxOneByteChar);
+}
+
+
+static bool IsSmpSequenceStart(uint8_t code_unit) {
+ // Check is codepoint is >= U+10000.
+ return (code_unit >= 0xF0);
+}
+
+
// Returns true if the code point is a high- or low-surrogate.
static bool IsSurrogate(uint32_t code_point) {
return (code_point & 0xfffff800) == 0xd800;
@@ -66,7 +78,7 @@
// Returns true if the code point value is above Plane 17.
static bool IsOutOfRange(uint32_t code_point) {
- return code_point > 0x10FFFF;
+ return (code_point > 0x10FFFF);
}
@@ -76,47 +88,51 @@
}
+void Utf8::ConvertUTF32ToUTF16(int32_t codepoint, uint16_t* dst) {
+ ASSERT(codepoint > kMaxBmpCodepoint);
+ ASSERT(dst != NULL);
+ dst[0] = (Utf8::kLeadOffset + (codepoint >> 10));
+ dst[1] = (0xDC00 + (codepoint & 0x3FF));
+}
+
+
// Returns a count of the number of UTF-8 trail bytes.
-intptr_t Utf8::CodePointCount(const char* str, intptr_t* width) {
- bool is_two_byte_string = false;
- bool is_four_byte_string = false;
+intptr_t Utf8::CodePointCount(const uint8_t* utf8_array,
+ intptr_t array_len,
+ Type* type) {
intptr_t len = 0;
- for (; *str != '\0'; ++str) {
- uint8_t code_unit = *str;
+ Type char_type = kAscii;
+ for (intptr_t i = 0; i < array_len; i++) {
+ uint8_t code_unit = utf8_array[i];
if (!IsTrailByte(code_unit)) {
++len;
}
- if (code_unit > 0xC3) { // > U+00FF
- if (code_unit < 0xF0) { // < U+10000
- is_two_byte_string = true;
- } else {
- is_four_byte_string = true;
+ if (!IsAsciiSequenceStart(code_unit)) { // > U+007F
+ if (IsSmpSequenceStart(code_unit)) { // >= U+10000
+ char_type = kSMP;
+ ++len;
+ } else if (char_type == kAscii) {
+ char_type = kBMP;
}
}
}
- if (is_four_byte_string) {
- *width = 4;
- } else if (is_two_byte_string) {
- *width = 2;
- } else {
- *width = 1;
- }
+ *type = char_type;
return len;
}
// Returns true if str is a valid NUL-terminated UTF-8 string.
-bool Utf8::IsValid(const char* str) {
+bool Utf8::IsValid(const uint8_t* utf8_array, intptr_t array_len) {
intptr_t i = 0;
- while (str[i] != '\0') {
- uint32_t ch = str[i] & 0xFF;
+ while (i < array_len) {
+ uint32_t ch = utf8_array[i] & 0xFF;
intptr_t j = 1;
if (ch >= 0x80) {
- uint8_t num_trail_bytes = kTrailBytes[ch];
+ int8_t num_trail_bytes = kTrailBytes[ch];
bool is_malformed = false;
for (; j < num_trail_bytes; ++j) {
- if (str[i + j] != '\0') {
- uint8_t code_unit = str[i + j];
+ if ((i + j) < array_len) {
+ uint8_t code_unit = utf8_array[i + j];
is_malformed |= !IsTrailByte(code_unit);
ch = (ch << 6) + code_unit;
} else {
@@ -202,15 +218,17 @@
}
-intptr_t Utf8::Decode(const char* src, int32_t* dst) {
- uint32_t ch = src[0] & 0xFF;
- uint32_t i = 1;
+intptr_t Utf8::Decode(const uint8_t* utf8_array,
+ intptr_t array_len,
+ int32_t* dst) {
+ uint32_t ch = utf8_array[0] & 0xFF;
+ intptr_t i = 1;
if (ch >= 0x80) {
- uint32_t num_trail_bytes = kTrailBytes[ch];
+ int32_t num_trail_bytes = kTrailBytes[ch];
bool is_malformed = false;
for (; i < num_trail_bytes; ++i) {
- if (src[i] != '\0') {
- uint8_t code_unit = src[i];
+ if (i < array_len) {
+ uint8_t code_unit = utf8_array[i];
is_malformed |= !IsTrailByte(code_unit);
ch = (ch << 6) + code_unit;
} else {
@@ -233,38 +251,70 @@
}
-template<typename T>
-static bool DecodeImpl(const char* src, T* dst, intptr_t len) {
+bool Utf8::DecodeToAscii(const uint8_t* utf8_array,
+ intptr_t array_len,
+ uint8_t* dst,
+ intptr_t len) {
+ if (len < array_len) {
+ return false; // output overflow
+ }
+#ifdef DEBUG
+ for (intptr_t i = 0; i < array_len; i++) {
+ ASSERT(IsAsciiSequenceStart(utf8_array[i]));
+ }
+#endif
+ memmove(dst, utf8_array, array_len);
+ return true; // success
+}
+
+
+bool Utf8::DecodeToUTF16(const uint8_t* utf8_array,
+ intptr_t array_len,
+ uint16_t* dst,
+ intptr_t len) {
intptr_t i = 0;
intptr_t j = 0;
intptr_t num_bytes;
- for (; src[i] != '\0' && j < len; i += num_bytes, ++j) {
+ for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
int32_t ch;
- num_bytes = Utf8::Decode(&src[i], &ch);
+ bool is_smp = IsSmpSequenceStart(utf8_array[i]);
+ num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
if (ch == -1) {
return false; // invalid input
}
- dst[j] = ch;
+ if (is_smp) {
+ ConvertUTF32ToUTF16(ch, &(dst[j]));
+ j = j + 1;
+ } else {
+ dst[j] = ch;
+ }
}
- if (src[i] != '\0' && j == len) {
+ if ((i < array_len) && (j == len)) {
return false; // output overflow
}
return true; // success
}
-bool Utf8::Decode(const char* src, uint8_t* dst, intptr_t len) {
- return DecodeImpl(src, dst, len);
+bool Utf8::DecodeToUTF32(const uint8_t* utf8_array,
+ intptr_t array_len,
+ uint32_t* dst,
+ intptr_t len) {
+ intptr_t i = 0;
+ intptr_t j = 0;
+ intptr_t num_bytes;
+ for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
+ int32_t ch;
+ num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
+ if (ch == -1) {
+ return false; // invalid input
+ }
+ dst[j] = ch;
+ }
+ if ((i < array_len) && (j == len)) {
+ return false; // output overflow
+ }
+ return true; // success
}
-
-bool Utf8::Decode(const char* src, uint16_t* dst, intptr_t len) {
- return DecodeImpl(src, dst, len);
-}
-
-
-bool Utf8::Decode(const char* src, uint32_t* dst, intptr_t len) {
- return DecodeImpl(src, dst, len);
-}
-
} // namespace dart

Powered by Google App Engine
This is Rietveld 408576698