Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1489)

Unified Diff: runtime/vm/object.cc

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Implemented feedback from patch set 2. Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: runtime/vm/object.cc
diff --git a/runtime/vm/object.cc b/runtime/vm/object.cc
index 8e1ad649c9101ed396520087d9e4d5edd0e3d118..cea047f4d87ca273c2e84ec6596fd0932d576984 100644
--- a/runtime/vm/object.cc
+++ b/runtime/vm/object.cc
@@ -2187,7 +2187,7 @@ static bool MatchesAccessorName(const String& name,
return false;
}
for (intptr_t i = 0; i < prefix_length; i++) {
- if (name.CharAt(i) != prefix[i]) {
+ if (name.CharAt(i) != static_cast<int32_t>(prefix[i])) {
return false;
}
}
@@ -9882,7 +9882,7 @@ intptr_t String::Hash(const uint16_t* characters, intptr_t len) {
}
-intptr_t String::Hash(const uint32_t* characters, intptr_t len) {
+intptr_t String::Hash(const int32_t* characters, intptr_t len) {
return HashImpl(characters, len);
}
@@ -9892,16 +9892,16 @@ int32_t String::CharAt(intptr_t index) const {
ASSERT(RawObject::IsStringClassId(class_id));
NoGCScope no_gc;
if (class_id == kOneByteStringCid) {
- return *OneByteString::CharAddr(*this, index);
+ return OneByteString::CharAt(*this, index);
cshapiro 2012/11/17 02:25:27 Why was this changed? I think the code on the rig
erikcorry 2012/11/19 12:40:41 Reverted.
}
if (class_id == kTwoByteStringCid) {
- return *TwoByteString::CharAddr(*this, index);
+ return TwoByteString::CharAt(*this, index);
}
if (class_id == kExternalOneByteStringCid) {
- return *ExternalOneByteString::CharAddr(*this, index);
+ return ExternalOneByteString::CharAt(*this, index);
}
ASSERT(class_id == kExternalTwoByteStringCid);
- return *ExternalTwoByteString::CharAddr(*this, index);
+ return ExternalTwoByteString::CharAt(*this, index);
}
@@ -9957,8 +9957,13 @@ bool String::Equals(const char* str) const {
intptr_t consumed = Utf8::Decode(reinterpret_cast<const uint8_t*>(str),
len,
&ch);
- if (consumed == 0 || this->CharAt(i) != ch) {
- return false;
+ if (consumed == 0) return false;
+
+ if (ch <= Utf16::kMaxCodeUnit) {
+ if (this->CharAt(i) != ch) return false;
+ } else {
+ if (Utf16::CodePointAt(*this, i) != ch) return false;
+ i++;
}
str += consumed;
len -= consumed;
@@ -9997,16 +10002,18 @@ bool String::Equals(const uint16_t* characters, intptr_t len) const {
}
-bool String::Equals(const uint32_t* characters, intptr_t len) const {
+bool String::Equals(const int32_t* characters, intptr_t len) const {
if (len != this->Length()) {
// Lengths don't match.
return false;
}
for (intptr_t i = 0; i < len; i++) {
- if (this->CharAt(i) != static_cast<int32_t>(characters[i])) {
+ int32_t c = this->CharAt(i);
cshapiro 2012/11/17 02:25:27 This also looks like noise.
erikcorry 2012/11/19 12:40:41 This function was an attempt to compare a UTF-32 s
+ if (c != characters[i]) {
return false;
}
+ if (c > Utf16::kMaxCodeUnit) i++;
cshapiro 2012/11/17 02:25:27 I am confused about this line. A comment would be
erikcorry 2012/11/19 12:40:41 Ditto
}
siva 2012/11/16 22:32:04 I don't quite get this implementation.
erikcorry 2012/11/19 12:40:41 Ditto.
return true;
}
@@ -10016,13 +10023,15 @@ intptr_t String::CompareTo(const String& other) const {
const intptr_t this_len = this->Length();
const intptr_t other_len = other.IsNull() ? 0 : other.Length();
const intptr_t len = (this_len < other_len) ? this_len : other_len;
+ // UTF-16 has the high surrogate before the low surrogate so we can compare
+ // one code unit at a time for efficiency and still get the right ordering.
for (intptr_t i = 0; i < len; i++) {
- int32_t this_code_point = this->CharAt(i);
- int32_t other_code_point = other.CharAt(i);
- if (this_code_point < other_code_point) {
+ int32_t this_code_unit = this->CharAt(i);
+ int32_t other_code_unit = other.CharAt(i);
+ if (this_code_unit < other_code_unit) {
return -1;
}
- if (this_code_point > other_code_point) {
+ if (this_code_unit > other_code_unit) {
return 1;
}
}
@@ -10066,7 +10075,7 @@ RawString* String::New(const uint8_t* utf8_array,
intptr_t array_len,
Heap::Space space) {
Utf8::Type type;
- intptr_t len = Utf8::CodePointCount(utf8_array, array_len, &type);
+ intptr_t len = Utf8::CodeUnitCount(utf8_array, array_len, &type);
if (type == Utf8::kAscii) {
const String& strobj = String::Handle(OneByteString::New(len, space));
if (len > 0) {
@@ -10102,7 +10111,7 @@ RawString* String::New(const uint16_t* utf16_array,
}
-RawString* String::New(const uint32_t* utf32_array,
+RawString* String::New(const int32_t* utf32_array,
intptr_t array_len,
Heap::Space space) {
bool is_one_byte_string = true;
@@ -10391,23 +10400,37 @@ RawString* String::Transform(int32_t (*mapping)(int32_t ch),
bool has_mapping = false;
int32_t dst_max = 0;
intptr_t len = str.Length();
+ intptr_t out_len = 0;
// TODO(cshapiro): assume a transform is required, rollback if not.
- for (intptr_t i = 0; i < len; ++i) {
+ intptr_t i = 0;
+ for (; i < len; ++i) {
int32_t src = str.CharAt(i);
+ if (Utf16::IsSurrogate(src)) break;
int32_t dst = mapping(src);
if (src != dst) {
has_mapping = true;
}
dst_max = Utils::Maximum(dst_max, dst);
+ out_len += dst > Utf16::kMaxCodeUnit ? 2 : 1;
+ }
+ for (; i < len; ++i) {
+ int32_t src = Utf16::CodePointAt(str, i);
+ int32_t dst = mapping(src);
+ if (src != dst) {
+ has_mapping = true;
+ }
+ dst_max = Utils::Maximum(dst_max, dst);
+ out_len += dst > Utf16::kMaxCodeUnit ? 2 : 1;
+ if (src > Utf16::kMaxCodeUnit) ++i;
}
if (!has_mapping) {
return str.raw();
}
if (dst_max <= 0x7F) {
- return OneByteString::Transform(mapping, str, space);
+ return OneByteString::Transform(mapping, str, out_len, space);
}
ASSERT(dst_max > 0x7F);
- return TwoByteString::Transform(mapping, str, space);
+ return TwoByteString::Transform(mapping, str, out_len, space);
}
@@ -10561,7 +10584,7 @@ RawOneByteString* OneByteString::New(const uint16_t* characters,
}
-RawOneByteString* OneByteString::New(const uint32_t* characters,
+RawOneByteString* OneByteString::New(const int32_t* characters,
intptr_t len,
Heap::Space space) {
const String& result = String::Handle(OneByteString::New(len, space));
@@ -10614,14 +10637,18 @@ RawOneByteString* OneByteString::ConcatAll(const Array& strings,
RawOneByteString* OneByteString::Transform(int32_t (*mapping)(int32_t ch),
const String& str,
+ int out_length,
Heap::Space space) {
ASSERT(!str.IsNull());
intptr_t len = str.Length();
- const String& result = String::Handle(OneByteString::New(len, space));
- for (intptr_t i = 0; i < len; ++i) {
- int32_t ch = mapping(str.CharAt(i));
- ASSERT(ch >= 0 && ch <= 0x7F);
- *CharAddr(result, i) = ch;
+ const String& result =
+ String::Handle(OneByteString::New(out_length, space));
+ for (intptr_t i = 0, j = 0; i < len; ++i, j++) {
+ int32_t old_ch = str.CharAt(i);
+ if (old_ch > Utf16::kMaxCodeUnit) i++;
+ int32_t ch = mapping(old_ch);
+ ASSERT(ch <= 0x7F);
+ *CharAddr(result, j) = ch;
}
return OneByteString::raw(result);
}
@@ -10696,7 +10723,7 @@ RawTwoByteString* TwoByteString::New(const uint16_t* utf16_array,
RawTwoByteString* TwoByteString::New(intptr_t utf16_len,
- const uint32_t* utf32_array,
+ const int32_t* utf32_array,
intptr_t array_len,
Heap::Space space) {
ASSERT((array_len > 0) && (utf16_len >= array_len));
@@ -10705,9 +10732,11 @@ RawTwoByteString* TwoByteString::New(intptr_t utf16_len,
NoGCScope no_gc;
intptr_t j = 0;
for (intptr_t i = 0; i < array_len; ++i) {
- if (utf32_array[i] > 0xffff) {
+ int32_t code_point = utf32_array[i];
+ if (code_point > Utf16::kMaxCodeUnit) {
ASSERT(j < (utf16_len - 1));
- Utf8::ConvertUTF32ToUTF16(utf32_array[i], CharAddr(result, j));
+ *CharAddr(result, j) = Utf16::LeadFromCodePoint(code_point);
+ *CharAddr(result, j + 1) = Utf16::TrailFromCodePoint(code_point);
j += 2;
} else {
ASSERT(j < utf16_len);
@@ -10761,14 +10790,25 @@ RawTwoByteString* TwoByteString::ConcatAll(const Array& strings,
RawTwoByteString* TwoByteString::Transform(int32_t (*mapping)(int32_t ch),
const String& str,
+ int out_length,
Heap::Space space) {
ASSERT(!str.IsNull());
intptr_t len = str.Length();
- const String& result = String::Handle(TwoByteString::New(len, space));
- for (intptr_t i = 0; i < len; ++i) {
- int32_t ch = mapping(str.CharAt(i));
- ASSERT(ch >= 0 && ch <= 0xFFFF);
- *CharAddr(result, i) = ch;
+ const String& result =
+ String::Handle(TwoByteString::New(out_length, space));
+ for (intptr_t i = 0, j = 0; i < len; ++i, j++) {
+ int32_t old_ch = Utf16::CodePointAt(str, i);
+ if (old_ch > Utf16::kMaxCodeUnit) i++;
+ int32_t ch = mapping(old_ch);
+ ASSERT(ch <= Utf16::kMaxCodePoint);
siva 2012/11/16 22:32:04 We assert here that ch <= Utf16::kMaxCodePoint but
erikcorry 2012/11/19 12:40:41 No, the assert checks against kMaxCodePoint (0x10f
+ if (ch <= Utf16::kMaxCodeUnit) {
+ *CharAddr(result, j) = ch;
+ } else {
+ *CharAddr(result, j) = Utf16::LeadFromCodePoint(ch);
+ *CharAddr(result, j + 1) = Utf16::TrailFromCodePoint(ch);
+ ++j;
+ }
+ ASSERT(j <= out_length);
}
return TwoByteString::raw(result);
}

Powered by Google App Engine
This is Rietveld 408576698