runtime/vm/object.cc - Issue 11368138: Add some support for the code-point code-unit distinction.

Unified Diff: runtime/vm/object.cc

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Implemented feedback from patch set 3 Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: runtime/vm/object.cc

diff --git a/runtime/vm/object.cc b/runtime/vm/object.cc

index 5dc09dd7e5c5e4afedd5e9abae89ad005309f24a..796024d9e07f7189e476060de1891c515b01756f 100644

--- a/runtime/vm/object.cc

+++ b/runtime/vm/object.cc

@@ -2235,7 +2235,7 @@ static bool MatchesAccessorName(const String& name,

return false;

}

for (intptr_t i = 0; i < prefix_length; i++) {

- if (name.CharAt(i) != prefix[i]) {

+ if (name.CharAt(i) != static_cast<int32_t>(prefix[i])) {

return false;

}

@@ -9163,8 +9163,7 @@ const char* Integer::ToCString() const {

RawInteger* Integer::New(const String& str, Heap::Space space) {

- // We are not supposed to have integers represented as two byte or

- // four byte strings.

+ // We are not supposed to have integers represented as two byte strings.

ASSERT(str.IsOneByteString());

int64_t value;

if (!OS::StringToInt64(str.ToCString(), &value)) {

@@ -9927,7 +9926,7 @@ intptr_t String::Hash(const uint16_t* characters, intptr_t len) {

}

-intptr_t String::Hash(const uint32_t* characters, intptr_t len) {

+intptr_t String::Hash(const int32_t* characters, intptr_t len) {

return HashImpl(characters, len);

}

@@ -10002,8 +10001,13 @@ bool String::Equals(const char* str) const {

intptr_t consumed = Utf8::Decode(reinterpret_cast<const uint8_t*>(str),

len,

&ch);

- if (consumed == 0 || this->CharAt(i) != ch) {

- return false;

+ if (consumed == 0) return false;

+ if (ch <= Utf16::kMaxCodeUnit) {

+ if (this->CharAt(i) != ch) return false;

+ } else {

+ if (Utf16::CodePointAt(*this, i) != ch) return false;

+ i++;

}

str += consumed;

len -= consumed;

@@ -10042,32 +10046,19 @@ bool String::Equals(const uint16_t* characters, intptr_t len) const {

}

-bool String::Equals(const uint32_t* characters, intptr_t len) const {

- if (len != this->Length()) {

- // Lengths don't match.

- return false;

- }

- for (intptr_t i = 0; i < len; i++) {

- if (this->CharAt(i) != static_cast<int32_t>(characters[i])) {

- return false;

- }

- return true;

intptr_t String::CompareTo(const String& other) const {

const intptr_t this_len = this->Length();

const intptr_t other_len = other.IsNull() ? 0 : other.Length();

const intptr_t len = (this_len < other_len) ? this_len : other_len;

+ // UTF-16 has the high surrogate before the low surrogate so we can compare

+ // one code unit at a time for efficiency and still get the right ordering.

for (intptr_t i = 0; i < len; i++) {

- int32_t this_code_point = this->CharAt(i);

- int32_t other_code_point = other.CharAt(i);

- if (this_code_point < other_code_point) {

+ int32_t this_code_unit = this->CharAt(i);

+ int32_t other_code_unit = other.CharAt(i);

+ if (this_code_unit < other_code_unit) {

return -1;

}

- if (this_code_point > other_code_point) {

+ if (this_code_unit > other_code_unit) {

return 1;

}

@@ -10111,7 +10102,7 @@ RawString* String::New(const uint8_t* utf8_array,

intptr_t array_len,

Heap::Space space) {

Utf8::Type type;

- intptr_t len = Utf8::CodePointCount(utf8_array, array_len, &type);

+ intptr_t len = Utf8::CodeUnitCount(utf8_array, array_len, &type);

if (type == Utf8::kLatin1) {

const String& strobj = String::Handle(OneByteString::New(len, space));

if (len > 0) {

@@ -10147,7 +10138,7 @@ RawString* String::New(const uint16_t* utf16_array,

}

-RawString* String::New(const uint32_t* utf32_array,

+RawString* String::New(const int32_t* utf32_array,

intptr_t array_len,

Heap::Space space) {

bool is_one_byte_string = true;

@@ -10513,23 +10504,37 @@ RawString* String::Transform(int32_t (*mapping)(int32_t ch),

bool has_mapping = false;

int32_t dst_max = 0;

intptr_t len = str.Length();

+ intptr_t out_len = 0;

// TODO(cshapiro): assume a transform is required, rollback if not.

- for (intptr_t i = 0; i < len; ++i) {

+ intptr_t i = 0;

+ for (; i < len; ++i) {

int32_t src = str.CharAt(i);

+ if (Utf16::IsSurrogate(src)) break;

+ int32_t dst = mapping(src);

+ if (src != dst) {

+ has_mapping = true;

+ }

+ dst_max = Utils::Maximum(dst_max, dst);

+ out_len += dst > Utf16::kMaxCodeUnit ? 2 : 1;

+ }

+ for (; i < len; ++i) {

+ int32_t src = Utf16::CodePointAt(str, i);

int32_t dst = mapping(src);

if (src != dst) {

has_mapping = true;

}

dst_max = Utils::Maximum(dst_max, dst);

+ out_len += dst > Utf16::kMaxCodeUnit ? 2 : 1;

+ if (src > Utf16::kMaxCodeUnit) ++i;

}

if (!has_mapping) {

return str.raw();

}

if (dst_max <= 0xFF) {

- return OneByteString::Transform(mapping, str, space);

+ return OneByteString::Transform(mapping, str, out_len, space);

}

ASSERT(dst_max > 0xFF);

- return TwoByteString::Transform(mapping, str, space);

+ return TwoByteString::Transform(mapping, str, out_len, space);

}

@@ -10683,7 +10688,7 @@ RawOneByteString* OneByteString::New(const uint16_t* characters,

}

-RawOneByteString* OneByteString::New(const uint32_t* characters,

+RawOneByteString* OneByteString::New(const int32_t* characters,

intptr_t len,

Heap::Space space) {

const String& result = String::Handle(OneByteString::New(len, space));

@@ -10736,14 +10741,18 @@ RawOneByteString* OneByteString::ConcatAll(const Array& strings,

RawOneByteString* OneByteString::Transform(int32_t (*mapping)(int32_t ch),

const String& str,

+ intptr_t out_length,

Heap::Space space) {

ASSERT(!str.IsNull());

intptr_t len = str.Length();

- const String& result = String::Handle(OneByteString::New(len, space));

- for (intptr_t i = 0; i < len; ++i) {

- int32_t ch = mapping(str.CharAt(i));

- ASSERT(ch >= 0 && ch <= 0xFF);

- *CharAddr(result, i) = ch;

+ const String& result =

+ String::Handle(OneByteString::New(out_length, space));

+ for (intptr_t i = 0, j = 0; i < len; ++i, j++) {

+ int32_t old_ch = str.CharAt(i);

+ if (old_ch > Utf16::kMaxCodeUnit) i++;

+ int32_t ch = mapping(old_ch);

+ ASSERT(ch <= 0xFF);

+ *CharAddr(result, j) = ch;

}

return OneByteString::raw(result);

}

@@ -10818,7 +10827,7 @@ RawTwoByteString* TwoByteString::New(const uint16_t* utf16_array,

RawTwoByteString* TwoByteString::New(intptr_t utf16_len,

- const uint32_t* utf32_array,

+ const int32_t* utf32_array,

intptr_t array_len,

Heap::Space space) {

ASSERT((array_len > 0) && (utf16_len >= array_len));

@@ -10827,9 +10836,11 @@ RawTwoByteString* TwoByteString::New(intptr_t utf16_len,

NoGCScope no_gc;

intptr_t j = 0;

for (intptr_t i = 0; i < array_len; ++i) {

- if (utf32_array[i] > 0xffff) {

+ int32_t code_point = utf32_array[i];

+ if (code_point > Utf16::kMaxCodeUnit) {

ASSERT(j < (utf16_len - 1));

- Utf8::ConvertUTF32ToUTF16(utf32_array[i], CharAddr(result, j));

+ *CharAddr(result, j) = Utf16::LeadFromCodePoint(code_point);

+ *CharAddr(result, j + 1) = Utf16::TrailFromCodePoint(code_point);

j += 2;

} else {

ASSERT(j < utf16_len);

@@ -10883,14 +10894,25 @@ RawTwoByteString* TwoByteString::ConcatAll(const Array& strings,

RawTwoByteString* TwoByteString::Transform(int32_t (*mapping)(int32_t ch),

const String& str,

+ intptr_t out_length,

Heap::Space space) {

ASSERT(!str.IsNull());

intptr_t len = str.Length();

- const String& result = String::Handle(TwoByteString::New(len, space));

- for (intptr_t i = 0; i < len; ++i) {

- int32_t ch = mapping(str.CharAt(i));

- ASSERT(ch >= 0 && ch <= 0xFFFF);

- *CharAddr(result, i) = ch;

+ const String& result =

+ String::Handle(TwoByteString::New(out_length, space));

+ for (intptr_t i = 0, j = 0; i < len; ++i, j++) {

+ int32_t old_ch = Utf16::CodePointAt(str, i);

+ if (old_ch > Utf16::kMaxCodeUnit) i++;

+ int32_t ch = mapping(old_ch);

+ ASSERT(ch <= Utf16::kMaxCodePoint);

+ if (ch <= Utf16::kMaxCodeUnit) {

+ *CharAddr(result, j) = ch;

+ } else {

+ *CharAddr(result, j) = Utf16::LeadFromCodePoint(ch);

+ *CharAddr(result, j + 1) = Utf16::TrailFromCodePoint(ch);

+ ++j;

+ }

+ ASSERT(j <= out_length);

}

return TwoByteString::raw(result);

}

« no previous file with comments | « runtime/vm/object.h ('k') | runtime/vm/object_test.cc » ('j') | runtime/vm/scanner.cc » ('J')