runtime/vm/object.cc - Issue 11368138: Add some support for the code-point code-unit distinction.

Unified Diff: runtime/vm/object.cc

Issue 11368138: Add some support for the code-point code-unit distinction. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: New version integrates feedback, adds less to standard String class. Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: runtime/vm/object.cc

diff --git a/runtime/vm/object.cc b/runtime/vm/object.cc

index 8e1ad649c9101ed396520087d9e4d5edd0e3d118..0758bcae27db294f864701b9bacaaedff7a00119 100644

--- a/runtime/vm/object.cc

+++ b/runtime/vm/object.cc

@@ -141,16 +141,16 @@ static RawString* IdentifierPrettyName(const String& name) {

bool is_setter = false;

for (int i = 0; i < name.Length(); i++) {

- if (name.CharAt(i) == ':') {

+ if (name.CodeUnitAt(i) == ':') {

cshapiro 2012/11/15 20:14:51 This file is an example of why we would like the r

erikcorry 2012/11/15 23:47:05 Done.

ASSERT(start == 0);

- if (name.CharAt(0) == 's') {

+ if (name.CodeUnitAt(0) == 's') {

is_setter = true;

}

start = i + 1;

- } else if (name.CharAt(i) == '@') {

+ } else if (name.CodeUnitAt(i) == '@') {

ASSERT(at_pos == len);

at_pos = i;

- } else if (name.CharAt(i) == '.') {

+ } else if (name.CodeUnitAt(i) == '.') {

dot_pos = i;

break;

}

@@ -168,7 +168,7 @@ static RawString* IdentifierPrettyName(const String& name) {

// "_ReceivePortImpl@6be832b._internal@6be832b".

at_pos = len;

for (int i = dot_pos; i < name.Length(); i++) {

- if (name.CharAt(i) == '@') {

+ if (name.CodeUnitAt(i) == '@') {

ASSERT(at_pos == len);

at_pos = i;

}

@@ -489,7 +489,7 @@ void Object::RegisterClass(const Class& cls,

const String& name,

const Library& lib) {

ASSERT(name.Length() > 0);

- ASSERT(name.CharAt(0) != '_');

+ ASSERT(name.CodeUnitAt(0) != '_');

cls.set_name(name);

lib.AddClass(cls);

}

@@ -499,7 +499,7 @@ void Object::RegisterPrivateClass(const Class& cls,

const String& public_class_name,

const Library& lib) {

ASSERT(public_class_name.Length() > 0);

- ASSERT(public_class_name.CharAt(0) == '_');

+ ASSERT(public_class_name.CodeUnitAt(0) == '_');

String& str = String::Handle();

str = lib.PrivateName(public_class_name);

cls.set_name(str);

@@ -2187,12 +2187,12 @@ static bool MatchesAccessorName(const String& name,

return false;

}

for (intptr_t i = 0; i < prefix_length; i++) {

- if (name.CharAt(i) != prefix[i]) {

+ if (name.CodeUnitAt(i) != static_cast<uint32_t>(prefix[i])) {

return false;

}

for (intptr_t i = 0, j = prefix_length; i < accessor_name_len; i++, j++) {

- if (name.CharAt(j) != accessor_name.CharAt(i)) {

+ if (name.CodeUnitAt(j) != accessor_name.CodeUnitAt(i)) {

return false;

}

@@ -4497,13 +4497,13 @@ RawString* TokenStream::GenerateSource() const {

bool is_raw_string = false;

bool escape_characters = false;

for (intptr_t i = 0; i < literal.Length(); i++) {

- if (IsSpecialCharacter(literal.CharAt(i))) {

+ if (IsSpecialCharacter(literal.CodeUnitAt(i))) {

escape_characters = true;

}

// TODO(4995): Temp solution for raw strings, this will break

// if we saw a string that is not a raw string but has back slashes

// in it.

- if ((literal.CharAt(i) == '\\')) {

+ if ((literal.CodeUnitAt(i) == '\\')) {

if ((next != Token::kINTERPOL_VAR) &&

(next != Token::kINTERPOL_START) &&

(prev != Token::kINTERPOL_VAR) &&

@@ -4531,12 +4531,12 @@ RawString* TokenStream::GenerateSource() const {

}

} else if (curr == Token::kINTERPOL_VAR) {

literals.Add(dollar);

- if (literal.CharAt(0) == Scanner::kPrivateIdentifierStart) {

+ if (literal.CodeUnitAt(0) == Scanner::kPrivateIdentifierStart) {

literal = String::SubString(literal, 0, literal.Length() - private_len);

}

literals.Add(literal);

} else if (curr == Token::kIDENT) {

- if (literal.CharAt(0) == Scanner::kPrivateIdentifierStart) {

+ if (literal.CodeUnitAt(0) == Scanner::kPrivateIdentifierStart) {

literal = String::SubString(literal, 0, literal.Length() - private_len);

}

literals.Add(literal);

@@ -5080,10 +5080,10 @@ RawString* Script::GetLine(intptr_t line_number) const {

if ((current_line == line_number) && (line_start < 0)) {

line_start = ix;

}

- if (src.CharAt(ix) == '\n') {

+ if (src.CodeUnitAt(ix) == '\n') {

current_line++;

- } else if (src.CharAt(ix) == '\r') {

- if ((ix + 1 != src.Length()) && (src.CharAt(ix + 1) != '\n')) {

+ } else if (src.CodeUnitAt(ix) == '\r') {

+ if ((ix + 1 != src.Length()) && (src.CodeUnitAt(ix + 1) != '\n')) {

current_line++;

}

} else {

@@ -5110,7 +5110,7 @@ RawString* Script::GetSnippet(intptr_t from_line,

intptr_t lookahead = 0;

intptr_t snippet_start = -1;

intptr_t snippet_end = -1;

- char c = src.CharAt(lookahead);

+ char c = src.CodeUnitAt(lookahead);

while (lookahead != length) {

if (snippet_start == -1) {

if ((line == from_line) && (column == from_column)) {

@@ -5128,13 +5128,13 @@ RawString* Script::GetSnippet(intptr_t from_line,

lookahead++;

if (lookahead != length) {

// Replace '\r' with '\n' and a sequence of '\r' '\n' with a single '\n'.

- if (src.CharAt(lookahead) == '\r') {

+ if (src.CodeUnitAt(lookahead) == '\r') {

c = '\n';

- if (lookahead + 1 != length && src.CharAt(lookahead) == '\n') {

+ if (lookahead + 1 != length && src.CodeUnitAt(lookahead) == '\n') {

lookahead++;

}

} else {

- c = src.CharAt(lookahead);

+ c = src.CodeUnitAt(lookahead);

}

@@ -5534,13 +5534,13 @@ RawObject* Library::LookupLocalObject(const String& name) const {

static bool ShouldBePrivate(const String& name) {

return

(name.Length() >= 1 &&

- name.CharAt(0) == '_') ||

+ name.CodeUnitAt(0) == '_') ||

(name.Length() >= 5 &&

- (name.CharAt(4) == '_' &&

- (name.CharAt(0) == 'g' || name.CharAt(0) == 's') &&

- name.CharAt(1) == 'e' &&

- name.CharAt(2) == 't' &&

- name.CharAt(3) == ':'));

+ (name.CodeUnitAt(4) == '_' &&

+ (name.CodeUnitAt(0) == 'g' || name.CodeUnitAt(0) == 's') &&

+ name.CodeUnitAt(1) == 'e' &&

+ name.CodeUnitAt(2) == 't' &&

+ name.CodeUnitAt(3) == ':'));

}

@@ -9855,7 +9855,7 @@ intptr_t String::Hash(const String& str, intptr_t begin_index, intptr_t len) {

ASSERT((begin_index + len) <= str.Length());

StringHasher hasher;

for (intptr_t i = 0; i < len; i++) {

- hasher.Add(str.CharAt(begin_index + i));

+ hasher.Add(str.CodeUnitAt(begin_index + i));

}

return hasher.Finalize(String::kHashBits);

}

@@ -9887,21 +9887,21 @@ intptr_t String::Hash(const uint32_t* characters, intptr_t len) {

}

-int32_t String::CharAt(intptr_t index) const {

+uint32_t String::CodeUnitAt(intptr_t index) const {

intptr_t class_id = raw()->GetClassId();

ASSERT(RawObject::IsStringClassId(class_id));

NoGCScope no_gc;

if (class_id == kOneByteStringCid) {

- return *OneByteString::CharAddr(*this, index);

+ return OneByteString::CodeUnitAt(*this, index);

}

if (class_id == kTwoByteStringCid) {

- return *TwoByteString::CharAddr(*this, index);

+ return TwoByteString::CodeUnitAt(*this, index);

}

if (class_id == kExternalOneByteStringCid) {

- return *ExternalOneByteString::CharAddr(*this, index);

+ return ExternalOneByteString::CodeUnitAt(*this, index);

}

ASSERT(class_id == kExternalTwoByteStringCid);

- return *ExternalTwoByteString::CharAddr(*this, index);

+ return ExternalTwoByteString::CodeUnitAt(*this, index);

}

@@ -9953,12 +9953,17 @@ bool String::Equals(const char* str) const {

// Lengths don't match.

return false;

}

- int32_t ch;

+ uint32_t ch;

intptr_t consumed = Utf8::Decode(reinterpret_cast<const uint8_t*>(str),

len,

&ch);

- if (consumed == 0 || this->CharAt(i) != ch) {

- return false;

+ if (consumed == 0) return false;

+ if (ch <= Utf16::kMaxCodeUnit) {

+ if (this->CodeUnitAt(i) != ch) return false;

+ } else {

+ if (Utf16::CodePointAt(*this, i) != ch) return false;

+ i++;

}

str += consumed;

len -= consumed;

@@ -9974,7 +9979,7 @@ bool String::Equals(const uint8_t* characters, intptr_t len) const {

}

for (intptr_t i = 0; i < len; i++) {

- if (this->CharAt(i) != characters[i]) {

+ if (this->CodeUnitAt(i) != characters[i]) {

return false;

}

@@ -9989,7 +9994,7 @@ bool String::Equals(const uint16_t* characters, intptr_t len) const {

}

for (intptr_t i = 0; i < len; i++) {

- if (this->CharAt(i) != characters[i]) {

+ if (this->CodeUnitAt(i) != characters[i]) {

return false;

}

@@ -10004,9 +10009,11 @@ bool String::Equals(const uint32_t* characters, intptr_t len) const {

}

for (intptr_t i = 0; i < len; i++) {

- if (this->CharAt(i) != static_cast<int32_t>(characters[i])) {

+ uint32_t c = this->CodeUnitAt(i);

+ if (c != characters[i]) {

return false;

}

+ if (c > Utf16::kMaxCodeUnit) i++;

}

return true;

}

@@ -10016,13 +10023,15 @@ intptr_t String::CompareTo(const String& other) const {

const intptr_t this_len = this->Length();

const intptr_t other_len = other.IsNull() ? 0 : other.Length();

const intptr_t len = (this_len < other_len) ? this_len : other_len;

+ // UTF-16 has the high surrogate before the low surrogate so we can compare

+ // one code unit at a time for efficiency and still get the right ordering.

for (intptr_t i = 0; i < len; i++) {

- int32_t this_code_point = this->CharAt(i);

- int32_t other_code_point = other.CharAt(i);

- if (this_code_point < other_code_point) {

+ int32_t this_code_unit = this->CodeUnitAt(i);

+ int32_t other_code_unit = other.CodeUnitAt(i);

+ if (this_code_unit < other_code_unit) {

return -1;

}

- if (this_code_point > other_code_point) {

+ if (this_code_unit > other_code_unit) {

return 1;

}

@@ -10038,7 +10047,7 @@ bool String::StartsWith(const String& other) const {

}

intptr_t slen = other.Length();

for (int i = 0; i < slen; i++) {

- if (this->CharAt(i) != other.CharAt(i)) {

+ if (this->CodeUnitAt(i) != other.CodeUnitAt(i)) {

return false;

}

@@ -10066,7 +10075,7 @@ RawString* String::New(const uint8_t* utf8_array,

intptr_t array_len,

Heap::Space space) {

Utf8::Type type;

- intptr_t len = Utf8::CodePointCount(utf8_array, array_len, &type);

+ intptr_t len = Utf8::CodeUnitCount(utf8_array, array_len, &type);

if (type == Utf8::kAscii) {

const String& strobj = String::Handle(OneByteString::New(len, space));

if (len > 0) {

@@ -10344,7 +10353,7 @@ RawString* String::SubString(const String& str,

intptr_t char_size = str.CharSize();

if (char_size == kTwoByteChar) {

for (intptr_t i = begin_index; i < begin_index + length; ++i) {

- if (str.CharAt(i) > 0x7F) {

+ if (str.CodeUnitAt(i) > 0x7F) {

is_one_byte_string = false;

break;

}

@@ -10389,25 +10398,39 @@ RawString* String::Transform(int32_t (*mapping)(int32_t ch),

Heap::Space space) {

ASSERT(!str.IsNull());

bool has_mapping = false;

- int32_t dst_max = 0;

+ uint32_t dst_max = 0;

intptr_t len = str.Length();

+ intptr_t out_len = 0;

// TODO(cshapiro): assume a transform is required, rollback if not.

- for (intptr_t i = 0; i < len; ++i) {

- int32_t src = str.CharAt(i);

- int32_t dst = mapping(src);

+ intptr_t i = 0;

+ for (; i < len; ++i) {

+ uint32_t src = str.CodeUnitAt(i);

+ if (Utf16::IsSurrogate(src)) break;

+ uint32_t dst = mapping(src);

+ if (src != dst) {

+ has_mapping = true;

+ }

+ dst_max = Utils::Maximum(dst_max, dst);

+ out_len += dst > Utf16::kMaxCodeUnit ? 2 : 1;

+ }

+ for (; i < len; ++i) {

+ uint32_t src = Utf16::CodePointAt(str, i);

+ uint32_t dst = mapping(src);

if (src != dst) {

has_mapping = true;

}

dst_max = Utils::Maximum(dst_max, dst);

+ out_len += dst > Utf16::kMaxCodeUnit ? 2 : 1;

+ if (src > Utf16::kMaxCodeUnit) ++i;

}

if (!has_mapping) {

return str.raw();

}

if (dst_max <= 0x7F) {

- return OneByteString::Transform(mapping, str, space);

+ return OneByteString::Transform(mapping, str, out_len, space);

}

ASSERT(dst_max > 0x7F);

- return TwoByteString::Transform(mapping, str, space);

+ return TwoByteString::Transform(mapping, str, out_len, space);

}

@@ -10614,14 +10637,18 @@ RawOneByteString* OneByteString::ConcatAll(const Array& strings,

RawOneByteString* OneByteString::Transform(int32_t (*mapping)(int32_t ch),

const String& str,

+ int out_length,

Heap::Space space) {

ASSERT(!str.IsNull());

intptr_t len = str.Length();

- const String& result = String::Handle(OneByteString::New(len, space));

- for (intptr_t i = 0; i < len; ++i) {

- int32_t ch = mapping(str.CharAt(i));

- ASSERT(ch >= 0 && ch <= 0x7F);

- *CharAddr(result, i) = ch;

+ const String& result =

+ String::Handle(OneByteString::New(out_length, space));

+ for (intptr_t i = 0, j = 0; i < len; ++i, j++) {

+ uint32_t old_ch = str.CodeUnitAt(i);

+ if (old_ch > Utf16::kMaxCodeUnit) i++;

+ uint32_t ch = mapping(old_ch);

+ ASSERT(ch <= 0x7Fu);

+ *CharAddr(result, j) = ch;

}

return OneByteString::raw(result);

}

@@ -10705,9 +10732,11 @@ RawTwoByteString* TwoByteString::New(intptr_t utf16_len,

NoGCScope no_gc;

intptr_t j = 0;

for (intptr_t i = 0; i < array_len; ++i) {

- if (utf32_array[i] > 0xffff) {

+ uint32_t code_point = utf32_array[i];

+ if (code_point > Utf16::kMaxCodeUnit) {

ASSERT(j < (utf16_len - 1));

- Utf8::ConvertUTF32ToUTF16(utf32_array[i], CharAddr(result, j));

+ *CharAddr(result, j) = Utf16::LeadFromCodePoint(code_point);

+ *CharAddr(result, j + 1) = Utf16::TrailFromCodePoint(code_point);

j += 2;

} else {

ASSERT(j < utf16_len);

@@ -10761,14 +10790,25 @@ RawTwoByteString* TwoByteString::ConcatAll(const Array& strings,

RawTwoByteString* TwoByteString::Transform(int32_t (*mapping)(int32_t ch),

const String& str,

+ int out_length,

Heap::Space space) {

ASSERT(!str.IsNull());

intptr_t len = str.Length();

- const String& result = String::Handle(TwoByteString::New(len, space));

- for (intptr_t i = 0; i < len; ++i) {

- int32_t ch = mapping(str.CharAt(i));

- ASSERT(ch >= 0 && ch <= 0xFFFF);

- *CharAddr(result, i) = ch;

+ const String& result =

+ String::Handle(TwoByteString::New(out_length, space));

+ for (intptr_t i = 0, j = 0; i < len; ++i, j++) {

+ uint32_t old_ch = Utf16::CodePointAt(str, i);

+ if (old_ch > Utf16::kMaxCodeUnit) i++;

+ uint32_t ch = mapping(old_ch);

+ ASSERT(ch <= Utf16::kMaxCodePoint);

+ if (ch <= Utf16::kMaxCodeUnit) {

+ *CharAddr(result, j) = ch;

+ } else {

+ *CharAddr(result, j) = Utf16::LeadFromCodePoint(ch);

+ *CharAddr(result, j + 1) = Utf16::TrailFromCodePoint(ch);

+ ++j;

+ }

+ ASSERT(j <= out_length);

}

return TwoByteString::raw(result);

}

« runtime/vm/object.h ('K') | « runtime/vm/object.h ('k') | runtime/vm/object_test.cc » ('j') | runtime/vm/unicode.h » ('J')