src/runtime.cc - Issue 59853006: Fix y-umlaut to uppercase.

Unified Diff: src/runtime.cc

Issue 59853006: Fix y-umlaut to uppercase. (Closed) Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/runtime.cc

diff --git a/src/runtime.cc b/src/runtime.cc

index 107297c944701f3e3570145168d11b66b30bc3a7..7486181b00f19f1c6e994712f88ff1cf3c6750ad 100644

--- a/src/runtime.cc

+++ b/src/runtime.cc

@@ -6194,6 +6194,7 @@ template <class Converter>

MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(

Isolate* isolate,

String* s,

+ String::Encoding result_encoding,

int length,

int input_string_length,

unibrow::Mapping<Converter, 128>* mapping) {

@@ -6209,7 +6210,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(

// might break in the future if we implement more context and locale

// dependent upper/lower conversions.

Object* o;

- { MaybeObject* maybe_o = s->IsOneByteRepresentation()

+ { MaybeObject* maybe_o = result_encoding == String::ONE_BYTE_ENCODING

? isolate->heap()->AllocateRawOneByteString(length)

: isolate->heap()->AllocateRawTwoByteString(length);

if (!maybe_o->ToObject(&o)) return maybe_o;

@@ -6217,6 +6218,8 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(

String* result = String::cast(o);

bool has_changed_character = false;

+ DisallowHeapAllocation no_gc;

// Convert all characters to upper case, assuming that they will fit

// in the buffer

Access<ConsStringIteratorOp> op(

@@ -6225,6 +6228,10 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(

unibrow::uchar chars[Converter::kMaxWidth];

// We can assume that the string is not empty

uc32 current = stream.GetNext();

+ // y with umlauts is the only character that stops fitting into one-byte

+ // when converting to uppercase.

+ static const uc32 yuml_code = 0xff;

Sven Panne 2013/11/07 07:07:24 Hmmm, is this really the only case? Look e.g. at i

dcarney 2013/11/07 07:13:33 This code does not take locale into account. It's

+ bool ignore_yuml = result->IsSeqTwoByteString() || Converter::kIsToLower;

for (int i = 0; i < length;) {

bool has_next = stream.HasMore();

uc32 next = has_next ? stream.GetNext() : 0;

@@ -6233,13 +6240,14 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(

// The case conversion of this character is the character itself.

result->Set(i, current);

i++;

- } else if (char_length == 1) {

+ } else if (char_length == 1 && (ignore_yuml || current != yuml_code)) {

// Common case: converting the letter resulted in one character.

ASSERT(static_cast<uc32>(chars[0]) != current);

result->Set(i, chars[0]);

has_changed_character = true;

i++;

} else if (length == input_string_length) {

+ bool found_yuml = (current == yuml_code);

// We've assumed that the result would be as long as the

// input but here is a character that converts to several

// characters. No matter, we calculate the exact length

@@ -6259,6 +6267,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(

int current_length = i + char_length + next_length;

while (stream.HasMore()) {

current = stream.GetNext();

+ found_yuml |= (current == yuml_code);

// NOTE: we use 0 as the next character here because, while

// the next character may affect what a character converts to,

// it does not in any case affect the length of what it convert

@@ -6271,8 +6280,10 @@ MUST_USE_RESULT static MaybeObject* ConvertCaseHelper(

return Failure::OutOfMemoryException(0x13);

}

- // Try again with the real length.

- return Smi::FromInt(current_length);

+ // Try again with the real length. Return signed if we need

+ // to allocate a two-byte string for y-umlaut to uppercase.

+ return (found_yuml && !ignore_yuml) ? Smi::FromInt(-current_length)

+ : Smi::FromInt(current_length);

} else {

for (int j = 0; j < char_length; j++) {

result->Set(i, chars[j]);

@@ -6318,121 +6329,107 @@ static inline uintptr_t AsciiRangeMask(uintptr_t w, char m, char n) {

}

-enum AsciiCaseConversion {

- ASCII_TO_LOWER,

- ASCII_TO_UPPER

-};

-template <AsciiCaseConversion dir>

-struct FastAsciiConverter {

- static bool Convert(char* dst, char* src, int length, bool* changed_out) {

+template<class Converter>

+static bool FastAsciiConvert(char* dst,

+ char* src,

+ int length,

+ bool* changed_out) {

#ifdef DEBUG

char* saved_dst = dst;

char* saved_src = src;

#endif

- // We rely on the distance between upper and lower case letters

- // being a known power of 2.

- ASSERT('a' - 'A' == (1 << 5));

- // Boundaries for the range of input characters than require conversion.

- const char lo = (dir == ASCII_TO_LOWER) ? 'A' - 1 : 'a' - 1;

- const char hi = (dir == ASCII_TO_LOWER) ? 'Z' + 1 : 'z' + 1;

- bool changed = false;

- uintptr_t or_acc = 0;

- char* const limit = src + length;

+ DisallowHeapAllocation no_gc;

+ // We rely on the distance between upper and lower case letters

+ // being a known power of 2.

+ ASSERT('a' - 'A' == (1 << 5));

+ // Boundaries for the range of input characters than require conversion.

+ static const char lo = Converter::kIsToLower ? 'A' - 1 : 'a' - 1;

+ static const char hi = Converter::kIsToLower ? 'Z' + 1 : 'z' + 1;

+ bool changed = false;

+ uintptr_t or_acc = 0;

+ char* const limit = src + length;

#ifdef V8_HOST_CAN_READ_UNALIGNED

- // Process the prefix of the input that requires no conversion one

- // (machine) word at a time.

- while (src <= limit - sizeof(uintptr_t)) {

- uintptr_t w = *reinterpret_cast<uintptr_t*>(src);

- or_acc |= w;

- if (AsciiRangeMask(w, lo, hi) != 0) {

- changed = true;

- break;

- }

- *reinterpret_cast<uintptr_t*>(dst) = w;

- src += sizeof(uintptr_t);

- dst += sizeof(uintptr_t);

- }

- // Process the remainder of the input performing conversion when

- // required one word at a time.

- while (src <= limit - sizeof(uintptr_t)) {

- uintptr_t w = *reinterpret_cast<uintptr_t*>(src);

- or_acc |= w;

- uintptr_t m = AsciiRangeMask(w, lo, hi);

- // The mask has high (7th) bit set in every byte that needs

- // conversion and we know that the distance between cases is

- // 1 << 5.

- *reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);

- src += sizeof(uintptr_t);

- dst += sizeof(uintptr_t);

- }

-#endif

- // Process the last few bytes of the input (or the whole input if

- // unaligned access is not supported).

- while (src < limit) {

- char c = *src;

- or_acc |= c;

- if (lo < c && c < hi) {

- c ^= (1 << 5);

- changed = true;

- }

- *dst = c;

- ++src;

- ++dst;

- }

- if ((or_acc & kAsciiMask) != 0) {

- return false;

+ // Process the prefix of the input that requires no conversion one

+ // (machine) word at a time.

+ while (src <= limit - sizeof(uintptr_t)) {

+ uintptr_t w = *reinterpret_cast<uintptr_t*>(src);

+ or_acc |= w;

+ if (AsciiRangeMask(w, lo, hi) != 0) {

+ changed = true;

+ break;

}

-#ifdef DEBUG

- CheckConvert(saved_dst, saved_src, length, changed);

+ *reinterpret_cast<uintptr_t*>(dst) = w;

+ src += sizeof(uintptr_t);

+ dst += sizeof(uintptr_t);

+ }

+ // Process the remainder of the input performing conversion when

+ // required one word at a time.

+ while (src <= limit - sizeof(uintptr_t)) {

+ uintptr_t w = *reinterpret_cast<uintptr_t*>(src);

+ or_acc |= w;

+ uintptr_t m = AsciiRangeMask(w, lo, hi);

+ // The mask has high (7th) bit set in every byte that needs

+ // conversion and we know that the distance between cases is

+ // 1 << 5.

+ *reinterpret_cast<uintptr_t*>(dst) = w ^ (m >> 2);

+ src += sizeof(uintptr_t);

+ dst += sizeof(uintptr_t);

+ }

#endif

- *changed_out = changed;

- return true;

+ // Process the last few bytes of the input (or the whole input if

+ // unaligned access is not supported).

+ while (src < limit) {

+ char c = *src;

+ or_acc |= c;

+ if (lo < c && c < hi) {

+ c ^= (1 << 5);

+ changed = true;

+ }

+ *dst = c;

+ ++src;

+ ++dst;

+ }

+ if ((or_acc & kAsciiMask) != 0) {

+ return false;

}

+ ASSERT(CheckFastAsciiConvert(

+ saved_dst, saved_src, length, changed, Converter::kIsToLower));

+ *changed_out = changed;

+ return true;

#ifdef DEBUG

- static void CheckConvert(char* dst, char* src, int length, bool changed) {

- bool expected_changed = false;

- for (int i = 0; i < length; i++) {

- if (dst[i] == src[i]) continue;

- expected_changed = true;

- if (dir == ASCII_TO_LOWER) {

- ASSERT('A' <= src[i] && src[i] <= 'Z');

- ASSERT(dst[i] == src[i] + ('a' - 'A'));

- } else {

- ASSERT(dir == ASCII_TO_UPPER);

- ASSERT('a' <= src[i] && src[i] <= 'z');

- ASSERT(dst[i] == src[i] - ('a' - 'A'));

- }

+static bool CheckFastAsciiConvert(char* dst,

+ char* src,

+ int length,

+ bool changed,

+ bool is_to_lower) {

+ bool expected_changed = false;

+ for (int i = 0; i < length; i++) {

+ if (dst[i] == src[i]) continue;

+ expected_changed = true;

+ if (is_to_lower) {

+ ASSERT('A' <= src[i] && src[i] <= 'Z');

+ ASSERT(dst[i] == src[i] + ('a' - 'A'));

+ } else {

+ ASSERT('a' <= src[i] && src[i] <= 'z');

+ ASSERT(dst[i] == src[i] - ('a' - 'A'));

}

- ASSERT(expected_changed == changed);

}

+ return (expected_changed == changed);

#endif

-};

-struct ToLowerTraits {

- typedef unibrow::ToLowercase UnibrowConverter;

- typedef FastAsciiConverter<ASCII_TO_LOWER> AsciiConverter;

-};

-struct ToUpperTraits {

- typedef unibrow::ToUppercase UnibrowConverter;

- typedef FastAsciiConverter<ASCII_TO_UPPER> AsciiConverter;

-};

} // namespace

-template <typename ConvertTraits>

+template <class Converter>

MUST_USE_RESULT static MaybeObject* ConvertCase(

Arguments args,

Isolate* isolate,

- unibrow::Mapping<typename ConvertTraits::UnibrowConverter, 128>* mapping) {

+ unibrow::Mapping<Converter, 128>* mapping) {

SealHandleScope shs(isolate);

CONVERT_ARG_CHECKED(String, s, 0);

s = s->TryFlattenGetString();

@@ -6454,7 +6451,7 @@ MUST_USE_RESULT static MaybeObject* ConvertCase(

}

SeqOneByteString* result = SeqOneByteString::cast(o);

bool has_changed_character;

- bool is_ascii = ConvertTraits::AsciiConverter::Convert(

+ bool is_ascii = FastAsciiConvert<Converter>(

reinterpret_cast<char*>(result->GetChars()),

reinterpret_cast<char*>(SeqOneByteString::cast(s)->GetChars()),

length,

@@ -6465,31 +6462,35 @@ MUST_USE_RESULT static MaybeObject* ConvertCase(

}

+ String::Encoding result_encoding = s->IsOneByteRepresentationUnderneath()

+ ? String::ONE_BYTE_ENCODING : String::TWO_BYTE_ENCODING;

Object* answer;

- { MaybeObject* maybe_answer =

- ConvertCaseHelper(isolate, s, length, length, mapping);

+ { MaybeObject* maybe_answer = ConvertCaseHelper(

+ isolate, s, result_encoding, length, length, mapping);

if (!maybe_answer->ToObject(&answer)) return maybe_answer;

}

if (answer->IsSmi()) {

- // Retry with correct length.

- { MaybeObject* maybe_answer =

- ConvertCaseHelper(isolate,

- s, Smi::cast(answer)->value(), length, mapping);

- if (!maybe_answer->ToObject(&answer)) return maybe_answer;

+ int new_length = Smi::cast(answer)->value();

+ if (new_length < 0) {

+ result_encoding = String::TWO_BYTE_ENCODING;

+ new_length = -new_length;

}

+ MaybeObject* maybe_answer = ConvertCaseHelper(

+ isolate, s, result_encoding, new_length, length, mapping);

+ if (!maybe_answer->ToObject(&answer)) return maybe_answer;

}

return answer;

}

RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToLowerCase) {

- return ConvertCase<ToLowerTraits>(

+ return ConvertCase(

args, isolate, isolate->runtime_state()->to_lower_mapping());

}

RUNTIME_FUNCTION(MaybeObject*, Runtime_StringToUpperCase) {

- return ConvertCase<ToUpperTraits>(

+ return ConvertCase(

args, isolate, isolate->runtime_state()->to_upper_mapping());

}

« no previous file with comments | « src/heap.cc ('k') | src/unicode.h » ('j') | no next file with comments »