| Index: src/runtime/runtime-i18n.cc
|
| diff --git a/src/runtime/runtime-i18n.cc b/src/runtime/runtime-i18n.cc
|
| index 75e0952581b3723aea67e4e9b108534d926fec86..5ea524840e3699d6e0d939f41d5c750ee3c1967f 100644
|
| --- a/src/runtime/runtime-i18n.cc
|
| +++ b/src/runtime/runtime-i18n.cc
|
| @@ -8,13 +8,15 @@
|
|
|
| #include <memory>
|
|
|
| -#include "src/api.h"
|
| #include "src/api-natives.h"
|
| +#include "src/api.h"
|
| #include "src/arguments.h"
|
| #include "src/factory.h"
|
| #include "src/i18n.h"
|
| #include "src/isolate-inl.h"
|
| #include "src/messages.h"
|
| +#include "src/string-case.h"
|
| +#include "src/utils.h"
|
|
|
| #include "unicode/brkiter.h"
|
| #include "unicode/calendar.h"
|
| @@ -1041,15 +1043,14 @@ bool ToUpperFastASCII(const Vector<const Char>& src,
|
| const uint16_t sharp_s = 0xDF;
|
|
|
| template <typename Char>
|
| -bool ToUpperOneByte(const Vector<const Char>& src,
|
| - Handle<SeqOneByteString> result, int* sharp_s_count) {
|
| +bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,
|
| + int* sharp_s_count) {
|
| // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
|
|
|
| // There are two special cases.
|
| // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
|
| // 2. Lower case sharp-S converts to "SS" (two characters)
|
| *sharp_s_count = 0;
|
| - int32_t index = 0;
|
| for (auto it = src.begin(); it != src.end(); ++it) {
|
| uint16_t ch = static_cast<uint16_t>(*it);
|
| if (V8_UNLIKELY(ch == sharp_s)) {
|
| @@ -1061,7 +1062,7 @@ bool ToUpperOneByte(const Vector<const Char>& src,
|
| // need to take the 16-bit path.
|
| return false;
|
| }
|
| - result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));
|
| + *dest++ = ToLatin1Upper(ch);
|
| }
|
|
|
| return true;
|
| @@ -1082,6 +1083,16 @@ void ToUpperWithSharpS(const Vector<const Char>& src,
|
| }
|
| }
|
|
|
| +inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {
|
| + for (int index = 0; index < length; ++index) {
|
| + uint16_t ch = s->Get(index);
|
| + if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
|
| + return index;
|
| + }
|
| + }
|
| + return length;
|
| +}
|
| +
|
| } // namespace
|
|
|
| RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
|
| @@ -1091,60 +1102,65 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
|
|
|
| int length = s->length();
|
| s = String::Flatten(s);
|
| - // First scan the string for uppercase and non-ASCII characters:
|
| - if (s->HasOnlyOneByteChars()) {
|
| - int first_index_to_lower = length;
|
| - for (int index = 0; index < length; ++index) {
|
| - // Blink specializes this path for one-byte strings, so it
|
| - // does not need to do a generic get, but can do the equivalent
|
| - // of SeqOneByteStringGet.
|
| - uint16_t ch = s->Get(index);
|
| - if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
|
| - first_index_to_lower = index;
|
| - break;
|
| - }
|
| - }
|
|
|
| + if (!s->HasOnlyOneByteChars()) {
|
| + // Use a slower implementation for strings with characters beyond U+00FF.
|
| + return LocaleConvertCase(s, isolate, false, "");
|
| + }
|
| +
|
| + // We depend here on the invariant that the length of a Latin1
|
| + // string is invariant under ToLowerCase, and the result always
|
| + // fits in the Latin1 range in the *root locale*. It does not hold
|
| + // for ToUpperCase even in the root locale.
|
| +
|
| + // Scan the string for uppercase and non-ASCII characters for strings
|
| + // shorter than a machine-word without any memory allocation overhead.
|
| + // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()
|
| + // to two parts, one for scanning the prefix with no change and the other for
|
| + // handling ASCII-only characters.
|
| + int index_to_first_unprocessed = length;
|
| + const bool is_short = length < static_cast<int>(sizeof(uintptr_t));
|
| + if (is_short) {
|
| + index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
|
| // Nothing to do if the string is all ASCII with no uppercase.
|
| - if (first_index_to_lower == length) return *s;
|
| + if (index_to_first_unprocessed == length) return *s;
|
| + }
|
|
|
| - // We depend here on the invariant that the length of a Latin1
|
| - // string is invariant under ToLowerCase, and the result always
|
| - // fits in the Latin1 range in the *root locale*. It does not hold
|
| - // for ToUpperCase even in the root locale.
|
| - Handle<SeqOneByteString> result;
|
| - ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
|
| - isolate, result, isolate->factory()->NewRawOneByteString(length));
|
| + Handle<SeqOneByteString> result =
|
| + isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
|
|
|
| - DisallowHeapAllocation no_gc;
|
| - String::FlatContent flat = s->GetFlatContent();
|
| - if (flat.IsOneByte()) {
|
| - const uint8_t* src = flat.ToOneByteVector().start();
|
| - CopyChars(result->GetChars(), src,
|
| - static_cast<size_t>(first_index_to_lower));
|
| - for (int index = first_index_to_lower; index < length; ++index) {
|
| - uint16_t ch = static_cast<uint16_t>(src[index]);
|
| - result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
|
| - }
|
| - } else {
|
| - const uint16_t* src = flat.ToUC16Vector().start();
|
| - CopyChars(result->GetChars(), src,
|
| - static_cast<size_t>(first_index_to_lower));
|
| - for (int index = first_index_to_lower; index < length; ++index) {
|
| - uint16_t ch = src[index];
|
| - result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
|
| - }
|
| + DisallowHeapAllocation no_gc;
|
| + String::FlatContent flat = s->GetFlatContent();
|
| + uint8_t* dest = result->GetChars();
|
| + if (flat.IsOneByte()) {
|
| + const uint8_t* src = flat.ToOneByteVector().start();
|
| + bool has_changed_character = false;
|
| + index_to_first_unprocessed = FastAsciiConvert<true>(
|
| + reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(src),
|
| + length, &has_changed_character);
|
| + // If not ASCII, we keep the result up to index_to_first_unprocessed and
|
| + // process the rest.
|
| + if (index_to_first_unprocessed == length)
|
| + return has_changed_character ? *result : *s;
|
| +
|
| + for (int index = index_to_first_unprocessed; index < length; ++index) {
|
| + dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
|
| + }
|
| + } else {
|
| + if (index_to_first_unprocessed == length) {
|
| + DCHECK(!is_short);
|
| + index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
|
| + }
|
| + // Nothing to do if the string is all ASCII with no uppercase.
|
| + if (index_to_first_unprocessed == length) return *s;
|
| + const uint16_t* src = flat.ToUC16Vector().start();
|
| + CopyChars(dest, src, index_to_first_unprocessed);
|
| + for (int index = index_to_first_unprocessed; index < length; ++index) {
|
| + dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
|
| }
|
| -
|
| - return *result;
|
| }
|
|
|
| - // Blink had an additional case here for ASCII 2-byte strings, but
|
| - // that is subsumed by the above code (assuming there isn't a false
|
| - // negative for HasOnlyOneByteChars).
|
| -
|
| - // Do a slower implementation for cases that include non-ASCII characters.
|
| - return LocaleConvertCase(s, isolate, false, "");
|
| + return *result;
|
| }
|
|
|
| RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
|
| @@ -1152,35 +1168,38 @@ RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
|
| DCHECK_EQ(args.length(), 1);
|
| CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
|
|
|
| - // This function could be optimized for no-op cases the way lowercase
|
| - // counterpart is, but in empirical testing, few actual calls to upper()
|
| - // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
|
| -
|
| int32_t length = s->length();
|
| s = String::Flatten(s);
|
|
|
| if (s->HasOnlyOneByteChars()) {
|
| - Handle<SeqOneByteString> result;
|
| - ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
|
| - isolate, result, isolate->factory()->NewRawOneByteString(length));
|
| + Handle<SeqOneByteString> result =
|
| + isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
|
|
|
| int sharp_s_count;
|
| bool is_result_single_byte;
|
| {
|
| DisallowHeapAllocation no_gc;
|
| String::FlatContent flat = s->GetFlatContent();
|
| - // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII
|
| - // could be removed because ToUpperOneByte is pretty fast now (it
|
| - // does not call ICU API any more.).
|
| + uint8_t* dest = result->GetChars();
|
| if (flat.IsOneByte()) {
|
| Vector<const uint8_t> src = flat.ToOneByteVector();
|
| - if (ToUpperFastASCII(src, result)) return *result;
|
| - is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
|
| + bool has_changed_character = false;
|
| + int index_to_first_unprocessed =
|
| + FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),
|
| + reinterpret_cast<const char*>(src.start()),
|
| + length, &has_changed_character);
|
| + if (index_to_first_unprocessed == length)
|
| + return has_changed_character ? *result : *s;
|
| + // If not ASCII, we keep the result up to index_to_first_unprocessed and
|
| + // process the rest.
|
| + is_result_single_byte =
|
| + ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),
|
| + dest + index_to_first_unprocessed, &sharp_s_count);
|
| } else {
|
| DCHECK(flat.IsTwoByte());
|
| Vector<const uint16_t> src = flat.ToUC16Vector();
|
| if (ToUpperFastASCII(src, result)) return *result;
|
| - is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
|
| + is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);
|
| }
|
| }
|
|
|
|
|