src/runtime/runtime-i18n.cc - Issue 2533033003: Fix the uppercasing of U+00E7(ç) and U+00F7(÷)

Unified Diff: src/runtime/runtime-i18n.cc

Issue 2533033003: Fix the uppercasing of U+00E7(ç) and U+00F7(÷) (Closed)

Patch Set: a bit more tweaks + tests Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/runtime/runtime-i18n.cc

diff --git a/src/runtime/runtime-i18n.cc b/src/runtime/runtime-i18n.cc

index 75e0952581b3723aea67e4e9b108534d926fec86..e17cb231468fa91651aadecab1c66df3ccba862f 100644

--- a/src/runtime/runtime-i18n.cc

+++ b/src/runtime/runtime-i18n.cc

@@ -8,13 +8,14 @@

#include <memory>

-#include "src/api.h"

#include "src/api-natives.h"

+#include "src/api.h"

#include "src/arguments.h"

#include "src/factory.h"

#include "src/i18n.h"

#include "src/isolate-inl.h"

#include "src/messages.h"

+#include "src/utils.h"

#include "unicode/brkiter.h"

#include "unicode/calendar.h"

@@ -1093,21 +1094,6 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

s = String::Flatten(s);

// First scan the string for uppercase and non-ASCII characters:

if (s->HasOnlyOneByteChars()) {

- int first_index_to_lower = length;

- for (int index = 0; index < length; ++index) {

- // Blink specializes this path for one-byte strings, so it

- // does not need to do a generic get, but can do the equivalent

- // of SeqOneByteStringGet.

- uint16_t ch = s->Get(index);

- if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {

- first_index_to_lower = index;

- break;

- }

- // Nothing to do if the string is all ASCII with no uppercase.

- if (first_index_to_lower == length) return *s;

// We depend here on the invariant that the length of a Latin1

// string is invariant under ToLowerCase, and the result always

// fits in the Latin1 range in the *root locale*. It does not hold

@@ -1118,19 +1104,42 @@ RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

DisallowHeapAllocation no_gc;

String::FlatContent flat = s->GetFlatContent();

+ uint8_t* dest = result->GetChars();

+ const uint8_t* src = flat.ToOneByteVector().start();

+ if (flat.IsOneByte() && static_cast<size_t>(length) >= sizeof(uintptr_t)) {

+ bool has_changed_character = false;

+ bool is_ascii = FastAsciiConvert<true>(reinterpret_cast<char*>(dest),

+ reinterpret_cast<const char*>(src),

+ length, &has_changed_character);

+ // If not ASCII, we discard the result and start anew.

+ if (is_ascii) return has_changed_character ? *result : *s;

+ }

+ int index_to_first_upper = 0;

+ for (int index = 0; index < length; ++index) {

+ uint16_t ch = s->Get(index);

+ if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {

+ index_to_first_upper = index;

+ break;

+ }

+ // An ASCII input without any uppercase characters is already handled by

+ // FastAsciiConvert as long as the input is a machine-word or longer.

+ DCHECK(index_to_first_upper < length ||

+ static_cast<size_t>(length) < sizeof(uintptr_t));

+ // Nothing to do if the string is all ASCII with no uppercase.

+ if (index_to_first_upper == length) return *s;

if (flat.IsOneByte()) {

- const uint8_t* src = flat.ToOneByteVector().start();

- CopyChars(result->GetChars(), src,

- static_cast<size_t>(first_index_to_lower));

- for (int index = first_index_to_lower; index < length; ++index) {

+ CopyChars(dest, src, static_cast<size_t>(index_to_first_upper));

+ for (int index = index_to_first_upper; index < length; ++index) {

uint16_t ch = static_cast<uint16_t>(src[index]);

result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

}

} else {

const uint16_t* src = flat.ToUC16Vector().start();

- CopyChars(result->GetChars(), src,

- static_cast<size_t>(first_index_to_lower));

- for (int index = first_index_to_lower; index < length; ++index) {

+ CopyChars(dest, src, static_cast<size_t>(index_to_first_upper));

+ for (int index = index_to_first_upper; index < length; ++index) {

uint16_t ch = src[index];

result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

}

@@ -1152,10 +1161,6 @@ RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

DCHECK_EQ(args.length(), 1);

CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

- // This function could be optimized for no-op cases the way lowercase

- // counterpart is, but in empirical testing, few actual calls to upper()

- // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

int32_t length = s->length();

s = String::Flatten(s);

@@ -1169,12 +1174,15 @@ RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

{

DisallowHeapAllocation no_gc;

String::FlatContent flat = s->GetFlatContent();

- // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII

- // could be removed because ToUpperOneByte is pretty fast now (it

- // does not call ICU API any more.).

if (flat.IsOneByte()) {

Vector<const uint8_t> src = flat.ToOneByteVector();

- if (ToUpperFastASCII(src, result)) return *result;

+ bool has_changed_character = false;

+ bool is_ascii =

+ FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),

+ reinterpret_cast<const char*>(src.start()),

+ length, &has_changed_character);

+ // If not ASCII, we discard the result and use the table for Latin1.

+ if (is_ascii) return has_changed_character ? *result : *s;

is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

} else {

DCHECK(flat.IsTwoByte());

« no previous file with comments | « BUILD.gn ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »