src/runtime/runtime-i18n.cc - Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag

Unified Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: trival change: unnecessary line dropped Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/runtime/runtime-i18n.cc

diff --git a/src/runtime/runtime-i18n.cc b/src/runtime/runtime-i18n.cc

index 27f970bdb4b17392cee21a375e7a04a49c4985aa..09a4e6f2a42fce042a96ce002cd332f0874c3337 100644

--- a/src/runtime/runtime-i18n.cc

+++ b/src/runtime/runtime-i18n.cc

@@ -29,10 +29,12 @@

#include "unicode/rbbi.h"

#include "unicode/smpdtfmt.h"

#include "unicode/timezone.h"

+#include "unicode/translit.h"

#include "unicode/uchar.h"

#include "unicode/ucol.h"

#include "unicode/ucurr.h"

#include "unicode/uloc.h"

+#include "unicode/unistr.h"

#include "unicode/unum.h"

#include "unicode/uversion.h"

@@ -749,6 +751,315 @@ RUNTIME_FUNCTION(Runtime_BreakIteratorBreakType) {

return *isolate->factory()->NewStringFromStaticChars("unknown");

}

+namespace {

+void ConvertCaseWithTransliterator(icu::UnicodeString* input,

+ const char* transliterator_id) {

+ UErrorCode status = U_ZERO_ERROR;

+ base::SmartPointer<icu::Transliterator> translit(

+ icu::Transliterator::createInstance(

+ icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,

+ status));

+ if (U_FAILURE(status)) return;

+ translit->transliterate(*input);

+MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,

+ bool is_to_upper, const char* lang) {

+ int32_t src_length = s->length();

+ const UChar* src = nullptr;

+ base::SmartArrayPointer<uc16> sap;

+ if (s->IsOneByteRepresentationUnderneath()) {

+ sap = s->ToWideCString();

+ src = reinterpret_cast<const UChar*>(sap.get());

+ }

+ // Greek uppercasing has to be done via transliteration.

+ // TODO(jshin): Drop this special-casing once ICU's regular case conversion

+ // API supports Greek uppercasing. See

+ // http://bugs.icu-project.org/trac/ticket/10582 .

+ // ICU's C API for transliteration is nasty and we just use C++ API.

+ if (V8_UNLIKELY(!strncmp(lang, "el", 2) && is_to_upper)) {

+ icu::UnicodeString converted;

+ {

+ DisallowHeapAllocation no_gc;

+ String::FlatContent flat = s->GetFlatContent();

Yang 2016/04/27 08:06:45 I don't see the string being flattened anywhere le

jungshik at Google 2016/04/28 10:50:09 Thank you for pointing this out. Actually, two of

Yang 2016/04/28 12:52:26 I did check the description of fastCopyFrom. In ei

I did check the description of fastCopyFrom. In either case, the unicode string must not alias the original string buffer (which is what would make it fast), since that would change the original string (src points to the orignal backing of the two-byte string). So we just might as well make a copy explicitly and not rely on ICU to do it correctly implicitly.

+ if (src == nullptr) {

+ DCHECK(flat.IsTwoByte());

+ src = reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

+ }

+ // Starts with the source string and will be replaced by the converted

+ // result.

+ converted.fastCopyFrom(icu::UnicodeString(false, src, src_length));

+ ConvertCaseWithTransliterator(&converted, "el-Upper");

+ }

+ Handle<String> result;

+ ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

+ isolate, result,

+ isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(

Yang 2016/04/27 08:06:45 This means that any case conversion with "problema

jungshik at Google 2016/04/28 10:50:10 That's a good point. If toLocale{L,U}Case(<no argu

+ reinterpret_cast<const uint16_t*>(converted.getBuffer()),

+ converted.length())));

+ return *result;

+ }

+ auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;

+ int32_t dest_length = src_length;

+ UErrorCode error;

+ Handle<SeqTwoByteString> result;

+ // This is not a real loop. It'll be executed only once (no overflow) or

+ // twice (overflow).

+ for (int i = 0; i < 2; ++i) {

+ result =

+ isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();

+ DisallowHeapAllocation no_gc;

+ String::FlatContent flat = s->GetFlatContent();

Yang 2016/04/27 08:06:45 We need to make sure s is flattened at this point

jungshik at Google 2016/04/28 10:50:09 Yup. See the comment above to the same question.

+ // For OneByteString, |src| is already obtained with |sap| outside the loop.

+ if (flat.IsTwoByte())

Yang 2016/04/27 08:06:45 add brackets here.

jungshik at Google 2016/04/28 10:50:10 Done.

+ src = reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

+ error = U_ZERO_ERROR;

+ dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),

+ dest_length, src, src_length, lang, &error);

+ if (error != U_BUFFER_OVERFLOW_ERROR) break;

+ }

+ // In most cases, the output will fill the destination buffer completely

+ // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).

+ // Only in rare cases, it'll be shorter than the destination buffer and

+ // |result| has to be truncated.

+ DCHECK(U_SUCCESS(error));

+ // dest_length == result->length()

Yang 2016/04/27 08:06:45 Can we make this a DCHECK?

jungshik at Google 2016/04/28 10:50:10 Done.

+ if (V8_LIKELY(error == U_STRING_NOT_TERMINATED_WARNING)) return *result;

+ if (U_SUCCESS(error)) {

+ // dest_length < result->length()

Yang 2016/04/27 08:06:45 Also make this a DCHECK.

jungshik at Google 2016/04/28 10:50:10 Done.

+ return *Handle<SeqTwoByteString>::cast(

+ SeqString::Truncate(result, dest_length));

+ }

+ return *s;

+inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

+inline uint16_t ToLatin1Lower(uint16_t ch) {

+ return ch |

+ (((ch >= 'A' && ch <= 'Z') || (ch >= 0xC0 && ch <= 0xDE && ch != 0xD7))

+ << 5);

+inline uint16_t ToASCIIUpper(uint16_t ch) {

+ return ch & ~((ch >= 'a' && ch <= 'z') << 5);

+// Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.

+inline uint16_t ToLatin1Upper(uint16_t ch) {

+ DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);

+ return ch &

+ ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xE7))

+ << 5);

+template <typename Char>

+bool ToUpperFastASCII(const Vector<const Char>& src,

+ Handle<SeqOneByteString> result) {

+ // Do a faster loop for the case where all the characters are ASCII.

+ uint16_t ored = 0;

+ int32_t index = 0;

+ for (auto it = src.begin(); it != src.end(); ++it) {

+ uint16_t ch = static_cast<uint16_t>(*it);

+ ored |= ch;

+ result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));

+ }

+ return !(ored & ~0x7F);

+const uint16_t sharp_s = 0xDF;

+template <typename Char>

+bool ToUpperOneByte(const Vector<const Char>& src,

+ Handle<SeqOneByteString> result, int* sharp_s_count) {

+ // Still pretty-fast path for the input with non-ASCII Latin-1 characters.

+ // There are two special cases.

+ // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.

+ // 2. Lower case sharp-S converts to "SS" (two characters)

+ *sharp_s_count = 0;

+ int32_t index = 0;

+ for (auto it = src.begin(); it != src.end(); ++it) {

+ uint16_t ch = static_cast<uint16_t>(*it);

+ if (V8_UNLIKELY(ch == sharp_s)) {

+ ++(*sharp_s_count);

+ continue;

+ }

+ if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {

+ // Since this upper-cased character does not fit in an 8-bit string, we

+ // need to take the 16-bit path.

+ return false;

+ }

+ result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));

+ }

+ return true;

+template <typename Char>

+void ToUpperWithSharpS(const Vector<const Char>& src,

+ Handle<SeqOneByteString> result) {

+ int32_t dest_index = 0;

+ for (auto it = src.begin(); it != src.end(); ++it) {

+ uint16_t ch = static_cast<uint16_t>(*it);

+ if (ch == sharp_s) {

+ result->SeqOneByteStringSet(dest_index++, 'S');

+ } else {

+ result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));

+ }

+} // namespace

+RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

+ HandleScope scope(isolate);

+ DCHECK_EQ(args.length(), 1);

+ CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

+ int length = s->length();

+ s = String::Flatten(s);

+ // First scan the string for uppercase and non-ASCII characters:

+ if (s->HasOnlyOneByteChars()) {

+ unsigned first_index_to_lower = length;

+ for (int index = 0; index < length; ++index) {

+ // Blink specializes this path for one-byte strings, so it

+ // does not need to do a generic get, but can do the equivalent

+ // of SeqOneByteStringGet.

+ uint16_t ch = s->Get(index);

+ if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {

+ first_index_to_lower = index;

+ break;

+ }

+ // Nothing to do if the string is all ASCII with no uppercase.

+ if (first_index_to_lower == length) return *s;

+ // We depend here on the invariant that the length of a Latin1

+ // string is invariant under ToLowerCase, and the result always

+ // fits in the Latin1 range in the *root locale*. It does not hold

+ // for ToUpperCase even in the root locale.

+ Handle<SeqOneByteString> result;

+ ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

+ isolate, result, isolate->factory()->NewRawOneByteString(length));

+ DisallowHeapAllocation no_gc;

+ String::FlatContent flat = s->GetFlatContent();

+ if (flat.IsOneByte()) {

+ const uint8_t* src = flat.ToOneByteVector().start();

+ CopyChars(result->GetChars(), src, first_index_to_lower);

+ for (int index = first_index_to_lower; index < length; ++index) {

+ uint16_t ch = static_cast<uint16_t>(src[index]);

+ result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

+ }

+ } else {

+ const uint16_t* src = flat.ToUC16Vector().start();

+ CopyChars(result->GetChars(), src, first_index_to_lower);

+ for (int index = first_index_to_lower; index < length; ++index) {

+ uint16_t ch = src[index];

+ result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

+ }

+ return *result;

+ }

+ // Blink had an additional case here for ASCII 2-byte strings, but

+ // that is subsumed by the above code (assuming there isn't a false

+ // negative for HasOnlyOneByteChars).

+ // Do a slower implementation for cases that include non-ASCII characters.

+ return LocaleConvertCase(s, isolate, false, "");

+RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

+ HandleScope scope(isolate);

+ DCHECK_EQ(args.length(), 1);

+ CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

+ // This function could be optimized for no-op cases the way lowercase

+ // counterpart is, but in empirical testing, few actual calls to upper()

+ // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

+ int32_t length = s->length();

+ s = String::Flatten(s);

+ if (s->HasOnlyOneByteChars()) {

+ Handle<SeqOneByteString> result;

+ ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

+ isolate, result, isolate->factory()->NewRawOneByteString(length));

+ int sharp_s_count;

+ bool is_result_single_byte;

+ {

+ DisallowHeapAllocation no_gc;

+ String::FlatContent flat = s->GetFlatContent();

+ // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII

+ // could be removed because ToUpperOneByte is pretty fast now (it

+ // does not call ICU API any more.).

+ if (flat.IsOneByte()) {

+ Vector<const uint8_t> src = flat.ToOneByteVector();

+ if (ToUpperFastASCII(src, result)) return *result;

+ is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

+ } else {

+ DCHECK(flat.IsTwoByte());

+ Vector<const uint16_t> src = flat.ToUC16Vector();

+ if (ToUpperFastASCII(src, result)) return *result;

+ is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

+ }

+ // Go to the full Unicode path if there are characters whose uppercase

+ // is beyond the Latin-1 range (cannot be represented in OneByteString).

+ if (V8_UNLIKELY(!is_result_single_byte))

Yang 2016/04/27 08:06:45 add brackets please.

jungshik at Google 2016/04/28 10:50:10 Done.

+ return LocaleConvertCase(s, isolate, true, "");

+ if (sharp_s_count == 0) return *result;

+ // We have sharp_s_count sharp-s characters, but the result is still

+ // in the Latin-1 range.

+ ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

+ isolate, result,

+ isolate->factory()->NewRawOneByteString(length + sharp_s_count));

+ DisallowHeapAllocation no_gc;

+ String::FlatContent flat = s->GetFlatContent();

+ if (flat.IsOneByte())

Yang 2016/04/27 08:06:45 brackets

jungshik at Google 2016/04/28 10:50:10 Done.

+ ToUpperWithSharpS(flat.ToOneByteVector(), result);

+ else

+ ToUpperWithSharpS(flat.ToUC16Vector(), result);

+ return *result;

+ }

+ return LocaleConvertCase(s, isolate, true, "");

+RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {

+ HandleScope scope(isolate);

+ DCHECK_EQ(args.length(), 3);

+ CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

+ CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);

+ CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);

+ // All the languages requiring special handling ("az", "el", "lt", "tr")

+ // have a 2-letter language code.

+ DCHECK(lang->length() == 2);

+ uint8_t lang_str[3];

+ memcpy(lang_str, lang->GetChars(), 2);

+ lang_str[2] = 0;

+ return LocaleConvertCase(s, isolate, is_upper,

+ reinterpret_cast<const char*>(lang_str));

} // namespace internal

} // namespace v8

« src/js/i18n.js ('K') | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »