src/runtime/runtime-i18n.cc - Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: use FlatContent for uppercase; add 3 templatized helpers Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include "src/api.h"	9 #include "src/api.h"

10 #include "src/api-natives.h"	10 #include "src/api-natives.h"

(...skipping 11 matching lines...) Expand all Loading...
22 #include "unicode/dcfmtsym.h"	22 #include "unicode/dcfmtsym.h"

23 #include "unicode/decimfmt.h"	23 #include "unicode/decimfmt.h"

24 #include "unicode/dtfmtsym.h"	24 #include "unicode/dtfmtsym.h"

25 #include "unicode/dtptngen.h"	25 #include "unicode/dtptngen.h"

26 #include "unicode/locid.h"	26 #include "unicode/locid.h"

27 #include "unicode/numfmt.h"	27 #include "unicode/numfmt.h"

28 #include "unicode/numsys.h"	28 #include "unicode/numsys.h"

29 #include "unicode/rbbi.h"	29 #include "unicode/rbbi.h"

30 #include "unicode/smpdtfmt.h"	30 #include "unicode/smpdtfmt.h"

31 #include "unicode/timezone.h"	31 #include "unicode/timezone.h"

	32 #include "unicode/translit.h"

32 #include "unicode/uchar.h"	33 #include "unicode/uchar.h"

33 #include "unicode/ucol.h"	34 #include "unicode/ucol.h"

34 #include "unicode/ucurr.h"	35 #include "unicode/ucurr.h"

35 #include "unicode/uloc.h"	36 #include "unicode/uloc.h"

	37 #include "unicode/unistr.h"

36 #include "unicode/unum.h"	38 #include "unicode/unum.h"

37 #include "unicode/uversion.h"	39 #include "unicode/uversion.h"

38	40

39	41

40 namespace v8 {	42 namespace v8 {

41 namespace internal {	43 namespace internal {

42	44

43 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {	45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {

44 HandleScope scope(isolate);	46 HandleScope scope(isolate);

45 Factory* factory = isolate->factory();	47 Factory* factory = isolate->factory();

(...skipping 696 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
742 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {	744 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {

743 return *isolate->factory()->NewStringFromStaticChars("letter");	745 return *isolate->factory()->NewStringFromStaticChars("letter");

744 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {	746 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {

745 return *isolate->factory()->NewStringFromStaticChars("kana");	747 return *isolate->factory()->NewStringFromStaticChars("kana");

746 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {	748 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {

747 return *isolate->factory()->NewStringFromStaticChars("ideo");	749 return *isolate->factory()->NewStringFromStaticChars("ideo");

748 } else {	750 } else {

749 return *isolate->factory()->NewStringFromStaticChars("unknown");	751 return *isolate->factory()->NewStringFromStaticChars("unknown");

750 }	752 }

751 }	753 }

	754

	755 namespace {

	756 void ConvertCaseWithTransliterator(icu::UnicodeString* input,

	757 const char* transliterator_id) {

	758 UErrorCode status = U_ZERO_ERROR;

	759 base::SmartPointer<icu::Transliterator> translit(

	760 icu::Transliterator::createInstance(

	761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,

	762 status));

	763 if (U_FAILURE(status)) return;

	764 translit->transliterate(*input);

	765 }

	766

	767 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,

	768 bool is_to_upper, int locale_id) {

	769 static const char* conversion_locales[] = {

	770 "az", "el", "lt", "tr",

	771 };

	772 RUNTIME_ASSERT(locale_id >= -1 &&

	773 locale_id < static_cast<int>(arraysize(conversion_locales)));

	774 int32_t src_length = s->length();

	775 const UChar* src = nullptr;

	776

	777 base::SmartArrayPointer<uc16> sap;

	778 if (s->IsOneByteRepresentationUnderneath()) {

	779 sap = s->ToWideCString();

	780 src = reinterpret_cast<const UChar*>(sap.get());

	781 }

	782

	783 // Greek (id == 1) uppercasing has to be done via transliteration.

	784 // TODO(jshin): Drop this special-casing once ICU's regular case conversion

	785 // API supports Greek uppercasing. See

	786 // http://bugs.icu-project.org/trac/ticket/10582 .

	787 // ICU's C API for transliteration is nasty and we just use C++ API.

	788 if (V8_UNLIKELY(locale_id == 1 && is_to_upper)) {
	Dan Ehrenberg 2016/04/20 22:01:30 Maybe make an enum for these ids, like enum Local Maybe make an enum for these ids, like enum LocaleID { ROOT = -1, AZERI = 0, GREEK = 1, ... }; static const char* conversion_locales[] = { ..., [GREEK] = "el", ... }; jungshik at Google 2016/04/21 20:39:17 Is it ok to use a C99 feature? I have to add '-Wn Show quoted text On 2016/04/20 22:01:30, Dan Ehrenberg wrote: > Maybe make an enum for these ids, like > > enum LocaleID { > ROOT = -1, > AZERI = 0, > GREEK = 1, > ... > }; > > static const char* conversion_locales[] = { > ..., > [GREEK] = "el", > ... > }; Is it ok to use a C99 feature? I have to add '-Wno-c99-extensions' to gyp file. This is related to your comment in i18n.js. If a string is used for locale, I don't have to worry about this. As I replied there, I have a perf concern (maybe I have to measure).
	789 icu::UnicodeString converted;

	790 {

	791 DisallowHeapAllocation no_gc;

	792 String::FlatContent flat = s->GetFlatContent();

	793 if (src == nullptr) {

	794 DCHECK(flat.IsTwoByte());

	795 src = reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

	796 }

	797 // Starts with the source string and will be replaced by the converted

	798 // result.

	799 converted.fastCopyFrom(icu::UnicodeString(false, src, src_length));

	800 ConvertCaseWithTransliterator(&converted, "el-Upper");

	801 }

	802 Handle<String> result;

	803 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	804 isolate, result,

	805 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(

	806 reinterpret_cast<const uint16_t*>(converted.getBuffer()),

	807 converted.length())));

	808 return *result;

	809 }

	810

	811 typedef int32_t (*case_conversion_fn)(

	812 UChar * dest, int32_t destCapacity, const UChar* src, int32_t srcLength,

	813 const char* locale, UErrorCode* pErrorCode);

	814 case_conversion_fn fn = is_to_upper ? u_strToUpper : u_strToLower;
	Dan Ehrenberg 2016/04/20 22:01:30 Nit: Since this is just used locally, maybe a good Nit: Since this is just used locally, maybe a good case for auto? jungshik at Google 2016/04/21 20:39:17 Thanks. Done Show quoted text On 2016/04/20 22:01:30, Dan Ehrenberg wrote: > Nit: Since this is just used locally, maybe a good case for auto? Thanks. Done
	815 const char* locale = locale_id == -1 ? "" : conversion_locales[locale_id];
	Dan Ehrenberg 2016/04/20 22:01:30 The use of the root locale seems appropriate here The use of the root locale seems appropriate here for String.prototype.toUpperCase. However, if the default locale is different from the root locale with respect to case mapping, do we know that it will be included in the set of four languages which is included in this code? jungshik at Google 2016/04/21 20:39:17 Again, "" means root locale and NULL means the def Show quoted text On 2016/04/20 22:01:30, Dan Ehrenberg wrote: > The use of the root locale seems appropriate here for > String.prototype.toUpperCase. However, if the default locale is different from > the root locale with respect to case mapping, do we know that it will be > included in the set of four languages which is included in this code? Again, "" means root locale and NULL means the default locale. :-) As toLocale{U,L}Case without an argument, it's the default locale. And, if the default locale happens to be one of 4, that's already taken care in JS code.
	816

	817 int32_t dest_length = src_length;

	818 UErrorCode error;

	819 Handle<SeqTwoByteString> result;

	820

	821 // This is not a real loop. It'll be executed only once (no overflow) or

	822 // twice (overflow).
	Dan Ehrenberg 2016/04/20 22:01:30 Any way we could include a DCHECK to ensure it doe Any way we could include a DCHECK to ensure it doesn't run more than twice? jungshik at Google 2016/04/21 20:39:17 This is the second time I got that question in a m Show quoted text On 2016/04/20 22:01:30, Dan Ehrenberg wrote: > Any way we could include a DCHECK to ensure it doesn't run more than twice? This is the second time I got that question in a month :-). (the first one was for a sqlite CL). This is a pretty common pattern used multiple times in both Chromium and ICU itself. Anyway, I'm turning do-while to for loop (i=0 ; i < 2; ++i).
	823 do {

	824 result =

	825 isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();

	826 base::SmartArrayPointer<uc16> sap;

	827 DisallowHeapAllocation no_gc;

	828 String::FlatContent flat = s->GetFlatContent();

	829 // For OneByteString, \|src\| is already obtained with \|sap\| outside the loop.

	830 if (flat.IsTwoByte())

	831 src = reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

	832 error = U_ZERO_ERROR;

	833 dest_length = fn(reinterpret_cast<UChar*>(result->GetChars()), dest_length,

	834 src, src_length, locale, &error);

	835 } while (error == U_BUFFER_OVERFLOW_ERROR);

	836

	837 // In most cases, the output will fill the destination buffer completely

	838 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).

	839 // Only in rare cases, it'll be shorter than the destination buffer and

	840 // \|result\| has to be truncated.

	841 DCHECK(U_SUCCESS(error));

	842 // dest_length == result->length()

	843 if (V8_LIKELY(error == U_STRING_NOT_TERMINATED_WARNING)) return *result;

	844 if (U_SUCCESS(error)) {

	845 // dest_length < result->length()

	846 return *Handle<SeqTwoByteString>::cast(

	847 SeqString::Truncate(result, dest_length));
	Dan Ehrenberg 2016/04/20 22:01:29 Do you have a test which hits this case? Do you have a test which hits this case? jungshik at Google 2016/04/21 20:39:17 intl/general/case-mapping.js has several (Greek an Show quoted text On 2016/04/20 22:01:29, Dan Ehrenberg wrote: > Do you have a test which hits this case? intl/general/case-mapping.js has several (Greek and Turkic drop diacritic marks). test262/intl402/Strings/* has some, too.
	848 }

	849 return *s;

	850 }

	851

	852 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	853

	854 inline uint16_t ToLatin1Lower(uint16_t ch) {

	855 return ch \|

	856 (((ch >= 'A' && ch <= 'Z') \|\| (ch >= 0xC0 && ch <= 0xDE && ch != 0xD7))

	857 << 5);

	858 }

	859

	860 inline uint16_t ToASCIIUpper(uint16_t ch) {

	861 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	862 }

	863

	864 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.

	865 inline uint16_t ToLatin1Upper(uint16_t ch) {

	866 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);

	867 return ch &

	868 ~(((ch >= 'a' && ch <= 'z') \|\| (((ch & 0xE0) == 0xE0) && ch != 0xE7))

	869 << 5);

	870 }

	871

	872 template <typename Char>

	873 bool ToUpperFastASCII(const Vector<const Char>& src,

	874 Handle<SeqOneByteString> result) {

	875 // Do a faster loop for the case where all the characters are ASCII.

	876 uint16_t ored = 0;

	877 int32_t index = 0;

	878 for (auto it = src.begin(); it != src.end(); ++it) {

	879 uint16_t ch = static_cast<uint16_t>(*it);

	880 ored \|= ch;

	881 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));

	882 }

	883 return !(ored & ~0x7F);

	884 }

	885

	886 const uint16_t sharp_s = 0xDF;

	887

	888 template <typename Char>

	889 bool ToUpperOneByte(const Vector<const Char>& src,

	890 Handle<SeqOneByteString> result, int* sharp_s_count) {

	891 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.

	892

	893 // There are two special cases.

	894 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.

	895 // 2. Lower case sharp-S converts to "SS" (two characters)

	896 *sharp_s_count = 0;

	897 int32_t index = 0;

	898 for (auto it = src.begin(); it != src.end(); ++it) {

	899 uint16_t ch = static_cast<uint16_t>(*it);

	900 if (V8_UNLIKELY(ch == sharp_s)) {

	901 ++(*sharp_s_count);

	902 continue;

	903 }

	904 if (V8_UNLIKELY(ch == 0xB5 \|\| ch == 0xFF)) {

	905 // Since this upper-cased character does not fit in an 8-bit string, we

	906 // need to take the 16-bit path.

	907 return false;

	908 }

	909 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));

	910 }

	911

	912 return true;

	913 }

	914

	915 template <typename Char>

	916 void ToUpperWithSharpS(const Vector<const Char>& src,

	917 Handle<SeqOneByteString> result) {

	918 int32_t dest_index = 0;

	919 for (auto it = src.begin(); it != src.end(); ++it) {

	920 uint16_t ch = static_cast<uint16_t>(*it);

	921 if (ch == sharp_s) {

	922 result->SeqOneByteStringSet(dest_index++, 'S');

	923 result->SeqOneByteStringSet(dest_index++, 'S');

	924 } else {

	925 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));

	926 }

	927 }

	928 }

	929

	930 } // namespace

	931

	932 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

	933 HandleScope scope(isolate);

	934 DCHECK_EQ(args.length(), 1);

	935 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	936

	937 int length = s->length();

	938 s = String::Flatten(s);

	939 // First scan the string for uppercase and non-ASCII characters:

	940 if (s->HasOnlyOneByteChars()) {

	941 unsigned first_index_to_lower = length;

	942 for (int index = 0; index < length; ++index) {

	943 // Blink specializes this path for one-byte strings, so it

	944 // does not need to do a generic get, but can do the equivalent

	945 // of SeqOneByteStringGet.

	946 uint16_t ch = s->Get(index);

	947 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	948 first_index_to_lower = index;

	949 break;

	950 }

	951 }

	952

	953 // Nothing to do if the string is all ASCII with no uppercase.

	954 if (first_index_to_lower == length) return *s;

	955

	956 // We depend here on the invariant that the length of a Latin1

	957 // string is invariant under ToLowerCase, and the result always

	958 // fits in the Latin1 range (untrue for ToUpperCase, and might

	959 // be untrue in some locales, but this is the root locale)

	960 Handle<SeqOneByteString> result;

	961 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	962 isolate, result, isolate->factory()->NewRawOneByteString(length));

	963

	964 DisallowHeapAllocation no_gc;

	965 String::FlatContent flat = s->GetFlatContent();

	966 if (flat.IsOneByte()) {

	967 const uint8_t* src = flat.ToOneByteVector().start();

	968 CopyChars(result->GetChars(), src, first_index_to_lower);

	969 for (int index = first_index_to_lower; index < length; ++index) {

	970 uint16_t ch = static_cast<uint16_t>(src[index]);

	971 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

	972 }

	973 } else {

	974 const uint16_t* src = flat.ToUC16Vector().start();

	975 CopyChars(result->GetChars(), src, first_index_to_lower);

	976 for (int index = first_index_to_lower; index < length; ++index) {

	977 uint16_t ch = src[index];

	978 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

	979 }

	980 }

	981

	982 return *result;

	983 }

	984

	985 // Blink had an additional case here for ASCII 2-byte strings, but

	986 // that is subsumed by the above code (assuming there isn't a false

	987 // negative for HasOnlyOneByteChars).

	988

	989 // Do a slower implementation for cases that include non-ASCII characters.

	990 return LocaleConvertCase(s, isolate, false, -1);

	991 }

	992

	993 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

	994 HandleScope scope(isolate);

	995 DCHECK_EQ(args.length(), 1);

	996 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	997

	998 // This function could be optimized for no-op cases the way lowercase

	999 // counterpart is, but in empirical testing, few actual calls to upper()

	1000 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

	1001

	1002 int32_t length = s->length();

	1003 s = String::Flatten(s);

	1004

	1005 if (s->HasOnlyOneByteChars()) {

	1006 Handle<SeqOneByteString> result;

	1007 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1008 isolate, result, isolate->factory()->NewRawOneByteString(length));

	1009

	1010 int sharp_s_count;

	1011 bool is_result_single_byte;

	1012 {

	1013 DisallowHeapAllocation no_gc;

	1014 String::FlatContent flat = s->GetFlatContent();

	1015 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII

	1016 // could be removed because ToUpperOneByte is pretty fast now (it

	1017 // does not call ICU API any more.).

	1018 if (flat.IsOneByte()) {

	1019 Vector<const uint8_t> src = flat.ToOneByteVector();

	1020 if (ToUpperFastASCII(src, result)) return *result;

	1021 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

	1022 } else {

	1023 DCHECK(flat.IsTwoByte());

	1024 Vector<const uint16_t> src = flat.ToUC16Vector();

	1025 if (ToUpperFastASCII(src, result)) return *result;

	1026 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

	1027 }

	1028 }

	1029

	1030 // Go to the full Unicode path if there are characters whose uppercase

	1031 // is beyond the Latin-1 range (cannot be represented in OneByteString).

	1032 if (V8_UNLIKELY(!is_result_single_byte))

	1033 return LocaleConvertCase(s, isolate, true, -1);

	1034

	1035 if (sharp_s_count == 0) return *result;

	1036

	1037 // We have sharp_s_count sharp-s characters, but the result is still

	1038 // in the Latin-1 range.

	1039 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1040 isolate, result,

	1041 isolate->factory()->NewRawOneByteString(length + sharp_s_count));

	1042 DisallowHeapAllocation no_gc;

	1043 String::FlatContent flat = s->GetFlatContent();

	1044 if (flat.IsOneByte())

	1045 ToUpperWithSharpS(flat.ToOneByteVector(), result);

	1046 else

	1047 ToUpperWithSharpS(flat.ToUC16Vector(), result);

	1048

	1049 return *result;

	1050 }

	1051

	1052 return LocaleConvertCase(s, isolate, true, -1);

	1053 }

	1054

	1055 RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {

	1056 HandleScope scope(isolate);

	1057 DCHECK_EQ(args.length(), 3);

	1058 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	1059 CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);

	1060 CONVERT_NUMBER_CHECKED(int, lang_id, Int32, args[2]);

	1061

	1062 return LocaleConvertCase(s, isolate, is_upper, lang_id);

	1063 }

	1064

752 } // namespace internal	1065 } // namespace internal

753 } // namespace v8	1066 } // namespace v8

754	1067

755 #endif // V8_I18N_SUPPORT	1068 #endif // V8_I18N_SUPPORT

OLD	NEW

« src/js/i18n.js ('K') | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »