src/runtime/runtime-i18n.cc - Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Yang's comment addressed - return right away for no-change Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include "src/api.h"	9 #include "src/api.h"

10 #include "src/api-natives.h"	10 #include "src/api-natives.h"

(...skipping 11 matching lines...) Expand all Loading...
22 #include "unicode/dcfmtsym.h"	22 #include "unicode/dcfmtsym.h"

23 #include "unicode/decimfmt.h"	23 #include "unicode/decimfmt.h"

24 #include "unicode/dtfmtsym.h"	24 #include "unicode/dtfmtsym.h"

25 #include "unicode/dtptngen.h"	25 #include "unicode/dtptngen.h"

26 #include "unicode/locid.h"	26 #include "unicode/locid.h"

27 #include "unicode/numfmt.h"	27 #include "unicode/numfmt.h"

28 #include "unicode/numsys.h"	28 #include "unicode/numsys.h"

29 #include "unicode/rbbi.h"	29 #include "unicode/rbbi.h"

30 #include "unicode/smpdtfmt.h"	30 #include "unicode/smpdtfmt.h"

31 #include "unicode/timezone.h"	31 #include "unicode/timezone.h"

	32 #include "unicode/translit.h"

32 #include "unicode/uchar.h"	33 #include "unicode/uchar.h"

33 #include "unicode/ucol.h"	34 #include "unicode/ucol.h"

34 #include "unicode/ucurr.h"	35 #include "unicode/ucurr.h"

35 #include "unicode/uloc.h"	36 #include "unicode/uloc.h"

	37 #include "unicode/unistr.h"

36 #include "unicode/unum.h"	38 #include "unicode/unum.h"

37 #include "unicode/uversion.h"	39 #include "unicode/uversion.h"

38	40

39	41

40 namespace v8 {	42 namespace v8 {

41 namespace internal {	43 namespace internal {

42	44

43 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {	45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {

44 HandleScope scope(isolate);	46 HandleScope scope(isolate);

45 Factory* factory = isolate->factory();	47 Factory* factory = isolate->factory();

(...skipping 696 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
742 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {	744 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {

743 return *isolate->factory()->NewStringFromStaticChars("letter");	745 return *isolate->factory()->NewStringFromStaticChars("letter");

744 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {	746 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {

745 return *isolate->factory()->NewStringFromStaticChars("kana");	747 return *isolate->factory()->NewStringFromStaticChars("kana");

746 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {	748 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {

747 return *isolate->factory()->NewStringFromStaticChars("ideo");	749 return *isolate->factory()->NewStringFromStaticChars("ideo");

748 } else {	750 } else {

749 return *isolate->factory()->NewStringFromStaticChars("unknown");	751 return *isolate->factory()->NewStringFromStaticChars("unknown");

750 }	752 }

751 }	753 }

	754
	srl295 2016/07/27 18:53:39 filed ICU bug http://bugs.icu-project.org/trac/tic filed ICU bug http://bugs.icu-project.org/trac/ticket/12647 to pull this fastpath into ICU.
	755 namespace {

	756 void ConvertCaseWithTransliterator(icu::UnicodeString* input,

	757 const char* transliterator_id) {

	758 UErrorCode status = U_ZERO_ERROR;

	759 base::SmartPointer<icu::Transliterator> translit(

	760 icu::Transliterator::createInstance(

	761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,

	762 status));

	763 if (U_FAILURE(status)) return;

	764 translit->transliterate(*input);

	765 }

	766

	767 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,

	768 base::SmartArrayPointer<uc16>* dest,

	769 int32_t length) {

	770 DCHECK(flat.IsFlat());

	771 if (flat.IsOneByte()) {

	772 if (dest->is_empty()) {

	773 dest->Reset(NewArray<uc16>(length));

	774 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);

	775 }

	776 return reinterpret_cast<const UChar*>(dest->get());

	777 } else {

	778 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

	779 }

	780 }

	781

	782 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,

	783 bool is_to_upper, const char* lang) {

	784 int32_t src_length = s->length();

	785

	786 // Greek uppercasing has to be done via transliteration.

	787 // TODO(jshin): Drop this special-casing once ICU's regular case conversion

	788 // API supports Greek uppercasing. See

	789 // http://bugs.icu-project.org/trac/ticket/10582 .

	790 // In the meantime, if there's no Greek character in \|s\|, call this

	791 // function again with the root locale (lang="").

	792 // ICU's C API for transliteration is nasty and we just use C++ API.

	793 if (V8_UNLIKELY(is_to_upper && lang[0] == 'e' && lang[1] == 'l')) {

	794 icu::UnicodeString converted;

	795 base::SmartArrayPointer<uc16> sap;

	796 {

	797 DisallowHeapAllocation no_gc;

	798 String::FlatContent flat = s->GetFlatContent();

	799 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);

	800 // Starts with the source string (read-only alias with copy-on-write

	801 // semantics) and will be modified to contain the converted result.

	802 // Using read-only alias at first saves one copy operation if

	803 // transliteration does not change the input, which is rather rare.

	804 // Moreover, transliteration takes rather long so that saving one copy

	805 // helps only a little bit.

	806 converted.setTo(false, src, src_length);

	807 ConvertCaseWithTransliterator(&converted, "el-Upper");

	808 // If no change is made, just return \|s\|.

	809 if (converted.getBuffer() == src) return *s;

	810 }

	811 Handle<String> result;

	812 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	813 isolate, result,

	814 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(

	815 reinterpret_cast<const uint16_t*>(converted.getBuffer()),

	816 converted.length())));

	817 return *result;

	818 }

	819

	820 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;

	821

	822 int32_t dest_length = src_length;

	823 UErrorCode status;

	824 Handle<SeqTwoByteString> result;

	825 base::SmartArrayPointer<uc16> sap;

	826

	827 // This is not a real loop. It'll be executed only once (no overflow) or

	828 // twice (overflow).

	829 for (int i = 0; i < 2; ++i) {

	830 result =

	831 isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();

	832 DisallowHeapAllocation no_gc;

	833 String::FlatContent flat = s->GetFlatContent();

	834 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);

	835 status = U_ZERO_ERROR;

	836 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),

	837 dest_length, src, src_length, lang, &status);

	838 if (status != U_BUFFER_OVERFLOW_ERROR) break;

	839 }

	840

	841 // In most cases, the output will fill the destination buffer completely

	842 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).

	843 // Only in rare cases, it'll be shorter than the destination buffer and

	844 // \|result\| has to be truncated.

	845 DCHECK(U_SUCCESS(status));

	846 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {

	847 DCHECK(dest_length == result->length());

	848 return *result;

	849 }

	850 if (U_SUCCESS(status)) {

	851 DCHECK(dest_length < result->length());

	852 return *Handle<SeqTwoByteString>::cast(

	853 SeqString::Truncate(result, dest_length));

	854 }

	855 return *s;

	856 }

	857

	858 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	859

	860 const uint8_t kToLower[256] = {

	861 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,

	862 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

	863 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,

	864 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,

	865 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,

	866 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,

	867 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,

	868 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,

	869 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,

	870 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,

	871 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,

	872 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,

	873 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,

	874 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,

	875 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,

	876 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,

	877 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,

	878 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,

	879 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,

	880 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,

	881 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,

	882 0xFC, 0xFD, 0xFE, 0xFF,

	883 };

	884

	885 inline uint16_t ToLatin1Lower(uint16_t ch) {

	886 return static_cast<uint16_t>(kToLower[ch]);

	887 }

	888

	889 inline uint16_t ToASCIIUpper(uint16_t ch) {

	890 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	891 }

	892

	893 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.

	894 inline uint16_t ToLatin1Upper(uint16_t ch) {

	895 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);

	896 return ch &

	897 ~(((ch >= 'a' && ch <= 'z') \|\| (((ch & 0xE0) == 0xE0) && ch != 0xE7))

	898 << 5);

	899 }

	900

	901 template <typename Char>

	902 bool ToUpperFastASCII(const Vector<const Char>& src,

	903 Handle<SeqOneByteString> result) {

	904 // Do a faster loop for the case where all the characters are ASCII.

	905 uint16_t ored = 0;

	906 int32_t index = 0;

	907 for (auto it = src.begin(); it != src.end(); ++it) {

	908 uint16_t ch = static_cast<uint16_t>(*it);

	909 ored \|= ch;

	910 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));

	911 }

	912 return !(ored & ~0x7F);

	913 }

	914

	915 const uint16_t sharp_s = 0xDF;

	916

	917 template <typename Char>

	918 bool ToUpperOneByte(const Vector<const Char>& src,

	919 Handle<SeqOneByteString> result, int* sharp_s_count) {

	920 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.

	921

	922 // There are two special cases.

	923 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.

	924 // 2. Lower case sharp-S converts to "SS" (two characters)

	925 *sharp_s_count = 0;

	926 int32_t index = 0;

	927 for (auto it = src.begin(); it != src.end(); ++it) {

	928 uint16_t ch = static_cast<uint16_t>(*it);

	929 if (V8_UNLIKELY(ch == sharp_s)) {

	930 ++(*sharp_s_count);

	931 continue;

	932 }

	933 if (V8_UNLIKELY(ch == 0xB5 \|\| ch == 0xFF)) {

	934 // Since this upper-cased character does not fit in an 8-bit string, we

	935 // need to take the 16-bit path.

	936 return false;

	937 }

	938 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));

	939 }

	940

	941 return true;

	942 }

	943

	944 template <typename Char>

	945 void ToUpperWithSharpS(const Vector<const Char>& src,

	946 Handle<SeqOneByteString> result) {

	947 int32_t dest_index = 0;

	948 for (auto it = src.begin(); it != src.end(); ++it) {

	949 uint16_t ch = static_cast<uint16_t>(*it);

	950 if (ch == sharp_s) {

	951 result->SeqOneByteStringSet(dest_index++, 'S');

	952 result->SeqOneByteStringSet(dest_index++, 'S');

	953 } else {

	954 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));

	955 }

	956 }

	957 }

	958

	959 } // namespace

	960

	961 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

	962 HandleScope scope(isolate);

	963 DCHECK_EQ(args.length(), 1);

	964 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	965

	966 int length = s->length();

	967 s = String::Flatten(s);

	968 // First scan the string for uppercase and non-ASCII characters:

	969 if (s->HasOnlyOneByteChars()) {

	970 unsigned first_index_to_lower = length;

	971 for (int index = 0; index < length; ++index) {

	972 // Blink specializes this path for one-byte strings, so it

	973 // does not need to do a generic get, but can do the equivalent

	974 // of SeqOneByteStringGet.

	975 uint16_t ch = s->Get(index);

	976 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	977 first_index_to_lower = index;

	978 break;

	979 }

	980 }

	981

	982 // Nothing to do if the string is all ASCII with no uppercase.

	983 if (first_index_to_lower == length) return *s;

	984

	985 // We depend here on the invariant that the length of a Latin1

	986 // string is invariant under ToLowerCase, and the result always

	987 // fits in the Latin1 range in the root locale. It does not hold

	988 // for ToUpperCase even in the root locale.

	989 Handle<SeqOneByteString> result;

	990 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	991 isolate, result, isolate->factory()->NewRawOneByteString(length));

	992

	993 DisallowHeapAllocation no_gc;

	994 String::FlatContent flat = s->GetFlatContent();

	995 if (flat.IsOneByte()) {

	996 const uint8_t* src = flat.ToOneByteVector().start();

	997 CopyChars(result->GetChars(), src, first_index_to_lower);

	998 for (int index = first_index_to_lower; index < length; ++index) {

	999 uint16_t ch = static_cast<uint16_t>(src[index]);

	1000 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

	1001 }

	1002 } else {

	1003 const uint16_t* src = flat.ToUC16Vector().start();

	1004 CopyChars(result->GetChars(), src, first_index_to_lower);

	1005 for (int index = first_index_to_lower; index < length; ++index) {

	1006 uint16_t ch = src[index];

	1007 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

	1008 }

	1009 }

	1010

	1011 return *result;

	1012 }

	1013

	1014 // Blink had an additional case here for ASCII 2-byte strings, but

	1015 // that is subsumed by the above code (assuming there isn't a false

	1016 // negative for HasOnlyOneByteChars).

	1017

	1018 // Do a slower implementation for cases that include non-ASCII characters.

	1019 return LocaleConvertCase(s, isolate, false, "");

	1020 }

	1021

	1022 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

	1023 HandleScope scope(isolate);

	1024 DCHECK_EQ(args.length(), 1);

	1025 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	1026

	1027 // This function could be optimized for no-op cases the way lowercase

	1028 // counterpart is, but in empirical testing, few actual calls to upper()

	1029 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

	1030

	1031 int32_t length = s->length();

	1032 s = String::Flatten(s);

	1033

	1034 if (s->HasOnlyOneByteChars()) {

	1035 Handle<SeqOneByteString> result;

	1036 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1037 isolate, result, isolate->factory()->NewRawOneByteString(length));

	1038

	1039 int sharp_s_count;

	1040 bool is_result_single_byte;

	1041 {

	1042 DisallowHeapAllocation no_gc;

	1043 String::FlatContent flat = s->GetFlatContent();

	1044 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII

	1045 // could be removed because ToUpperOneByte is pretty fast now (it

	1046 // does not call ICU API any more.).

	1047 if (flat.IsOneByte()) {

	1048 Vector<const uint8_t> src = flat.ToOneByteVector();

	1049 if (ToUpperFastASCII(src, result)) return *result;

	1050 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

	1051 } else {

	1052 DCHECK(flat.IsTwoByte());

	1053 Vector<const uint16_t> src = flat.ToUC16Vector();

	1054 if (ToUpperFastASCII(src, result)) return *result;

	1055 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

	1056 }

	1057 }

	1058

	1059 // Go to the full Unicode path if there are characters whose uppercase

	1060 // is beyond the Latin-1 range (cannot be represented in OneByteString).

	1061 if (V8_UNLIKELY(!is_result_single_byte)) {

	1062 return LocaleConvertCase(s, isolate, true, "");

	1063 }

	1064

	1065 if (sharp_s_count == 0) return *result;

	1066

	1067 // We have sharp_s_count sharp-s characters, but the result is still

	1068 // in the Latin-1 range.

	1069 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1070 isolate, result,

	1071 isolate->factory()->NewRawOneByteString(length + sharp_s_count));

	1072 DisallowHeapAllocation no_gc;

	1073 String::FlatContent flat = s->GetFlatContent();

	1074 if (flat.IsOneByte()) {

	1075 ToUpperWithSharpS(flat.ToOneByteVector(), result);

	1076 } else {

	1077 ToUpperWithSharpS(flat.ToUC16Vector(), result);

	1078 }

	1079

	1080 return *result;

	1081 }

	1082

	1083 return LocaleConvertCase(s, isolate, true, "");

	1084 }

	1085

	1086 RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {

	1087 HandleScope scope(isolate);

	1088 DCHECK_EQ(args.length(), 3);

	1089 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	1090 CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);

	1091 CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);

	1092

	1093 // All the languages requiring special handling ("az", "el", "lt", "tr")

	1094 // have a 2-letter language code.

	1095 DCHECK(lang->length() == 2);

	1096 uint8_t lang_str[3];

	1097 memcpy(lang_str, lang->GetChars(), 2);

	1098 lang_str[2] = 0;

	1099 s = String::Flatten(s);

	1100 // TODO(jshin): Consider adding a fast path for ASCII or Latin-1. The fastpath

	1101 // in the root locale needs to be adjusted for az, lt and tr because even case

	1102 // mapping of ASCII range characters are different in those locales.

	1103 // Greek (el) does not require any adjustment, though.

	1104 return LocaleConvertCase(s, isolate, is_upper,

	1105 reinterpret_cast<const char*>(lang_str));

	1106 }

	1107

752 } // namespace internal	1108 } // namespace internal

753 } // namespace v8	1109 } // namespace v8

754	1110

755 #endif // V8_I18N_SUPPORT	1111 #endif // V8_I18N_SUPPORT

OLD	NEW

« src/js/i18n.js ('K') | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »