src/runtime/runtime-i18n.cc - Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Address Yang's comment Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include "src/api.h"	9 #include "src/api.h"

10 #include "src/api-natives.h"	10 #include "src/api-natives.h"

(...skipping 11 matching lines...) Expand all Loading...
22 #include "unicode/dcfmtsym.h"	22 #include "unicode/dcfmtsym.h"

23 #include "unicode/decimfmt.h"	23 #include "unicode/decimfmt.h"

24 #include "unicode/dtfmtsym.h"	24 #include "unicode/dtfmtsym.h"

25 #include "unicode/dtptngen.h"	25 #include "unicode/dtptngen.h"

26 #include "unicode/locid.h"	26 #include "unicode/locid.h"

27 #include "unicode/numfmt.h"	27 #include "unicode/numfmt.h"

28 #include "unicode/numsys.h"	28 #include "unicode/numsys.h"

29 #include "unicode/rbbi.h"	29 #include "unicode/rbbi.h"

30 #include "unicode/smpdtfmt.h"	30 #include "unicode/smpdtfmt.h"

31 #include "unicode/timezone.h"	31 #include "unicode/timezone.h"

	32 #include "unicode/translit.h"

32 #include "unicode/uchar.h"	33 #include "unicode/uchar.h"

33 #include "unicode/ucol.h"	34 #include "unicode/ucol.h"

34 #include "unicode/ucurr.h"	35 #include "unicode/ucurr.h"

35 #include "unicode/uloc.h"	36 #include "unicode/uloc.h"

	37 #include "unicode/unistr.h"

36 #include "unicode/unum.h"	38 #include "unicode/unum.h"

37 #include "unicode/uversion.h"	39 #include "unicode/uversion.h"

38	40

39	41

40 namespace v8 {	42 namespace v8 {

41 namespace internal {	43 namespace internal {

42	44

43 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {	45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {

44 HandleScope scope(isolate);	46 HandleScope scope(isolate);

45 Factory* factory = isolate->factory();	47 Factory* factory = isolate->factory();

(...skipping 696 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
742 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {	744 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {

743 return *isolate->factory()->NewStringFromStaticChars("letter");	745 return *isolate->factory()->NewStringFromStaticChars("letter");

744 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {	746 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {

745 return *isolate->factory()->NewStringFromStaticChars("kana");	747 return *isolate->factory()->NewStringFromStaticChars("kana");

746 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {	748 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {

747 return *isolate->factory()->NewStringFromStaticChars("ideo");	749 return *isolate->factory()->NewStringFromStaticChars("ideo");

748 } else {	750 } else {

749 return *isolate->factory()->NewStringFromStaticChars("unknown");	751 return *isolate->factory()->NewStringFromStaticChars("unknown");

750 }	752 }

751 }	753 }

	754

	755 namespace {

	756 void ConvertCaseWithTransliterator(icu::UnicodeString* input,

	757 const char* transliterator_id) {

	758 UErrorCode status = U_ZERO_ERROR;

	759 base::SmartPointer<icu::Transliterator> translit(

	760 icu::Transliterator::createInstance(

	761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,

	762 status));

	763 if (U_FAILURE(status)) return;

	764 translit->transliterate(*input);

	765 }

	766

	767 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,

	768 base::SmartArrayPointer<uc16>* dest,

	769 int32_t length) {

	770 DCHECK(flat.IsFlat());

	771 if (flat.IsOneByte()) {

	772 if (dest->is_empty()) {

	773 dest->Reset(NewArray<uc16>(length));

	774 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);

	775 }

	776 return reinterpret_cast<const UChar*>(dest->get());

	777 } else {

	778 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

	779 }

	780 }

	781

	782 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,

	783 bool is_to_upper, const char* lang) {

	784 int32_t src_length = s->length();

	785

	786 // Greek uppercasing has to be done via transliteration.

	787 // TODO(jshin): Drop this special-casing once ICU's regular case conversion

	788 // API supports Greek uppercasing. See

	789 // http://bugs.icu-project.org/trac/ticket/10582 .

	790 // In the meantime, if there's no Greek character in \|s\|, call this

	791 // function again with the root locale (lang="").

	792 // ICU's C API for transliteration is nasty and we just use C++ API.

	793 if (V8_UNLIKELY(is_to_upper && lang[0] == 'e' && lang[1] == 'l')) {

	794 icu::UnicodeString converted;

	795 base::SmartArrayPointer<uc16> sap;

	796 {

	797 DisallowHeapAllocation no_gc;

	798 String::FlatContent flat = s->GetFlatContent();

	799 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);

	800 // Starts with the source string (read-only alias with copy-on-write

	801 // semantics) and will be modified to contain the converted result.

	802 // Using read-only alias at first saves one copy operation if

	803 // transliteration does not change the input, which is rather rare.

	804 // Moreover, transliteration takes rather long so that saving one copy

	805 // helps only a little bit.

	806 converted.setTo(false, src, src_length);

	807 ConvertCaseWithTransliterator(&converted, "el-Upper");
	Yang 2016/05/11 08:42:30 So... if ConvertCaseWithTransliterator does not ch So... if ConvertCaseWithTransliterator does not change the input, 'converted' should still alias 'src', right? Can we simply return 's' if converted.getBuffer() == src? That doesn't seem to be expensive. jungshik at Google 2016/05/11 18:10:08 Done. Show quoted text On 2016/05/11 08:42:30, Yang wrote: > So... if ConvertCaseWithTransliterator does not change the input, 'converted' > should still alias 'src', right? Can we simply return 's' if > converted.getBuffer() == src? That doesn't seem to be expensive. Done.
	808 }

	809 Handle<String> result;

	810 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	811 isolate, result,

	812 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(

	813 reinterpret_cast<const uint16_t*>(converted.getBuffer()),

	814 converted.length())));

	815 return *result;

	816 }

	817

	818 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;

	819

	820 int32_t dest_length = src_length;

	821 UErrorCode status;

	822 Handle<SeqTwoByteString> result;

	823 base::SmartArrayPointer<uc16> sap;

	824

	825 // This is not a real loop. It'll be executed only once (no overflow) or

	826 // twice (overflow).

	827 for (int i = 0; i < 2; ++i) {

	828 result =

	829 isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();

	830 DisallowHeapAllocation no_gc;

	831 String::FlatContent flat = s->GetFlatContent();

	832 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);

	833 status = U_ZERO_ERROR;

	834 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),

	835 dest_length, src, src_length, lang, &status);

	836 if (status != U_BUFFER_OVERFLOW_ERROR) break;

	837 }

	838

	839 // In most cases, the output will fill the destination buffer completely

	840 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).

	841 // Only in rare cases, it'll be shorter than the destination buffer and

	842 // \|result\| has to be truncated.

	843 DCHECK(U_SUCCESS(status));

	844 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {

	845 DCHECK(dest_length == result->length());

	846 return *result;

	847 }

	848 if (U_SUCCESS(status)) {

	849 DCHECK(dest_length < result->length());

	850 return *Handle<SeqTwoByteString>::cast(

	851 SeqString::Truncate(result, dest_length));

	852 }

	853 return *s;

	854 }

	855

	856 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	857

	858 const uint8_t kToLower[256] = {

	859 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,

	860 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

	861 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,

	862 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,

	863 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,

	864 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,

	865 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,

	866 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,

	867 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,

	868 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,

	869 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,

	870 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,

	871 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,

	872 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,

	873 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,

	874 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,

	875 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,

	876 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,

	877 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,

	878 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,

	879 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,

	880 0xFC, 0xFD, 0xFE, 0xFF,

	881 };

	882

	883 inline uint16_t ToLatin1Lower(uint16_t ch) {

	884 return static_cast<uint16_t>(kToLower[ch]);

	885 }

	886

	887 inline uint16_t ToASCIIUpper(uint16_t ch) {

	888 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	889 }

	890

	891 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.

	892 inline uint16_t ToLatin1Upper(uint16_t ch) {

	893 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);

	894 return ch &

	895 ~(((ch >= 'a' && ch <= 'z') \|\| (((ch & 0xE0) == 0xE0) && ch != 0xE7))

	896 << 5);

	897 }

	898

	899 template <typename Char>

	900 bool ToUpperFastASCII(const Vector<const Char>& src,

	901 Handle<SeqOneByteString> result) {

	902 // Do a faster loop for the case where all the characters are ASCII.

	903 uint16_t ored = 0;

	904 int32_t index = 0;

	905 for (auto it = src.begin(); it != src.end(); ++it) {

	906 uint16_t ch = static_cast<uint16_t>(*it);

	907 ored \|= ch;

	908 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));

	909 }

	910 return !(ored & ~0x7F);

	911 }

	912

	913 const uint16_t sharp_s = 0xDF;

	914

	915 template <typename Char>

	916 bool ToUpperOneByte(const Vector<const Char>& src,

	917 Handle<SeqOneByteString> result, int* sharp_s_count) {

	918 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.

	919

	920 // There are two special cases.

	921 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.

	922 // 2. Lower case sharp-S converts to "SS" (two characters)

	923 *sharp_s_count = 0;

	924 int32_t index = 0;

	925 for (auto it = src.begin(); it != src.end(); ++it) {

	926 uint16_t ch = static_cast<uint16_t>(*it);

	927 if (V8_UNLIKELY(ch == sharp_s)) {

	928 ++(*sharp_s_count);

	929 continue;

	930 }

	931 if (V8_UNLIKELY(ch == 0xB5 \|\| ch == 0xFF)) {

	932 // Since this upper-cased character does not fit in an 8-bit string, we

	933 // need to take the 16-bit path.

	934 return false;

	935 }

	936 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));

	937 }

	938

	939 return true;

	940 }

	941

	942 template <typename Char>

	943 void ToUpperWithSharpS(const Vector<const Char>& src,

	944 Handle<SeqOneByteString> result) {

	945 int32_t dest_index = 0;

	946 for (auto it = src.begin(); it != src.end(); ++it) {

	947 uint16_t ch = static_cast<uint16_t>(*it);

	948 if (ch == sharp_s) {

	949 result->SeqOneByteStringSet(dest_index++, 'S');

	950 result->SeqOneByteStringSet(dest_index++, 'S');

	951 } else {

	952 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));

	953 }

	954 }

	955 }

	956

	957 } // namespace

	958

	959 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

	960 HandleScope scope(isolate);

	961 DCHECK_EQ(args.length(), 1);

	962 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	963

	964 int length = s->length();

	965 s = String::Flatten(s);

	966 // First scan the string for uppercase and non-ASCII characters:

	967 if (s->HasOnlyOneByteChars()) {

	968 unsigned first_index_to_lower = length;

	969 for (int index = 0; index < length; ++index) {

	970 // Blink specializes this path for one-byte strings, so it

	971 // does not need to do a generic get, but can do the equivalent

	972 // of SeqOneByteStringGet.

	973 uint16_t ch = s->Get(index);

	974 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	975 first_index_to_lower = index;

	976 break;

	977 }

	978 }

	979

	980 // Nothing to do if the string is all ASCII with no uppercase.

	981 if (first_index_to_lower == length) return *s;

	982

	983 // We depend here on the invariant that the length of a Latin1

	984 // string is invariant under ToLowerCase, and the result always

	985 // fits in the Latin1 range in the root locale. It does not hold

	986 // for ToUpperCase even in the root locale.

	987 Handle<SeqOneByteString> result;

	988 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	989 isolate, result, isolate->factory()->NewRawOneByteString(length));

	990

	991 DisallowHeapAllocation no_gc;

	992 String::FlatContent flat = s->GetFlatContent();

	993 if (flat.IsOneByte()) {

	994 const uint8_t* src = flat.ToOneByteVector().start();

	995 CopyChars(result->GetChars(), src, first_index_to_lower);

	996 for (int index = first_index_to_lower; index < length; ++index) {

	997 uint16_t ch = static_cast<uint16_t>(src[index]);

	998 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

	999 }

	1000 } else {

	1001 const uint16_t* src = flat.ToUC16Vector().start();

	1002 CopyChars(result->GetChars(), src, first_index_to_lower);

	1003 for (int index = first_index_to_lower; index < length; ++index) {

	1004 uint16_t ch = src[index];

	1005 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

	1006 }

	1007 }

	1008

	1009 return *result;

	1010 }

	1011

	1012 // Blink had an additional case here for ASCII 2-byte strings, but

	1013 // that is subsumed by the above code (assuming there isn't a false

	1014 // negative for HasOnlyOneByteChars).

	1015

	1016 // Do a slower implementation for cases that include non-ASCII characters.

	1017 return LocaleConvertCase(s, isolate, false, "");

	1018 }

	1019

	1020 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

	1021 HandleScope scope(isolate);

	1022 DCHECK_EQ(args.length(), 1);

	1023 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	1024

	1025 // This function could be optimized for no-op cases the way lowercase

	1026 // counterpart is, but in empirical testing, few actual calls to upper()

	1027 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

	1028

	1029 int32_t length = s->length();

	1030 s = String::Flatten(s);

	1031

	1032 if (s->HasOnlyOneByteChars()) {

	1033 Handle<SeqOneByteString> result;

	1034 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1035 isolate, result, isolate->factory()->NewRawOneByteString(length));

	1036

	1037 int sharp_s_count;

	1038 bool is_result_single_byte;

	1039 {

	1040 DisallowHeapAllocation no_gc;

	1041 String::FlatContent flat = s->GetFlatContent();

	1042 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII

	1043 // could be removed because ToUpperOneByte is pretty fast now (it

	1044 // does not call ICU API any more.).

	1045 if (flat.IsOneByte()) {

	1046 Vector<const uint8_t> src = flat.ToOneByteVector();

	1047 if (ToUpperFastASCII(src, result)) return *result;

	1048 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

	1049 } else {

	1050 DCHECK(flat.IsTwoByte());

	1051 Vector<const uint16_t> src = flat.ToUC16Vector();

	1052 if (ToUpperFastASCII(src, result)) return *result;

	1053 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

	1054 }

	1055 }

	1056

	1057 // Go to the full Unicode path if there are characters whose uppercase

	1058 // is beyond the Latin-1 range (cannot be represented in OneByteString).

	1059 if (V8_UNLIKELY(!is_result_single_byte)) {

	1060 return LocaleConvertCase(s, isolate, true, "");

	1061 }

	1062

	1063 if (sharp_s_count == 0) return *result;

	1064

	1065 // We have sharp_s_count sharp-s characters, but the result is still

	1066 // in the Latin-1 range.

	1067 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1068 isolate, result,

	1069 isolate->factory()->NewRawOneByteString(length + sharp_s_count));

	1070 DisallowHeapAllocation no_gc;

	1071 String::FlatContent flat = s->GetFlatContent();

	1072 if (flat.IsOneByte()) {

	1073 ToUpperWithSharpS(flat.ToOneByteVector(), result);

	1074 } else {

	1075 ToUpperWithSharpS(flat.ToUC16Vector(), result);

	1076 }

	1077

	1078 return *result;

	1079 }

	1080

	1081 return LocaleConvertCase(s, isolate, true, "");

	1082 }

	1083

	1084 RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {

	1085 HandleScope scope(isolate);

	1086 DCHECK_EQ(args.length(), 3);

	1087 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	1088 CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);

	1089 CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);

	1090

	1091 // All the languages requiring special handling ("az", "el", "lt", "tr")

	1092 // have a 2-letter language code.

	1093 DCHECK(lang->length() == 2);

	1094 uint8_t lang_str[3];

	1095 memcpy(lang_str, lang->GetChars(), 2);

	1096 lang_str[2] = 0;

	1097 s = String::Flatten(s);

	1098 // TODO(jshin): Consider adding a fast path for ASCII or Latin-1. The fastpath

	1099 // in the root locale needs to be adjusted for az, lt and tr because even case

	1100 // mapping of ASCII range characters are different in those locales.

	1101 // Greek (el) does not require any adjustment, though.

	1102 return LocaleConvertCase(s, isolate, is_upper,

	1103 reinterpret_cast<const char*>(lang_str));

	1104 }

	1105

752 } // namespace internal	1106 } // namespace internal

753 } // namespace v8	1107 } // namespace v8

754	1108

755 #endif // V8_I18N_SUPPORT	1109 #endif // V8_I18N_SUPPORT

OLD	NEW

« no previous file with comments | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »