src/runtime/runtime-i18n.cc - Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: working with icu-case-mapping.js manually compiled in; gyp change not picking it up Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include "src/api.h"	9 #include "src/api.h"

10 #include "src/api-natives.h"	10 #include "src/api-natives.h"

(...skipping 11 matching lines...) Expand all Loading...
22 #include "unicode/dcfmtsym.h"	22 #include "unicode/dcfmtsym.h"

23 #include "unicode/decimfmt.h"	23 #include "unicode/decimfmt.h"

24 #include "unicode/dtfmtsym.h"	24 #include "unicode/dtfmtsym.h"

25 #include "unicode/dtptngen.h"	25 #include "unicode/dtptngen.h"

26 #include "unicode/locid.h"	26 #include "unicode/locid.h"

27 #include "unicode/numfmt.h"	27 #include "unicode/numfmt.h"

28 #include "unicode/numsys.h"	28 #include "unicode/numsys.h"

29 #include "unicode/rbbi.h"	29 #include "unicode/rbbi.h"

30 #include "unicode/smpdtfmt.h"	30 #include "unicode/smpdtfmt.h"

31 #include "unicode/timezone.h"	31 #include "unicode/timezone.h"

	32 #include "unicode/translit.h"

32 #include "unicode/uchar.h"	33 #include "unicode/uchar.h"

33 #include "unicode/ucol.h"	34 #include "unicode/ucol.h"

34 #include "unicode/ucurr.h"	35 #include "unicode/ucurr.h"

35 #include "unicode/uloc.h"	36 #include "unicode/uloc.h"

	37 #include "unicode/unistr.h"

36 #include "unicode/unum.h"	38 #include "unicode/unum.h"

37 #include "unicode/uversion.h"	39 #include "unicode/uversion.h"

38	40

39	41

40 namespace v8 {	42 namespace v8 {

41 namespace internal {	43 namespace internal {

42	44

43 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {	45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {

44 HandleScope scope(isolate);	46 HandleScope scope(isolate);

45 Factory* factory = isolate->factory();	47 Factory* factory = isolate->factory();

(...skipping 696 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
742 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {	744 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {

743 return *isolate->factory()->NewStringFromStaticChars("letter");	745 return *isolate->factory()->NewStringFromStaticChars("letter");

744 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {	746 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {

745 return *isolate->factory()->NewStringFromStaticChars("kana");	747 return *isolate->factory()->NewStringFromStaticChars("kana");

746 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {	748 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {

747 return *isolate->factory()->NewStringFromStaticChars("ideo");	749 return *isolate->factory()->NewStringFromStaticChars("ideo");

748 } else {	750 } else {

749 return *isolate->factory()->NewStringFromStaticChars("unknown");	751 return *isolate->factory()->NewStringFromStaticChars("unknown");

750 }	752 }

751 }	753 }

	754

	755 namespace {

	756 void ConvertCaseWithTransliterator(icu::UnicodeString* input,

	757 const char* transliterator_id) {

	758 UErrorCode status = U_ZERO_ERROR;

	759 base::SmartPointer<icu::Transliterator> translit(

	760 icu::Transliterator::createInstance(

	761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,

	762 status));

	763 if (U_FAILURE(status)) return;

	764 translit->transliterate(*input);

	765 }

	766

	767 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat, uc16* dest,

	768 int32_t length) {

	769 DCHECK(flat.IsFlat());

	770 if (flat.IsOneByte()) {

	771 CopyChars(dest, flat.ToOneByteVector().start(), length);

	772 return static_cast<const UChar*>(dest);

	773 } else {

	774 return static_cast<const UChar*>(flat.ToUC16Vector().start());

	775 }

	776 }

	777

	778 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,

	779 bool is_to_upper, const char* lang) {

	780 int32_t src_length = s->length();

	781

	782 // Greek uppercasing has to be done via transliteration.

	783 // TODO(jshin): Drop this special-casing once ICU's regular case conversion

	784 // API supports Greek uppercasing. See

	785 // http://bugs.icu-project.org/trac/ticket/10582 .

	786 // In the meantime, if there's no Greek character in \|s\|, call this

	787 // function again with the root locale (lang="").

	788 // ICU's C API for transliteration is nasty and we just use C++ API.

	789 if (V8_UNLIKELY(is_to_upper && lang[0] == 'e' && lang[1] == 'l')) {

	790 icu::UnicodeString converted;

	791 {

	792 DisallowHeapAllocation no_gc;

	793 String::FlatContent flat = s->GetFlatContent();

	794 base::SmartArrayPointer<uc16> sap(NewArray<uc16>(src_length));
	Yang 2016/04/29 07:38:47 This seems wrong. In case of two-byte string, we d This seems wrong. In case of two-byte string, we do not copy, and 'converted' aliases the original string. Can we have test cases that test that the original string did not change? jungshik at Google 2016/04/29 18:03:23 UnicodeString's \|setTo(src, src_length\| does copy Show quoted text On 2016/04/29 07:38:47, Yang wrote: > This seems wrong. In case of two-byte string, we do not copy, and 'converted' > aliases the original string. Can we have test cases that test that the original > string did not change? UnicodeString's \|setTo(src, src_length\| does copy the buffer pointed to by \|src\| before doing anything. Previously, I used \|setTo(false, src, src_length)\| which aliases the buffer until it's time to write/modify. So, in case of a SingleByteString, the widened buffer is stored in \|sap\| and then copied to UnicodeString. In case of TwoByteString, its uc16 buffer is extracted (flatbuffer) and copied to UnicodeString upfront. Anyway, I added a couple of tests (toLocale{L,U}Case("el") that do not change the input.
	795 const UChar* src = GetUCharBufferFromFlat(flat, sap.get(), src_length);

	796 // Starts with the source string (copied) and will be modified to contain

	797 // the converted result. Note that there's no benefit in using

	798 // read-aliasing \|setTo\| (3 argument version) because the buffer is copied

	799 // anyway upon transliteration.

	800 converted.setTo(src, src_length);

	801 ConvertCaseWithTransliterator(&converted, "el-Upper");

	802 }

	803 Handle<String> result;

	804 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	805 isolate, result,

	806 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(

	807 reinterpret_cast<const uint16_t*>(converted.getBuffer()),

	808 converted.length())));

	809 return *result;

	810 }

	811

	812 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;

	813

	814 int32_t dest_length = src_length;

	815 UErrorCode status;

	816 Handle<SeqTwoByteString> result;

	817 base::SmartArrayPointer<uc16> sap(NewArray<uc16>(src_length));

	818

	819 // This is not a real loop. It'll be executed only once (no overflow) or

	820 // twice (overflow).

	821 for (int i = 0; i < 2; ++i) {

	822 result =

	823 isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();

	824 DisallowHeapAllocation no_gc;

	825 String::FlatContent flat = s->GetFlatContent();

	826 const UChar* src = GetUCharBufferFromFlat(flat, sap.get(), src_length);

	827 status = U_ZERO_ERROR;

	828 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),

	829 dest_length, src, src_length, lang, &status);

	830 if (status != U_BUFFER_OVERFLOW_ERROR) break;

	831 }

	832

	833 // In most cases, the output will fill the destination buffer completely

	834 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).

	835 // Only in rare cases, it'll be shorter than the destination buffer and

	836 // \|result\| has to be truncated.

	837 DCHECK(U_SUCCESS(status));

	838 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {

	839 DCHECK(dest_length == result->length());

	840 return *result;

	841 }

	842 if (U_SUCCESS(status)) {

	843 DCHECK(dest_length < result->length());

	844 return *Handle<SeqTwoByteString>::cast(

	845 SeqString::Truncate(result, dest_length));

	846 }

	847 return *s;

	848 }

	849

	850 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	851

	852 const uint8_t kToLower[256] = {

	853 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,

	854 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

	855 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,

	856 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,

	857 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,

	858 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,

	859 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,

	860 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,

	861 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,

	862 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,

	863 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,

	864 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,

	865 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,

	866 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,

	867 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,

	868 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,

	869 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,

	870 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,

	871 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,

	872 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,

	873 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,

	874 0xFC, 0xFD, 0xFE, 0xFF,

	875 };

	876

	877 inline uint16_t ToLatin1Lower(uint16_t ch) {

	878 return static_cast<uint16_t>(kToLower[ch]);

	879 }

	880

	881 inline uint16_t ToASCIIUpper(uint16_t ch) {

	882 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	883 }

	884

	885 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.

	886 inline uint16_t ToLatin1Upper(uint16_t ch) {

	887 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);

	888 return ch &

	889 ~(((ch >= 'a' && ch <= 'z') \|\| (((ch & 0xE0) == 0xE0) && ch != 0xE7))

	890 << 5);

	891 }

	892

	893 template <typename Char>

	894 bool ToUpperFastASCII(const Vector<const Char>& src,

	895 Handle<SeqOneByteString> result) {

	896 // Do a faster loop for the case where all the characters are ASCII.

	897 uint16_t ored = 0;

	898 int32_t index = 0;

	899 for (auto it = src.begin(); it != src.end(); ++it) {

	900 uint16_t ch = static_cast<uint16_t>(*it);

	901 ored \|= ch;

	902 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));

	903 }

	904 return !(ored & ~0x7F);

	905 }

	906

	907 const uint16_t sharp_s = 0xDF;

	908

	909 template <typename Char>

	910 bool ToUpperOneByte(const Vector<const Char>& src,

	911 Handle<SeqOneByteString> result, int* sharp_s_count) {

	912 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.

	913

	914 // There are two special cases.

	915 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.

	916 // 2. Lower case sharp-S converts to "SS" (two characters)

	917 *sharp_s_count = 0;

	918 int32_t index = 0;

	919 for (auto it = src.begin(); it != src.end(); ++it) {

	920 uint16_t ch = static_cast<uint16_t>(*it);

	921 if (V8_UNLIKELY(ch == sharp_s)) {

	922 ++(*sharp_s_count);

	923 continue;

	924 }

	925 if (V8_UNLIKELY(ch == 0xB5 \|\| ch == 0xFF)) {

	926 // Since this upper-cased character does not fit in an 8-bit string, we

	927 // need to take the 16-bit path.

	928 return false;

	929 }

	930 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));

	931 }

	932

	933 return true;

	934 }

	935

	936 template <typename Char>

	937 void ToUpperWithSharpS(const Vector<const Char>& src,

	938 Handle<SeqOneByteString> result) {

	939 int32_t dest_index = 0;

	940 for (auto it = src.begin(); it != src.end(); ++it) {

	941 uint16_t ch = static_cast<uint16_t>(*it);

	942 if (ch == sharp_s) {

	943 result->SeqOneByteStringSet(dest_index++, 'S');

	944 result->SeqOneByteStringSet(dest_index++, 'S');

	945 } else {

	946 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));

	947 }

	948 }

	949 }

	950

	951 } // namespace

	952

	953 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

	954 HandleScope scope(isolate);

	955 DCHECK_EQ(args.length(), 1);

	956 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	957

	958 int length = s->length();

	959 s = String::Flatten(s);

	960 // First scan the string for uppercase and non-ASCII characters:

	961 if (s->HasOnlyOneByteChars()) {

	962 unsigned first_index_to_lower = length;

	963 for (int index = 0; index < length; ++index) {

	964 // Blink specializes this path for one-byte strings, so it

	965 // does not need to do a generic get, but can do the equivalent

	966 // of SeqOneByteStringGet.

	967 uint16_t ch = s->Get(index);

	968 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	969 first_index_to_lower = index;

	970 break;

	971 }

	972 }

	973

	974 // Nothing to do if the string is all ASCII with no uppercase.

	975 if (first_index_to_lower == length) return *s;

	976

	977 // We depend here on the invariant that the length of a Latin1

	978 // string is invariant under ToLowerCase, and the result always

	979 // fits in the Latin1 range in the root locale. It does not hold

	980 // for ToUpperCase even in the root locale.

	981 Handle<SeqOneByteString> result;

	982 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	983 isolate, result, isolate->factory()->NewRawOneByteString(length));

	984

	985 DisallowHeapAllocation no_gc;

	986 String::FlatContent flat = s->GetFlatContent();

	987 if (flat.IsOneByte()) {

	988 const uint8_t* src = flat.ToOneByteVector().start();

	989 CopyChars(result->GetChars(), src, first_index_to_lower);

	990 for (int index = first_index_to_lower; index < length; ++index) {

	991 uint16_t ch = static_cast<uint16_t>(src[index]);

	992 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

	993 }

	994 } else {

	995 const uint16_t* src = flat.ToUC16Vector().start();

	996 CopyChars(result->GetChars(), src, first_index_to_lower);

	997 for (int index = first_index_to_lower; index < length; ++index) {

	998 uint16_t ch = src[index];

	999 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

	1000 }

	1001 }

	1002

	1003 return *result;

	1004 }

	1005

	1006 // Blink had an additional case here for ASCII 2-byte strings, but

	1007 // that is subsumed by the above code (assuming there isn't a false

	1008 // negative for HasOnlyOneByteChars).

	1009

	1010 // Do a slower implementation for cases that include non-ASCII characters.

	1011 return LocaleConvertCase(s, isolate, false, "");

	1012 }

	1013

	1014 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

	1015 HandleScope scope(isolate);

	1016 DCHECK_EQ(args.length(), 1);

	1017 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	1018

	1019 // This function could be optimized for no-op cases the way lowercase

	1020 // counterpart is, but in empirical testing, few actual calls to upper()

	1021 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

	1022

	1023 int32_t length = s->length();

	1024 s = String::Flatten(s);

	1025

	1026 if (s->HasOnlyOneByteChars()) {

	1027 Handle<SeqOneByteString> result;

	1028 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1029 isolate, result, isolate->factory()->NewRawOneByteString(length));

	1030

	1031 int sharp_s_count;

	1032 bool is_result_single_byte;

	1033 {

	1034 DisallowHeapAllocation no_gc;

	1035 String::FlatContent flat = s->GetFlatContent();

	1036 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII

	1037 // could be removed because ToUpperOneByte is pretty fast now (it

	1038 // does not call ICU API any more.).

	1039 if (flat.IsOneByte()) {

	1040 Vector<const uint8_t> src = flat.ToOneByteVector();

	1041 if (ToUpperFastASCII(src, result)) return *result;

	1042 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

	1043 } else {

	1044 DCHECK(flat.IsTwoByte());

	1045 Vector<const uint16_t> src = flat.ToUC16Vector();

	1046 if (ToUpperFastASCII(src, result)) return *result;

	1047 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

	1048 }

	1049 }

	1050

	1051 // Go to the full Unicode path if there are characters whose uppercase

	1052 // is beyond the Latin-1 range (cannot be represented in OneByteString).

	1053 if (V8_UNLIKELY(!is_result_single_byte)) {

	1054 return LocaleConvertCase(s, isolate, true, "");

	1055 }

	1056

	1057 if (sharp_s_count == 0) return *result;

	1058

	1059 // We have sharp_s_count sharp-s characters, but the result is still

	1060 // in the Latin-1 range.

	1061 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1062 isolate, result,

	1063 isolate->factory()->NewRawOneByteString(length + sharp_s_count));

	1064 DisallowHeapAllocation no_gc;

	1065 String::FlatContent flat = s->GetFlatContent();

	1066 if (flat.IsOneByte()) {

	1067 ToUpperWithSharpS(flat.ToOneByteVector(), result);

	1068 } else {

	1069 ToUpperWithSharpS(flat.ToUC16Vector(), result);

	1070 }

	1071

	1072 return *result;

	1073 }

	1074

	1075 return LocaleConvertCase(s, isolate, true, "");

	1076 }

	1077

	1078 RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {

	1079 HandleScope scope(isolate);

	1080 DCHECK_EQ(args.length(), 3);

	1081 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

	1082 CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);

	1083 CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);

	1084

	1085 // All the languages requiring special handling ("az", "el", "lt", "tr")

	1086 // have a 2-letter language code.

	1087 DCHECK(lang->length() == 2);

	1088 uint8_t lang_str[3];

	1089 memcpy(lang_str, lang->GetChars(), 2);

	1090 lang_str[2] = 0;

	1091 s = String::Flatten(s);

	1092 // TODO(jshin): Consider adding a fast path for ASCII or Latin-1. The fastpath

	1093 // in the root locale needs to be adjusted for az, lt and tr because even case

	1094 // mapping of ASCII range characters are different in those locales.

	1095 // Greek (el) does not require any adjustment, though.

	1096 return LocaleConvertCase(s, isolate, is_upper,

	1097 reinterpret_cast<const char*>(lang_str));

	1098 }

	1099

752 } // namespace internal	1100 } // namespace internal

753 } // namespace v8	1101 } // namespace v8

754	1102

755 #endif // V8_I18N_SUPPORT	1103 #endif // V8_I18N_SUPPORT

OLD	NEW

« src/js/i18n.js ('K') | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | test/intl/general/case-mapping.js » ('J')