src/runtime/runtime-strings.cc - Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag

Side by Side Diff: src/runtime/runtime-strings.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Use CharCopy(); GetFlatContent still crashes Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/runtime/runtime-utils.h"	5 #include "src/runtime/runtime-utils.h"

6	6

7 #include "src/arguments.h"	7 #include "src/arguments.h"

8 #include "src/conversions-inl.h"	8 #include "src/conversions-inl.h"

9 #include "src/isolate-inl.h"	9 #include "src/isolate-inl.h"

10 #include "src/regexp/jsregexp-inl.h"	10 #include "src/regexp/jsregexp-inl.h"

11 #include "src/regexp/jsregexp.h"	11 #include "src/regexp/jsregexp.h"

12 #include "src/string-builder.h"	12 #include "src/string-builder.h"

13 #include "src/string-search.h"	13 #include "src/string-search.h"

14	14

	15 #ifdef V8_I18N_SUPPORT

	16 #include "unicode/locid.h"

	17 #include "unicode/uchar.h"

	18 #include "unicode/unistr.h"

	19 #endif

	20

15 namespace v8 {	21 namespace v8 {

16 namespace internal {	22 namespace internal {

17	23

18	24

19 // Perform string match of pattern on subject, starting at start index.	25 // Perform string match of pattern on subject, starting at start index.

20 // Caller must ensure that 0 <= start_index <= sub->length(),	26 // Caller must ensure that 0 <= start_index <= sub->length(),

21 // and should check that pat->length() + start_index <= sub->length().	27 // and should check that pat->length() + start_index <= sub->length().

22 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,	28 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,

23 int start_index) {	29 int start_index) {

24 DCHECK(0 <= start_index);	30 DCHECK(0 <= start_index);

(...skipping 1045 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1070 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1076 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1071 isolate, result, isolate->factory()->NewRawOneByteString(length));	1077 isolate, result, isolate->factory()->NewRawOneByteString(length));

1072 } else {	1078 } else {

1073 if (length < 0) length = -length;	1079 if (length < 0) length = -length;

1074 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1080 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1075 isolate, result, isolate->factory()->NewRawTwoByteString(length));	1081 isolate, result, isolate->factory()->NewRawTwoByteString(length));

1076 }	1082 }

1077 return ConvertCaseHelper(isolate, s, result, length, mapping);	1083 return ConvertCaseHelper(isolate, s, result, length, mapping);

1078 }	1084 }

1079	1085

	1086 #ifdef V8_I18N_SUPPORT

	1087 namespace {

	1088

	1089 MUST_USE_RESULT static Handle<String> ConvertCaseICU(Handle<String> s,

	1090 Isolate* isolate,

	1091 bool is_to_upper) {

	1092 DCHECK(s->IsFlat());

	1093 // Handle<String> flattened = String::Flatten(s);

	1094 String::FlatContent flat = s->GetFlatContent();
	jungshik at Google 2016/04/07 18:57:11 This leads to a assertion failure in objects.cc: This leads to a assertion failure in objects.cc: # Fatal error in ../../src/objects.cc, line 11065 # Check failed: !AllowHeapAllocation::IsAllowed(). I have little idea what I'm doing wrong (I found a similar usage in this file and other places).
	1095

	1096 const UChar* src;

	1097 if (flat.IsOneByte()) {

	1098 base::SmartArrayPointer<uc16> sap = s->ToWideCString();

	1099 src = reinterpret_cast<const UChar*>(sap.get());

	1100 } else {

	1101 src = reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

	1102 }

	1103

	1104 int32_t length = s->length();

	1105

	1106 // This UnicodeString ctor has copy-on-write semantics. It starts as a

	1107 // read-only alias but the buffer is copied when it's written to.

	1108 icu::UnicodeString converted(0, src, length);

	1109 const icu::Locale& root_locale = icu::Locale::getRoot();

	1110 if (is_to_upper)

	1111 converted.toUpper(root_locale);

	1112 else

	1113 converted.toLower(root_locale);

	1114

	1115 return isolate->factory()

	1116 ->NewStringFromTwoByte(Vector<const uint16_t>(

	1117 reinterpret_cast<const uint16_t*>(converted.getBuffer()),

	1118 converted.length()))

	1119 .ToHandleChecked();

	1120 }

	1121

	1122 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	1123

	1124 inline uint16_t ToASCIILower(uint16_t ch) {

	1125 return ch \| ((ch >= 'A' && ch <= 'Z') << 5);

	1126 }

	1127

	1128 inline uint16_t ToASCIIUpper(uint16_t ch) {

	1129 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	1130 }

	1131

	1132 MUST_USE_RESULT Handle<String> StringToLowerCase(Handle<String> s,

	1133 Isolate* isolate) {

	1134 // Note: This is a hot function in the Dromaeo benchmark, specifically the

	1135 // no-op code path up through the first 'return' statement.

	1136

	1137 int length = s->length();

	1138 s = String::Flatten(s);

	1139 // First scan the string for uppercase and non-ASCII characters:

	1140 if (s->HasOnlyOneByteChars()) {

	1141 unsigned first_index_to_lower = length;

	1142 for (int index = 0; index < length; ++index) {

	1143 // Blink specializes this path for one-byte strings, so it

	1144 // does not need to do a generic get, but can do the equivalent

	1145 // of SeqOneByteStringGet.

	1146 uint16_t ch = s->Get(index);

	1147 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	1148 first_index_to_lower = index;

	1149 break;

	1150 }

	1151 }

	1152

	1153 // Nothing to do if the string is all ASCII with no uppercase.

	1154 if (first_index_to_lower == length) return s;

	1155

	1156 // We depend here on the invariant that the length of a Latin1

	1157 // string is invariant under ToLowerCase, and the result always

	1158 // fits in the Latin1 range (untrue for ToUpperCase, and might

	1159 // be untrue in some locales, but this is the root locale)

	1160 Handle<SeqOneByteString> result =

	1161 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1162 if (s->IsSeqOneByteString()) {

	1163 SeqOneByteString* source = SeqOneByteString::cast(*s);

	1164 CopyChars(result->GetChars(), source->GetChars(), first_index_to_lower);

	1165 } else {

	1166 // Do we have to worry about External{One,Two}ByteString?

	1167 DCHECK(s->IsSeqTwoByteString());

	1168 SeqTwoByteString* source = SeqTwoByteString::cast(*s);

	1169 CopyChars(result->GetChars(), source->GetChars(), first_index_to_lower);

	1170 }

	1171

	1172 for (int index = first_index_to_lower; index < length; ++index) {

	1173 uint16_t ch = s->Get(index);

	1174 result->SeqOneByteStringSet(

	1175 index, V8_UNLIKELY(ch & ~0x7F) ? static_cast<uint16_t>(u_tolower(ch))

	1176 : ToASCIILower(ch));

	1177 }

	1178

	1179 return Handle<String>(*result);

	1180 }

	1181

	1182 // Blink had an additional case here for ASCII 2-byte strings, but

	1183 // that is subsumed by the above code (assuming there isn't a false

	1184 // negative for HasOnlyOneByteChars).

	1185

	1186 // Do a slower implementation for cases that include non-ASCII characters.

	1187 return ConvertCaseICU(s, isolate, false);

	1188 }

	1189

	1190 const uint16_t sharp_s = 0x00DFu;

	1191

	1192 MUST_USE_RESULT Handle<String> StringToUpperCase(Handle<String> s,

	1193 Isolate* isolate) {

	1194 // This function could be optimized for no-op cases the way lower() is,

	1195 // but in empirical testing, few actual calls to upper() are no-ops, so

	1196 // it wouldn't be worth the extra time for pre-scanning.

	1197

	1198 int32_t length = s->length();

	1199 s = String::Flatten(s);

	1200

	1201 if (s->HasOnlyOneByteChars()) {

	1202 Handle<SeqOneByteString> result =

	1203 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1204

	1205 // Do a faster loop for the case where all the characters are ASCII.

	1206 uint16_t ored = 0;

	1207 for (int index = 0; index < length; ++index) {

	1208 uint16_t ch = s->Get(index);

	1209 ored \|= ch;

	1210 result->SeqOneByteStringSet(index, ToASCIIUpper(ch));

	1211 }

	1212 if (!(ored & ~0x7F)) return Handle<String>(*result);

	1213

	1214 // Do a slower implementation for cases that include non-ASCII Latin-1

	1215 // characters.

	1216 int sharp_s_count = 0;

	1217

	1218 // There are two special cases.

	1219 // 1. latin-1 characters when converted to upper case are 16 bit

	1220 // characters.

	1221 // 2. Lower case sharp-S converts to "SS" (two characters)

	1222 for (int32_t index = 0; index < length; ++index) {

	1223 uint16_t ch = s->Get(index);

	1224 if (V8_UNLIKELY(ch == sharp_s)) {

	1225 ++sharp_s_count;

	1226 continue;

	1227 }

	1228 uint16_t upper = static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));

	1229 if (V8_UNLIKELY(upper > 0xff)) {

	1230 // Since this upper-cased character does not fit in an 8-bit string, we

	1231 // need to take the 16-bit path.

	1232 goto upconvert;

	1233 }

	1234 result->SeqOneByteStringSet(index, upper);

	1235 }

	1236

	1237 if (sharp_s_count == 0) return Handle<String>(*result);

	1238

	1239 // We have sharp_s_count sharp-s characters, but none of the other special

	1240 // characters.

	1241 result = isolate->factory()

	1242 ->NewRawOneByteString(length + sharp_s_count)

	1243 .ToHandleChecked();

	1244 for (int32_t index = 0, dest_index = 0; index < length; ++index) {

	1245 uint16_t ch = s->Get(index);

	1246 if (ch == sharp_s) {

	1247 result->SeqOneByteStringSet(dest_index++, 'S');

	1248 result->SeqOneByteStringSet(dest_index++, 'S');

	1249 } else {

	1250 uint16_t upper =

	1251 static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));

	1252 result->SeqOneByteStringSet(dest_index++, upper);

	1253 }

	1254 }

	1255

	1256 return Handle<String>(*result);

	1257 }

	1258

	1259 upconvert:

	1260 return ConvertCaseICU(s, isolate, true);

	1261 }

	1262

	1263 } // namespace

	1264 #endif

1080	1265

1081 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {	1266 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {

1082 HandleScope scope(isolate);	1267 HandleScope scope(isolate);

1083 DCHECK(args.length() == 1);	1268 DCHECK_EQ(args.length(), 1);

1084 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1269 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1085 return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping());	1270 #ifdef V8_I18N_SUPPORT

	1271 if (FLAG_icu_case_mapping)

	1272 return *StringToLowerCase(s, isolate);

	1273 else

	1274 #endif

	1275 return ConvertCase(s, isolate,

	1276 isolate->runtime_state()->to_lower_mapping());

1086 }	1277 }

1087	1278

1088	1279

1089 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {	1280 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {

1090 HandleScope scope(isolate);	1281 HandleScope scope(isolate);

1091 DCHECK(args.length() == 1);	1282 DCHECK_EQ(args.length(), 1);

1092 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1283 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1093 return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping());	1284 #ifdef V8_I18N_SUPPORT

1094 }	1285 if (FLAG_icu_case_mapping)

1095	1286 return *StringToUpperCase(s, isolate);

1096	1287 else

	1288 #endif

	1289 return ConvertCase(s, isolate,

	1290 isolate->runtime_state()->to_upper_mapping());

	1291 }

	1292

	1293

1097 RUNTIME_FUNCTION(Runtime_StringTrim) {	1294 RUNTIME_FUNCTION(Runtime_StringTrim) {

1098 HandleScope scope(isolate);	1295 HandleScope scope(isolate);

1099 DCHECK(args.length() == 3);	1296 DCHECK(args.length() == 3);

1100	1297

1101 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);	1298 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);

1102 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);	1299 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);

1103 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);	1300 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);

1104	1301

1105 string = String::Flatten(string);	1302 string = String::Flatten(string);

1106 int length = string->length();	1303 int length = string->length();

(...skipping 211 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1318 SealHandleScope shs(isolate);	1515 SealHandleScope shs(isolate);

1319 DCHECK(args.length() == 2);	1516 DCHECK(args.length() == 2);

1320 if (!args[0]->IsString()) return isolate->heap()->undefined_value();	1517 if (!args[0]->IsString()) return isolate->heap()->undefined_value();

1321 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();	1518 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();

1322 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();	1519 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();

1323 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);	1520 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);

1324 }	1521 }

1325	1522

1326 } // namespace internal	1523 } // namespace internal

1327 } // namespace v8	1524 } // namespace v8

OLD	NEW

« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | no next file with comments »