src/regexp/regexp-macro-assembler.cc - Issue 1599303002: [regexp] implement case-insensitive unicode regexps.

Side by Side Diff: src/regexp/regexp-macro-assembler.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass

Patch Set: fixes Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-macro-assembler.h"	5 #include "src/regexp/regexp-macro-assembler.h"

6	6

7 #include "src/assembler.h"	7 #include "src/assembler.h"

8 #include "src/isolate-inl.h"	8 #include "src/isolate-inl.h"

9 #include "src/regexp/regexp-stack.h"	9 #include "src/regexp/regexp-stack.h"

10 #include "src/simulator.h"	10 #include "src/simulator.h"

11	11

	12 #ifdef V8_I18N_SUPPORT

	13 #include "unicode/uchar.h"

	14 #endif // V8_I18N_SUPPORT

	15

12 namespace v8 {	16 namespace v8 {

13 namespace internal {	17 namespace internal {

14	18

15 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)	19 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)

16 : slow_safe_compiler_(false),	20 : slow_safe_compiler_(false),

17 global_mode_(NOT_GLOBAL),	21 global_mode_(NOT_GLOBAL),

18 isolate_(isolate),	22 isolate_(isolate),

19 zone_(zone) {}	23 zone_(zone) {}

20	24

21	25

22 RegExpMacroAssembler::~RegExpMacroAssembler() {	26 RegExpMacroAssembler::~RegExpMacroAssembler() {

23 }	27 }

24	28

25	29

	30 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,

	31 Address byte_offset2,

	32 size_t byte_length,

	33 Isolate* isolate) {

	34 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

	35 isolate->regexp_macro_assembler_canonicalize();

	36 // This function is not allowed to cause a garbage collection.

	37 // A GC might move the calling generated code and invalidate the

	38 // return address on the stack.

	39 DCHECK(byte_length % 2 == 0);

	40 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);

	41 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);

	42 size_t length = byte_length >> 1;

	43

	44 #ifdef V8_I18N_SUPPORT

	45 if (isolate == nullptr) {

	46 for (size_t i = 0; i < length; i++) {

	47 uc32 c1 = substring1[i];

	48 uc32 c2 = substring2[i];

	49 if (unibrow::Utf16::IsLeadSurrogate(c1)) {

	50 // Non-BMP characters do not have case-equivalents in the BMP.

	51 // Both have to be non-BMP for them to be able to match.

	52 if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;

	53 if (i + 1 < length) {

	54 uc16 c1t = substring1[i + 1];

	55 uc16 c2t = substring2[i + 1];

	56 if (unibrow::Utf16::IsTrailSurrogate(c1t) &&

	57 unibrow::Utf16::IsTrailSurrogate(c2t)) {

	58 c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);

	59 c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);

	60 i++;

	61 }

	62 }

	63 }

	64 c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);

	65 c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);

	66 if (c1 != c2) return 0;

	67 }

	68 return 1;

	69 }

	70 #endif // V8_I18N_SUPPORT

	71 DCHECK_NOT_NULL(isolate);

	72 for (size_t i = 0; i < length; i++) {

	73 unibrow::uchar c1 = substring1[i];

	74 unibrow::uchar c2 = substring2[i];

	75 if (c1 != c2) {

	76 unibrow::uchar s1[1] = {c1};

	77 canonicalize->get(c1, '\0', s1);

	78 if (s1[0] != c2) {

	79 unibrow::uchar s2[1] = {c2};

	80 canonicalize->get(c2, '\0', s2);

	81 if (s1[0] != s2[0]) {

	82 return 0;

	83 }

	84 }

	85 }

	86 }

	87 return 1;

	88 }

	89

	90

26 #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.	91 #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.

27	92

28 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,	93 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,

29 Zone* zone)	94 Zone* zone)

30 : RegExpMacroAssembler(isolate, zone) {}	95 : RegExpMacroAssembler(isolate, zone) {}

31	96

32	97

33 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {	98 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {

34 }	99 }

35	100

(...skipping 202 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
238 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,	303 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

239 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,	304 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

240	305

241 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,	306 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

242 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,	307 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

243 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,	308 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

244 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,	309 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,

245 };	310 };

246	311

247	312

248 int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(

249 Address byte_offset1,

250 Address byte_offset2,

251 size_t byte_length,

252 Isolate* isolate) {

253 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

254 isolate->regexp_macro_assembler_canonicalize();

255 // This function is not allowed to cause a garbage collection.

256 // A GC might move the calling generated code and invalidate the

257 // return address on the stack.

258 DCHECK(byte_length % 2 == 0);

259 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);

260 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);

261 size_t length = byte_length >> 1;

262

263 for (size_t i = 0; i < length; i++) {

264 unibrow::uchar c1 = substring1[i];

265 unibrow::uchar c2 = substring2[i];

266 if (c1 != c2) {

267 unibrow::uchar s1[1] = { c1 };

268 canonicalize->get(c1, '\0', s1);

269 if (s1[0] != c2) {

270 unibrow::uchar s2[1] = { c2 };

271 canonicalize->get(c2, '\0', s2);

272 if (s1[0] != s2[0]) {

273 return 0;

274 }

275 }

276 }

277 }

278 return 1;

279 }

280

281

282 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,	313 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,

283 Address* stack_base,	314 Address* stack_base,

284 Isolate* isolate) {	315 Isolate* isolate) {

285 RegExpStack* regexp_stack = isolate->regexp_stack();	316 RegExpStack* regexp_stack = isolate->regexp_stack();

286 size_t size = regexp_stack->stack_capacity();	317 size_t size = regexp_stack->stack_capacity();

287 Address old_stack_base = regexp_stack->stack_base();	318 Address old_stack_base = regexp_stack->stack_base();

288 DCHECK(old_stack_base == *stack_base);	319 DCHECK(old_stack_base == *stack_base);

289 DCHECK(stack_pointer <= old_stack_base);	320 DCHECK(stack_pointer <= old_stack_base);

290 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);	321 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);

291 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);	322 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);

292 if (new_stack_base == NULL) {	323 if (new_stack_base == NULL) {

293 return NULL;	324 return NULL;

294 }	325 }

295 *stack_base = new_stack_base;	326 *stack_base = new_stack_base;

296 intptr_t stack_content_size = old_stack_base - stack_pointer;	327 intptr_t stack_content_size = old_stack_base - stack_pointer;

297 return new_stack_base - stack_content_size;	328 return new_stack_base - stack_content_size;

298 }	329 }

299	330

300 #endif // V8_INTERPRETED_REGEXP	331 #endif // V8_INTERPRETED_REGEXP

301	332

302 } // namespace internal	333 } // namespace internal

303 } // namespace v8	334 } // namespace v8

OLD	NEW

« src/regexp/jsregexp.cc ('K') | « src/regexp/regexp-macro-assembler.h ('k') | src/regexp/regexp-macro-assembler-irregexp.h » ('j') | no next file with comments »