src/regexp/interpreter-irregexp.cc - Issue 1599303002: [regexp] implement case-insensitive unicode regexps.

Side by Side Diff: src/regexp/interpreter-irregexp.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass

Patch Set: fixes Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2011 the V8 project authors. All rights reserved.	1 // Copyright 2011 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // A simple interpreter for the Irregexp byte code.	5 // A simple interpreter for the Irregexp byte code.

6	6

7 #ifdef V8_INTERPRETED_REGEXP	7 #ifdef V8_INTERPRETED_REGEXP

8	8

9 #include "src/regexp/interpreter-irregexp.h"	9 #include "src/regexp/interpreter-irregexp.h"

10	10

11 #include "src/ast/ast.h"	11 #include "src/ast/ast.h"

12 #include "src/regexp/bytecodes-irregexp.h"	12 #include "src/regexp/bytecodes-irregexp.h"

13 #include "src/regexp/jsregexp.h"	13 #include "src/regexp/jsregexp.h"

14 #include "src/regexp/regexp-macro-assembler.h"	14 #include "src/regexp/regexp-macro-assembler.h"

15 #include "src/unicode.h"	15 #include "src/unicode.h"

16 #include "src/utils.h"	16 #include "src/utils.h"

17	17

	18 #ifdef V8_I18N_SUPPORT

	19 #include "unicode/uchar.h"

	20 #endif // V8_I18N_SUPPORT

	21

18 namespace v8 {	22 namespace v8 {

19 namespace internal {	23 namespace internal {

20	24

21 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;	25 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;

22	26

23 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,	27 static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,

24 int from,	28 int len, Vector<const uc16> subject,

25 int current,	29 bool unicode) {

26 int len,	30 Address offset_a =

27 Vector<const uc16> subject) {	31 reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));

28 for (int i = 0; i < len; i++) {	32 Address offset_b =

29 unibrow::uchar old_char = subject[from++];	33 reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));

30 unibrow::uchar new_char = subject[current++];	34 size_t length = len * kUC16Size;

31 if (old_char == new_char) continue;	35 return RegExpMacroAssembler::CaseInsensitiveCompareUC16(

32 unibrow::uchar old_string[1] = { old_char };	36 offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;

33 unibrow::uchar new_string[1] = { new_char };

34 interp_canonicalize->get(old_char, '\0', old_string);

35 interp_canonicalize->get(new_char, '\0', new_string);

36 if (old_string[0] != new_string[0]) {

37 return false;

38 }

39 }

40 return true;

41 }	37 }

42	38

43	39

44 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,	40 static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,

45 int from,	41 int len, Vector<const uint8_t> subject,

46 int current,	42 bool unicode) {

47 int len,	43 // For Latin1 characters the unicode flag makes no difference.

48 Vector<const uint8_t> subject) {

49 for (int i = 0; i < len; i++) {	44 for (int i = 0; i < len; i++) {

50 unsigned int old_char = subject[from++];	45 unsigned int old_char = subject[from++];

51 unsigned int new_char = subject[current++];	46 unsigned int new_char = subject[current++];

52 if (old_char == new_char) continue;	47 if (old_char == new_char) continue;

53 // Convert both characters to lower case.	48 // Convert both characters to lower case.

54 old_char \|= 0x20;	49 old_char \|= 0x20;

55 new_char \|= 0x20;	50 new_char \|= 0x20;

56 if (old_char != new_char) return false;	51 if (old_char != new_char) return false;

57 // Not letters in the ASCII range and Latin-1 range.	52 // Not letters in the ASCII range and Latin-1 range.

58 if (!(old_char - 'a' <= 'z' - 'a') &&	53 if (!(old_char - 'a' <= 'z' - 'a') &&

(...skipping 457 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
516 if (current - len < 0 \|\|	511 if (current - len < 0 \|\|

517 CompareChars(&subject[from], &subject[current - len], len) != 0) {	512 CompareChars(&subject[from], &subject[current - len], len) != 0) {

518 pc = code_base + Load32Aligned(pc + 4);	513 pc = code_base + Load32Aligned(pc + 4);

519 break;	514 break;

520 }	515 }

521 current -= len;	516 current -= len;

522 }	517 }

523 pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;	518 pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;

524 break;	519 break;

525 }	520 }

	521 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE)

526 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {	522 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {

	523 bool unicode =

	524 (insn & BYTECODE_MASK) == BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE;

527 int from = registers[insn >> BYTECODE_SHIFT];	525 int from = registers[insn >> BYTECODE_SHIFT];

528 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;	526 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;

529 if (from >= 0 && len > 0) {	527 if (from >= 0 && len > 0) {

530 if (current + len > subject.length() \|\|	528 if (current + len > subject.length() \|\|

531 !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),	529 !BackRefMatchesNoCase(isolate, from, current, len, subject,

532 from, current, len, subject)) {	530 unicode)) {

533 pc = code_base + Load32Aligned(pc + 4);	531 pc = code_base + Load32Aligned(pc + 4);

534 break;	532 break;

535 }	533 }

536 current += len;	534 current += len;

537 }	535 }

538 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;	536 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;

539 break;	537 break;

540 }	538 }

	539 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD)

541 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {	540 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {

	541 bool unicode = (insn & BYTECODE_MASK) ==

	542 BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD;

542 int from = registers[insn >> BYTECODE_SHIFT];	543 int from = registers[insn >> BYTECODE_SHIFT];

543 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;	544 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;

544 if (from >= 0 && len > 0) {	545 if (from >= 0 && len > 0) {

545 if (current - len < 0 \|\|	546 if (current - len < 0 \|\|

546 !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),	547 !BackRefMatchesNoCase(isolate, from, current - len, len, subject,

547 from, current - len, len, subject)) {	548 unicode)) {

548 pc = code_base + Load32Aligned(pc + 4);	549 pc = code_base + Load32Aligned(pc + 4);

549 break;	550 break;

550 }	551 }

551 current -= len;	552 current -= len;

552 }	553 }

553 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH;	554 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH;

554 break;	555 break;

555 }	556 }

556 BYTECODE(CHECK_AT_START)	557 BYTECODE(CHECK_AT_START)

557 if (current == 0) {	558 if (current == 0) {

(...skipping 57 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
615 registers,	616 registers,

616 start_position,	617 start_position,

617 previous_char);	618 previous_char);

618 }	619 }

619 }	620 }

620	621

621 } // namespace internal	622 } // namespace internal

622 } // namespace v8	623 } // namespace v8

623	624

624 #endif // V8_INTERPRETED_REGEXP	625 #endif // V8_INTERPRETED_REGEXP

OLD	NEW

« no previous file with comments | « src/regexp/ia32/regexp-macro-assembler-ia32.cc ('k') | src/regexp/jsregexp.h » ('j') | src/regexp/jsregexp.cc » ('J')