Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(420)

Side by Side Diff: src/regexp/interpreter-irregexp.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass
Patch Set: fixes Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // A simple interpreter for the Irregexp byte code. 5 // A simple interpreter for the Irregexp byte code.
6 6
7 #ifdef V8_INTERPRETED_REGEXP 7 #ifdef V8_INTERPRETED_REGEXP
8 8
9 #include "src/regexp/interpreter-irregexp.h" 9 #include "src/regexp/interpreter-irregexp.h"
10 10
11 #include "src/ast/ast.h" 11 #include "src/ast/ast.h"
12 #include "src/regexp/bytecodes-irregexp.h" 12 #include "src/regexp/bytecodes-irregexp.h"
13 #include "src/regexp/jsregexp.h" 13 #include "src/regexp/jsregexp.h"
14 #include "src/regexp/regexp-macro-assembler.h" 14 #include "src/regexp/regexp-macro-assembler.h"
15 #include "src/unicode.h" 15 #include "src/unicode.h"
16 #include "src/utils.h" 16 #include "src/utils.h"
17 17
18 #ifdef V8_I18N_SUPPORT
19 #include "unicode/uchar.h"
20 #endif // V8_I18N_SUPPORT
21
18 namespace v8 { 22 namespace v8 {
19 namespace internal { 23 namespace internal {
20 24
21 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize; 25 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
22 26
23 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, 27 static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
24 int from, 28 int len, Vector<const uc16> subject,
25 int current, 29 bool unicode) {
26 int len, 30 Address offset_a =
27 Vector<const uc16> subject) { 31 reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
28 for (int i = 0; i < len; i++) { 32 Address offset_b =
29 unibrow::uchar old_char = subject[from++]; 33 reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
30 unibrow::uchar new_char = subject[current++]; 34 size_t length = len * kUC16Size;
31 if (old_char == new_char) continue; 35 return RegExpMacroAssembler::CaseInsensitiveCompareUC16(
32 unibrow::uchar old_string[1] = { old_char }; 36 offset_a, offset_b, length, unicode ? nullptr : isolate) == 1;
33 unibrow::uchar new_string[1] = { new_char };
34 interp_canonicalize->get(old_char, '\0', old_string);
35 interp_canonicalize->get(new_char, '\0', new_string);
36 if (old_string[0] != new_string[0]) {
37 return false;
38 }
39 }
40 return true;
41 } 37 }
42 38
43 39
44 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, 40 static bool BackRefMatchesNoCase(Isolate* isolate, int from, int current,
45 int from, 41 int len, Vector<const uint8_t> subject,
46 int current, 42 bool unicode) {
47 int len, 43 // For Latin1 characters the unicode flag makes no difference.
48 Vector<const uint8_t> subject) {
49 for (int i = 0; i < len; i++) { 44 for (int i = 0; i < len; i++) {
50 unsigned int old_char = subject[from++]; 45 unsigned int old_char = subject[from++];
51 unsigned int new_char = subject[current++]; 46 unsigned int new_char = subject[current++];
52 if (old_char == new_char) continue; 47 if (old_char == new_char) continue;
53 // Convert both characters to lower case. 48 // Convert both characters to lower case.
54 old_char |= 0x20; 49 old_char |= 0x20;
55 new_char |= 0x20; 50 new_char |= 0x20;
56 if (old_char != new_char) return false; 51 if (old_char != new_char) return false;
57 // Not letters in the ASCII range and Latin-1 range. 52 // Not letters in the ASCII range and Latin-1 range.
58 if (!(old_char - 'a' <= 'z' - 'a') && 53 if (!(old_char - 'a' <= 'z' - 'a') &&
(...skipping 457 matching lines...) Expand 10 before | Expand all | Expand 10 after
516 if (current - len < 0 || 511 if (current - len < 0 ||
517 CompareChars(&subject[from], &subject[current - len], len) != 0) { 512 CompareChars(&subject[from], &subject[current - len], len) != 0) {
518 pc = code_base + Load32Aligned(pc + 4); 513 pc = code_base + Load32Aligned(pc + 4);
519 break; 514 break;
520 } 515 }
521 current -= len; 516 current -= len;
522 } 517 }
523 pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH; 518 pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;
524 break; 519 break;
525 } 520 }
521 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE)
526 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { 522 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
523 bool unicode =
524 (insn & BYTECODE_MASK) == BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE;
527 int from = registers[insn >> BYTECODE_SHIFT]; 525 int from = registers[insn >> BYTECODE_SHIFT];
528 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; 526 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
529 if (from >= 0 && len > 0) { 527 if (from >= 0 && len > 0) {
530 if (current + len > subject.length() || 528 if (current + len > subject.length() ||
531 !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(), 529 !BackRefMatchesNoCase(isolate, from, current, len, subject,
532 from, current, len, subject)) { 530 unicode)) {
533 pc = code_base + Load32Aligned(pc + 4); 531 pc = code_base + Load32Aligned(pc + 4);
534 break; 532 break;
535 } 533 }
536 current += len; 534 current += len;
537 } 535 }
538 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; 536 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
539 break; 537 break;
540 } 538 }
539 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD)
541 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { 540 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
541 bool unicode = (insn & BYTECODE_MASK) ==
542 BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD;
542 int from = registers[insn >> BYTECODE_SHIFT]; 543 int from = registers[insn >> BYTECODE_SHIFT];
543 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; 544 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
544 if (from >= 0 && len > 0) { 545 if (from >= 0 && len > 0) {
545 if (current - len < 0 || 546 if (current - len < 0 ||
546 !BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(), 547 !BackRefMatchesNoCase(isolate, from, current - len, len, subject,
547 from, current - len, len, subject)) { 548 unicode)) {
548 pc = code_base + Load32Aligned(pc + 4); 549 pc = code_base + Load32Aligned(pc + 4);
549 break; 550 break;
550 } 551 }
551 current -= len; 552 current -= len;
552 } 553 }
553 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH; 554 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH;
554 break; 555 break;
555 } 556 }
556 BYTECODE(CHECK_AT_START) 557 BYTECODE(CHECK_AT_START)
557 if (current == 0) { 558 if (current == 0) {
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
615 registers, 616 registers,
616 start_position, 617 start_position,
617 previous_char); 618 previous_char);
618 } 619 }
619 } 620 }
620 621
621 } // namespace internal 622 } // namespace internal
622 } // namespace v8 623 } // namespace v8
623 624
624 #endif // V8_INTERPRETED_REGEXP 625 #endif // V8_INTERPRETED_REGEXP
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698