src/jsregexp.cc - Issue 1188793004: RegExp: Remove bogus assumptions about case independence and Latin1

Side by Side Diff: src/jsregexp.cc

Issue 1188793004: RegExp: Remove bogus assumptions about case independence and Latin1 (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/v8.h"	5 #include "src/v8.h"

6	6

7 #include "src/ast.h"	7 #include "src/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 1548 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1559 DCHECK(!trace->mentions_reg(guard->reg()));	1559 DCHECK(!trace->mentions_reg(guard->reg()));

1560 macro_assembler->IfRegisterLT(guard->reg(),	1560 macro_assembler->IfRegisterLT(guard->reg(),

1561 guard->value(),	1561 guard->value(),

1562 trace->backtrack());	1562 trace->backtrack());

1563 break;	1563 break;

1564 }	1564 }

1565 }	1565 }

1566	1566

1567	1567

1568 // Returns the number of characters in the equivalence class, omitting those	1568 // Returns the number of characters in the equivalence class, omitting those

1569 // that cannot occur in the source string because it is ASCII.	1569 // that cannot occur in the source string because it is Latin1.

1570 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,	1570 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,

1571 bool one_byte_subject,	1571 bool one_byte_subject,

1572 unibrow::uchar* letters) {	1572 unibrow::uchar* letters) {

1573 int length =	1573 int length =

1574 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);	1574 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);

1575 // Unibrow returns 0 or 1 for characters where case independence is	1575 // Unibrow returns 0 or 1 for characters where case independence is

1576 // trivial.	1576 // trivial.

1577 if (length == 0) {	1577 if (length == 0) {

1578 letters[0] = character;	1578 letters[0] = character;

1579 length = 1;	1579 length = 1;

1580 }	1580 }

1581 if (!one_byte_subject \|\| character <= String::kMaxOneByteCharCode) {	1581

1582 return length;	1582 if (one_byte_subject) {

	1583 int new_length = 0;

	1584 for (int i = 0; i < length; i++) {

	1585 if (letters[i] <= String::kMaxOneByteCharCode) {

	1586 letters[new_length++] = letters[i];

	1587 }

	1588 }

	1589 length = new_length;

1583 }	1590 }

1584	1591

1585 // The standard requires that non-ASCII characters cannot have ASCII	1592 return length;

1586 // character codes in their equivalence class.

1587 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,

1588 // is it? For example, \u00C5 is equivalent to \u212B.

1589 return 0;

1590 }	1593 }

1591	1594

1592	1595

1593 static inline bool EmitSimpleCharacter(Isolate* isolate,	1596 static inline bool EmitSimpleCharacter(Isolate* isolate,

1594 RegExpCompiler* compiler,	1597 RegExpCompiler* compiler,

1595 uc16 c,	1598 uc16 c,

1596 Label* on_failure,	1599 Label* on_failure,

1597 int cp_offset,	1600 int cp_offset,

1598 bool check,	1601 bool check,

1599 bool preloaded) {	1602 bool preloaded) {

(...skipping 918 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2518 char_mask = String::kMaxUtf16CodeUnit;	2521 char_mask = String::kMaxUtf16CodeUnit;

2519 }	2522 }

2520 for (int k = 0; k < elms_->length(); k++) {	2523 for (int k = 0; k < elms_->length(); k++) {

2521 TextElement elm = elms_->at(k);	2524 TextElement elm = elms_->at(k);

2522 if (elm.text_type() == TextElement::ATOM) {	2525 if (elm.text_type() == TextElement::ATOM) {

2523 Vector<const uc16> quarks = elm.atom()->data();	2526 Vector<const uc16> quarks = elm.atom()->data();

2524 for (int i = 0; i < characters && i < quarks.length(); i++) {	2527 for (int i = 0; i < characters && i < quarks.length(); i++) {

2525 QuickCheckDetails::Position* pos =	2528 QuickCheckDetails::Position* pos =

2526 details->positions(characters_filled_in);	2529 details->positions(characters_filled_in);

2527 uc16 c = quarks[i];	2530 uc16 c = quarks[i];

2528 if (c > char_mask) {

2529 // If we expect a non-Latin1 character from an one-byte string,

2530 // there is no way we can match. Not even case-independent

2531 // matching can turn an Latin1 character into non-Latin1 or

2532 // vice versa.

2533 // TODO(dcarney): issue 3550. Verify that this works as expected.

2534 // For example, \u0178 is uppercase of \u00ff (y-umlaut).

2535 details->set_cannot_match();

2536 pos->determines_perfectly = false;

2537 return;

2538 }

2539 if (compiler->ignore_case()) {	2531 if (compiler->ignore_case()) {

2540 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	2532 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

2541 int length = GetCaseIndependentLetters(isolate, c,	2533 int length = GetCaseIndependentLetters(isolate, c,

2542 compiler->one_byte(), chars);	2534 compiler->one_byte(), chars);

2543 DCHECK(length != 0); // Can only happen if c > char_mask (see above).	2535 if (length == 0) {

	2536 // This can happen because all case variants are non-Latin1, but we

	2537 // know the input is Latin1.

	2538 details->set_cannot_match();

	2539 pos->determines_perfectly = false;

	2540 return;

	2541 }

2544 if (length == 1) {	2542 if (length == 1) {

2545 // This letter has no case equivalents, so it's nice and simple	2543 // This letter has no case equivalents, so it's nice and simple

2546 // and the mask-compare will determine definitely whether we have	2544 // and the mask-compare will determine definitely whether we have

2547 // a match at this character position.	2545 // a match at this character position.

2548 pos->mask = char_mask;	2546 pos->mask = char_mask;

2549 pos->value = c;	2547 pos->value = c;

2550 pos->determines_perfectly = true;	2548 pos->determines_perfectly = true;

2551 } else {	2549 } else {

2552 uint32_t common_bits = char_mask;	2550 uint32_t common_bits = char_mask;

2553 uint32_t bits = chars[0];	2551 uint32_t bits = chars[0];

(...skipping 10 matching lines...) Expand all Loading...
2564 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {	2562 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {

2565 pos->determines_perfectly = true;	2563 pos->determines_perfectly = true;

2566 }	2564 }

2567 pos->mask = common_bits;	2565 pos->mask = common_bits;

2568 pos->value = bits;	2566 pos->value = bits;

2569 }	2567 }

2570 } else {	2568 } else {

2571 // Don't ignore case. Nice simple case where the mask-compare will	2569 // Don't ignore case. Nice simple case where the mask-compare will

2572 // determine definitely whether we have a match at this character	2570 // determine definitely whether we have a match at this character

2573 // position.	2571 // position.

	2572 if (c > char_mask) {

	2573 details->set_cannot_match();

	2574 pos->determines_perfectly = false;

	2575 return;

	2576 }

2574 pos->mask = char_mask;	2577 pos->mask = char_mask;

2575 pos->value = c;	2578 pos->value = c;

2576 pos->determines_perfectly = true;	2579 pos->determines_perfectly = true;

2577 }	2580 }

2578 characters_filled_in++;	2581 characters_filled_in++;

2579 DCHECK(characters_filled_in <= details->characters());	2582 DCHECK(characters_filled_in <= details->characters());

2580 if (characters_filled_in == details->characters()) {	2583 if (characters_filled_in == details->characters()) {

2581 return;	2584 return;

2582 }	2585 }

2583 }	2586 }

(...skipping 3762 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;	6349 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;

6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&	6350 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&

6348 heap->isolate()->memory_allocator()->SizeExecutable() >	6351 heap->isolate()->memory_allocator()->SizeExecutable() >

6349 RegExpImpl::kRegExpExecutableMemoryLimit) {	6352 RegExpImpl::kRegExpExecutableMemoryLimit) {

6350 too_much = true;	6353 too_much = true;

6351 }	6354 }

6352 return too_much;	6355 return too_much;

6353 }	6356 }

6354 } // namespace internal	6357 } // namespace internal

6355 } // namespace v8	6358 } // namespace v8

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »