Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/jsregexp.cc

Issue 1188793004: RegExp: Remove bogus assumptions about case independence and Latin1 (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/v8.h" 5 #include "src/v8.h"
6 6
7 #include "src/ast.h" 7 #include "src/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 1548 matching lines...) Expand 10 before | Expand all | Expand 10 after
1559 DCHECK(!trace->mentions_reg(guard->reg())); 1559 DCHECK(!trace->mentions_reg(guard->reg()));
1560 macro_assembler->IfRegisterLT(guard->reg(), 1560 macro_assembler->IfRegisterLT(guard->reg(),
1561 guard->value(), 1561 guard->value(),
1562 trace->backtrack()); 1562 trace->backtrack());
1563 break; 1563 break;
1564 } 1564 }
1565 } 1565 }
1566 1566
1567 1567
1568 // Returns the number of characters in the equivalence class, omitting those 1568 // Returns the number of characters in the equivalence class, omitting those
1569 // that cannot occur in the source string because it is ASCII. 1569 // that cannot occur in the source string because it is Latin1.
1570 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1570 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1571 bool one_byte_subject, 1571 bool one_byte_subject,
1572 unibrow::uchar* letters) { 1572 unibrow::uchar* letters) {
1573 int length = 1573 int length =
1574 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1574 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1575 // Unibrow returns 0 or 1 for characters where case independence is 1575 // Unibrow returns 0 or 1 for characters where case independence is
1576 // trivial. 1576 // trivial.
1577 if (length == 0) { 1577 if (length == 0) {
1578 letters[0] = character; 1578 letters[0] = character;
1579 length = 1; 1579 length = 1;
1580 } 1580 }
1581 if (!one_byte_subject || character <= String::kMaxOneByteCharCode) { 1581
1582 return length; 1582 if (one_byte_subject) {
1583 int new_length = 0;
1584 for (int i = 0; i < length; i++) {
1585 if (letters[i] <= String::kMaxOneByteCharCode) {
1586 letters[new_length++] = letters[i];
1587 }
1588 }
1589 length = new_length;
1583 } 1590 }
1584 1591
1585 // The standard requires that non-ASCII characters cannot have ASCII 1592 return length;
1586 // character codes in their equivalence class.
1587 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
1588 // is it? For example, \u00C5 is equivalent to \u212B.
1589 return 0;
1590 } 1593 }
1591 1594
1592 1595
1593 static inline bool EmitSimpleCharacter(Isolate* isolate, 1596 static inline bool EmitSimpleCharacter(Isolate* isolate,
1594 RegExpCompiler* compiler, 1597 RegExpCompiler* compiler,
1595 uc16 c, 1598 uc16 c,
1596 Label* on_failure, 1599 Label* on_failure,
1597 int cp_offset, 1600 int cp_offset,
1598 bool check, 1601 bool check,
1599 bool preloaded) { 1602 bool preloaded) {
(...skipping 918 matching lines...) Expand 10 before | Expand all | Expand 10 after
2518 char_mask = String::kMaxUtf16CodeUnit; 2521 char_mask = String::kMaxUtf16CodeUnit;
2519 } 2522 }
2520 for (int k = 0; k < elms_->length(); k++) { 2523 for (int k = 0; k < elms_->length(); k++) {
2521 TextElement elm = elms_->at(k); 2524 TextElement elm = elms_->at(k);
2522 if (elm.text_type() == TextElement::ATOM) { 2525 if (elm.text_type() == TextElement::ATOM) {
2523 Vector<const uc16> quarks = elm.atom()->data(); 2526 Vector<const uc16> quarks = elm.atom()->data();
2524 for (int i = 0; i < characters && i < quarks.length(); i++) { 2527 for (int i = 0; i < characters && i < quarks.length(); i++) {
2525 QuickCheckDetails::Position* pos = 2528 QuickCheckDetails::Position* pos =
2526 details->positions(characters_filled_in); 2529 details->positions(characters_filled_in);
2527 uc16 c = quarks[i]; 2530 uc16 c = quarks[i];
2528 if (c > char_mask) {
2529 // If we expect a non-Latin1 character from an one-byte string,
2530 // there is no way we can match. Not even case-independent
2531 // matching can turn an Latin1 character into non-Latin1 or
2532 // vice versa.
2533 // TODO(dcarney): issue 3550. Verify that this works as expected.
2534 // For example, \u0178 is uppercase of \u00ff (y-umlaut).
2535 details->set_cannot_match();
2536 pos->determines_perfectly = false;
2537 return;
2538 }
2539 if (compiler->ignore_case()) { 2531 if (compiler->ignore_case()) {
2540 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2532 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2541 int length = GetCaseIndependentLetters(isolate, c, 2533 int length = GetCaseIndependentLetters(isolate, c,
2542 compiler->one_byte(), chars); 2534 compiler->one_byte(), chars);
2543 DCHECK(length != 0); // Can only happen if c > char_mask (see above). 2535 if (length == 0) {
2536 // This can happen because all case variants are non-Latin1, but we
2537 // know the input is Latin1.
2538 details->set_cannot_match();
2539 pos->determines_perfectly = false;
2540 return;
2541 }
2544 if (length == 1) { 2542 if (length == 1) {
2545 // This letter has no case equivalents, so it's nice and simple 2543 // This letter has no case equivalents, so it's nice and simple
2546 // and the mask-compare will determine definitely whether we have 2544 // and the mask-compare will determine definitely whether we have
2547 // a match at this character position. 2545 // a match at this character position.
2548 pos->mask = char_mask; 2546 pos->mask = char_mask;
2549 pos->value = c; 2547 pos->value = c;
2550 pos->determines_perfectly = true; 2548 pos->determines_perfectly = true;
2551 } else { 2549 } else {
2552 uint32_t common_bits = char_mask; 2550 uint32_t common_bits = char_mask;
2553 uint32_t bits = chars[0]; 2551 uint32_t bits = chars[0];
(...skipping 10 matching lines...) Expand all
2564 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { 2562 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2565 pos->determines_perfectly = true; 2563 pos->determines_perfectly = true;
2566 } 2564 }
2567 pos->mask = common_bits; 2565 pos->mask = common_bits;
2568 pos->value = bits; 2566 pos->value = bits;
2569 } 2567 }
2570 } else { 2568 } else {
2571 // Don't ignore case. Nice simple case where the mask-compare will 2569 // Don't ignore case. Nice simple case where the mask-compare will
2572 // determine definitely whether we have a match at this character 2570 // determine definitely whether we have a match at this character
2573 // position. 2571 // position.
2572 if (c > char_mask) {
2573 details->set_cannot_match();
2574 pos->determines_perfectly = false;
2575 return;
2576 }
2574 pos->mask = char_mask; 2577 pos->mask = char_mask;
2575 pos->value = c; 2578 pos->value = c;
2576 pos->determines_perfectly = true; 2579 pos->determines_perfectly = true;
2577 } 2580 }
2578 characters_filled_in++; 2581 characters_filled_in++;
2579 DCHECK(characters_filled_in <= details->characters()); 2582 DCHECK(characters_filled_in <= details->characters());
2580 if (characters_filled_in == details->characters()) { 2583 if (characters_filled_in == details->characters()) {
2581 return; 2584 return;
2582 } 2585 }
2583 } 2586 }
(...skipping 3762 matching lines...) Expand 10 before | Expand all | Expand 10 after
6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6349 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6350 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6348 heap->isolate()->memory_allocator()->SizeExecutable() > 6351 heap->isolate()->memory_allocator()->SizeExecutable() >
6349 RegExpImpl::kRegExpExecutableMemoryLimit) { 6352 RegExpImpl::kRegExpExecutableMemoryLimit) {
6350 too_much = true; 6353 too_much = true;
6351 } 6354 }
6352 return too_much; 6355 return too_much;
6353 } 6356 }
6354 } // namespace internal 6357 } // namespace internal
6355 } // namespace v8 6358 } // namespace v8
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698