OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/v8.h" | 5 #include "src/v8.h" |
6 | 6 |
7 #include "src/ast.h" | 7 #include "src/ast.h" |
8 #include "src/base/platform/platform.h" | 8 #include "src/base/platform/platform.h" |
9 #include "src/compilation-cache.h" | 9 #include "src/compilation-cache.h" |
10 #include "src/compiler.h" | 10 #include "src/compiler.h" |
(...skipping 1548 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1559 DCHECK(!trace->mentions_reg(guard->reg())); | 1559 DCHECK(!trace->mentions_reg(guard->reg())); |
1560 macro_assembler->IfRegisterLT(guard->reg(), | 1560 macro_assembler->IfRegisterLT(guard->reg(), |
1561 guard->value(), | 1561 guard->value(), |
1562 trace->backtrack()); | 1562 trace->backtrack()); |
1563 break; | 1563 break; |
1564 } | 1564 } |
1565 } | 1565 } |
1566 | 1566 |
1567 | 1567 |
1568 // Returns the number of characters in the equivalence class, omitting those | 1568 // Returns the number of characters in the equivalence class, omitting those |
1569 // that cannot occur in the source string because it is ASCII. | 1569 // that cannot occur in the source string because it is Latin1. |
1570 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, | 1570 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, |
1571 bool one_byte_subject, | 1571 bool one_byte_subject, |
1572 unibrow::uchar* letters) { | 1572 unibrow::uchar* letters) { |
1573 int length = | 1573 int length = |
1574 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); | 1574 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); |
1575 // Unibrow returns 0 or 1 for characters where case independence is | 1575 // Unibrow returns 0 or 1 for characters where case independence is |
1576 // trivial. | 1576 // trivial. |
1577 if (length == 0) { | 1577 if (length == 0) { |
1578 letters[0] = character; | 1578 letters[0] = character; |
1579 length = 1; | 1579 length = 1; |
1580 } | 1580 } |
1581 if (!one_byte_subject || character <= String::kMaxOneByteCharCode) { | 1581 |
1582 return length; | 1582 if (one_byte_subject) { |
| 1583 int new_length = 0; |
| 1584 for (int i = 0; i < length; i++) { |
| 1585 if (letters[i] <= String::kMaxOneByteCharCode) { |
| 1586 letters[new_length++] = letters[i]; |
| 1587 } |
| 1588 } |
| 1589 length = new_length; |
1583 } | 1590 } |
1584 | 1591 |
1585 // The standard requires that non-ASCII characters cannot have ASCII | 1592 return length; |
1586 // character codes in their equivalence class. | |
1587 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore, | |
1588 // is it? For example, \u00C5 is equivalent to \u212B. | |
1589 return 0; | |
1590 } | 1593 } |
1591 | 1594 |
1592 | 1595 |
1593 static inline bool EmitSimpleCharacter(Isolate* isolate, | 1596 static inline bool EmitSimpleCharacter(Isolate* isolate, |
1594 RegExpCompiler* compiler, | 1597 RegExpCompiler* compiler, |
1595 uc16 c, | 1598 uc16 c, |
1596 Label* on_failure, | 1599 Label* on_failure, |
1597 int cp_offset, | 1600 int cp_offset, |
1598 bool check, | 1601 bool check, |
1599 bool preloaded) { | 1602 bool preloaded) { |
(...skipping 918 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2518 char_mask = String::kMaxUtf16CodeUnit; | 2521 char_mask = String::kMaxUtf16CodeUnit; |
2519 } | 2522 } |
2520 for (int k = 0; k < elms_->length(); k++) { | 2523 for (int k = 0; k < elms_->length(); k++) { |
2521 TextElement elm = elms_->at(k); | 2524 TextElement elm = elms_->at(k); |
2522 if (elm.text_type() == TextElement::ATOM) { | 2525 if (elm.text_type() == TextElement::ATOM) { |
2523 Vector<const uc16> quarks = elm.atom()->data(); | 2526 Vector<const uc16> quarks = elm.atom()->data(); |
2524 for (int i = 0; i < characters && i < quarks.length(); i++) { | 2527 for (int i = 0; i < characters && i < quarks.length(); i++) { |
2525 QuickCheckDetails::Position* pos = | 2528 QuickCheckDetails::Position* pos = |
2526 details->positions(characters_filled_in); | 2529 details->positions(characters_filled_in); |
2527 uc16 c = quarks[i]; | 2530 uc16 c = quarks[i]; |
2528 if (c > char_mask) { | |
2529 // If we expect a non-Latin1 character from an one-byte string, | |
2530 // there is no way we can match. Not even case-independent | |
2531 // matching can turn an Latin1 character into non-Latin1 or | |
2532 // vice versa. | |
2533 // TODO(dcarney): issue 3550. Verify that this works as expected. | |
2534 // For example, \u0178 is uppercase of \u00ff (y-umlaut). | |
2535 details->set_cannot_match(); | |
2536 pos->determines_perfectly = false; | |
2537 return; | |
2538 } | |
2539 if (compiler->ignore_case()) { | 2531 if (compiler->ignore_case()) { |
2540 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 2532 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
2541 int length = GetCaseIndependentLetters(isolate, c, | 2533 int length = GetCaseIndependentLetters(isolate, c, |
2542 compiler->one_byte(), chars); | 2534 compiler->one_byte(), chars); |
2543 DCHECK(length != 0); // Can only happen if c > char_mask (see above). | 2535 if (length == 0) { |
| 2536 // This can happen because all case variants are non-Latin1, but we |
| 2537 // know the input is Latin1. |
| 2538 details->set_cannot_match(); |
| 2539 pos->determines_perfectly = false; |
| 2540 return; |
| 2541 } |
2544 if (length == 1) { | 2542 if (length == 1) { |
2545 // This letter has no case equivalents, so it's nice and simple | 2543 // This letter has no case equivalents, so it's nice and simple |
2546 // and the mask-compare will determine definitely whether we have | 2544 // and the mask-compare will determine definitely whether we have |
2547 // a match at this character position. | 2545 // a match at this character position. |
2548 pos->mask = char_mask; | 2546 pos->mask = char_mask; |
2549 pos->value = c; | 2547 pos->value = c; |
2550 pos->determines_perfectly = true; | 2548 pos->determines_perfectly = true; |
2551 } else { | 2549 } else { |
2552 uint32_t common_bits = char_mask; | 2550 uint32_t common_bits = char_mask; |
2553 uint32_t bits = chars[0]; | 2551 uint32_t bits = chars[0]; |
(...skipping 10 matching lines...) Expand all Loading... |
2564 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { | 2562 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) { |
2565 pos->determines_perfectly = true; | 2563 pos->determines_perfectly = true; |
2566 } | 2564 } |
2567 pos->mask = common_bits; | 2565 pos->mask = common_bits; |
2568 pos->value = bits; | 2566 pos->value = bits; |
2569 } | 2567 } |
2570 } else { | 2568 } else { |
2571 // Don't ignore case. Nice simple case where the mask-compare will | 2569 // Don't ignore case. Nice simple case where the mask-compare will |
2572 // determine definitely whether we have a match at this character | 2570 // determine definitely whether we have a match at this character |
2573 // position. | 2571 // position. |
| 2572 if (c > char_mask) { |
| 2573 details->set_cannot_match(); |
| 2574 pos->determines_perfectly = false; |
| 2575 return; |
| 2576 } |
2574 pos->mask = char_mask; | 2577 pos->mask = char_mask; |
2575 pos->value = c; | 2578 pos->value = c; |
2576 pos->determines_perfectly = true; | 2579 pos->determines_perfectly = true; |
2577 } | 2580 } |
2578 characters_filled_in++; | 2581 characters_filled_in++; |
2579 DCHECK(characters_filled_in <= details->characters()); | 2582 DCHECK(characters_filled_in <= details->characters()); |
2580 if (characters_filled_in == details->characters()) { | 2583 if (characters_filled_in == details->characters()) { |
2581 return; | 2584 return; |
2582 } | 2585 } |
2583 } | 2586 } |
(...skipping 3762 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; | 6349 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; |
6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && | 6350 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && |
6348 heap->isolate()->memory_allocator()->SizeExecutable() > | 6351 heap->isolate()->memory_allocator()->SizeExecutable() > |
6349 RegExpImpl::kRegExpExecutableMemoryLimit) { | 6352 RegExpImpl::kRegExpExecutableMemoryLimit) { |
6350 too_much = true; | 6353 too_much = true; |
6351 } | 6354 } |
6352 return too_much; | 6355 return too_much; |
6353 } | 6356 } |
6354 } // namespace internal | 6357 } // namespace internal |
6355 } // namespace v8 | 6358 } // namespace v8 |
OLD | NEW |