Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(199)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1661483002: Revert of [regexp] implement /ui to mirror the implementation for /i. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: add cause of revert as test case Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include "src/ast/ast.h" 7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 1580 matching lines...) Expand 10 before | Expand all | Expand 10 after
1591 macro_assembler->IfRegisterLT(guard->reg(), 1591 macro_assembler->IfRegisterLT(guard->reg(),
1592 guard->value(), 1592 guard->value(),
1593 trace->backtrack()); 1593 trace->backtrack());
1594 break; 1594 break;
1595 } 1595 }
1596 } 1596 }
1597 1597
1598 1598
1599 // Returns the number of characters in the equivalence class, omitting those 1599 // Returns the number of characters in the equivalence class, omitting those
1600 // that cannot occur in the source string because it is Latin1. 1600 // that cannot occur in the source string because it is Latin1.
1601 static int GetCaseIndependentLetters(RegExpCompiler* compiler, uc16 character, 1601 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1602 uc32* letters) { 1602 bool one_byte_subject,
1603 int length; 1603 unibrow::uchar* letters) {
1604 #ifdef V8_I18N_SUPPORT 1604 int length =
1605 if (compiler->unicode()) { 1605 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1606 USet* set = uset_open(character, character); 1606 // Unibrow returns 0 or 1 for characters where case independence is
1607 uset_closeOver(set, USET_CASE_INSENSITIVE); 1607 // trivial.
1608 uset_removeAllStrings(set); 1608 if (length == 0) {
1609 length = uset_size(set); 1609 letters[0] = character;
1610 for (int i = 0; i < length; i++) { 1610 length = 1;
1611 letters[i] = uset_charAt(set, i);
1612 }
1613 uset_close(set);
1614 } else // NOLINT
1615 // Fallback in case ICU is not included.
1616 #endif // V8_I18N_SUPPORT
1617 {
1618 length = compiler->isolate()->jsregexp_uncanonicalize()->get(character,
1619 '\0', letters);
1620 // Unibrow returns 0 or 1 for characters where case independence is
1621 // trivial.
1622 if (length == 0) {
1623 letters[0] = character;
1624 length = 1;
1625 }
1626 } 1611 }
1627 1612
1628 if (compiler->one_byte()) { 1613 if (one_byte_subject) {
1629 int new_length = 0; 1614 int new_length = 0;
1630 for (int i = 0; i < length; i++) { 1615 for (int i = 0; i < length; i++) {
1631 if (letters[i] <= String::kMaxOneByteCharCode) { 1616 if (letters[i] <= String::kMaxOneByteCharCode) {
1632 letters[new_length++] = letters[i]; 1617 letters[new_length++] = letters[i];
1633 } 1618 }
1634 } 1619 }
1635 length = new_length; 1620 length = new_length;
1636 } 1621 }
1637 1622
1638 return length; 1623 return length;
1639 } 1624 }
1640 1625
1641 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, uc16 c, 1626
1642 Label* on_failure, int cp_offset, 1627 static inline bool EmitSimpleCharacter(Isolate* isolate,
1643 bool check, bool preloaded) { 1628 RegExpCompiler* compiler,
1629 uc16 c,
1630 Label* on_failure,
1631 int cp_offset,
1632 bool check,
1633 bool preloaded) {
1644 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1634 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1645 bool bound_checked = false; 1635 bool bound_checked = false;
1646 if (!preloaded) { 1636 if (!preloaded) {
1647 assembler->LoadCurrentCharacter( 1637 assembler->LoadCurrentCharacter(
1648 cp_offset, 1638 cp_offset,
1649 on_failure, 1639 on_failure,
1650 check); 1640 check);
1651 bound_checked = true; 1641 bound_checked = true;
1652 } 1642 }
1653 assembler->CheckNotCharacter(c, on_failure); 1643 assembler->CheckNotCharacter(c, on_failure);
1654 return bound_checked; 1644 return bound_checked;
1655 } 1645 }
1656 1646
1657 1647
1658 // Only emits non-letters (things that don't have case). Only used for case 1648 // Only emits non-letters (things that don't have case). Only used for case
1659 // independent matches. 1649 // independent matches.
1660 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, uc16 c, 1650 static inline bool EmitAtomNonLetter(Isolate* isolate,
1661 Label* on_failure, int cp_offset, 1651 RegExpCompiler* compiler,
1662 bool check, bool preloaded) { 1652 uc16 c,
1653 Label* on_failure,
1654 int cp_offset,
1655 bool check,
1656 bool preloaded) {
1663 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1657 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1658 bool one_byte = compiler->one_byte();
1664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1659 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1665 int length = GetCaseIndependentLetters(compiler, c, chars); 1660 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1666 if (length < 1) { 1661 if (length < 1) {
1667 // This can't match. Must be an one-byte subject and a non-one-byte 1662 // This can't match. Must be an one-byte subject and a non-one-byte
1668 // character. We do not need to do anything since the one-byte pass 1663 // character. We do not need to do anything since the one-byte pass
1669 // already handled this. 1664 // already handled this.
1670 return false; // Bounds not checked. 1665 return false; // Bounds not checked.
1671 } 1666 }
1672 bool checked = false; 1667 bool checked = false;
1673 // We handle the length > 1 case in a later pass. 1668 // We handle the length > 1 case in a later pass.
1674 if (length == 1) { 1669 if (length == 1) {
1675 if (compiler->one_byte() && c > String::kMaxOneByteCharCodeU) { 1670 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1676 // This cannot match. 1671 // Can't match - see above.
1677 return false; // Bounds not checked. 1672 return false; // Bounds not checked.
1678 } 1673 }
1679 if (!preloaded) { 1674 if (!preloaded) {
1680 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1675 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1681 checked = check; 1676 checked = check;
1682 } 1677 }
1683 macro_assembler->CheckNotCharacter(c, on_failure); 1678 macro_assembler->CheckNotCharacter(c, on_failure);
1684 } 1679 }
1685 return checked; 1680 return checked;
1686 } 1681 }
(...skipping 28 matching lines...) Expand all
1715 uc16 mask = char_mask ^ diff; 1710 uc16 mask = char_mask ^ diff;
1716 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1711 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1717 diff, 1712 diff,
1718 mask, 1713 mask,
1719 on_failure); 1714 on_failure);
1720 return true; 1715 return true;
1721 } 1716 }
1722 return false; 1717 return false;
1723 } 1718 }
1724 1719
1725 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, uc16 c, 1720
1726 Label* on_failure, int cp_offset, bool check, 1721 typedef bool EmitCharacterFunction(Isolate* isolate,
1722 RegExpCompiler* compiler,
1723 uc16 c,
1724 Label* on_failure,
1725 int cp_offset,
1726 bool check,
1727 bool preloaded); 1727 bool preloaded);
1728 1728
1729 // Only emits letters (things that have case). Only used for case independent 1729 // Only emits letters (things that have case). Only used for case independent
1730 // matches. 1730 // matches.
1731 static inline bool EmitAtomLetter(RegExpCompiler* compiler, uc16 c, 1731 static inline bool EmitAtomLetter(Isolate* isolate,
1732 Label* on_failure, int cp_offset, bool check, 1732 RegExpCompiler* compiler,
1733 uc16 c,
1734 Label* on_failure,
1735 int cp_offset,
1736 bool check,
1733 bool preloaded) { 1737 bool preloaded) {
1734 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1738 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1739 bool one_byte = compiler->one_byte();
1735 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1740 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1736 int length = GetCaseIndependentLetters(compiler, c, chars); 1741 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1737 if (length <= 1) return false; 1742 if (length <= 1) return false;
1738 // We may not need to check against the end of the input string 1743 // We may not need to check against the end of the input string
1739 // if this character lies before a character that matched. 1744 // if this character lies before a character that matched.
1740 if (!preloaded) { 1745 if (!preloaded) {
1741 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1746 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1742 } 1747 }
1743 Label ok; 1748 Label ok;
1744 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1749 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1745 switch (length) { 1750 switch (length) {
1746 case 2: { 1751 case 2: {
1747 if (ShortCutEmitCharacterPair(macro_assembler, compiler->one_byte(), 1752 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1748 chars[0], chars[1], on_failure)) { 1753 chars[1], on_failure)) {
1749 } else { 1754 } else {
1750 macro_assembler->CheckCharacter(chars[0], &ok); 1755 macro_assembler->CheckCharacter(chars[0], &ok);
1751 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1756 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1752 macro_assembler->Bind(&ok); 1757 macro_assembler->Bind(&ok);
1753 } 1758 }
1754 break; 1759 break;
1755 } 1760 }
1756 case 4: 1761 case 4:
1757 macro_assembler->CheckCharacter(chars[3], &ok); 1762 macro_assembler->CheckCharacter(chars[3], &ok);
1758 // Fall through! 1763 // Fall through!
(...skipping 516 matching lines...) Expand 10 before | Expand all | Expand 10 after
2275 int ActionNode::EatsAtLeast(int still_to_find, 2280 int ActionNode::EatsAtLeast(int still_to_find,
2276 int budget, 2281 int budget,
2277 bool not_at_start) { 2282 bool not_at_start) {
2278 if (budget <= 0) return 0; 2283 if (budget <= 0) return 0;
2279 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2284 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2280 return on_success()->EatsAtLeast(still_to_find, 2285 return on_success()->EatsAtLeast(still_to_find,
2281 budget - 1, 2286 budget - 1,
2282 not_at_start); 2287 not_at_start);
2283 } 2288 }
2284 2289
2285 void ActionNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget, 2290
2291 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2286 BoyerMooreLookahead* bm, bool not_at_start) { 2292 BoyerMooreLookahead* bm, bool not_at_start) {
2287 if (action_type_ == BEGIN_SUBMATCH) { 2293 if (action_type_ == BEGIN_SUBMATCH) {
2288 bm->SetRest(offset); 2294 bm->SetRest(offset);
2289 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2295 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2290 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start); 2296 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2291 } 2297 }
2292 SaveBMInfo(bm, not_at_start, offset); 2298 SaveBMInfo(bm, not_at_start, offset);
2293 } 2299 }
2294 2300
2295 2301
2296 int AssertionNode::EatsAtLeast(int still_to_find, 2302 int AssertionNode::EatsAtLeast(int still_to_find,
2297 int budget, 2303 int budget,
2298 bool not_at_start) { 2304 bool not_at_start) {
2299 if (budget <= 0) return 0; 2305 if (budget <= 0) return 0;
2300 // If we know we are not at the start and we are asked "how many characters 2306 // If we know we are not at the start and we are asked "how many characters
2301 // will you match if you succeed?" then we can answer anything since false 2307 // will you match if you succeed?" then we can answer anything since false
2302 // implies false. So lets just return the max answer (still_to_find) since 2308 // implies false. So lets just return the max answer (still_to_find) since
2303 // that won't prevent us from preloading a lot of characters for the other 2309 // that won't prevent us from preloading a lot of characters for the other
2304 // branches in the node graph. 2310 // branches in the node graph.
2305 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2311 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2306 return on_success()->EatsAtLeast(still_to_find, 2312 return on_success()->EatsAtLeast(still_to_find,
2307 budget - 1, 2313 budget - 1,
2308 not_at_start); 2314 not_at_start);
2309 } 2315 }
2310 2316
2311 void AssertionNode::FillInBMInfo(RegExpCompiler* compiler, int offset, 2317
2312 int budget, BoyerMooreLookahead* bm, 2318 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2313 bool not_at_start) { 2319 BoyerMooreLookahead* bm, bool not_at_start) {
2314 // Match the behaviour of EatsAtLeast on this node. 2320 // Match the behaviour of EatsAtLeast on this node.
2315 if (assertion_type() == AT_START && not_at_start) return; 2321 if (assertion_type() == AT_START && not_at_start) return;
2316 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start); 2322 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2317 SaveBMInfo(bm, not_at_start, offset); 2323 SaveBMInfo(bm, not_at_start, offset);
2318 } 2324 }
2319 2325
2320 2326
2321 int BackReferenceNode::EatsAtLeast(int still_to_find, 2327 int BackReferenceNode::EatsAtLeast(int still_to_find,
2322 int budget, 2328 int budget,
2323 bool not_at_start) { 2329 bool not_at_start) {
2324 if (read_backward()) return 0; 2330 if (read_backward()) return 0;
2325 if (budget <= 0) return 0; 2331 if (budget <= 0) return 0;
2326 return on_success()->EatsAtLeast(still_to_find, 2332 return on_success()->EatsAtLeast(still_to_find,
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after
2520 // The masks and values for the positions will be combined into a single 2526 // The masks and values for the positions will be combined into a single
2521 // machine word for the current character width in order to be used in 2527 // machine word for the current character width in order to be used in
2522 // generating a quick check. 2528 // generating a quick check.
2523 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2529 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2524 RegExpCompiler* compiler, 2530 RegExpCompiler* compiler,
2525 int characters_filled_in, 2531 int characters_filled_in,
2526 bool not_at_start) { 2532 bool not_at_start) {
2527 // Do not collect any quick check details if the text node reads backward, 2533 // Do not collect any quick check details if the text node reads backward,
2528 // since it reads in the opposite direction than we use for quick checks. 2534 // since it reads in the opposite direction than we use for quick checks.
2529 if (read_backward()) return; 2535 if (read_backward()) return;
2536 Isolate* isolate = compiler->macro_assembler()->isolate();
2530 DCHECK(characters_filled_in < details->characters()); 2537 DCHECK(characters_filled_in < details->characters());
2531 int characters = details->characters(); 2538 int characters = details->characters();
2532 int char_mask; 2539 int char_mask;
2533 if (compiler->one_byte()) { 2540 if (compiler->one_byte()) {
2534 char_mask = String::kMaxOneByteCharCode; 2541 char_mask = String::kMaxOneByteCharCode;
2535 } else { 2542 } else {
2536 char_mask = String::kMaxUtf16CodeUnit; 2543 char_mask = String::kMaxUtf16CodeUnit;
2537 } 2544 }
2538 for (int k = 0; k < elements()->length(); k++) { 2545 for (int k = 0; k < elements()->length(); k++) {
2539 TextElement elm = elements()->at(k); 2546 TextElement elm = elements()->at(k);
2540 if (elm.text_type() == TextElement::ATOM) { 2547 if (elm.text_type() == TextElement::ATOM) {
2541 Vector<const uc16> quarks = elm.atom()->data(); 2548 Vector<const uc16> quarks = elm.atom()->data();
2542 for (int i = 0; i < characters && i < quarks.length(); i++) { 2549 for (int i = 0; i < characters && i < quarks.length(); i++) {
2543 QuickCheckDetails::Position* pos = 2550 QuickCheckDetails::Position* pos =
2544 details->positions(characters_filled_in); 2551 details->positions(characters_filled_in);
2545 uc16 c = quarks[i]; 2552 uc16 c = quarks[i];
2546 if (compiler->ignore_case()) { 2553 if (compiler->ignore_case()) {
2547 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2554 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2548 int length = GetCaseIndependentLetters(compiler, c, chars); 2555 int length = GetCaseIndependentLetters(isolate, c,
2556 compiler->one_byte(), chars);
2549 if (length == 0) { 2557 if (length == 0) {
2550 // This can happen because all case variants are non-Latin1, but we 2558 // This can happen because all case variants are non-Latin1, but we
2551 // know the input is Latin1. 2559 // know the input is Latin1.
2552 details->set_cannot_match(); 2560 details->set_cannot_match();
2553 pos->determines_perfectly = false; 2561 pos->determines_perfectly = false;
2554 return; 2562 return;
2555 } 2563 }
2556 if (length == 1) { 2564 if (length == 1) {
2557 // This letter has no case equivalents, so it's nice and simple 2565 // This letter has no case equivalents, so it's nice and simple
2558 // and the mask-compare will determine definitely whether we have 2566 // and the mask-compare will determine definitely whether we have
(...skipping 184 matching lines...) Expand 10 before | Expand all | Expand 10 after
2743 DCHECK(!info->visited); 2751 DCHECK(!info->visited);
2744 info->visited = true; 2752 info->visited = true;
2745 } 2753 }
2746 ~VisitMarker() { 2754 ~VisitMarker() {
2747 info_->visited = false; 2755 info_->visited = false;
2748 } 2756 }
2749 private: 2757 private:
2750 NodeInfo* info_; 2758 NodeInfo* info_;
2751 }; 2759 };
2752 2760
2753 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) { 2761
2762 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2754 if (info()->replacement_calculated) return replacement(); 2763 if (info()->replacement_calculated) return replacement();
2755 if (depth < 0) return this; 2764 if (depth < 0) return this;
2756 DCHECK(!info()->visited); 2765 DCHECK(!info()->visited);
2757 VisitMarker marker(info()); 2766 VisitMarker marker(info());
2758 return FilterSuccessor(depth - 1, compiler); 2767 return FilterSuccessor(depth - 1, ignore_case);
2759 } 2768 }
2760 2769
2761 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, 2770
2762 RegExpCompiler* compiler) { 2771 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2763 RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler); 2772 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2764 if (next == NULL) return set_replacement(NULL); 2773 if (next == NULL) return set_replacement(NULL);
2765 on_success_ = next; 2774 on_success_ = next;
2766 return set_replacement(this); 2775 return set_replacement(this);
2767 } 2776 }
2768 2777
2769 2778
2770 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2779 // We need to check for the following characters: 0x39c 0x3bc 0x178.
2771 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2780 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2772 // TODO(dcarney): this could be a lot more efficient. 2781 // TODO(dcarney): this could be a lot more efficient.
2773 return range.Contains(0x39c) || 2782 return range.Contains(0x39c) ||
2774 range.Contains(0x3bc) || range.Contains(0x178); 2783 range.Contains(0x3bc) || range.Contains(0x178);
2775 } 2784 }
2776 2785
2777 2786
2778 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2787 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2779 for (int i = 0; i < ranges->length(); i++) { 2788 for (int i = 0; i < ranges->length(); i++) {
2780 // TODO(dcarney): this could be a lot more efficient. 2789 // TODO(dcarney): this could be a lot more efficient.
2781 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2790 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2782 } 2791 }
2783 return false; 2792 return false;
2784 } 2793 }
2785 2794
2786 static uc16 ConvertNonLatin1ToEquivalentLatin1(bool unicode, uc16 c) {
2787 #ifdef V8_I18N_SUPPORT
2788 if (unicode) {
2789 USet* set = uset_open(c, c);
2790 uset_closeOver(set, USET_CASE_INSENSITIVE);
2791 uset_removeAllStrings(set);
2792 int length = uset_size(set);
2793 uc16 result = 0;
2794 for (int i = 0; i < length; i++) {
2795 uc32 c = uset_charAt(set, i);
2796 if (c <= String::kMaxOneByteCharCode) {
2797 result = static_cast<uc16>(c);
2798 break;
2799 }
2800 }
2801 uset_close(set);
2802 return result;
2803 }
2804 // Fallback to unibrow if ICU is not included.
2805 #endif // V8_I18N_SUPPORT
2806 return unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2807 }
2808 2795
2809 RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) { 2796 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2810 if (info()->replacement_calculated) return replacement(); 2797 if (info()->replacement_calculated) return replacement();
2811 if (depth < 0) return this; 2798 if (depth < 0) return this;
2812 DCHECK(!info()->visited); 2799 DCHECK(!info()->visited);
2813 VisitMarker marker(info()); 2800 VisitMarker marker(info());
2814 int element_count = elements()->length(); 2801 int element_count = elements()->length();
2815 for (int i = 0; i < element_count; i++) { 2802 for (int i = 0; i < element_count; i++) {
2816 TextElement elm = elements()->at(i); 2803 TextElement elm = elements()->at(i);
2817 if (elm.text_type() == TextElement::ATOM) { 2804 if (elm.text_type() == TextElement::ATOM) {
2818 Vector<const uc16> quarks = elm.atom()->data(); 2805 Vector<const uc16> quarks = elm.atom()->data();
2819 for (int j = 0; j < quarks.length(); j++) { 2806 for (int j = 0; j < quarks.length(); j++) {
2820 uc16 c = quarks[j]; 2807 uint16_t c = quarks[j];
2821 if (c <= String::kMaxOneByteCharCode) continue; 2808 if (c <= String::kMaxOneByteCharCode) continue;
2822 if (!compiler->ignore_case()) return set_replacement(NULL); 2809 if (!ignore_case) return set_replacement(NULL);
2823 // Here, we need to check for characters whose upper and lower cases 2810 // Here, we need to check for characters whose upper and lower cases
2824 // are outside the Latin-1 range. 2811 // are outside the Latin-1 range.
2825 uc16 converted = 2812 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2826 ConvertNonLatin1ToEquivalentLatin1(compiler->unicode(), c);
2827 // Character is outside Latin-1 completely 2813 // Character is outside Latin-1 completely
2828 if (converted == 0) return set_replacement(NULL); 2814 if (converted == 0) return set_replacement(NULL);
2829 // Convert quark to Latin-1 in place. 2815 // Convert quark to Latin-1 in place.
2830 uc16* copy = const_cast<uc16*>(quarks.start()); 2816 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2831 copy[j] = converted; 2817 copy[j] = converted;
2832 } 2818 }
2833 } else { 2819 } else {
2834 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2820 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2835 RegExpCharacterClass* cc = elm.char_class(); 2821 RegExpCharacterClass* cc = elm.char_class();
2836 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2822 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2837 CharacterRange::Canonicalize(ranges); 2823 CharacterRange::Canonicalize(ranges);
2838 // Now they are in order so we only need to look at the first. 2824 // Now they are in order so we only need to look at the first.
2839 int range_count = ranges->length(); 2825 int range_count = ranges->length();
2840 if (cc->is_negated()) { 2826 if (cc->is_negated()) {
2841 if (range_count != 0 && 2827 if (range_count != 0 &&
2842 ranges->at(0).from() == 0 && 2828 ranges->at(0).from() == 0 &&
2843 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2829 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2844 // This will be handled in a later filter. 2830 // This will be handled in a later filter.
2845 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges)) 2831 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2846 continue;
2847 return set_replacement(NULL); 2832 return set_replacement(NULL);
2848 } 2833 }
2849 } else { 2834 } else {
2850 if (range_count == 0 || 2835 if (range_count == 0 ||
2851 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2836 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2852 // This will be handled in a later filter. 2837 // This will be handled in a later filter.
2853 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges)) 2838 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2854 continue;
2855 return set_replacement(NULL); 2839 return set_replacement(NULL);
2856 } 2840 }
2857 } 2841 }
2858 } 2842 }
2859 } 2843 }
2860 return FilterSuccessor(depth - 1, compiler); 2844 return FilterSuccessor(depth - 1, ignore_case);
2861 } 2845 }
2862 2846
2863 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) { 2847
2848 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2864 if (info()->replacement_calculated) return replacement(); 2849 if (info()->replacement_calculated) return replacement();
2865 if (depth < 0) return this; 2850 if (depth < 0) return this;
2866 if (info()->visited) return this; 2851 if (info()->visited) return this;
2867 { 2852 {
2868 VisitMarker marker(info()); 2853 VisitMarker marker(info());
2869 2854
2870 RegExpNode* continue_replacement = 2855 RegExpNode* continue_replacement =
2871 continue_node_->FilterOneByte(depth - 1, compiler); 2856 continue_node_->FilterOneByte(depth - 1, ignore_case);
2872 // If we can't continue after the loop then there is no sense in doing the 2857 // If we can't continue after the loop then there is no sense in doing the
2873 // loop. 2858 // loop.
2874 if (continue_replacement == NULL) return set_replacement(NULL); 2859 if (continue_replacement == NULL) return set_replacement(NULL);
2875 } 2860 }
2876 2861
2877 return ChoiceNode::FilterOneByte(depth - 1, compiler); 2862 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2878 } 2863 }
2879 2864
2880 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) { 2865
2866 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2881 if (info()->replacement_calculated) return replacement(); 2867 if (info()->replacement_calculated) return replacement();
2882 if (depth < 0) return this; 2868 if (depth < 0) return this;
2883 if (info()->visited) return this; 2869 if (info()->visited) return this;
2884 VisitMarker marker(info()); 2870 VisitMarker marker(info());
2885 int choice_count = alternatives_->length(); 2871 int choice_count = alternatives_->length();
2886 2872
2887 for (int i = 0; i < choice_count; i++) { 2873 for (int i = 0; i < choice_count; i++) {
2888 GuardedAlternative alternative = alternatives_->at(i); 2874 GuardedAlternative alternative = alternatives_->at(i);
2889 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2875 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2890 set_replacement(this); 2876 set_replacement(this);
2891 return this; 2877 return this;
2892 } 2878 }
2893 } 2879 }
2894 2880
2895 int surviving = 0; 2881 int surviving = 0;
2896 RegExpNode* survivor = NULL; 2882 RegExpNode* survivor = NULL;
2897 for (int i = 0; i < choice_count; i++) { 2883 for (int i = 0; i < choice_count; i++) {
2898 GuardedAlternative alternative = alternatives_->at(i); 2884 GuardedAlternative alternative = alternatives_->at(i);
2899 RegExpNode* replacement = 2885 RegExpNode* replacement =
2900 alternative.node()->FilterOneByte(depth - 1, compiler); 2886 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2901 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2887 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2902 if (replacement != NULL) { 2888 if (replacement != NULL) {
2903 alternatives_->at(i).set_node(replacement); 2889 alternatives_->at(i).set_node(replacement);
2904 surviving++; 2890 surviving++;
2905 survivor = replacement; 2891 survivor = replacement;
2906 } 2892 }
2907 } 2893 }
2908 if (surviving < 2) return set_replacement(survivor); 2894 if (surviving < 2) return set_replacement(survivor);
2909 2895
2910 set_replacement(this); 2896 set_replacement(this);
2911 if (surviving == choice_count) { 2897 if (surviving == choice_count) {
2912 return this; 2898 return this;
2913 } 2899 }
2914 // Only some of the nodes survived the filtering. We need to rebuild the 2900 // Only some of the nodes survived the filtering. We need to rebuild the
2915 // alternatives list. 2901 // alternatives list.
2916 ZoneList<GuardedAlternative>* new_alternatives = 2902 ZoneList<GuardedAlternative>* new_alternatives =
2917 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2903 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2918 for (int i = 0; i < choice_count; i++) { 2904 for (int i = 0; i < choice_count; i++) {
2919 RegExpNode* replacement = 2905 RegExpNode* replacement =
2920 alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler); 2906 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2921 if (replacement != NULL) { 2907 if (replacement != NULL) {
2922 alternatives_->at(i).set_node(replacement); 2908 alternatives_->at(i).set_node(replacement);
2923 new_alternatives->Add(alternatives_->at(i), zone()); 2909 new_alternatives->Add(alternatives_->at(i), zone());
2924 } 2910 }
2925 } 2911 }
2926 alternatives_ = new_alternatives; 2912 alternatives_ = new_alternatives;
2927 return this; 2913 return this;
2928 } 2914 }
2929 2915
2930 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte( 2916
2931 int depth, RegExpCompiler* compiler) { 2917 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2918 bool ignore_case) {
2932 if (info()->replacement_calculated) return replacement(); 2919 if (info()->replacement_calculated) return replacement();
2933 if (depth < 0) return this; 2920 if (depth < 0) return this;
2934 if (info()->visited) return this; 2921 if (info()->visited) return this;
2935 VisitMarker marker(info()); 2922 VisitMarker marker(info());
2936 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2923 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2937 // afterwards. 2924 // afterwards.
2938 RegExpNode* node = alternatives_->at(1).node(); 2925 RegExpNode* node = alternatives_->at(1).node();
2939 RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler); 2926 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2940 if (replacement == NULL) return set_replacement(NULL); 2927 if (replacement == NULL) return set_replacement(NULL);
2941 alternatives_->at(1).set_node(replacement); 2928 alternatives_->at(1).set_node(replacement);
2942 2929
2943 RegExpNode* neg_node = alternatives_->at(0).node(); 2930 RegExpNode* neg_node = alternatives_->at(0).node();
2944 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler); 2931 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2945 // If the negative lookahead is always going to fail then 2932 // If the negative lookahead is always going to fail then
2946 // we don't need to check it. 2933 // we don't need to check it.
2947 if (neg_replacement == NULL) return set_replacement(replacement); 2934 if (neg_replacement == NULL) return set_replacement(replacement);
2948 alternatives_->at(0).set_node(neg_replacement); 2935 alternatives_->at(0).set_node(neg_replacement);
2949 return set_replacement(this); 2936 return set_replacement(this);
2950 } 2937 }
2951 2938
2952 2939
2953 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2940 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2954 RegExpCompiler* compiler, 2941 RegExpCompiler* compiler,
2955 int characters_filled_in, 2942 int characters_filled_in,
2956 bool not_at_start) { 2943 bool not_at_start) {
2957 if (body_can_be_zero_length_ || info()->visited) return; 2944 if (body_can_be_zero_length_ || info()->visited) return;
2958 VisitMarker marker(info()); 2945 VisitMarker marker(info());
2959 return ChoiceNode::GetQuickCheckDetails(details, 2946 return ChoiceNode::GetQuickCheckDetails(details,
2960 compiler, 2947 compiler,
2961 characters_filled_in, 2948 characters_filled_in,
2962 not_at_start); 2949 not_at_start);
2963 } 2950 }
2964 2951
2965 void LoopChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, 2952
2966 int budget, BoyerMooreLookahead* bm, 2953 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2967 bool not_at_start) { 2954 BoyerMooreLookahead* bm, bool not_at_start) {
2968 if (body_can_be_zero_length_ || budget <= 0) { 2955 if (body_can_be_zero_length_ || budget <= 0) {
2969 bm->SetRest(offset); 2956 bm->SetRest(offset);
2970 SaveBMInfo(bm, not_at_start, offset); 2957 SaveBMInfo(bm, not_at_start, offset);
2971 return; 2958 return;
2972 } 2959 }
2973 ChoiceNode::FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start); 2960 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2974 SaveBMInfo(bm, not_at_start, offset); 2961 SaveBMInfo(bm, not_at_start, offset);
2975 } 2962 }
2976 2963
2977 2964
2978 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2965 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2979 RegExpCompiler* compiler, 2966 RegExpCompiler* compiler,
2980 int characters_filled_in, 2967 int characters_filled_in,
2981 bool not_at_start) { 2968 bool not_at_start) {
2982 not_at_start = (not_at_start || not_at_start_); 2969 not_at_start = (not_at_start || not_at_start_);
2983 int choice_count = alternatives_->length(); 2970 int choice_count = alternatives_->length();
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
3055 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3042 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3056 } 3043 }
3057 assembler->Bind(&ok); 3044 assembler->Bind(&ok);
3058 on_success->Emit(compiler, &new_trace); 3045 on_success->Emit(compiler, &new_trace);
3059 } 3046 }
3060 3047
3061 3048
3062 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3049 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3063 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3050 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3064 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3051 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3052 Isolate* isolate = assembler->isolate();
3065 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3053 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3066 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3054 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3067 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3055 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3068 if (lookahead == NULL) { 3056 if (lookahead == NULL) {
3069 int eats_at_least = 3057 int eats_at_least =
3070 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3058 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3071 kRecursionBudget, 3059 kRecursionBudget,
3072 not_at_start)); 3060 not_at_start));
3073 if (eats_at_least >= 1) { 3061 if (eats_at_least >= 1) {
3074 BoyerMooreLookahead* bm = 3062 BoyerMooreLookahead* bm =
3075 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3063 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3076 FillInBMInfo(compiler, 0, kRecursionBudget, bm, not_at_start); 3064 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3077 if (bm->at(0)->is_non_word()) 3065 if (bm->at(0)->is_non_word())
3078 next_is_word_character = Trace::FALSE_VALUE; 3066 next_is_word_character = Trace::FALSE_VALUE;
3079 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3067 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3080 } 3068 }
3081 } else { 3069 } else {
3082 if (lookahead->at(0)->is_non_word()) 3070 if (lookahead->at(0)->is_non_word())
3083 next_is_word_character = Trace::FALSE_VALUE; 3071 next_is_word_character = Trace::FALSE_VALUE;
3084 if (lookahead->at(0)->is_word()) 3072 if (lookahead->at(0)->is_word())
3085 next_is_word_character = Trace::TRUE_VALUE; 3073 next_is_word_character = Trace::TRUE_VALUE;
3086 } 3074 }
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
3238 // up to the limit the quick check already checked. In addition the quick 3226 // up to the limit the quick check already checked. In addition the quick
3239 // check can have involved a mask and compare operation which may simplify 3227 // check can have involved a mask and compare operation which may simplify
3240 // or obviate the need for further checks at some character positions. 3228 // or obviate the need for further checks at some character positions.
3241 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3229 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3242 TextEmitPassType pass, 3230 TextEmitPassType pass,
3243 bool preloaded, 3231 bool preloaded,
3244 Trace* trace, 3232 Trace* trace,
3245 bool first_element_checked, 3233 bool first_element_checked,
3246 int* checked_up_to) { 3234 int* checked_up_to) {
3247 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3235 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3236 Isolate* isolate = assembler->isolate();
3248 bool one_byte = compiler->one_byte(); 3237 bool one_byte = compiler->one_byte();
3249 Label* backtrack = trace->backtrack(); 3238 Label* backtrack = trace->backtrack();
3250 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3239 QuickCheckDetails* quick_check = trace->quick_check_performed();
3251 int element_count = elements()->length(); 3240 int element_count = elements()->length();
3252 int backward_offset = read_backward() ? -Length() : 0; 3241 int backward_offset = read_backward() ? -Length() : 0;
3253 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3242 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3254 TextElement elm = elements()->at(i); 3243 TextElement elm = elements()->at(i);
3255 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 3244 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3256 if (elm.text_type() == TextElement::ATOM) { 3245 if (elm.text_type() == TextElement::ATOM) {
3257 Vector<const uc16> quarks = elm.atom()->data(); 3246 Vector<const uc16> quarks = elm.atom()->data();
3258 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3247 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3259 if (first_element_checked && i == 0 && j == 0) continue; 3248 if (first_element_checked && i == 0 && j == 0) continue;
3260 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3249 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3261 EmitCharacterFunction* emit_function = NULL; 3250 EmitCharacterFunction* emit_function = NULL;
3262 switch (pass) { 3251 switch (pass) {
3263 case NON_LATIN1_MATCH: 3252 case NON_LATIN1_MATCH:
3264 DCHECK(one_byte); 3253 DCHECK(one_byte);
3265 DCHECK(!(compiler->unicode() && compiler->ignore_case()));
3266 if (quarks[j] > String::kMaxOneByteCharCode) { 3254 if (quarks[j] > String::kMaxOneByteCharCode) {
3267 assembler->GoTo(backtrack); 3255 assembler->GoTo(backtrack);
3268 return; 3256 return;
3269 } 3257 }
3270 break; 3258 break;
3271 case NON_LETTER_CHARACTER_MATCH: 3259 case NON_LETTER_CHARACTER_MATCH:
3272 emit_function = &EmitAtomNonLetter; 3260 emit_function = &EmitAtomNonLetter;
3273 break; 3261 break;
3274 case SIMPLE_CHARACTER_MATCH: 3262 case SIMPLE_CHARACTER_MATCH:
3275 emit_function = &EmitSimpleCharacter; 3263 emit_function = &EmitSimpleCharacter;
3276 break; 3264 break;
3277 case CASE_CHARACTER_MATCH: 3265 case CASE_CHARACTER_MATCH:
3278 emit_function = &EmitAtomLetter; 3266 emit_function = &EmitAtomLetter;
3279 break; 3267 break;
3280 default: 3268 default:
3281 break; 3269 break;
3282 } 3270 }
3283 if (emit_function != NULL) { 3271 if (emit_function != NULL) {
3284 bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); 3272 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3285 bool bound_checked = 3273 bool bound_checked =
3286 emit_function(compiler, quarks[j], backtrack, cp_offset + j, 3274 emit_function(isolate, compiler, quarks[j], backtrack,
3287 bounds_check, preloaded); 3275 cp_offset + j, bounds_check, preloaded);
3288 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3276 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3289 } 3277 }
3290 } 3278 }
3291 } else { 3279 } else {
3292 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3280 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3293 if (pass == CHARACTER_CLASS_MATCH) { 3281 if (pass == CHARACTER_CLASS_MATCH) {
3294 if (first_element_checked && i == 0) continue; 3282 if (first_element_checked && i == 0) continue;
3295 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3283 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3296 RegExpCharacterClass* cc = elm.char_class(); 3284 RegExpCharacterClass* cc = elm.char_class();
3297 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 3285 bool bounds_check = *checked_up_to < cp_offset || read_backward();
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
3360 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3348 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3361 LimitResult limit_result = LimitVersions(compiler, trace); 3349 LimitResult limit_result = LimitVersions(compiler, trace);
3362 if (limit_result == DONE) return; 3350 if (limit_result == DONE) return;
3363 DCHECK(limit_result == CONTINUE); 3351 DCHECK(limit_result == CONTINUE);
3364 3352
3365 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3353 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3366 compiler->SetRegExpTooBig(); 3354 compiler->SetRegExpTooBig();
3367 return; 3355 return;
3368 } 3356 }
3369 3357
3370 if (compiler->one_byte() && 3358 if (compiler->one_byte()) {
3371 !(compiler->unicode() && compiler->ignore_case())) {
3372 // If any character within the text node is outside the Latin1 range, it
3373 // cannot possibly match anything in a one-byte string. This still holds
3374 // for case-insensitive non-unicode regexp patterns. However, for
3375 // case-insensitive unicode regexp patterns, this is no longer true, e.g.
3376 // /\u212b/ui matches "\u00c5".
3377 int dummy = 0; 3359 int dummy = 0;
3378 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3360 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3379 } 3361 }
3380 3362
3381 bool first_elt_done = false; 3363 bool first_elt_done = false;
3382 int bound_checked_to = trace->cp_offset() - 1; 3364 int bound_checked_to = trace->cp_offset() - 1;
3383 bound_checked_to += trace->bound_checked_up_to(); 3365 bound_checked_to += trace->bound_checked_up_to();
3384 3366
3385 // If a character is preloaded into the current character register then 3367 // If a character is preloaded into the current character register then
3386 // check that now. 3368 // check that now.
(...skipping 731 matching lines...) Expand 10 before | Expand all | Expand 10 after
4118 4100
4119 // Really we should be creating a new trace when we execute this function, 4101 // Really we should be creating a new trace when we execute this function,
4120 // but there is no need, because the code it generates cannot backtrack, and 4102 // but there is no need, because the code it generates cannot backtrack, and
4121 // we always arrive here with a trivial trace (since it's the entry to a 4103 // we always arrive here with a trivial trace (since it's the entry to a
4122 // loop. That also implies that there are no preloaded characters, which is 4104 // loop. That also implies that there are no preloaded characters, which is
4123 // good, because it means we won't be violating any assumptions by 4105 // good, because it means we won't be violating any assumptions by
4124 // overwriting those characters with new load instructions. 4106 // overwriting those characters with new load instructions.
4125 DCHECK(trace->is_trivial()); 4107 DCHECK(trace->is_trivial());
4126 4108
4127 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4109 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4110 Isolate* isolate = macro_assembler->isolate();
4128 // At this point we know that we are at a non-greedy loop that will eat 4111 // At this point we know that we are at a non-greedy loop that will eat
4129 // any character one at a time. Any non-anchored regexp has such a 4112 // any character one at a time. Any non-anchored regexp has such a
4130 // loop prepended to it in order to find where it starts. We look for 4113 // loop prepended to it in order to find where it starts. We look for
4131 // a pattern of the form ...abc... where we can look 6 characters ahead 4114 // a pattern of the form ...abc... where we can look 6 characters ahead
4132 // and step forwards 3 if the character is not one of abc. Abc need 4115 // and step forwards 3 if the character is not one of abc. Abc need
4133 // not be atoms, they can be any reasonably limited character class or 4116 // not be atoms, they can be any reasonably limited character class or
4134 // small alternation. 4117 // small alternation.
4135 BoyerMooreLookahead* bm = bm_info(false); 4118 BoyerMooreLookahead* bm = bm_info(false);
4136 if (bm == NULL) { 4119 if (bm == NULL) {
4137 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4120 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4138 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4121 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4139 kRecursionBudget, 4122 kRecursionBudget,
4140 false)); 4123 false));
4141 if (eats_at_least >= 1) { 4124 if (eats_at_least >= 1) {
4142 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4125 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4143 compiler, 4126 compiler,
4144 zone()); 4127 zone());
4145 GuardedAlternative alt0 = alternatives_->at(0); 4128 GuardedAlternative alt0 = alternatives_->at(0);
4146 alt0.node()->FillInBMInfo(compiler, 0, kRecursionBudget, bm, false); 4129 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4147 } 4130 }
4148 } 4131 }
4149 if (bm != NULL) { 4132 if (bm != NULL) {
4150 bm->EmitSkipInstructions(macro_assembler); 4133 bm->EmitSkipInstructions(macro_assembler);
4151 } 4134 }
4152 return eats_at_least; 4135 return eats_at_least;
4153 } 4136 }
4154 4137
4155 4138
4156 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4139 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
(...skipping 2241 matching lines...) Expand 10 before | Expand all | Expand 10 after
6398 6381
6399 void Analysis::VisitBackReference(BackReferenceNode* that) { 6382 void Analysis::VisitBackReference(BackReferenceNode* that) {
6400 EnsureAnalyzed(that->on_success()); 6383 EnsureAnalyzed(that->on_success());
6401 } 6384 }
6402 6385
6403 6386
6404 void Analysis::VisitAssertion(AssertionNode* that) { 6387 void Analysis::VisitAssertion(AssertionNode* that) {
6405 EnsureAnalyzed(that->on_success()); 6388 EnsureAnalyzed(that->on_success());
6406 } 6389 }
6407 6390
6408 void BackReferenceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, 6391
6409 int budget, BoyerMooreLookahead* bm, 6392 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6393 BoyerMooreLookahead* bm,
6410 bool not_at_start) { 6394 bool not_at_start) {
6411 // Working out the set of characters that a backreference can match is too 6395 // Working out the set of characters that a backreference can match is too
6412 // hard, so we just say that any character can match. 6396 // hard, so we just say that any character can match.
6413 bm->SetRest(offset); 6397 bm->SetRest(offset);
6414 SaveBMInfo(bm, not_at_start, offset); 6398 SaveBMInfo(bm, not_at_start, offset);
6415 } 6399 }
6416 6400
6417 6401
6418 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 6402 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6419 RegExpMacroAssembler::kTableSize); 6403 RegExpMacroAssembler::kTableSize);
6420 6404
6421 void ChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget, 6405
6406 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6422 BoyerMooreLookahead* bm, bool not_at_start) { 6407 BoyerMooreLookahead* bm, bool not_at_start) {
6423 ZoneList<GuardedAlternative>* alts = alternatives(); 6408 ZoneList<GuardedAlternative>* alts = alternatives();
6424 budget = (budget - 1) / alts->length(); 6409 budget = (budget - 1) / alts->length();
6425 for (int i = 0; i < alts->length(); i++) { 6410 for (int i = 0; i < alts->length(); i++) {
6426 GuardedAlternative& alt = alts->at(i); 6411 GuardedAlternative& alt = alts->at(i);
6427 if (alt.guards() != NULL && alt.guards()->length() != 0) { 6412 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6428 bm->SetRest(offset); // Give up trying to fill in info. 6413 bm->SetRest(offset); // Give up trying to fill in info.
6429 SaveBMInfo(bm, not_at_start, offset); 6414 SaveBMInfo(bm, not_at_start, offset);
6430 return; 6415 return;
6431 } 6416 }
6432 alt.node()->FillInBMInfo(compiler, offset, budget, bm, not_at_start); 6417 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6433 } 6418 }
6434 SaveBMInfo(bm, not_at_start, offset); 6419 SaveBMInfo(bm, not_at_start, offset);
6435 } 6420 }
6436 6421
6437 void TextNode::FillInBMInfo(RegExpCompiler* compiler, int initial_offset, 6422
6438 int budget, BoyerMooreLookahead* bm, 6423 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6439 bool not_at_start) { 6424 BoyerMooreLookahead* bm, bool not_at_start) {
6440 if (initial_offset >= bm->length()) return; 6425 if (initial_offset >= bm->length()) return;
6441 int offset = initial_offset; 6426 int offset = initial_offset;
6442 int max_char = bm->max_char(); 6427 int max_char = bm->max_char();
6443 for (int i = 0; i < elements()->length(); i++) { 6428 for (int i = 0; i < elements()->length(); i++) {
6444 if (offset >= bm->length()) { 6429 if (offset >= bm->length()) {
6445 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6430 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6446 return; 6431 return;
6447 } 6432 }
6448 TextElement text = elements()->at(i); 6433 TextElement text = elements()->at(i);
6449 if (text.text_type() == TextElement::ATOM) { 6434 if (text.text_type() == TextElement::ATOM) {
6450 RegExpAtom* atom = text.atom(); 6435 RegExpAtom* atom = text.atom();
6451 for (int j = 0; j < atom->length(); j++, offset++) { 6436 for (int j = 0; j < atom->length(); j++, offset++) {
6452 if (offset >= bm->length()) { 6437 if (offset >= bm->length()) {
6453 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6438 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6454 return; 6439 return;
6455 } 6440 }
6456 uc16 character = atom->data()[j]; 6441 uc16 character = atom->data()[j];
6457 if (bm->compiler()->ignore_case()) { 6442 if (bm->compiler()->ignore_case()) {
6458 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 6443 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6459 int length = GetCaseIndependentLetters(compiler, character, chars); 6444 int length = GetCaseIndependentLetters(
6445 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6446 chars);
6460 for (int j = 0; j < length; j++) { 6447 for (int j = 0; j < length; j++) {
6461 bm->Set(offset, chars[j]); 6448 bm->Set(offset, chars[j]);
6462 } 6449 }
6463 } else { 6450 } else {
6464 if (character <= max_char) bm->Set(offset, character); 6451 if (character <= max_char) bm->Set(offset, character);
6465 } 6452 }
6466 } 6453 }
6467 } else { 6454 } else {
6468 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 6455 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6469 RegExpCharacterClass* char_class = text.char_class(); 6456 RegExpCharacterClass* char_class = text.char_class();
6470 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 6457 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6471 if (char_class->is_negated()) { 6458 if (char_class->is_negated()) {
6472 bm->SetAll(offset); 6459 bm->SetAll(offset);
6473 } else { 6460 } else {
6474 for (int k = 0; k < ranges->length(); k++) { 6461 for (int k = 0; k < ranges->length(); k++) {
6475 CharacterRange& range = ranges->at(k); 6462 CharacterRange& range = ranges->at(k);
6476 if (range.from() > max_char) continue; 6463 if (range.from() > max_char) continue;
6477 int to = Min(max_char, static_cast<int>(range.to())); 6464 int to = Min(max_char, static_cast<int>(range.to()));
6478 bm->SetInterval(offset, Interval(range.from(), to)); 6465 bm->SetInterval(offset, Interval(range.from(), to));
6479 } 6466 }
6480 } 6467 }
6481 offset++; 6468 offset++;
6482 } 6469 }
6483 } 6470 }
6484 if (offset >= bm->length()) { 6471 if (offset >= bm->length()) {
6485 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6472 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6486 return; 6473 return;
6487 } 6474 }
6488 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, 6475 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6489 true); // Not at start after a text node. 6476 true); // Not at start after a text node.
6490 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6477 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6491 } 6478 }
6492 6479
6493 6480
6494 // ------------------------------------------------------------------- 6481 // -------------------------------------------------------------------
6495 // Dispatch table construction 6482 // Dispatch table construction
6496 6483
6497 6484
6498 void DispatchTableConstructor::VisitEnd(EndNode* that) { 6485 void DispatchTableConstructor::VisitEnd(EndNode* that) {
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
6636 } 6623 }
6637 6624
6638 6625
6639 RegExpEngine::CompilationResult RegExpEngine::Compile( 6626 RegExpEngine::CompilationResult RegExpEngine::Compile(
6640 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6627 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6641 JSRegExp::Flags flags, Handle<String> pattern, 6628 JSRegExp::Flags flags, Handle<String> pattern,
6642 Handle<String> sample_subject, bool is_one_byte) { 6629 Handle<String> sample_subject, bool is_one_byte) {
6643 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6630 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6644 return IrregexpRegExpTooBig(isolate); 6631 return IrregexpRegExpTooBig(isolate);
6645 } 6632 }
6633 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6646 bool is_sticky = flags & JSRegExp::kSticky; 6634 bool is_sticky = flags & JSRegExp::kSticky;
6647 bool is_global = flags & JSRegExp::kGlobal; 6635 bool is_global = flags & JSRegExp::kGlobal;
6648 bool is_unicode = flags & JSRegExp::kUnicode; 6636 bool is_unicode = flags & JSRegExp::kUnicode;
6649 RegExpCompiler compiler(isolate, zone, data->capture_count, flags, 6637 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6650 is_one_byte); 6638 is_one_byte);
6651 6639
6652 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); 6640 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6653 6641
6654 // Sample some characters from the middle of the string. 6642 // Sample some characters from the middle of the string.
6655 static const int kSampleSize = 128; 6643 static const int kSampleSize = 128;
(...skipping 29 matching lines...) Expand all
6685 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6673 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6686 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6674 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6687 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( 6675 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6688 new (zone) RegExpCharacterClass('*'), false, loop_node))); 6676 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6689 node = first_step_node; 6677 node = first_step_node;
6690 } else { 6678 } else {
6691 node = loop_node; 6679 node = loop_node;
6692 } 6680 }
6693 } 6681 }
6694 if (is_one_byte) { 6682 if (is_one_byte) {
6695 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler); 6683 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6696 // Do it again to propagate the new nodes to places where they were not 6684 // Do it again to propagate the new nodes to places where they were not
6697 // put because they had not been calculated yet. 6685 // put because they had not been calculated yet.
6698 if (node != NULL) { 6686 if (node != NULL) {
6699 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler); 6687 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6700 } 6688 }
6701 } else if (compiler.unicode() && (is_global || is_sticky)) { 6689 } else if (compiler.unicode() && (is_global || is_sticky)) {
6702 node = OptionallyStepBackToLeadSurrogate(&compiler, node); 6690 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6703 } 6691 }
6704 6692
6705 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6693 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6706 data->node = node; 6694 data->node = node;
6707 Analysis analysis(isolate, flags, is_one_byte); 6695 Analysis analysis(isolate, flags, is_one_byte);
6708 analysis.EnsureAnalyzed(node); 6696 analysis.EnsureAnalyzed(node);
6709 if (analysis.has_failed()) { 6697 if (analysis.has_failed()) {
(...skipping 177 matching lines...) Expand 10 before | Expand all | Expand 10 after
6887 6875
6888 6876
6889 void RegExpResultsCache::Clear(FixedArray* cache) { 6877 void RegExpResultsCache::Clear(FixedArray* cache) {
6890 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6878 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6891 cache->set(i, Smi::FromInt(0)); 6879 cache->set(i, Smi::FromInt(0));
6892 } 6880 }
6893 } 6881 }
6894 6882
6895 } // namespace internal 6883 } // namespace internal
6896 } // namespace v8 6884 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698