Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(107)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1641613002: [regexp] implement /ui to mirror the implementation for /i. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include "src/ast/ast.h" 7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 1570 matching lines...) Expand 10 before | Expand all | Expand 10 after
1581 macro_assembler->IfRegisterLT(guard->reg(), 1581 macro_assembler->IfRegisterLT(guard->reg(),
1582 guard->value(), 1582 guard->value(),
1583 trace->backtrack()); 1583 trace->backtrack());
1584 break; 1584 break;
1585 } 1585 }
1586 } 1586 }
1587 1587
1588 1588
1589 // Returns the number of characters in the equivalence class, omitting those 1589 // Returns the number of characters in the equivalence class, omitting those
1590 // that cannot occur in the source string because it is Latin1. 1590 // that cannot occur in the source string because it is Latin1.
1591 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1591 static int GetCaseIndependentLetters(RegExpCompiler* compiler, uc16 character,
1592 bool one_byte_subject, 1592 bool one_byte_subject, uc32* letters) {
erikcorry 2016/02/01 15:21:40 The compiler knows whether we have a one-byte subj
Yang 2016/02/02 06:06:48 Done.
1593 unibrow::uchar* letters) { 1593 int length;
1594 int length = 1594 #ifdef V8_I18N_SUPPORT
1595 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1595 if (compiler->unicode()) {
1596 // Unibrow returns 0 or 1 for characters where case independence is 1596 USet* set = uset_open(character, character);
1597 // trivial. 1597 uset_closeOver(set, USET_CASE_INSENSITIVE);
1598 if (length == 0) { 1598 uset_removeAllStrings(set);
1599 letters[0] = character; 1599 length = uset_size(set);
1600 length = 1; 1600 for (int i = 0; i < length; i++) {
1601 letters[i] = uset_charAt(set, i);
1602 }
1603 uset_close(set);
1604 } else // NOLINT
1605 // Fallback in case ICU is not included.
1606 #endif // V8_I18N_SUPPORT
1607 {
1608 length = compiler->isolate()->jsregexp_uncanonicalize()->get(character,
1609 '\0', letters);
1610 // Unibrow returns 0 or 1 for characters where case independence is
1611 // trivial.
1612 if (length == 0) {
1613 letters[0] = character;
1614 length = 1;
1615 }
1601 } 1616 }
1602 1617
1603 if (one_byte_subject) { 1618 if (one_byte_subject) {
1604 int new_length = 0; 1619 int new_length = 0;
1605 for (int i = 0; i < length; i++) { 1620 for (int i = 0; i < length; i++) {
1606 if (letters[i] <= String::kMaxOneByteCharCode) { 1621 if (letters[i] <= String::kMaxOneByteCharCode) {
1607 letters[new_length++] = letters[i]; 1622 letters[new_length++] = letters[i];
1608 } 1623 }
1609 } 1624 }
1610 length = new_length; 1625 length = new_length;
1611 } 1626 }
1612 1627
1613 return length; 1628 return length;
1614 } 1629 }
1615 1630
1616 1631 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, uc16 c,
1617 static inline bool EmitSimpleCharacter(Isolate* isolate, 1632 Label* on_failure, int cp_offset,
1618 RegExpCompiler* compiler, 1633 bool check, bool preloaded) {
1619 uc16 c,
1620 Label* on_failure,
1621 int cp_offset,
1622 bool check,
1623 bool preloaded) {
1624 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1634 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1625 bool bound_checked = false; 1635 bool bound_checked = false;
1626 if (!preloaded) { 1636 if (!preloaded) {
1627 assembler->LoadCurrentCharacter( 1637 assembler->LoadCurrentCharacter(
1628 cp_offset, 1638 cp_offset,
1629 on_failure, 1639 on_failure,
1630 check); 1640 check);
1631 bound_checked = true; 1641 bound_checked = true;
1632 } 1642 }
1633 assembler->CheckNotCharacter(c, on_failure); 1643 assembler->CheckNotCharacter(c, on_failure);
1634 return bound_checked; 1644 return bound_checked;
1635 } 1645 }
1636 1646
1637 1647
1638 // Only emits non-letters (things that don't have case). Only used for case 1648 // Only emits non-letters (things that don't have case). Only used for case
1639 // independent matches. 1649 // independent matches.
1640 static inline bool EmitAtomNonLetter(Isolate* isolate, 1650 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, uc16 c,
1641 RegExpCompiler* compiler, 1651 Label* on_failure, int cp_offset,
1642 uc16 c, 1652 bool check, bool preloaded) {
1643 Label* on_failure,
1644 int cp_offset,
1645 bool check,
1646 bool preloaded) {
1647 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1653 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1648 bool one_byte = compiler->one_byte(); 1654 bool one_byte = compiler->one_byte();
1649 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1655 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1650 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1656 int length = GetCaseIndependentLetters(compiler, c, one_byte, chars);
1651 if (length < 1) { 1657 if (length < 1) {
1652 // This can't match. Must be an one-byte subject and a non-one-byte 1658 // This can't match. Must be an one-byte subject and a non-one-byte
1653 // character. We do not need to do anything since the one-byte pass 1659 // character. We do not need to do anything since the one-byte pass
1654 // already handled this. 1660 // already handled this.
1655 return false; // Bounds not checked. 1661 return false; // Bounds not checked.
1656 } 1662 }
1657 bool checked = false; 1663 bool checked = false;
1658 // We handle the length > 1 case in a later pass. 1664 // We handle the length > 1 case in a later pass.
1659 if (length == 1) { 1665 if (length == 1) {
1660 if (one_byte && c > String::kMaxOneByteCharCodeU) {
erikcorry 2016/02/01 15:21:40 I think this is still worth having. For these cha
Yang 2016/02/02 06:06:48 I'm confused here. Wouldn't the FilterOneByte pass
1661 // Can't match - see above.
1662 return false; // Bounds not checked.
1663 }
1664 if (!preloaded) { 1666 if (!preloaded) {
1665 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1667 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1666 checked = check; 1668 checked = check;
1667 } 1669 }
1668 macro_assembler->CheckNotCharacter(c, on_failure); 1670 macro_assembler->CheckNotCharacter(c, on_failure);
1669 } 1671 }
1670 return checked; 1672 return checked;
1671 } 1673 }
1672 1674
1673 1675
(...skipping 26 matching lines...) Expand all
1700 uc16 mask = char_mask ^ diff; 1702 uc16 mask = char_mask ^ diff;
1701 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1703 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1702 diff, 1704 diff,
1703 mask, 1705 mask,
1704 on_failure); 1706 on_failure);
1705 return true; 1707 return true;
1706 } 1708 }
1707 return false; 1709 return false;
1708 } 1710 }
1709 1711
1710 1712 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, uc16 c,
1711 typedef bool EmitCharacterFunction(Isolate* isolate, 1713 Label* on_failure, int cp_offset, bool check,
1712 RegExpCompiler* compiler,
1713 uc16 c,
1714 Label* on_failure,
1715 int cp_offset,
1716 bool check,
1717 bool preloaded); 1714 bool preloaded);
1718 1715
1719 // Only emits letters (things that have case). Only used for case independent 1716 // Only emits letters (things that have case). Only used for case independent
1720 // matches. 1717 // matches.
1721 static inline bool EmitAtomLetter(Isolate* isolate, 1718 static inline bool EmitAtomLetter(RegExpCompiler* compiler, uc16 c,
1722 RegExpCompiler* compiler, 1719 Label* on_failure, int cp_offset, bool check,
1723 uc16 c,
1724 Label* on_failure,
1725 int cp_offset,
1726 bool check,
1727 bool preloaded) { 1720 bool preloaded) {
1728 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1721 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1729 bool one_byte = compiler->one_byte(); 1722 bool one_byte = compiler->one_byte();
1730 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1723 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1731 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1724 int length = GetCaseIndependentLetters(compiler, c, one_byte, chars);
1732 if (length <= 1) return false; 1725 if (length <= 1) return false;
1733 // We may not need to check against the end of the input string 1726 // We may not need to check against the end of the input string
1734 // if this character lies before a character that matched. 1727 // if this character lies before a character that matched.
1735 if (!preloaded) { 1728 if (!preloaded) {
1736 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1729 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1737 } 1730 }
1738 Label ok; 1731 Label ok;
1739 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1732 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1740 switch (length) { 1733 switch (length) {
1741 case 2: { 1734 case 2: {
(...skipping 528 matching lines...) Expand 10 before | Expand all | Expand 10 after
2270 int ActionNode::EatsAtLeast(int still_to_find, 2263 int ActionNode::EatsAtLeast(int still_to_find,
2271 int budget, 2264 int budget,
2272 bool not_at_start) { 2265 bool not_at_start) {
2273 if (budget <= 0) return 0; 2266 if (budget <= 0) return 0;
2274 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2267 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2275 return on_success()->EatsAtLeast(still_to_find, 2268 return on_success()->EatsAtLeast(still_to_find,
2276 budget - 1, 2269 budget - 1,
2277 not_at_start); 2270 not_at_start);
2278 } 2271 }
2279 2272
2280 2273 void ActionNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
2281 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2282 BoyerMooreLookahead* bm, bool not_at_start) { 2274 BoyerMooreLookahead* bm, bool not_at_start) {
2283 if (action_type_ == BEGIN_SUBMATCH) { 2275 if (action_type_ == BEGIN_SUBMATCH) {
2284 bm->SetRest(offset); 2276 bm->SetRest(offset);
2285 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2277 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2286 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2278 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
2287 } 2279 }
2288 SaveBMInfo(bm, not_at_start, offset); 2280 SaveBMInfo(bm, not_at_start, offset);
2289 } 2281 }
2290 2282
2291 2283
2292 int AssertionNode::EatsAtLeast(int still_to_find, 2284 int AssertionNode::EatsAtLeast(int still_to_find,
2293 int budget, 2285 int budget,
2294 bool not_at_start) { 2286 bool not_at_start) {
2295 if (budget <= 0) return 0; 2287 if (budget <= 0) return 0;
2296 // If we know we are not at the start and we are asked "how many characters 2288 // If we know we are not at the start and we are asked "how many characters
2297 // will you match if you succeed?" then we can answer anything since false 2289 // will you match if you succeed?" then we can answer anything since false
2298 // implies false. So lets just return the max answer (still_to_find) since 2290 // implies false. So lets just return the max answer (still_to_find) since
2299 // that won't prevent us from preloading a lot of characters for the other 2291 // that won't prevent us from preloading a lot of characters for the other
2300 // branches in the node graph. 2292 // branches in the node graph.
2301 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2293 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2302 return on_success()->EatsAtLeast(still_to_find, 2294 return on_success()->EatsAtLeast(still_to_find,
2303 budget - 1, 2295 budget - 1,
2304 not_at_start); 2296 not_at_start);
2305 } 2297 }
2306 2298
2307 2299 void AssertionNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
2308 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2300 int budget, BoyerMooreLookahead* bm,
2309 BoyerMooreLookahead* bm, bool not_at_start) { 2301 bool not_at_start) {
2310 // Match the behaviour of EatsAtLeast on this node. 2302 // Match the behaviour of EatsAtLeast on this node.
2311 if (assertion_type() == AT_START && not_at_start) return; 2303 if (assertion_type() == AT_START && not_at_start) return;
2312 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2304 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
2313 SaveBMInfo(bm, not_at_start, offset); 2305 SaveBMInfo(bm, not_at_start, offset);
2314 } 2306 }
2315 2307
2316 2308
2317 int BackReferenceNode::EatsAtLeast(int still_to_find, 2309 int BackReferenceNode::EatsAtLeast(int still_to_find,
2318 int budget, 2310 int budget,
2319 bool not_at_start) { 2311 bool not_at_start) {
2320 if (read_backward()) return 0; 2312 if (read_backward()) return 0;
2321 if (budget <= 0) return 0; 2313 if (budget <= 0) return 0;
2322 return on_success()->EatsAtLeast(still_to_find, 2314 return on_success()->EatsAtLeast(still_to_find,
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after
2516 // The masks and values for the positions will be combined into a single 2508 // The masks and values for the positions will be combined into a single
2517 // machine word for the current character width in order to be used in 2509 // machine word for the current character width in order to be used in
2518 // generating a quick check. 2510 // generating a quick check.
2519 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2511 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2520 RegExpCompiler* compiler, 2512 RegExpCompiler* compiler,
2521 int characters_filled_in, 2513 int characters_filled_in,
2522 bool not_at_start) { 2514 bool not_at_start) {
2523 // Do not collect any quick check details if the text node reads backward, 2515 // Do not collect any quick check details if the text node reads backward,
2524 // since it reads in the opposite direction than we use for quick checks. 2516 // since it reads in the opposite direction than we use for quick checks.
2525 if (read_backward()) return; 2517 if (read_backward()) return;
2526 Isolate* isolate = compiler->macro_assembler()->isolate();
2527 DCHECK(characters_filled_in < details->characters()); 2518 DCHECK(characters_filled_in < details->characters());
2528 int characters = details->characters(); 2519 int characters = details->characters();
2529 int char_mask; 2520 int char_mask;
2530 if (compiler->one_byte()) { 2521 if (compiler->one_byte()) {
2531 char_mask = String::kMaxOneByteCharCode; 2522 char_mask = String::kMaxOneByteCharCode;
2532 } else { 2523 } else {
2533 char_mask = String::kMaxUtf16CodeUnit; 2524 char_mask = String::kMaxUtf16CodeUnit;
2534 } 2525 }
2535 for (int k = 0; k < elements()->length(); k++) { 2526 for (int k = 0; k < elements()->length(); k++) {
2536 TextElement elm = elements()->at(k); 2527 TextElement elm = elements()->at(k);
2537 if (elm.text_type() == TextElement::ATOM) { 2528 if (elm.text_type() == TextElement::ATOM) {
2538 Vector<const uc16> quarks = elm.atom()->data(); 2529 Vector<const uc16> quarks = elm.atom()->data();
2539 for (int i = 0; i < characters && i < quarks.length(); i++) { 2530 for (int i = 0; i < characters && i < quarks.length(); i++) {
2540 QuickCheckDetails::Position* pos = 2531 QuickCheckDetails::Position* pos =
2541 details->positions(characters_filled_in); 2532 details->positions(characters_filled_in);
2542 uc16 c = quarks[i]; 2533 uc16 c = quarks[i];
2543 if (compiler->ignore_case()) { 2534 if (compiler->ignore_case()) {
2544 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2535 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2545 int length = GetCaseIndependentLetters(isolate, c, 2536 int length = GetCaseIndependentLetters(compiler, c,
2546 compiler->one_byte(), chars); 2537 compiler->one_byte(), chars);
2547 if (length == 0) { 2538 if (length == 0) {
2548 // This can happen because all case variants are non-Latin1, but we 2539 // This can happen because all case variants are non-Latin1, but we
2549 // know the input is Latin1. 2540 // know the input is Latin1.
2550 details->set_cannot_match(); 2541 details->set_cannot_match();
2551 pos->determines_perfectly = false; 2542 pos->determines_perfectly = false;
2552 return; 2543 return;
2553 } 2544 }
2554 if (length == 1) { 2545 if (length == 1) {
2555 // This letter has no case equivalents, so it's nice and simple 2546 // This letter has no case equivalents, so it's nice and simple
(...skipping 185 matching lines...) Expand 10 before | Expand all | Expand 10 after
2741 DCHECK(!info->visited); 2732 DCHECK(!info->visited);
2742 info->visited = true; 2733 info->visited = true;
2743 } 2734 }
2744 ~VisitMarker() { 2735 ~VisitMarker() {
2745 info_->visited = false; 2736 info_->visited = false;
2746 } 2737 }
2747 private: 2738 private:
2748 NodeInfo* info_; 2739 NodeInfo* info_;
2749 }; 2740 };
2750 2741
2751 2742 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
2752 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2753 if (info()->replacement_calculated) return replacement(); 2743 if (info()->replacement_calculated) return replacement();
2754 if (depth < 0) return this; 2744 if (depth < 0) return this;
2755 DCHECK(!info()->visited); 2745 DCHECK(!info()->visited);
2756 VisitMarker marker(info()); 2746 VisitMarker marker(info());
2757 return FilterSuccessor(depth - 1, ignore_case); 2747 return FilterSuccessor(depth - 1, compiler);
2758 } 2748 }
2759 2749
2760 2750 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth,
2761 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2751 RegExpCompiler* compiler) {
2762 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); 2752 RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler);
2763 if (next == NULL) return set_replacement(NULL); 2753 if (next == NULL) return set_replacement(NULL);
2764 on_success_ = next; 2754 on_success_ = next;
2765 return set_replacement(this); 2755 return set_replacement(this);
2766 } 2756 }
2767 2757
2768 2758
2769 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2759 // We need to check for the following characters: 0x39c 0x3bc 0x178.
2770 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2760 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2771 // TODO(dcarney): this could be a lot more efficient. 2761 // TODO(dcarney): this could be a lot more efficient.
2772 return range.Contains(0x39c) || 2762 return range.Contains(0x39c) ||
2773 range.Contains(0x3bc) || range.Contains(0x178); 2763 range.Contains(0x3bc) || range.Contains(0x178);
2774 } 2764 }
2775 2765
2776 2766
2777 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2767 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2778 for (int i = 0; i < ranges->length(); i++) { 2768 for (int i = 0; i < ranges->length(); i++) {
2779 // TODO(dcarney): this could be a lot more efficient. 2769 // TODO(dcarney): this could be a lot more efficient.
2780 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2770 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2781 } 2771 }
2782 return false; 2772 return false;
2783 } 2773 }
2784 2774
2775 static uc16 ConvertNonLatin1ToEquivalentLatin1(bool unicode, uc16 c) {
2776 #ifdef V8_I18N_SUPPORT
2777 if (unicode) {
2778 USet* set = uset_open(c, c);
2779 uset_closeOver(set, USET_CASE_INSENSITIVE);
2780 uset_removeAllStrings(set);
2781 int length = uset_size(set);
2782 uc16 result = 0;
2783 for (int i = 0; i < length; i++) {
2784 uc32 c = uset_charAt(set, i);
2785 if (c <= String::kMaxOneByteCharCode) {
2786 result = static_cast<uc16>(c);
2787 break;
2788 }
2789 }
2790 uset_close(set);
2791 return result;
2792 }
2793 // Fallback to unibrow if ICU is not included.
2794 #endif // V8_I18N_SUPPORT
2795 return unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2796 }
2785 2797
2786 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { 2798 RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
2787 if (info()->replacement_calculated) return replacement(); 2799 if (info()->replacement_calculated) return replacement();
2788 if (depth < 0) return this; 2800 if (depth < 0) return this;
2789 DCHECK(!info()->visited); 2801 DCHECK(!info()->visited);
2790 VisitMarker marker(info()); 2802 VisitMarker marker(info());
2791 int element_count = elements()->length(); 2803 int element_count = elements()->length();
2792 for (int i = 0; i < element_count; i++) { 2804 for (int i = 0; i < element_count; i++) {
2793 TextElement elm = elements()->at(i); 2805 TextElement elm = elements()->at(i);
2794 if (elm.text_type() == TextElement::ATOM) { 2806 if (elm.text_type() == TextElement::ATOM) {
2795 Vector<const uc16> quarks = elm.atom()->data(); 2807 Vector<const uc16> quarks = elm.atom()->data();
2796 for (int j = 0; j < quarks.length(); j++) { 2808 for (int j = 0; j < quarks.length(); j++) {
2797 uint16_t c = quarks[j]; 2809 uc16 c = quarks[j];
2798 if (c <= String::kMaxOneByteCharCode) continue; 2810 if (c <= String::kMaxOneByteCharCode) continue;
2799 if (!ignore_case) return set_replacement(NULL); 2811 if (!compiler->ignore_case()) return set_replacement(NULL);
2800 // Here, we need to check for characters whose upper and lower cases 2812 // Here, we need to check for characters whose upper and lower cases
2801 // are outside the Latin-1 range. 2813 // are outside the Latin-1 range.
2802 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2814 uc16 converted =
2815 ConvertNonLatin1ToEquivalentLatin1(compiler->unicode(), c);
2803 // Character is outside Latin-1 completely 2816 // Character is outside Latin-1 completely
2804 if (converted == 0) return set_replacement(NULL); 2817 if (converted == 0) return set_replacement(NULL);
2805 // Convert quark to Latin-1 in place. 2818 // Convert quark to Latin-1 in place.
2806 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2819 uc16* copy = const_cast<uc16*>(quarks.start());
2807 copy[j] = converted; 2820 copy[j] = converted;
2808 } 2821 }
2809 } else { 2822 } else {
2810 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2823 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2811 RegExpCharacterClass* cc = elm.char_class(); 2824 RegExpCharacterClass* cc = elm.char_class();
2812 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2825 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2813 CharacterRange::Canonicalize(ranges); 2826 CharacterRange::Canonicalize(ranges);
2814 // Now they are in order so we only need to look at the first. 2827 // Now they are in order so we only need to look at the first.
2815 int range_count = ranges->length(); 2828 int range_count = ranges->length();
2816 if (cc->is_negated()) { 2829 if (cc->is_negated()) {
2817 if (range_count != 0 && 2830 if (range_count != 0 &&
2818 ranges->at(0).from() == 0 && 2831 ranges->at(0).from() == 0 &&
2819 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2832 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2820 // This will be handled in a later filter. 2833 // This will be handled in a later filter.
2821 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2834 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))
2835 continue;
2822 return set_replacement(NULL); 2836 return set_replacement(NULL);
2823 } 2837 }
2824 } else { 2838 } else {
2825 if (range_count == 0 || 2839 if (range_count == 0 ||
2826 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2840 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2827 // This will be handled in a later filter. 2841 // This will be handled in a later filter.
2828 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2842 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))
2843 continue;
2829 return set_replacement(NULL); 2844 return set_replacement(NULL);
2830 } 2845 }
2831 } 2846 }
2832 } 2847 }
2833 } 2848 }
2834 return FilterSuccessor(depth - 1, ignore_case); 2849 return FilterSuccessor(depth - 1, compiler);
2835 } 2850 }
2836 2851
2837 2852 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
2838 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2839 if (info()->replacement_calculated) return replacement(); 2853 if (info()->replacement_calculated) return replacement();
2840 if (depth < 0) return this; 2854 if (depth < 0) return this;
2841 if (info()->visited) return this; 2855 if (info()->visited) return this;
2842 { 2856 {
2843 VisitMarker marker(info()); 2857 VisitMarker marker(info());
2844 2858
2845 RegExpNode* continue_replacement = 2859 RegExpNode* continue_replacement =
2846 continue_node_->FilterOneByte(depth - 1, ignore_case); 2860 continue_node_->FilterOneByte(depth - 1, compiler);
2847 // If we can't continue after the loop then there is no sense in doing the 2861 // If we can't continue after the loop then there is no sense in doing the
2848 // loop. 2862 // loop.
2849 if (continue_replacement == NULL) return set_replacement(NULL); 2863 if (continue_replacement == NULL) return set_replacement(NULL);
2850 } 2864 }
2851 2865
2852 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); 2866 return ChoiceNode::FilterOneByte(depth - 1, compiler);
2853 } 2867 }
2854 2868
2855 2869 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
2856 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2857 if (info()->replacement_calculated) return replacement(); 2870 if (info()->replacement_calculated) return replacement();
2858 if (depth < 0) return this; 2871 if (depth < 0) return this;
2859 if (info()->visited) return this; 2872 if (info()->visited) return this;
2860 VisitMarker marker(info()); 2873 VisitMarker marker(info());
2861 int choice_count = alternatives_->length(); 2874 int choice_count = alternatives_->length();
2862 2875
2863 for (int i = 0; i < choice_count; i++) { 2876 for (int i = 0; i < choice_count; i++) {
2864 GuardedAlternative alternative = alternatives_->at(i); 2877 GuardedAlternative alternative = alternatives_->at(i);
2865 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2878 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2866 set_replacement(this); 2879 set_replacement(this);
2867 return this; 2880 return this;
2868 } 2881 }
2869 } 2882 }
2870 2883
2871 int surviving = 0; 2884 int surviving = 0;
2872 RegExpNode* survivor = NULL; 2885 RegExpNode* survivor = NULL;
2873 for (int i = 0; i < choice_count; i++) { 2886 for (int i = 0; i < choice_count; i++) {
2874 GuardedAlternative alternative = alternatives_->at(i); 2887 GuardedAlternative alternative = alternatives_->at(i);
2875 RegExpNode* replacement = 2888 RegExpNode* replacement =
2876 alternative.node()->FilterOneByte(depth - 1, ignore_case); 2889 alternative.node()->FilterOneByte(depth - 1, compiler);
2877 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2890 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2878 if (replacement != NULL) { 2891 if (replacement != NULL) {
2879 alternatives_->at(i).set_node(replacement); 2892 alternatives_->at(i).set_node(replacement);
2880 surviving++; 2893 surviving++;
2881 survivor = replacement; 2894 survivor = replacement;
2882 } 2895 }
2883 } 2896 }
2884 if (surviving < 2) return set_replacement(survivor); 2897 if (surviving < 2) return set_replacement(survivor);
2885 2898
2886 set_replacement(this); 2899 set_replacement(this);
2887 if (surviving == choice_count) { 2900 if (surviving == choice_count) {
2888 return this; 2901 return this;
2889 } 2902 }
2890 // Only some of the nodes survived the filtering. We need to rebuild the 2903 // Only some of the nodes survived the filtering. We need to rebuild the
2891 // alternatives list. 2904 // alternatives list.
2892 ZoneList<GuardedAlternative>* new_alternatives = 2905 ZoneList<GuardedAlternative>* new_alternatives =
2893 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2906 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2894 for (int i = 0; i < choice_count; i++) { 2907 for (int i = 0; i < choice_count; i++) {
2895 RegExpNode* replacement = 2908 RegExpNode* replacement =
2896 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); 2909 alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler);
2897 if (replacement != NULL) { 2910 if (replacement != NULL) {
2898 alternatives_->at(i).set_node(replacement); 2911 alternatives_->at(i).set_node(replacement);
2899 new_alternatives->Add(alternatives_->at(i), zone()); 2912 new_alternatives->Add(alternatives_->at(i), zone());
2900 } 2913 }
2901 } 2914 }
2902 alternatives_ = new_alternatives; 2915 alternatives_ = new_alternatives;
2903 return this; 2916 return this;
2904 } 2917 }
2905 2918
2906 2919 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(
2907 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, 2920 int depth, RegExpCompiler* compiler) {
2908 bool ignore_case) {
2909 if (info()->replacement_calculated) return replacement(); 2921 if (info()->replacement_calculated) return replacement();
2910 if (depth < 0) return this; 2922 if (depth < 0) return this;
2911 if (info()->visited) return this; 2923 if (info()->visited) return this;
2912 VisitMarker marker(info()); 2924 VisitMarker marker(info());
2913 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2925 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2914 // afterwards. 2926 // afterwards.
2915 RegExpNode* node = alternatives_->at(1).node(); 2927 RegExpNode* node = alternatives_->at(1).node();
2916 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); 2928 RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler);
2917 if (replacement == NULL) return set_replacement(NULL); 2929 if (replacement == NULL) return set_replacement(NULL);
2918 alternatives_->at(1).set_node(replacement); 2930 alternatives_->at(1).set_node(replacement);
2919 2931
2920 RegExpNode* neg_node = alternatives_->at(0).node(); 2932 RegExpNode* neg_node = alternatives_->at(0).node();
2921 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); 2933 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler);
2922 // If the negative lookahead is always going to fail then 2934 // If the negative lookahead is always going to fail then
2923 // we don't need to check it. 2935 // we don't need to check it.
2924 if (neg_replacement == NULL) return set_replacement(replacement); 2936 if (neg_replacement == NULL) return set_replacement(replacement);
2925 alternatives_->at(0).set_node(neg_replacement); 2937 alternatives_->at(0).set_node(neg_replacement);
2926 return set_replacement(this); 2938 return set_replacement(this);
2927 } 2939 }
2928 2940
2929 2941
2930 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2942 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2931 RegExpCompiler* compiler, 2943 RegExpCompiler* compiler,
2932 int characters_filled_in, 2944 int characters_filled_in,
2933 bool not_at_start) { 2945 bool not_at_start) {
2934 if (body_can_be_zero_length_ || info()->visited) return; 2946 if (body_can_be_zero_length_ || info()->visited) return;
2935 VisitMarker marker(info()); 2947 VisitMarker marker(info());
2936 return ChoiceNode::GetQuickCheckDetails(details, 2948 return ChoiceNode::GetQuickCheckDetails(details,
2937 compiler, 2949 compiler,
2938 characters_filled_in, 2950 characters_filled_in,
2939 not_at_start); 2951 not_at_start);
2940 } 2952 }
2941 2953
2942 2954 void LoopChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
2943 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2955 int budget, BoyerMooreLookahead* bm,
2944 BoyerMooreLookahead* bm, bool not_at_start) { 2956 bool not_at_start) {
2945 if (body_can_be_zero_length_ || budget <= 0) { 2957 if (body_can_be_zero_length_ || budget <= 0) {
2946 bm->SetRest(offset); 2958 bm->SetRest(offset);
2947 SaveBMInfo(bm, not_at_start, offset); 2959 SaveBMInfo(bm, not_at_start, offset);
2948 return; 2960 return;
2949 } 2961 }
2950 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2962 ChoiceNode::FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
2951 SaveBMInfo(bm, not_at_start, offset); 2963 SaveBMInfo(bm, not_at_start, offset);
2952 } 2964 }
2953 2965
2954 2966
2955 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2967 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2956 RegExpCompiler* compiler, 2968 RegExpCompiler* compiler,
2957 int characters_filled_in, 2969 int characters_filled_in,
2958 bool not_at_start) { 2970 bool not_at_start) {
2959 not_at_start = (not_at_start || not_at_start_); 2971 not_at_start = (not_at_start || not_at_start_);
2960 int choice_count = alternatives_->length(); 2972 int choice_count = alternatives_->length();
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
3032 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3044 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3033 } 3045 }
3034 assembler->Bind(&ok); 3046 assembler->Bind(&ok);
3035 on_success->Emit(compiler, &new_trace); 3047 on_success->Emit(compiler, &new_trace);
3036 } 3048 }
3037 3049
3038 3050
3039 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3051 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3040 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3052 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3041 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3053 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3042 Isolate* isolate = assembler->isolate();
3043 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3054 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3044 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3055 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3045 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3056 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3046 if (lookahead == NULL) { 3057 if (lookahead == NULL) {
3047 int eats_at_least = 3058 int eats_at_least =
3048 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3059 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3049 kRecursionBudget, 3060 kRecursionBudget,
3050 not_at_start)); 3061 not_at_start));
3051 if (eats_at_least >= 1) { 3062 if (eats_at_least >= 1) {
3052 BoyerMooreLookahead* bm = 3063 BoyerMooreLookahead* bm =
3053 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3064 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3054 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 3065 FillInBMInfo(compiler, 0, kRecursionBudget, bm, not_at_start);
3055 if (bm->at(0)->is_non_word()) 3066 if (bm->at(0)->is_non_word())
3056 next_is_word_character = Trace::FALSE_VALUE; 3067 next_is_word_character = Trace::FALSE_VALUE;
3057 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3068 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3058 } 3069 }
3059 } else { 3070 } else {
3060 if (lookahead->at(0)->is_non_word()) 3071 if (lookahead->at(0)->is_non_word())
3061 next_is_word_character = Trace::FALSE_VALUE; 3072 next_is_word_character = Trace::FALSE_VALUE;
3062 if (lookahead->at(0)->is_word()) 3073 if (lookahead->at(0)->is_word())
3063 next_is_word_character = Trace::TRUE_VALUE; 3074 next_is_word_character = Trace::TRUE_VALUE;
3064 } 3075 }
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
3216 // up to the limit the quick check already checked. In addition the quick 3227 // up to the limit the quick check already checked. In addition the quick
3217 // check can have involved a mask and compare operation which may simplify 3228 // check can have involved a mask and compare operation which may simplify
3218 // or obviate the need for further checks at some character positions. 3229 // or obviate the need for further checks at some character positions.
3219 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3230 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3220 TextEmitPassType pass, 3231 TextEmitPassType pass,
3221 bool preloaded, 3232 bool preloaded,
3222 Trace* trace, 3233 Trace* trace,
3223 bool first_element_checked, 3234 bool first_element_checked,
3224 int* checked_up_to) { 3235 int* checked_up_to) {
3225 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3236 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3226 Isolate* isolate = assembler->isolate();
3227 bool one_byte = compiler->one_byte(); 3237 bool one_byte = compiler->one_byte();
3228 Label* backtrack = trace->backtrack(); 3238 Label* backtrack = trace->backtrack();
3229 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3239 QuickCheckDetails* quick_check = trace->quick_check_performed();
3230 int element_count = elements()->length(); 3240 int element_count = elements()->length();
3231 int backward_offset = read_backward() ? -Length() : 0; 3241 int backward_offset = read_backward() ? -Length() : 0;
3232 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3242 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3233 TextElement elm = elements()->at(i); 3243 TextElement elm = elements()->at(i);
3234 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 3244 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3235 if (elm.text_type() == TextElement::ATOM) { 3245 if (elm.text_type() == TextElement::ATOM) {
3236 Vector<const uc16> quarks = elm.atom()->data(); 3246 Vector<const uc16> quarks = elm.atom()->data();
3237 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3247 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3238 if (first_element_checked && i == 0 && j == 0) continue; 3248 if (first_element_checked && i == 0 && j == 0) continue;
3239 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3249 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3240 EmitCharacterFunction* emit_function = NULL; 3250 EmitCharacterFunction* emit_function = NULL;
3241 switch (pass) { 3251 switch (pass) {
3242 case NON_LATIN1_MATCH: 3252 case NON_LATIN1_MATCH:
3243 DCHECK(one_byte); 3253 DCHECK(one_byte);
3244 if (quarks[j] > String::kMaxOneByteCharCode) { 3254 if (quarks[j] > String::kMaxOneByteCharCode &&
3255 !(compiler->unicode() && compiler->ignore_case())) {
3256 // In the non-unicode case, if the pattern is non-Latin1, it
erikcorry 2016/02/01 15:21:40 I think you mean in the one-byte case (non-unicode
Yang 2016/02/02 06:06:48 I did mean this. With /i, if a character is outsid
3257 // cannot possibly match a Latin1 character, unless we have the
3258 // /ui flags, e.g. /\u212b/ui matches "\u00c5".
3245 assembler->GoTo(backtrack); 3259 assembler->GoTo(backtrack);
3246 return; 3260 return;
3247 } 3261 }
3248 break; 3262 break;
3249 case NON_LETTER_CHARACTER_MATCH: 3263 case NON_LETTER_CHARACTER_MATCH:
3250 emit_function = &EmitAtomNonLetter; 3264 emit_function = &EmitAtomNonLetter;
3251 break; 3265 break;
3252 case SIMPLE_CHARACTER_MATCH: 3266 case SIMPLE_CHARACTER_MATCH:
3253 emit_function = &EmitSimpleCharacter; 3267 emit_function = &EmitSimpleCharacter;
3254 break; 3268 break;
3255 case CASE_CHARACTER_MATCH: 3269 case CASE_CHARACTER_MATCH:
3256 emit_function = &EmitAtomLetter; 3270 emit_function = &EmitAtomLetter;
3257 break; 3271 break;
3258 default: 3272 default:
3259 break; 3273 break;
3260 } 3274 }
3261 if (emit_function != NULL) { 3275 if (emit_function != NULL) {
3262 bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); 3276 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3263 bool bound_checked = 3277 bool bound_checked =
3264 emit_function(isolate, compiler, quarks[j], backtrack, 3278 emit_function(compiler, quarks[j], backtrack, cp_offset + j,
3265 cp_offset + j, bounds_check, preloaded); 3279 bounds_check, preloaded);
3266 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3280 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3267 } 3281 }
3268 } 3282 }
3269 } else { 3283 } else {
3270 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3284 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3271 if (pass == CHARACTER_CLASS_MATCH) { 3285 if (pass == CHARACTER_CLASS_MATCH) {
3272 if (first_element_checked && i == 0) continue; 3286 if (first_element_checked && i == 0) continue;
3273 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3287 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3274 RegExpCharacterClass* cc = elm.char_class(); 3288 RegExpCharacterClass* cc = elm.char_class();
3275 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 3289 bool bounds_check = *checked_up_to < cp_offset || read_backward();
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
3338 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3352 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3339 LimitResult limit_result = LimitVersions(compiler, trace); 3353 LimitResult limit_result = LimitVersions(compiler, trace);
3340 if (limit_result == DONE) return; 3354 if (limit_result == DONE) return;
3341 DCHECK(limit_result == CONTINUE); 3355 DCHECK(limit_result == CONTINUE);
3342 3356
3343 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3357 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3344 compiler->SetRegExpTooBig(); 3358 compiler->SetRegExpTooBig();
3345 return; 3359 return;
3346 } 3360 }
3347 3361
3348 if (compiler->one_byte()) { 3362 if (compiler->one_byte()) {
erikcorry 2016/02/01 15:21:40 You should check for the /ui flags here instead of
Yang 2016/02/02 06:06:48 Done.
3349 int dummy = 0; 3363 int dummy = 0;
3350 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3364 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3351 } 3365 }
3352 3366
3353 bool first_elt_done = false; 3367 bool first_elt_done = false;
3354 int bound_checked_to = trace->cp_offset() - 1; 3368 int bound_checked_to = trace->cp_offset() - 1;
3355 bound_checked_to += trace->bound_checked_up_to(); 3369 bound_checked_to += trace->bound_checked_up_to();
3356 3370
3357 // If a character is preloaded into the current character register then 3371 // If a character is preloaded into the current character register then
3358 // check that now. 3372 // check that now.
(...skipping 730 matching lines...) Expand 10 before | Expand all | Expand 10 after
4089 4103
4090 // Really we should be creating a new trace when we execute this function, 4104 // Really we should be creating a new trace when we execute this function,
4091 // but there is no need, because the code it generates cannot backtrack, and 4105 // but there is no need, because the code it generates cannot backtrack, and
4092 // we always arrive here with a trivial trace (since it's the entry to a 4106 // we always arrive here with a trivial trace (since it's the entry to a
4093 // loop. That also implies that there are no preloaded characters, which is 4107 // loop. That also implies that there are no preloaded characters, which is
4094 // good, because it means we won't be violating any assumptions by 4108 // good, because it means we won't be violating any assumptions by
4095 // overwriting those characters with new load instructions. 4109 // overwriting those characters with new load instructions.
4096 DCHECK(trace->is_trivial()); 4110 DCHECK(trace->is_trivial());
4097 4111
4098 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4112 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4099 Isolate* isolate = macro_assembler->isolate();
4100 // At this point we know that we are at a non-greedy loop that will eat 4113 // At this point we know that we are at a non-greedy loop that will eat
4101 // any character one at a time. Any non-anchored regexp has such a 4114 // any character one at a time. Any non-anchored regexp has such a
4102 // loop prepended to it in order to find where it starts. We look for 4115 // loop prepended to it in order to find where it starts. We look for
4103 // a pattern of the form ...abc... where we can look 6 characters ahead 4116 // a pattern of the form ...abc... where we can look 6 characters ahead
4104 // and step forwards 3 if the character is not one of abc. Abc need 4117 // and step forwards 3 if the character is not one of abc. Abc need
4105 // not be atoms, they can be any reasonably limited character class or 4118 // not be atoms, they can be any reasonably limited character class or
4106 // small alternation. 4119 // small alternation.
4107 BoyerMooreLookahead* bm = bm_info(false); 4120 BoyerMooreLookahead* bm = bm_info(false);
4108 if (bm == NULL) { 4121 if (bm == NULL) {
4109 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4122 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4110 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4123 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4111 kRecursionBudget, 4124 kRecursionBudget,
4112 false)); 4125 false));
4113 if (eats_at_least >= 1) { 4126 if (eats_at_least >= 1) {
4114 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4127 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4115 compiler, 4128 compiler,
4116 zone()); 4129 zone());
4117 GuardedAlternative alt0 = alternatives_->at(0); 4130 GuardedAlternative alt0 = alternatives_->at(0);
4118 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 4131 alt0.node()->FillInBMInfo(compiler, 0, kRecursionBudget, bm, false);
4119 } 4132 }
4120 } 4133 }
4121 if (bm != NULL) { 4134 if (bm != NULL) {
4122 bm->EmitSkipInstructions(macro_assembler); 4135 bm->EmitSkipInstructions(macro_assembler);
4123 } 4136 }
4124 return eats_at_least; 4137 return eats_at_least;
4125 } 4138 }
4126 4139
4127 4140
4128 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4141 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
(...skipping 2236 matching lines...) Expand 10 before | Expand all | Expand 10 after
6365 6378
6366 void Analysis::VisitBackReference(BackReferenceNode* that) { 6379 void Analysis::VisitBackReference(BackReferenceNode* that) {
6367 EnsureAnalyzed(that->on_success()); 6380 EnsureAnalyzed(that->on_success());
6368 } 6381 }
6369 6382
6370 6383
6371 void Analysis::VisitAssertion(AssertionNode* that) { 6384 void Analysis::VisitAssertion(AssertionNode* that) {
6372 EnsureAnalyzed(that->on_success()); 6385 EnsureAnalyzed(that->on_success());
6373 } 6386 }
6374 6387
6375 6388 void BackReferenceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
6376 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6389 int budget, BoyerMooreLookahead* bm,
6377 BoyerMooreLookahead* bm,
6378 bool not_at_start) { 6390 bool not_at_start) {
6379 // Working out the set of characters that a backreference can match is too 6391 // Working out the set of characters that a backreference can match is too
6380 // hard, so we just say that any character can match. 6392 // hard, so we just say that any character can match.
6381 bm->SetRest(offset); 6393 bm->SetRest(offset);
6382 SaveBMInfo(bm, not_at_start, offset); 6394 SaveBMInfo(bm, not_at_start, offset);
6383 } 6395 }
6384 6396
6385 6397
6386 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 6398 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6387 RegExpMacroAssembler::kTableSize); 6399 RegExpMacroAssembler::kTableSize);
6388 6400
6389 6401 void ChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
6390 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6391 BoyerMooreLookahead* bm, bool not_at_start) { 6402 BoyerMooreLookahead* bm, bool not_at_start) {
6392 ZoneList<GuardedAlternative>* alts = alternatives(); 6403 ZoneList<GuardedAlternative>* alts = alternatives();
6393 budget = (budget - 1) / alts->length(); 6404 budget = (budget - 1) / alts->length();
6394 for (int i = 0; i < alts->length(); i++) { 6405 for (int i = 0; i < alts->length(); i++) {
6395 GuardedAlternative& alt = alts->at(i); 6406 GuardedAlternative& alt = alts->at(i);
6396 if (alt.guards() != NULL && alt.guards()->length() != 0) { 6407 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6397 bm->SetRest(offset); // Give up trying to fill in info. 6408 bm->SetRest(offset); // Give up trying to fill in info.
6398 SaveBMInfo(bm, not_at_start, offset); 6409 SaveBMInfo(bm, not_at_start, offset);
6399 return; 6410 return;
6400 } 6411 }
6401 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 6412 alt.node()->FillInBMInfo(compiler, offset, budget, bm, not_at_start);
6402 } 6413 }
6403 SaveBMInfo(bm, not_at_start, offset); 6414 SaveBMInfo(bm, not_at_start, offset);
6404 } 6415 }
6405 6416
6406 6417 void TextNode::FillInBMInfo(RegExpCompiler* compiler, int initial_offset,
6407 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 6418 int budget, BoyerMooreLookahead* bm,
6408 BoyerMooreLookahead* bm, bool not_at_start) { 6419 bool not_at_start) {
6409 if (initial_offset >= bm->length()) return; 6420 if (initial_offset >= bm->length()) return;
6410 int offset = initial_offset; 6421 int offset = initial_offset;
6411 int max_char = bm->max_char(); 6422 int max_char = bm->max_char();
6412 for (int i = 0; i < elements()->length(); i++) { 6423 for (int i = 0; i < elements()->length(); i++) {
6413 if (offset >= bm->length()) { 6424 if (offset >= bm->length()) {
6414 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6425 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6415 return; 6426 return;
6416 } 6427 }
6417 TextElement text = elements()->at(i); 6428 TextElement text = elements()->at(i);
6418 if (text.text_type() == TextElement::ATOM) { 6429 if (text.text_type() == TextElement::ATOM) {
6419 RegExpAtom* atom = text.atom(); 6430 RegExpAtom* atom = text.atom();
6420 for (int j = 0; j < atom->length(); j++, offset++) { 6431 for (int j = 0; j < atom->length(); j++, offset++) {
6421 if (offset >= bm->length()) { 6432 if (offset >= bm->length()) {
6422 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6433 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6423 return; 6434 return;
6424 } 6435 }
6425 uc16 character = atom->data()[j]; 6436 uc16 character = atom->data()[j];
6426 if (bm->compiler()->ignore_case()) { 6437 if (bm->compiler()->ignore_case()) {
6427 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 6438 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6428 int length = GetCaseIndependentLetters( 6439 int length = GetCaseIndependentLetters(
6429 isolate, character, bm->max_char() == String::kMaxOneByteCharCode, 6440 compiler, character,
6430 chars); 6441 bm->max_char() == String::kMaxOneByteCharCode, chars);
6431 for (int j = 0; j < length; j++) { 6442 for (int j = 0; j < length; j++) {
6432 bm->Set(offset, chars[j]); 6443 bm->Set(offset, chars[j]);
6433 } 6444 }
6434 } else { 6445 } else {
6435 if (character <= max_char) bm->Set(offset, character); 6446 if (character <= max_char) bm->Set(offset, character);
6436 } 6447 }
6437 } 6448 }
6438 } else { 6449 } else {
6439 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 6450 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6440 RegExpCharacterClass* char_class = text.char_class(); 6451 RegExpCharacterClass* char_class = text.char_class();
6441 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 6452 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6442 if (char_class->is_negated()) { 6453 if (char_class->is_negated()) {
6443 bm->SetAll(offset); 6454 bm->SetAll(offset);
6444 } else { 6455 } else {
6445 for (int k = 0; k < ranges->length(); k++) { 6456 for (int k = 0; k < ranges->length(); k++) {
6446 CharacterRange& range = ranges->at(k); 6457 CharacterRange& range = ranges->at(k);
6447 if (range.from() > max_char) continue; 6458 if (range.from() > max_char) continue;
6448 int to = Min(max_char, static_cast<int>(range.to())); 6459 int to = Min(max_char, static_cast<int>(range.to()));
6449 bm->SetInterval(offset, Interval(range.from(), to)); 6460 bm->SetInterval(offset, Interval(range.from(), to));
6450 } 6461 }
6451 } 6462 }
6452 offset++; 6463 offset++;
6453 } 6464 }
6454 } 6465 }
6455 if (offset >= bm->length()) { 6466 if (offset >= bm->length()) {
6456 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6467 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6457 return; 6468 return;
6458 } 6469 }
6459 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 6470 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm,
6460 true); // Not at start after a text node. 6471 true); // Not at start after a text node.
6461 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6472 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6462 } 6473 }
6463 6474
6464 6475
6465 // ------------------------------------------------------------------- 6476 // -------------------------------------------------------------------
6466 // Dispatch table construction 6477 // Dispatch table construction
6467 6478
6468 6479
6469 void DispatchTableConstructor::VisitEnd(EndNode* that) { 6480 void DispatchTableConstructor::VisitEnd(EndNode* that) {
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
6607 } 6618 }
6608 6619
6609 6620
6610 RegExpEngine::CompilationResult RegExpEngine::Compile( 6621 RegExpEngine::CompilationResult RegExpEngine::Compile(
6611 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6622 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6612 JSRegExp::Flags flags, Handle<String> pattern, 6623 JSRegExp::Flags flags, Handle<String> pattern,
6613 Handle<String> sample_subject, bool is_one_byte) { 6624 Handle<String> sample_subject, bool is_one_byte) {
6614 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6625 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6615 return IrregexpRegExpTooBig(isolate); 6626 return IrregexpRegExpTooBig(isolate);
6616 } 6627 }
6617 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6618 bool is_sticky = flags & JSRegExp::kSticky; 6628 bool is_sticky = flags & JSRegExp::kSticky;
6619 bool is_global = flags & JSRegExp::kGlobal; 6629 bool is_global = flags & JSRegExp::kGlobal;
6620 RegExpCompiler compiler(isolate, zone, data->capture_count, flags, 6630 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6621 is_one_byte); 6631 is_one_byte);
6622 6632
6623 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); 6633 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6624 6634
6625 // Sample some characters from the middle of the string. 6635 // Sample some characters from the middle of the string.
6626 static const int kSampleSize = 128; 6636 static const int kSampleSize = 128;
6627 6637
(...skipping 28 matching lines...) Expand all
6656 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6666 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6657 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6667 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6658 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( 6668 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6659 new (zone) RegExpCharacterClass('*'), false, loop_node))); 6669 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6660 node = first_step_node; 6670 node = first_step_node;
6661 } else { 6671 } else {
6662 node = loop_node; 6672 node = loop_node;
6663 } 6673 }
6664 } 6674 }
6665 if (is_one_byte) { 6675 if (is_one_byte) {
6666 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6676 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);
6667 // Do it again to propagate the new nodes to places where they were not 6677 // Do it again to propagate the new nodes to places where they were not
6668 // put because they had not been calculated yet. 6678 // put because they had not been calculated yet.
6669 if (node != NULL) { 6679 if (node != NULL) {
6670 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6680 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);
6671 } 6681 }
6672 } else if (compiler.unicode() && (is_global || is_sticky)) { 6682 } else if (compiler.unicode() && (is_global || is_sticky)) {
6673 node = OptionallyStepBackToLeadSurrogate(&compiler, node); 6683 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6674 } 6684 }
6675 6685
6676 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6686 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6677 data->node = node; 6687 data->node = node;
6678 Analysis analysis(isolate, flags, is_one_byte); 6688 Analysis analysis(isolate, flags, is_one_byte);
6679 analysis.EnsureAnalyzed(node); 6689 analysis.EnsureAnalyzed(node);
6680 if (analysis.has_failed()) { 6690 if (analysis.has_failed()) {
(...skipping 174 matching lines...) Expand 10 before | Expand all | Expand 10 after
6855 6865
6856 6866
6857 void RegExpResultsCache::Clear(FixedArray* cache) { 6867 void RegExpResultsCache::Clear(FixedArray* cache) {
6858 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6868 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6859 cache->set(i, Smi::FromInt(0)); 6869 cache->set(i, Smi::FromInt(0));
6860 } 6870 }
6861 } 6871 }
6862 6872
6863 } // namespace internal 6873 } // namespace internal
6864 } // namespace v8 6874 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698