Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(103)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1641613002: [regexp] implement /ui to mirror the implementation for /i. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: addressed comment Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include "src/ast/ast.h" 7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 1580 matching lines...) Expand 10 before | Expand all | Expand 10 after
1591 macro_assembler->IfRegisterLT(guard->reg(), 1591 macro_assembler->IfRegisterLT(guard->reg(),
1592 guard->value(), 1592 guard->value(),
1593 trace->backtrack()); 1593 trace->backtrack());
1594 break; 1594 break;
1595 } 1595 }
1596 } 1596 }
1597 1597
1598 1598
1599 // Returns the number of characters in the equivalence class, omitting those 1599 // Returns the number of characters in the equivalence class, omitting those
1600 // that cannot occur in the source string because it is Latin1. 1600 // that cannot occur in the source string because it is Latin1.
1601 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, 1601 static int GetCaseIndependentLetters(RegExpCompiler* compiler, uc16 character,
1602 bool one_byte_subject, 1602 uc32* letters) {
1603 unibrow::uchar* letters) { 1603 int length;
1604 int length = 1604 #ifdef V8_I18N_SUPPORT
1605 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); 1605 if (compiler->unicode()) {
1606 // Unibrow returns 0 or 1 for characters where case independence is 1606 USet* set = uset_open(character, character);
1607 // trivial. 1607 uset_closeOver(set, USET_CASE_INSENSITIVE);
1608 if (length == 0) { 1608 uset_removeAllStrings(set);
1609 letters[0] = character; 1609 length = uset_size(set);
1610 length = 1; 1610 for (int i = 0; i < length; i++) {
1611 letters[i] = uset_charAt(set, i);
1612 }
1613 uset_close(set);
1614 } else // NOLINT
1615 // Fallback in case ICU is not included.
1616 #endif // V8_I18N_SUPPORT
1617 {
1618 length = compiler->isolate()->jsregexp_uncanonicalize()->get(character,
1619 '\0', letters);
1620 // Unibrow returns 0 or 1 for characters where case independence is
1621 // trivial.
1622 if (length == 0) {
1623 letters[0] = character;
1624 length = 1;
1625 }
1611 } 1626 }
1612 1627
1613 if (one_byte_subject) { 1628 if (compiler->one_byte()) {
1614 int new_length = 0; 1629 int new_length = 0;
1615 for (int i = 0; i < length; i++) { 1630 for (int i = 0; i < length; i++) {
1616 if (letters[i] <= String::kMaxOneByteCharCode) { 1631 if (letters[i] <= String::kMaxOneByteCharCode) {
1617 letters[new_length++] = letters[i]; 1632 letters[new_length++] = letters[i];
1618 } 1633 }
1619 } 1634 }
1620 length = new_length; 1635 length = new_length;
1621 } 1636 }
1622 1637
1623 return length; 1638 return length;
1624 } 1639 }
1625 1640
1626 1641 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, uc16 c,
1627 static inline bool EmitSimpleCharacter(Isolate* isolate, 1642 Label* on_failure, int cp_offset,
1628 RegExpCompiler* compiler, 1643 bool check, bool preloaded) {
1629 uc16 c,
1630 Label* on_failure,
1631 int cp_offset,
1632 bool check,
1633 bool preloaded) {
1634 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 1644 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1635 bool bound_checked = false; 1645 bool bound_checked = false;
1636 if (!preloaded) { 1646 if (!preloaded) {
1637 assembler->LoadCurrentCharacter( 1647 assembler->LoadCurrentCharacter(
1638 cp_offset, 1648 cp_offset,
1639 on_failure, 1649 on_failure,
1640 check); 1650 check);
1641 bound_checked = true; 1651 bound_checked = true;
1642 } 1652 }
1643 assembler->CheckNotCharacter(c, on_failure); 1653 assembler->CheckNotCharacter(c, on_failure);
1644 return bound_checked; 1654 return bound_checked;
1645 } 1655 }
1646 1656
1647 1657
1648 // Only emits non-letters (things that don't have case). Only used for case 1658 // Only emits non-letters (things that don't have case). Only used for case
1649 // independent matches. 1659 // independent matches.
1650 static inline bool EmitAtomNonLetter(Isolate* isolate, 1660 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, uc16 c,
1651 RegExpCompiler* compiler, 1661 Label* on_failure, int cp_offset,
1652 uc16 c, 1662 bool check, bool preloaded) {
1653 Label* on_failure,
1654 int cp_offset,
1655 bool check,
1656 bool preloaded) {
1657 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1663 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1658 bool one_byte = compiler->one_byte();
1659 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1660 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1665 int length = GetCaseIndependentLetters(compiler, c, chars);
1661 if (length < 1) { 1666 if (length < 1) {
1662 // This can't match. Must be an one-byte subject and a non-one-byte 1667 // This can't match. Must be an one-byte subject and a non-one-byte
1663 // character. We do not need to do anything since the one-byte pass 1668 // character. We do not need to do anything since the one-byte pass
1664 // already handled this. 1669 // already handled this.
1665 return false; // Bounds not checked. 1670 return false; // Bounds not checked.
1666 } 1671 }
1667 bool checked = false; 1672 bool checked = false;
1668 // We handle the length > 1 case in a later pass. 1673 // We handle the length > 1 case in a later pass.
1669 if (length == 1) { 1674 if (length == 1) {
1670 if (one_byte && c > String::kMaxOneByteCharCodeU) { 1675 if (compiler->one_byte() && c > String::kMaxOneByteCharCodeU) {
1671 // Can't match - see above. 1676 // This cannot match.
1672 return false; // Bounds not checked. 1677 return false; // Bounds not checked.
1673 } 1678 }
1674 if (!preloaded) { 1679 if (!preloaded) {
1675 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1680 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1676 checked = check; 1681 checked = check;
1677 } 1682 }
1678 macro_assembler->CheckNotCharacter(c, on_failure); 1683 macro_assembler->CheckNotCharacter(c, on_failure);
1679 } 1684 }
1680 return checked; 1685 return checked;
1681 } 1686 }
(...skipping 28 matching lines...) Expand all
1710 uc16 mask = char_mask ^ diff; 1715 uc16 mask = char_mask ^ diff;
1711 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, 1716 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1712 diff, 1717 diff,
1713 mask, 1718 mask,
1714 on_failure); 1719 on_failure);
1715 return true; 1720 return true;
1716 } 1721 }
1717 return false; 1722 return false;
1718 } 1723 }
1719 1724
1720 1725 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, uc16 c,
1721 typedef bool EmitCharacterFunction(Isolate* isolate, 1726 Label* on_failure, int cp_offset, bool check,
1722 RegExpCompiler* compiler,
1723 uc16 c,
1724 Label* on_failure,
1725 int cp_offset,
1726 bool check,
1727 bool preloaded); 1727 bool preloaded);
1728 1728
1729 // Only emits letters (things that have case). Only used for case independent 1729 // Only emits letters (things that have case). Only used for case independent
1730 // matches. 1730 // matches.
1731 static inline bool EmitAtomLetter(Isolate* isolate, 1731 static inline bool EmitAtomLetter(RegExpCompiler* compiler, uc16 c,
1732 RegExpCompiler* compiler, 1732 Label* on_failure, int cp_offset, bool check,
1733 uc16 c,
1734 Label* on_failure,
1735 int cp_offset,
1736 bool check,
1737 bool preloaded) { 1733 bool preloaded) {
1738 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 1734 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1739 bool one_byte = compiler->one_byte();
1740 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 1735 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1741 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); 1736 int length = GetCaseIndependentLetters(compiler, c, chars);
1742 if (length <= 1) return false; 1737 if (length <= 1) return false;
1743 // We may not need to check against the end of the input string 1738 // We may not need to check against the end of the input string
1744 // if this character lies before a character that matched. 1739 // if this character lies before a character that matched.
1745 if (!preloaded) { 1740 if (!preloaded) {
1746 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); 1741 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1747 } 1742 }
1748 Label ok; 1743 Label ok;
1749 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); 1744 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1750 switch (length) { 1745 switch (length) {
1751 case 2: { 1746 case 2: {
1752 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], 1747 if (ShortCutEmitCharacterPair(macro_assembler, compiler->one_byte(),
1753 chars[1], on_failure)) { 1748 chars[0], chars[1], on_failure)) {
1754 } else { 1749 } else {
1755 macro_assembler->CheckCharacter(chars[0], &ok); 1750 macro_assembler->CheckCharacter(chars[0], &ok);
1756 macro_assembler->CheckNotCharacter(chars[1], on_failure); 1751 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1757 macro_assembler->Bind(&ok); 1752 macro_assembler->Bind(&ok);
1758 } 1753 }
1759 break; 1754 break;
1760 } 1755 }
1761 case 4: 1756 case 4:
1762 macro_assembler->CheckCharacter(chars[3], &ok); 1757 macro_assembler->CheckCharacter(chars[3], &ok);
1763 // Fall through! 1758 // Fall through!
(...skipping 516 matching lines...) Expand 10 before | Expand all | Expand 10 after
2280 int ActionNode::EatsAtLeast(int still_to_find, 2275 int ActionNode::EatsAtLeast(int still_to_find,
2281 int budget, 2276 int budget,
2282 bool not_at_start) { 2277 bool not_at_start) {
2283 if (budget <= 0) return 0; 2278 if (budget <= 0) return 0;
2284 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2279 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2285 return on_success()->EatsAtLeast(still_to_find, 2280 return on_success()->EatsAtLeast(still_to_find,
2286 budget - 1, 2281 budget - 1,
2287 not_at_start); 2282 not_at_start);
2288 } 2283 }
2289 2284
2290 2285 void ActionNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
2291 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2292 BoyerMooreLookahead* bm, bool not_at_start) { 2286 BoyerMooreLookahead* bm, bool not_at_start) {
2293 if (action_type_ == BEGIN_SUBMATCH) { 2287 if (action_type_ == BEGIN_SUBMATCH) {
2294 bm->SetRest(offset); 2288 bm->SetRest(offset);
2295 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) { 2289 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2296 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2290 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
2297 } 2291 }
2298 SaveBMInfo(bm, not_at_start, offset); 2292 SaveBMInfo(bm, not_at_start, offset);
2299 } 2293 }
2300 2294
2301 2295
2302 int AssertionNode::EatsAtLeast(int still_to_find, 2296 int AssertionNode::EatsAtLeast(int still_to_find,
2303 int budget, 2297 int budget,
2304 bool not_at_start) { 2298 bool not_at_start) {
2305 if (budget <= 0) return 0; 2299 if (budget <= 0) return 0;
2306 // If we know we are not at the start and we are asked "how many characters 2300 // If we know we are not at the start and we are asked "how many characters
2307 // will you match if you succeed?" then we can answer anything since false 2301 // will you match if you succeed?" then we can answer anything since false
2308 // implies false. So lets just return the max answer (still_to_find) since 2302 // implies false. So lets just return the max answer (still_to_find) since
2309 // that won't prevent us from preloading a lot of characters for the other 2303 // that won't prevent us from preloading a lot of characters for the other
2310 // branches in the node graph. 2304 // branches in the node graph.
2311 if (assertion_type() == AT_START && not_at_start) return still_to_find; 2305 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2312 return on_success()->EatsAtLeast(still_to_find, 2306 return on_success()->EatsAtLeast(still_to_find,
2313 budget - 1, 2307 budget - 1,
2314 not_at_start); 2308 not_at_start);
2315 } 2309 }
2316 2310
2317 2311 void AssertionNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
2318 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2312 int budget, BoyerMooreLookahead* bm,
2319 BoyerMooreLookahead* bm, bool not_at_start) { 2313 bool not_at_start) {
2320 // Match the behaviour of EatsAtLeast on this node. 2314 // Match the behaviour of EatsAtLeast on this node.
2321 if (assertion_type() == AT_START && not_at_start) return; 2315 if (assertion_type() == AT_START && not_at_start) return;
2322 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2316 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
2323 SaveBMInfo(bm, not_at_start, offset); 2317 SaveBMInfo(bm, not_at_start, offset);
2324 } 2318 }
2325 2319
2326 2320
2327 int BackReferenceNode::EatsAtLeast(int still_to_find, 2321 int BackReferenceNode::EatsAtLeast(int still_to_find,
2328 int budget, 2322 int budget,
2329 bool not_at_start) { 2323 bool not_at_start) {
2330 if (read_backward()) return 0; 2324 if (read_backward()) return 0;
2331 if (budget <= 0) return 0; 2325 if (budget <= 0) return 0;
2332 return on_success()->EatsAtLeast(still_to_find, 2326 return on_success()->EatsAtLeast(still_to_find,
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after
2526 // The masks and values for the positions will be combined into a single 2520 // The masks and values for the positions will be combined into a single
2527 // machine word for the current character width in order to be used in 2521 // machine word for the current character width in order to be used in
2528 // generating a quick check. 2522 // generating a quick check.
2529 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, 2523 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2530 RegExpCompiler* compiler, 2524 RegExpCompiler* compiler,
2531 int characters_filled_in, 2525 int characters_filled_in,
2532 bool not_at_start) { 2526 bool not_at_start) {
2533 // Do not collect any quick check details if the text node reads backward, 2527 // Do not collect any quick check details if the text node reads backward,
2534 // since it reads in the opposite direction than we use for quick checks. 2528 // since it reads in the opposite direction than we use for quick checks.
2535 if (read_backward()) return; 2529 if (read_backward()) return;
2536 Isolate* isolate = compiler->macro_assembler()->isolate();
2537 DCHECK(characters_filled_in < details->characters()); 2530 DCHECK(characters_filled_in < details->characters());
2538 int characters = details->characters(); 2531 int characters = details->characters();
2539 int char_mask; 2532 int char_mask;
2540 if (compiler->one_byte()) { 2533 if (compiler->one_byte()) {
2541 char_mask = String::kMaxOneByteCharCode; 2534 char_mask = String::kMaxOneByteCharCode;
2542 } else { 2535 } else {
2543 char_mask = String::kMaxUtf16CodeUnit; 2536 char_mask = String::kMaxUtf16CodeUnit;
2544 } 2537 }
2545 for (int k = 0; k < elements()->length(); k++) { 2538 for (int k = 0; k < elements()->length(); k++) {
2546 TextElement elm = elements()->at(k); 2539 TextElement elm = elements()->at(k);
2547 if (elm.text_type() == TextElement::ATOM) { 2540 if (elm.text_type() == TextElement::ATOM) {
2548 Vector<const uc16> quarks = elm.atom()->data(); 2541 Vector<const uc16> quarks = elm.atom()->data();
2549 for (int i = 0; i < characters && i < quarks.length(); i++) { 2542 for (int i = 0; i < characters && i < quarks.length(); i++) {
2550 QuickCheckDetails::Position* pos = 2543 QuickCheckDetails::Position* pos =
2551 details->positions(characters_filled_in); 2544 details->positions(characters_filled_in);
2552 uc16 c = quarks[i]; 2545 uc16 c = quarks[i];
2553 if (compiler->ignore_case()) { 2546 if (compiler->ignore_case()) {
2554 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 2547 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2555 int length = GetCaseIndependentLetters(isolate, c, 2548 int length = GetCaseIndependentLetters(compiler, c, chars);
2556 compiler->one_byte(), chars);
2557 if (length == 0) { 2549 if (length == 0) {
2558 // This can happen because all case variants are non-Latin1, but we 2550 // This can happen because all case variants are non-Latin1, but we
2559 // know the input is Latin1. 2551 // know the input is Latin1.
2560 details->set_cannot_match(); 2552 details->set_cannot_match();
2561 pos->determines_perfectly = false; 2553 pos->determines_perfectly = false;
2562 return; 2554 return;
2563 } 2555 }
2564 if (length == 1) { 2556 if (length == 1) {
2565 // This letter has no case equivalents, so it's nice and simple 2557 // This letter has no case equivalents, so it's nice and simple
2566 // and the mask-compare will determine definitely whether we have 2558 // and the mask-compare will determine definitely whether we have
(...skipping 184 matching lines...) Expand 10 before | Expand all | Expand 10 after
2751 DCHECK(!info->visited); 2743 DCHECK(!info->visited);
2752 info->visited = true; 2744 info->visited = true;
2753 } 2745 }
2754 ~VisitMarker() { 2746 ~VisitMarker() {
2755 info_->visited = false; 2747 info_->visited = false;
2756 } 2748 }
2757 private: 2749 private:
2758 NodeInfo* info_; 2750 NodeInfo* info_;
2759 }; 2751 };
2760 2752
2761 2753 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
2762 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2763 if (info()->replacement_calculated) return replacement(); 2754 if (info()->replacement_calculated) return replacement();
2764 if (depth < 0) return this; 2755 if (depth < 0) return this;
2765 DCHECK(!info()->visited); 2756 DCHECK(!info()->visited);
2766 VisitMarker marker(info()); 2757 VisitMarker marker(info());
2767 return FilterSuccessor(depth - 1, ignore_case); 2758 return FilterSuccessor(depth - 1, compiler);
2768 } 2759 }
2769 2760
2770 2761 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth,
2771 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { 2762 RegExpCompiler* compiler) {
2772 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); 2763 RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler);
2773 if (next == NULL) return set_replacement(NULL); 2764 if (next == NULL) return set_replacement(NULL);
2774 on_success_ = next; 2765 on_success_ = next;
2775 return set_replacement(this); 2766 return set_replacement(this);
2776 } 2767 }
2777 2768
2778 2769
2779 // We need to check for the following characters: 0x39c 0x3bc 0x178. 2770 // We need to check for the following characters: 0x39c 0x3bc 0x178.
2780 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { 2771 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2781 // TODO(dcarney): this could be a lot more efficient. 2772 // TODO(dcarney): this could be a lot more efficient.
2782 return range.Contains(0x39c) || 2773 return range.Contains(0x39c) ||
2783 range.Contains(0x3bc) || range.Contains(0x178); 2774 range.Contains(0x3bc) || range.Contains(0x178);
2784 } 2775 }
2785 2776
2786 2777
2787 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { 2778 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2788 for (int i = 0; i < ranges->length(); i++) { 2779 for (int i = 0; i < ranges->length(); i++) {
2789 // TODO(dcarney): this could be a lot more efficient. 2780 // TODO(dcarney): this could be a lot more efficient.
2790 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; 2781 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2791 } 2782 }
2792 return false; 2783 return false;
2793 } 2784 }
2794 2785
2786 static uc16 ConvertNonLatin1ToEquivalentLatin1(bool unicode, uc16 c) {
2787 #ifdef V8_I18N_SUPPORT
2788 if (unicode) {
2789 USet* set = uset_open(c, c);
2790 uset_closeOver(set, USET_CASE_INSENSITIVE);
2791 uset_removeAllStrings(set);
2792 int length = uset_size(set);
2793 uc16 result = 0;
2794 for (int i = 0; i < length; i++) {
2795 uc32 c = uset_charAt(set, i);
2796 if (c <= String::kMaxOneByteCharCode) {
2797 result = static_cast<uc16>(c);
2798 break;
2799 }
2800 }
2801 uset_close(set);
2802 return result;
2803 }
2804 // Fallback to unibrow if ICU is not included.
2805 #endif // V8_I18N_SUPPORT
2806 return unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2807 }
2795 2808
2796 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { 2809 RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
2797 if (info()->replacement_calculated) return replacement(); 2810 if (info()->replacement_calculated) return replacement();
2798 if (depth < 0) return this; 2811 if (depth < 0) return this;
2799 DCHECK(!info()->visited); 2812 DCHECK(!info()->visited);
2800 VisitMarker marker(info()); 2813 VisitMarker marker(info());
2801 int element_count = elements()->length(); 2814 int element_count = elements()->length();
2802 for (int i = 0; i < element_count; i++) { 2815 for (int i = 0; i < element_count; i++) {
2803 TextElement elm = elements()->at(i); 2816 TextElement elm = elements()->at(i);
2804 if (elm.text_type() == TextElement::ATOM) { 2817 if (elm.text_type() == TextElement::ATOM) {
2805 Vector<const uc16> quarks = elm.atom()->data(); 2818 Vector<const uc16> quarks = elm.atom()->data();
2806 for (int j = 0; j < quarks.length(); j++) { 2819 for (int j = 0; j < quarks.length(); j++) {
2807 uint16_t c = quarks[j]; 2820 uc16 c = quarks[j];
2808 if (c <= String::kMaxOneByteCharCode) continue; 2821 if (c <= String::kMaxOneByteCharCode) continue;
2809 if (!ignore_case) return set_replacement(NULL); 2822 if (!compiler->ignore_case()) return set_replacement(NULL);
2810 // Here, we need to check for characters whose upper and lower cases 2823 // Here, we need to check for characters whose upper and lower cases
2811 // are outside the Latin-1 range. 2824 // are outside the Latin-1 range.
2812 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c); 2825 uc16 converted =
2826 ConvertNonLatin1ToEquivalentLatin1(compiler->unicode(), c);
2813 // Character is outside Latin-1 completely 2827 // Character is outside Latin-1 completely
2814 if (converted == 0) return set_replacement(NULL); 2828 if (converted == 0) return set_replacement(NULL);
2815 // Convert quark to Latin-1 in place. 2829 // Convert quark to Latin-1 in place.
2816 uint16_t* copy = const_cast<uint16_t*>(quarks.start()); 2830 uc16* copy = const_cast<uc16*>(quarks.start());
2817 copy[j] = converted; 2831 copy[j] = converted;
2818 } 2832 }
2819 } else { 2833 } else {
2820 DCHECK(elm.text_type() == TextElement::CHAR_CLASS); 2834 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2821 RegExpCharacterClass* cc = elm.char_class(); 2835 RegExpCharacterClass* cc = elm.char_class();
2822 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); 2836 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2823 CharacterRange::Canonicalize(ranges); 2837 CharacterRange::Canonicalize(ranges);
2824 // Now they are in order so we only need to look at the first. 2838 // Now they are in order so we only need to look at the first.
2825 int range_count = ranges->length(); 2839 int range_count = ranges->length();
2826 if (cc->is_negated()) { 2840 if (cc->is_negated()) {
2827 if (range_count != 0 && 2841 if (range_count != 0 &&
2828 ranges->at(0).from() == 0 && 2842 ranges->at(0).from() == 0 &&
2829 ranges->at(0).to() >= String::kMaxOneByteCharCode) { 2843 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2830 // This will be handled in a later filter. 2844 // This will be handled in a later filter.
2831 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2845 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))
2846 continue;
2832 return set_replacement(NULL); 2847 return set_replacement(NULL);
2833 } 2848 }
2834 } else { 2849 } else {
2835 if (range_count == 0 || 2850 if (range_count == 0 ||
2836 ranges->at(0).from() > String::kMaxOneByteCharCode) { 2851 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2837 // This will be handled in a later filter. 2852 // This will be handled in a later filter.
2838 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; 2853 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))
2854 continue;
2839 return set_replacement(NULL); 2855 return set_replacement(NULL);
2840 } 2856 }
2841 } 2857 }
2842 } 2858 }
2843 } 2859 }
2844 return FilterSuccessor(depth - 1, ignore_case); 2860 return FilterSuccessor(depth - 1, compiler);
2845 } 2861 }
2846 2862
2847 2863 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
2848 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2849 if (info()->replacement_calculated) return replacement(); 2864 if (info()->replacement_calculated) return replacement();
2850 if (depth < 0) return this; 2865 if (depth < 0) return this;
2851 if (info()->visited) return this; 2866 if (info()->visited) return this;
2852 { 2867 {
2853 VisitMarker marker(info()); 2868 VisitMarker marker(info());
2854 2869
2855 RegExpNode* continue_replacement = 2870 RegExpNode* continue_replacement =
2856 continue_node_->FilterOneByte(depth - 1, ignore_case); 2871 continue_node_->FilterOneByte(depth - 1, compiler);
2857 // If we can't continue after the loop then there is no sense in doing the 2872 // If we can't continue after the loop then there is no sense in doing the
2858 // loop. 2873 // loop.
2859 if (continue_replacement == NULL) return set_replacement(NULL); 2874 if (continue_replacement == NULL) return set_replacement(NULL);
2860 } 2875 }
2861 2876
2862 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); 2877 return ChoiceNode::FilterOneByte(depth - 1, compiler);
2863 } 2878 }
2864 2879
2865 2880 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {
2866 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2867 if (info()->replacement_calculated) return replacement(); 2881 if (info()->replacement_calculated) return replacement();
2868 if (depth < 0) return this; 2882 if (depth < 0) return this;
2869 if (info()->visited) return this; 2883 if (info()->visited) return this;
2870 VisitMarker marker(info()); 2884 VisitMarker marker(info());
2871 int choice_count = alternatives_->length(); 2885 int choice_count = alternatives_->length();
2872 2886
2873 for (int i = 0; i < choice_count; i++) { 2887 for (int i = 0; i < choice_count; i++) {
2874 GuardedAlternative alternative = alternatives_->at(i); 2888 GuardedAlternative alternative = alternatives_->at(i);
2875 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { 2889 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2876 set_replacement(this); 2890 set_replacement(this);
2877 return this; 2891 return this;
2878 } 2892 }
2879 } 2893 }
2880 2894
2881 int surviving = 0; 2895 int surviving = 0;
2882 RegExpNode* survivor = NULL; 2896 RegExpNode* survivor = NULL;
2883 for (int i = 0; i < choice_count; i++) { 2897 for (int i = 0; i < choice_count; i++) {
2884 GuardedAlternative alternative = alternatives_->at(i); 2898 GuardedAlternative alternative = alternatives_->at(i);
2885 RegExpNode* replacement = 2899 RegExpNode* replacement =
2886 alternative.node()->FilterOneByte(depth - 1, ignore_case); 2900 alternative.node()->FilterOneByte(depth - 1, compiler);
2887 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. 2901 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2888 if (replacement != NULL) { 2902 if (replacement != NULL) {
2889 alternatives_->at(i).set_node(replacement); 2903 alternatives_->at(i).set_node(replacement);
2890 surviving++; 2904 surviving++;
2891 survivor = replacement; 2905 survivor = replacement;
2892 } 2906 }
2893 } 2907 }
2894 if (surviving < 2) return set_replacement(survivor); 2908 if (surviving < 2) return set_replacement(survivor);
2895 2909
2896 set_replacement(this); 2910 set_replacement(this);
2897 if (surviving == choice_count) { 2911 if (surviving == choice_count) {
2898 return this; 2912 return this;
2899 } 2913 }
2900 // Only some of the nodes survived the filtering. We need to rebuild the 2914 // Only some of the nodes survived the filtering. We need to rebuild the
2901 // alternatives list. 2915 // alternatives list.
2902 ZoneList<GuardedAlternative>* new_alternatives = 2916 ZoneList<GuardedAlternative>* new_alternatives =
2903 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); 2917 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2904 for (int i = 0; i < choice_count; i++) { 2918 for (int i = 0; i < choice_count; i++) {
2905 RegExpNode* replacement = 2919 RegExpNode* replacement =
2906 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); 2920 alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler);
2907 if (replacement != NULL) { 2921 if (replacement != NULL) {
2908 alternatives_->at(i).set_node(replacement); 2922 alternatives_->at(i).set_node(replacement);
2909 new_alternatives->Add(alternatives_->at(i), zone()); 2923 new_alternatives->Add(alternatives_->at(i), zone());
2910 } 2924 }
2911 } 2925 }
2912 alternatives_ = new_alternatives; 2926 alternatives_ = new_alternatives;
2913 return this; 2927 return this;
2914 } 2928 }
2915 2929
2916 2930 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(
2917 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, 2931 int depth, RegExpCompiler* compiler) {
2918 bool ignore_case) {
2919 if (info()->replacement_calculated) return replacement(); 2932 if (info()->replacement_calculated) return replacement();
2920 if (depth < 0) return this; 2933 if (depth < 0) return this;
2921 if (info()->visited) return this; 2934 if (info()->visited) return this;
2922 VisitMarker marker(info()); 2935 VisitMarker marker(info());
2923 // Alternative 0 is the negative lookahead, alternative 1 is what comes 2936 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2924 // afterwards. 2937 // afterwards.
2925 RegExpNode* node = alternatives_->at(1).node(); 2938 RegExpNode* node = alternatives_->at(1).node();
2926 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); 2939 RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler);
2927 if (replacement == NULL) return set_replacement(NULL); 2940 if (replacement == NULL) return set_replacement(NULL);
2928 alternatives_->at(1).set_node(replacement); 2941 alternatives_->at(1).set_node(replacement);
2929 2942
2930 RegExpNode* neg_node = alternatives_->at(0).node(); 2943 RegExpNode* neg_node = alternatives_->at(0).node();
2931 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); 2944 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler);
2932 // If the negative lookahead is always going to fail then 2945 // If the negative lookahead is always going to fail then
2933 // we don't need to check it. 2946 // we don't need to check it.
2934 if (neg_replacement == NULL) return set_replacement(replacement); 2947 if (neg_replacement == NULL) return set_replacement(replacement);
2935 alternatives_->at(0).set_node(neg_replacement); 2948 alternatives_->at(0).set_node(neg_replacement);
2936 return set_replacement(this); 2949 return set_replacement(this);
2937 } 2950 }
2938 2951
2939 2952
2940 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2953 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2941 RegExpCompiler* compiler, 2954 RegExpCompiler* compiler,
2942 int characters_filled_in, 2955 int characters_filled_in,
2943 bool not_at_start) { 2956 bool not_at_start) {
2944 if (body_can_be_zero_length_ || info()->visited) return; 2957 if (body_can_be_zero_length_ || info()->visited) return;
2945 VisitMarker marker(info()); 2958 VisitMarker marker(info());
2946 return ChoiceNode::GetQuickCheckDetails(details, 2959 return ChoiceNode::GetQuickCheckDetails(details,
2947 compiler, 2960 compiler,
2948 characters_filled_in, 2961 characters_filled_in,
2949 not_at_start); 2962 not_at_start);
2950 } 2963 }
2951 2964
2952 2965 void LoopChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
2953 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 2966 int budget, BoyerMooreLookahead* bm,
2954 BoyerMooreLookahead* bm, bool not_at_start) { 2967 bool not_at_start) {
2955 if (body_can_be_zero_length_ || budget <= 0) { 2968 if (body_can_be_zero_length_ || budget <= 0) {
2956 bm->SetRest(offset); 2969 bm->SetRest(offset);
2957 SaveBMInfo(bm, not_at_start, offset); 2970 SaveBMInfo(bm, not_at_start, offset);
2958 return; 2971 return;
2959 } 2972 }
2960 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); 2973 ChoiceNode::FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);
2961 SaveBMInfo(bm, not_at_start, offset); 2974 SaveBMInfo(bm, not_at_start, offset);
2962 } 2975 }
2963 2976
2964 2977
2965 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2978 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2966 RegExpCompiler* compiler, 2979 RegExpCompiler* compiler,
2967 int characters_filled_in, 2980 int characters_filled_in,
2968 bool not_at_start) { 2981 bool not_at_start) {
2969 not_at_start = (not_at_start || not_at_start_); 2982 not_at_start = (not_at_start || not_at_start_);
2970 int choice_count = alternatives_->length(); 2983 int choice_count = alternatives_->length();
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
3042 assembler->CheckNotCharacter('\r', new_trace.backtrack()); 3055 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3043 } 3056 }
3044 assembler->Bind(&ok); 3057 assembler->Bind(&ok);
3045 on_success->Emit(compiler, &new_trace); 3058 on_success->Emit(compiler, &new_trace);
3046 } 3059 }
3047 3060
3048 3061
3049 // Emit the code to handle \b and \B (word-boundary or non-word-boundary). 3062 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3050 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) { 3063 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3051 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3064 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3052 Isolate* isolate = assembler->isolate();
3053 Trace::TriBool next_is_word_character = Trace::UNKNOWN; 3065 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3054 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE); 3066 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3055 BoyerMooreLookahead* lookahead = bm_info(not_at_start); 3067 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3056 if (lookahead == NULL) { 3068 if (lookahead == NULL) {
3057 int eats_at_least = 3069 int eats_at_least =
3058 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore, 3070 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3059 kRecursionBudget, 3071 kRecursionBudget,
3060 not_at_start)); 3072 not_at_start));
3061 if (eats_at_least >= 1) { 3073 if (eats_at_least >= 1) {
3062 BoyerMooreLookahead* bm = 3074 BoyerMooreLookahead* bm =
3063 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone()); 3075 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3064 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start); 3076 FillInBMInfo(compiler, 0, kRecursionBudget, bm, not_at_start);
3065 if (bm->at(0)->is_non_word()) 3077 if (bm->at(0)->is_non_word())
3066 next_is_word_character = Trace::FALSE_VALUE; 3078 next_is_word_character = Trace::FALSE_VALUE;
3067 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE; 3079 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3068 } 3080 }
3069 } else { 3081 } else {
3070 if (lookahead->at(0)->is_non_word()) 3082 if (lookahead->at(0)->is_non_word())
3071 next_is_word_character = Trace::FALSE_VALUE; 3083 next_is_word_character = Trace::FALSE_VALUE;
3072 if (lookahead->at(0)->is_word()) 3084 if (lookahead->at(0)->is_word())
3073 next_is_word_character = Trace::TRUE_VALUE; 3085 next_is_word_character = Trace::TRUE_VALUE;
3074 } 3086 }
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
3226 // up to the limit the quick check already checked. In addition the quick 3238 // up to the limit the quick check already checked. In addition the quick
3227 // check can have involved a mask and compare operation which may simplify 3239 // check can have involved a mask and compare operation which may simplify
3228 // or obviate the need for further checks at some character positions. 3240 // or obviate the need for further checks at some character positions.
3229 void TextNode::TextEmitPass(RegExpCompiler* compiler, 3241 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3230 TextEmitPassType pass, 3242 TextEmitPassType pass,
3231 bool preloaded, 3243 bool preloaded,
3232 Trace* trace, 3244 Trace* trace,
3233 bool first_element_checked, 3245 bool first_element_checked,
3234 int* checked_up_to) { 3246 int* checked_up_to) {
3235 RegExpMacroAssembler* assembler = compiler->macro_assembler(); 3247 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3236 Isolate* isolate = assembler->isolate();
3237 bool one_byte = compiler->one_byte(); 3248 bool one_byte = compiler->one_byte();
3238 Label* backtrack = trace->backtrack(); 3249 Label* backtrack = trace->backtrack();
3239 QuickCheckDetails* quick_check = trace->quick_check_performed(); 3250 QuickCheckDetails* quick_check = trace->quick_check_performed();
3240 int element_count = elements()->length(); 3251 int element_count = elements()->length();
3241 int backward_offset = read_backward() ? -Length() : 0; 3252 int backward_offset = read_backward() ? -Length() : 0;
3242 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { 3253 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3243 TextElement elm = elements()->at(i); 3254 TextElement elm = elements()->at(i);
3244 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; 3255 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3245 if (elm.text_type() == TextElement::ATOM) { 3256 if (elm.text_type() == TextElement::ATOM) {
3246 Vector<const uc16> quarks = elm.atom()->data(); 3257 Vector<const uc16> quarks = elm.atom()->data();
3247 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { 3258 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3248 if (first_element_checked && i == 0 && j == 0) continue; 3259 if (first_element_checked && i == 0 && j == 0) continue;
3249 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; 3260 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3250 EmitCharacterFunction* emit_function = NULL; 3261 EmitCharacterFunction* emit_function = NULL;
3251 switch (pass) { 3262 switch (pass) {
3252 case NON_LATIN1_MATCH: 3263 case NON_LATIN1_MATCH:
3253 DCHECK(one_byte); 3264 DCHECK(one_byte);
3265 DCHECK(!(compiler->unicode() && compiler->ignore_case()));
3254 if (quarks[j] > String::kMaxOneByteCharCode) { 3266 if (quarks[j] > String::kMaxOneByteCharCode) {
3255 assembler->GoTo(backtrack); 3267 assembler->GoTo(backtrack);
3256 return; 3268 return;
3257 } 3269 }
3258 break; 3270 break;
3259 case NON_LETTER_CHARACTER_MATCH: 3271 case NON_LETTER_CHARACTER_MATCH:
3260 emit_function = &EmitAtomNonLetter; 3272 emit_function = &EmitAtomNonLetter;
3261 break; 3273 break;
3262 case SIMPLE_CHARACTER_MATCH: 3274 case SIMPLE_CHARACTER_MATCH:
3263 emit_function = &EmitSimpleCharacter; 3275 emit_function = &EmitSimpleCharacter;
3264 break; 3276 break;
3265 case CASE_CHARACTER_MATCH: 3277 case CASE_CHARACTER_MATCH:
3266 emit_function = &EmitAtomLetter; 3278 emit_function = &EmitAtomLetter;
3267 break; 3279 break;
3268 default: 3280 default:
3269 break; 3281 break;
3270 } 3282 }
3271 if (emit_function != NULL) { 3283 if (emit_function != NULL) {
3272 bool bounds_check = *checked_up_to < cp_offset + j || read_backward(); 3284 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3273 bool bound_checked = 3285 bool bound_checked =
3274 emit_function(isolate, compiler, quarks[j], backtrack, 3286 emit_function(compiler, quarks[j], backtrack, cp_offset + j,
3275 cp_offset + j, bounds_check, preloaded); 3287 bounds_check, preloaded);
3276 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); 3288 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3277 } 3289 }
3278 } 3290 }
3279 } else { 3291 } else {
3280 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); 3292 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3281 if (pass == CHARACTER_CLASS_MATCH) { 3293 if (pass == CHARACTER_CLASS_MATCH) {
3282 if (first_element_checked && i == 0) continue; 3294 if (first_element_checked && i == 0) continue;
3283 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; 3295 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3284 RegExpCharacterClass* cc = elm.char_class(); 3296 RegExpCharacterClass* cc = elm.char_class();
3285 bool bounds_check = *checked_up_to < cp_offset || read_backward(); 3297 bool bounds_check = *checked_up_to < cp_offset || read_backward();
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
3348 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3360 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3349 LimitResult limit_result = LimitVersions(compiler, trace); 3361 LimitResult limit_result = LimitVersions(compiler, trace);
3350 if (limit_result == DONE) return; 3362 if (limit_result == DONE) return;
3351 DCHECK(limit_result == CONTINUE); 3363 DCHECK(limit_result == CONTINUE);
3352 3364
3353 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { 3365 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3354 compiler->SetRegExpTooBig(); 3366 compiler->SetRegExpTooBig();
3355 return; 3367 return;
3356 } 3368 }
3357 3369
3358 if (compiler->one_byte()) { 3370 if (compiler->one_byte() &&
3371 !(compiler->unicode() && compiler->ignore_case())) {
3372 // If any character within the text node is outside the Latin1 range, it
3373 // cannot possibly match anything in a one-byte string. This still holds
3374 // for case-insensitive non-unicode regexp patterns. However, for
3375 // case-insensitive unicode regexp patterns, this is no longer true, e.g.
3376 // /\u212b/ui matches "\u00c5".
3359 int dummy = 0; 3377 int dummy = 0;
3360 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); 3378 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3361 } 3379 }
3362 3380
3363 bool first_elt_done = false; 3381 bool first_elt_done = false;
3364 int bound_checked_to = trace->cp_offset() - 1; 3382 int bound_checked_to = trace->cp_offset() - 1;
3365 bound_checked_to += trace->bound_checked_up_to(); 3383 bound_checked_to += trace->bound_checked_up_to();
3366 3384
3367 // If a character is preloaded into the current character register then 3385 // If a character is preloaded into the current character register then
3368 // check that now. 3386 // check that now.
(...skipping 731 matching lines...) Expand 10 before | Expand all | Expand 10 after
4100 4118
4101 // Really we should be creating a new trace when we execute this function, 4119 // Really we should be creating a new trace when we execute this function,
4102 // but there is no need, because the code it generates cannot backtrack, and 4120 // but there is no need, because the code it generates cannot backtrack, and
4103 // we always arrive here with a trivial trace (since it's the entry to a 4121 // we always arrive here with a trivial trace (since it's the entry to a
4104 // loop. That also implies that there are no preloaded characters, which is 4122 // loop. That also implies that there are no preloaded characters, which is
4105 // good, because it means we won't be violating any assumptions by 4123 // good, because it means we won't be violating any assumptions by
4106 // overwriting those characters with new load instructions. 4124 // overwriting those characters with new load instructions.
4107 DCHECK(trace->is_trivial()); 4125 DCHECK(trace->is_trivial());
4108 4126
4109 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); 4127 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4110 Isolate* isolate = macro_assembler->isolate();
4111 // At this point we know that we are at a non-greedy loop that will eat 4128 // At this point we know that we are at a non-greedy loop that will eat
4112 // any character one at a time. Any non-anchored regexp has such a 4129 // any character one at a time. Any non-anchored regexp has such a
4113 // loop prepended to it in order to find where it starts. We look for 4130 // loop prepended to it in order to find where it starts. We look for
4114 // a pattern of the form ...abc... where we can look 6 characters ahead 4131 // a pattern of the form ...abc... where we can look 6 characters ahead
4115 // and step forwards 3 if the character is not one of abc. Abc need 4132 // and step forwards 3 if the character is not one of abc. Abc need
4116 // not be atoms, they can be any reasonably limited character class or 4133 // not be atoms, they can be any reasonably limited character class or
4117 // small alternation. 4134 // small alternation.
4118 BoyerMooreLookahead* bm = bm_info(false); 4135 BoyerMooreLookahead* bm = bm_info(false);
4119 if (bm == NULL) { 4136 if (bm == NULL) {
4120 eats_at_least = Min(kMaxLookaheadForBoyerMoore, 4137 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4121 EatsAtLeast(kMaxLookaheadForBoyerMoore, 4138 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4122 kRecursionBudget, 4139 kRecursionBudget,
4123 false)); 4140 false));
4124 if (eats_at_least >= 1) { 4141 if (eats_at_least >= 1) {
4125 bm = new(zone()) BoyerMooreLookahead(eats_at_least, 4142 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4126 compiler, 4143 compiler,
4127 zone()); 4144 zone());
4128 GuardedAlternative alt0 = alternatives_->at(0); 4145 GuardedAlternative alt0 = alternatives_->at(0);
4129 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false); 4146 alt0.node()->FillInBMInfo(compiler, 0, kRecursionBudget, bm, false);
4130 } 4147 }
4131 } 4148 }
4132 if (bm != NULL) { 4149 if (bm != NULL) {
4133 bm->EmitSkipInstructions(macro_assembler); 4150 bm->EmitSkipInstructions(macro_assembler);
4134 } 4151 }
4135 return eats_at_least; 4152 return eats_at_least;
4136 } 4153 }
4137 4154
4138 4155
4139 void ChoiceNode::EmitChoices(RegExpCompiler* compiler, 4156 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
(...skipping 2241 matching lines...) Expand 10 before | Expand all | Expand 10 after
6381 6398
6382 void Analysis::VisitBackReference(BackReferenceNode* that) { 6399 void Analysis::VisitBackReference(BackReferenceNode* that) {
6383 EnsureAnalyzed(that->on_success()); 6400 EnsureAnalyzed(that->on_success());
6384 } 6401 }
6385 6402
6386 6403
6387 void Analysis::VisitAssertion(AssertionNode* that) { 6404 void Analysis::VisitAssertion(AssertionNode* that) {
6388 EnsureAnalyzed(that->on_success()); 6405 EnsureAnalyzed(that->on_success());
6389 } 6406 }
6390 6407
6391 6408 void BackReferenceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,
6392 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, 6409 int budget, BoyerMooreLookahead* bm,
6393 BoyerMooreLookahead* bm,
6394 bool not_at_start) { 6410 bool not_at_start) {
6395 // Working out the set of characters that a backreference can match is too 6411 // Working out the set of characters that a backreference can match is too
6396 // hard, so we just say that any character can match. 6412 // hard, so we just say that any character can match.
6397 bm->SetRest(offset); 6413 bm->SetRest(offset);
6398 SaveBMInfo(bm, not_at_start, offset); 6414 SaveBMInfo(bm, not_at_start, offset);
6399 } 6415 }
6400 6416
6401 6417
6402 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == 6418 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6403 RegExpMacroAssembler::kTableSize); 6419 RegExpMacroAssembler::kTableSize);
6404 6420
6405 6421 void ChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,
6406 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6407 BoyerMooreLookahead* bm, bool not_at_start) { 6422 BoyerMooreLookahead* bm, bool not_at_start) {
6408 ZoneList<GuardedAlternative>* alts = alternatives(); 6423 ZoneList<GuardedAlternative>* alts = alternatives();
6409 budget = (budget - 1) / alts->length(); 6424 budget = (budget - 1) / alts->length();
6410 for (int i = 0; i < alts->length(); i++) { 6425 for (int i = 0; i < alts->length(); i++) {
6411 GuardedAlternative& alt = alts->at(i); 6426 GuardedAlternative& alt = alts->at(i);
6412 if (alt.guards() != NULL && alt.guards()->length() != 0) { 6427 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6413 bm->SetRest(offset); // Give up trying to fill in info. 6428 bm->SetRest(offset); // Give up trying to fill in info.
6414 SaveBMInfo(bm, not_at_start, offset); 6429 SaveBMInfo(bm, not_at_start, offset);
6415 return; 6430 return;
6416 } 6431 }
6417 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start); 6432 alt.node()->FillInBMInfo(compiler, offset, budget, bm, not_at_start);
6418 } 6433 }
6419 SaveBMInfo(bm, not_at_start, offset); 6434 SaveBMInfo(bm, not_at_start, offset);
6420 } 6435 }
6421 6436
6422 6437 void TextNode::FillInBMInfo(RegExpCompiler* compiler, int initial_offset,
6423 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, 6438 int budget, BoyerMooreLookahead* bm,
6424 BoyerMooreLookahead* bm, bool not_at_start) { 6439 bool not_at_start) {
6425 if (initial_offset >= bm->length()) return; 6440 if (initial_offset >= bm->length()) return;
6426 int offset = initial_offset; 6441 int offset = initial_offset;
6427 int max_char = bm->max_char(); 6442 int max_char = bm->max_char();
6428 for (int i = 0; i < elements()->length(); i++) { 6443 for (int i = 0; i < elements()->length(); i++) {
6429 if (offset >= bm->length()) { 6444 if (offset >= bm->length()) {
6430 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6445 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6431 return; 6446 return;
6432 } 6447 }
6433 TextElement text = elements()->at(i); 6448 TextElement text = elements()->at(i);
6434 if (text.text_type() == TextElement::ATOM) { 6449 if (text.text_type() == TextElement::ATOM) {
6435 RegExpAtom* atom = text.atom(); 6450 RegExpAtom* atom = text.atom();
6436 for (int j = 0; j < atom->length(); j++, offset++) { 6451 for (int j = 0; j < atom->length(); j++, offset++) {
6437 if (offset >= bm->length()) { 6452 if (offset >= bm->length()) {
6438 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6453 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6439 return; 6454 return;
6440 } 6455 }
6441 uc16 character = atom->data()[j]; 6456 uc16 character = atom->data()[j];
6442 if (bm->compiler()->ignore_case()) { 6457 if (bm->compiler()->ignore_case()) {
6443 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 6458 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6444 int length = GetCaseIndependentLetters( 6459 int length = GetCaseIndependentLetters(compiler, character, chars);
6445 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6446 chars);
6447 for (int j = 0; j < length; j++) { 6460 for (int j = 0; j < length; j++) {
6448 bm->Set(offset, chars[j]); 6461 bm->Set(offset, chars[j]);
6449 } 6462 }
6450 } else { 6463 } else {
6451 if (character <= max_char) bm->Set(offset, character); 6464 if (character <= max_char) bm->Set(offset, character);
6452 } 6465 }
6453 } 6466 }
6454 } else { 6467 } else {
6455 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type()); 6468 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6456 RegExpCharacterClass* char_class = text.char_class(); 6469 RegExpCharacterClass* char_class = text.char_class();
6457 ZoneList<CharacterRange>* ranges = char_class->ranges(zone()); 6470 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6458 if (char_class->is_negated()) { 6471 if (char_class->is_negated()) {
6459 bm->SetAll(offset); 6472 bm->SetAll(offset);
6460 } else { 6473 } else {
6461 for (int k = 0; k < ranges->length(); k++) { 6474 for (int k = 0; k < ranges->length(); k++) {
6462 CharacterRange& range = ranges->at(k); 6475 CharacterRange& range = ranges->at(k);
6463 if (range.from() > max_char) continue; 6476 if (range.from() > max_char) continue;
6464 int to = Min(max_char, static_cast<int>(range.to())); 6477 int to = Min(max_char, static_cast<int>(range.to()));
6465 bm->SetInterval(offset, Interval(range.from(), to)); 6478 bm->SetInterval(offset, Interval(range.from(), to));
6466 } 6479 }
6467 } 6480 }
6468 offset++; 6481 offset++;
6469 } 6482 }
6470 } 6483 }
6471 if (offset >= bm->length()) { 6484 if (offset >= bm->length()) {
6472 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6485 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6473 return; 6486 return;
6474 } 6487 }
6475 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, 6488 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm,
6476 true); // Not at start after a text node. 6489 true); // Not at start after a text node.
6477 if (initial_offset == 0) set_bm_info(not_at_start, bm); 6490 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6478 } 6491 }
6479 6492
6480 6493
6481 // ------------------------------------------------------------------- 6494 // -------------------------------------------------------------------
6482 // Dispatch table construction 6495 // Dispatch table construction
6483 6496
6484 6497
6485 void DispatchTableConstructor::VisitEnd(EndNode* that) { 6498 void DispatchTableConstructor::VisitEnd(EndNode* that) {
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
6623 } 6636 }
6624 6637
6625 6638
6626 RegExpEngine::CompilationResult RegExpEngine::Compile( 6639 RegExpEngine::CompilationResult RegExpEngine::Compile(
6627 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6640 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6628 JSRegExp::Flags flags, Handle<String> pattern, 6641 JSRegExp::Flags flags, Handle<String> pattern,
6629 Handle<String> sample_subject, bool is_one_byte) { 6642 Handle<String> sample_subject, bool is_one_byte) {
6630 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6643 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6631 return IrregexpRegExpTooBig(isolate); 6644 return IrregexpRegExpTooBig(isolate);
6632 } 6645 }
6633 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6634 bool is_sticky = flags & JSRegExp::kSticky; 6646 bool is_sticky = flags & JSRegExp::kSticky;
6635 bool is_global = flags & JSRegExp::kGlobal; 6647 bool is_global = flags & JSRegExp::kGlobal;
6636 bool is_unicode = flags & JSRegExp::kUnicode; 6648 bool is_unicode = flags & JSRegExp::kUnicode;
6637 RegExpCompiler compiler(isolate, zone, data->capture_count, flags, 6649 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6638 is_one_byte); 6650 is_one_byte);
6639 6651
6640 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern)); 6652 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6641 6653
6642 // Sample some characters from the middle of the string. 6654 // Sample some characters from the middle of the string.
6643 static const int kSampleSize = 128; 6655 static const int kSampleSize = 128;
(...skipping 29 matching lines...) Expand all
6673 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); 6685 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6674 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 6686 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6675 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( 6687 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6676 new (zone) RegExpCharacterClass('*'), false, loop_node))); 6688 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6677 node = first_step_node; 6689 node = first_step_node;
6678 } else { 6690 } else {
6679 node = loop_node; 6691 node = loop_node;
6680 } 6692 }
6681 } 6693 }
6682 if (is_one_byte) { 6694 if (is_one_byte) {
6683 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6695 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);
6684 // Do it again to propagate the new nodes to places where they were not 6696 // Do it again to propagate the new nodes to places where they were not
6685 // put because they had not been calculated yet. 6697 // put because they had not been calculated yet.
6686 if (node != NULL) { 6698 if (node != NULL) {
6687 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6699 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);
6688 } 6700 }
6689 } else if (compiler.unicode() && (is_global || is_sticky)) { 6701 } else if (compiler.unicode() && (is_global || is_sticky)) {
6690 node = OptionallyStepBackToLeadSurrogate(&compiler, node); 6702 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6691 } 6703 }
6692 6704
6693 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6705 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6694 data->node = node; 6706 data->node = node;
6695 Analysis analysis(isolate, flags, is_one_byte); 6707 Analysis analysis(isolate, flags, is_one_byte);
6696 analysis.EnsureAnalyzed(node); 6708 analysis.EnsureAnalyzed(node);
6697 if (analysis.has_failed()) { 6709 if (analysis.has_failed()) {
(...skipping 177 matching lines...) Expand 10 before | Expand all | Expand 10 after
6875 6887
6876 6888
6877 void RegExpResultsCache::Clear(FixedArray* cache) { 6889 void RegExpResultsCache::Clear(FixedArray* cache) {
6878 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6890 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6879 cache->set(i, Smi::FromInt(0)); 6891 cache->set(i, Smi::FromInt(0));
6880 } 6892 }
6881 } 6893 }
6882 6894
6883 } // namespace internal 6895 } // namespace internal
6884 } // namespace v8 6896 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698