src/regexp/jsregexp.cc - Issue 1661483002: Revert of [regexp] implement /ui to mirror the implementation for /i.

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1661483002: Revert of [regexp] implement /ui to mirror the implementation for /i. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: add cause of revert as test case Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/jsregexp.h"	5 #include "src/regexp/jsregexp.h"

6	6

7 #include "src/ast/ast.h"	7 #include "src/ast/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 1580 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1591 macro_assembler->IfRegisterLT(guard->reg(),	1591 macro_assembler->IfRegisterLT(guard->reg(),

1592 guard->value(),	1592 guard->value(),

1593 trace->backtrack());	1593 trace->backtrack());

1594 break;	1594 break;

1595 }	1595 }

1596 }	1596 }

1597	1597

1598	1598

1599 // Returns the number of characters in the equivalence class, omitting those	1599 // Returns the number of characters in the equivalence class, omitting those

1600 // that cannot occur in the source string because it is Latin1.	1600 // that cannot occur in the source string because it is Latin1.

1601 static int GetCaseIndependentLetters(RegExpCompiler* compiler, uc16 character,	1601 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,

1602 uc32* letters) {	1602 bool one_byte_subject,

1603 int length;	1603 unibrow::uchar* letters) {

1604 #ifdef V8_I18N_SUPPORT	1604 int length =

1605 if (compiler->unicode()) {	1605 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);

1606 USet* set = uset_open(character, character);	1606 // Unibrow returns 0 or 1 for characters where case independence is

1607 uset_closeOver(set, USET_CASE_INSENSITIVE);	1607 // trivial.

1608 uset_removeAllStrings(set);	1608 if (length == 0) {

1609 length = uset_size(set);	1609 letters[0] = character;

1610 for (int i = 0; i < length; i++) {	1610 length = 1;

1611 letters[i] = uset_charAt(set, i);

1612 }

1613 uset_close(set);

1614 } else // NOLINT

1615 // Fallback in case ICU is not included.

1616 #endif // V8_I18N_SUPPORT

1617 {

1618 length = compiler->isolate()->jsregexp_uncanonicalize()->get(character,

1619 '\0', letters);

1620 // Unibrow returns 0 or 1 for characters where case independence is

1621 // trivial.

1622 if (length == 0) {

1623 letters[0] = character;

1624 length = 1;

1625 }

1626 }	1611 }

1627	1612

1628 if (compiler->one_byte()) {	1613 if (one_byte_subject) {

1629 int new_length = 0;	1614 int new_length = 0;

1630 for (int i = 0; i < length; i++) {	1615 for (int i = 0; i < length; i++) {

1631 if (letters[i] <= String::kMaxOneByteCharCode) {	1616 if (letters[i] <= String::kMaxOneByteCharCode) {

1632 letters[new_length++] = letters[i];	1617 letters[new_length++] = letters[i];

1633 }	1618 }

1634 }	1619 }

1635 length = new_length;	1620 length = new_length;

1636 }	1621 }

1637	1622

1638 return length;	1623 return length;

1639 }	1624 }

1640	1625

1641 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, uc16 c,	1626

1642 Label* on_failure, int cp_offset,	1627 static inline bool EmitSimpleCharacter(Isolate* isolate,

1643 bool check, bool preloaded) {	1628 RegExpCompiler* compiler,

	1629 uc16 c,

	1630 Label* on_failure,

	1631 int cp_offset,

	1632 bool check,

	1633 bool preloaded) {

1644 RegExpMacroAssembler* assembler = compiler->macro_assembler();	1634 RegExpMacroAssembler* assembler = compiler->macro_assembler();

1645 bool bound_checked = false;	1635 bool bound_checked = false;

1646 if (!preloaded) {	1636 if (!preloaded) {

1647 assembler->LoadCurrentCharacter(	1637 assembler->LoadCurrentCharacter(

1648 cp_offset,	1638 cp_offset,

1649 on_failure,	1639 on_failure,

1650 check);	1640 check);

1651 bound_checked = true;	1641 bound_checked = true;

1652 }	1642 }

1653 assembler->CheckNotCharacter(c, on_failure);	1643 assembler->CheckNotCharacter(c, on_failure);

1654 return bound_checked;	1644 return bound_checked;

1655 }	1645 }

1656	1646

1657	1647

1658 // Only emits non-letters (things that don't have case). Only used for case	1648 // Only emits non-letters (things that don't have case). Only used for case

1659 // independent matches.	1649 // independent matches.

1660 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, uc16 c,	1650 static inline bool EmitAtomNonLetter(Isolate* isolate,

1661 Label* on_failure, int cp_offset,	1651 RegExpCompiler* compiler,

1662 bool check, bool preloaded) {	1652 uc16 c,

	1653 Label* on_failure,

	1654 int cp_offset,

	1655 bool check,

	1656 bool preloaded) {

1663 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	1657 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1658 bool one_byte = compiler->one_byte();

1664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1659 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1665 int length = GetCaseIndependentLetters(compiler, c, chars);	1660 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);

1666 if (length < 1) {	1661 if (length < 1) {

1667 // This can't match. Must be an one-byte subject and a non-one-byte	1662 // This can't match. Must be an one-byte subject and a non-one-byte

1668 // character. We do not need to do anything since the one-byte pass	1663 // character. We do not need to do anything since the one-byte pass

1669 // already handled this.	1664 // already handled this.

1670 return false; // Bounds not checked.	1665 return false; // Bounds not checked.

1671 }	1666 }

1672 bool checked = false;	1667 bool checked = false;

1673 // We handle the length > 1 case in a later pass.	1668 // We handle the length > 1 case in a later pass.

1674 if (length == 1) {	1669 if (length == 1) {

1675 if (compiler->one_byte() && c > String::kMaxOneByteCharCodeU) {	1670 if (one_byte && c > String::kMaxOneByteCharCodeU) {

1676 // This cannot match.	1671 // Can't match - see above.

1677 return false; // Bounds not checked.	1672 return false; // Bounds not checked.

1678 }	1673 }

1679 if (!preloaded) {	1674 if (!preloaded) {

1680 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1675 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1681 checked = check;	1676 checked = check;

1682 }	1677 }

1683 macro_assembler->CheckNotCharacter(c, on_failure);	1678 macro_assembler->CheckNotCharacter(c, on_failure);

1684 }	1679 }

1685 return checked;	1680 return checked;

1686 }	1681 }

(...skipping 28 matching lines...) Expand all Loading...
1715 uc16 mask = char_mask ^ diff;	1710 uc16 mask = char_mask ^ diff;

1716 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,	1711 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,

1717 diff,	1712 diff,

1718 mask,	1713 mask,

1719 on_failure);	1714 on_failure);

1720 return true;	1715 return true;

1721 }	1716 }

1722 return false;	1717 return false;

1723 }	1718 }

1724	1719

1725 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, uc16 c,	1720

1726 Label* on_failure, int cp_offset, bool check,	1721 typedef bool EmitCharacterFunction(Isolate* isolate,

	1722 RegExpCompiler* compiler,

	1723 uc16 c,

	1724 Label* on_failure,

	1725 int cp_offset,

	1726 bool check,

1727 bool preloaded);	1727 bool preloaded);

1728	1728

1729 // Only emits letters (things that have case). Only used for case independent	1729 // Only emits letters (things that have case). Only used for case independent

1730 // matches.	1730 // matches.

1731 static inline bool EmitAtomLetter(RegExpCompiler* compiler, uc16 c,	1731 static inline bool EmitAtomLetter(Isolate* isolate,

1732 Label* on_failure, int cp_offset, bool check,	1732 RegExpCompiler* compiler,

	1733 uc16 c,

	1734 Label* on_failure,

	1735 int cp_offset,

	1736 bool check,

1733 bool preloaded) {	1737 bool preloaded) {

1734 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	1738 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	1739 bool one_byte = compiler->one_byte();

1735 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1740 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1736 int length = GetCaseIndependentLetters(compiler, c, chars);	1741 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);

1737 if (length <= 1) return false;	1742 if (length <= 1) return false;

1738 // We may not need to check against the end of the input string	1743 // We may not need to check against the end of the input string

1739 // if this character lies before a character that matched.	1744 // if this character lies before a character that matched.

1740 if (!preloaded) {	1745 if (!preloaded) {

1741 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1746 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1742 }	1747 }

1743 Label ok;	1748 Label ok;

1744 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);	1749 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

1745 switch (length) {	1750 switch (length) {

1746 case 2: {	1751 case 2: {

1747 if (ShortCutEmitCharacterPair(macro_assembler, compiler->one_byte(),	1752 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],

1748 chars[0], chars[1], on_failure)) {	1753 chars[1], on_failure)) {

1749 } else {	1754 } else {

1750 macro_assembler->CheckCharacter(chars[0], &ok);	1755 macro_assembler->CheckCharacter(chars[0], &ok);

1751 macro_assembler->CheckNotCharacter(chars[1], on_failure);	1756 macro_assembler->CheckNotCharacter(chars[1], on_failure);

1752 macro_assembler->Bind(&ok);	1757 macro_assembler->Bind(&ok);

1753 }	1758 }

1754 break;	1759 break;

1755 }	1760 }

1756 case 4:	1761 case 4:

1757 macro_assembler->CheckCharacter(chars[3], &ok);	1762 macro_assembler->CheckCharacter(chars[3], &ok);

1758 // Fall through!	1763 // Fall through!

(...skipping 516 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2275 int ActionNode::EatsAtLeast(int still_to_find,	2280 int ActionNode::EatsAtLeast(int still_to_find,

2276 int budget,	2281 int budget,

2277 bool not_at_start) {	2282 bool not_at_start) {

2278 if (budget <= 0) return 0;	2283 if (budget <= 0) return 0;

2279 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!	2284 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!

2280 return on_success()->EatsAtLeast(still_to_find,	2285 return on_success()->EatsAtLeast(still_to_find,

2281 budget - 1,	2286 budget - 1,

2282 not_at_start);	2287 not_at_start);

2283 }	2288 }

2284	2289

2285 void ActionNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,	2290

	2291 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

2286 BoyerMooreLookahead* bm, bool not_at_start) {	2292 BoyerMooreLookahead* bm, bool not_at_start) {

2287 if (action_type_ == BEGIN_SUBMATCH) {	2293 if (action_type_ == BEGIN_SUBMATCH) {

2288 bm->SetRest(offset);	2294 bm->SetRest(offset);

2289 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {	2295 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {

2290 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);	2296 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);

2291 }	2297 }

2292 SaveBMInfo(bm, not_at_start, offset);	2298 SaveBMInfo(bm, not_at_start, offset);

2293 }	2299 }

2294	2300

2295	2301

2296 int AssertionNode::EatsAtLeast(int still_to_find,	2302 int AssertionNode::EatsAtLeast(int still_to_find,

2297 int budget,	2303 int budget,

2298 bool not_at_start) {	2304 bool not_at_start) {

2299 if (budget <= 0) return 0;	2305 if (budget <= 0) return 0;

2300 // If we know we are not at the start and we are asked "how many characters	2306 // If we know we are not at the start and we are asked "how many characters

2301 // will you match if you succeed?" then we can answer anything since false	2307 // will you match if you succeed?" then we can answer anything since false

2302 // implies false. So lets just return the max answer (still_to_find) since	2308 // implies false. So lets just return the max answer (still_to_find) since

2303 // that won't prevent us from preloading a lot of characters for the other	2309 // that won't prevent us from preloading a lot of characters for the other

2304 // branches in the node graph.	2310 // branches in the node graph.

2305 if (assertion_type() == AT_START && not_at_start) return still_to_find;	2311 if (assertion_type() == AT_START && not_at_start) return still_to_find;

2306 return on_success()->EatsAtLeast(still_to_find,	2312 return on_success()->EatsAtLeast(still_to_find,

2307 budget - 1,	2313 budget - 1,

2308 not_at_start);	2314 not_at_start);

2309 }	2315 }

2310	2316

2311 void AssertionNode::FillInBMInfo(RegExpCompiler* compiler, int offset,	2317

2312 int budget, BoyerMooreLookahead* bm,	2318 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

2313 bool not_at_start) {	2319 BoyerMooreLookahead* bm, bool not_at_start) {

2314 // Match the behaviour of EatsAtLeast on this node.	2320 // Match the behaviour of EatsAtLeast on this node.

2315 if (assertion_type() == AT_START && not_at_start) return;	2321 if (assertion_type() == AT_START && not_at_start) return;

2316 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);	2322 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);

2317 SaveBMInfo(bm, not_at_start, offset);	2323 SaveBMInfo(bm, not_at_start, offset);

2318 }	2324 }

2319	2325

2320	2326

2321 int BackReferenceNode::EatsAtLeast(int still_to_find,	2327 int BackReferenceNode::EatsAtLeast(int still_to_find,

2322 int budget,	2328 int budget,

2323 bool not_at_start) {	2329 bool not_at_start) {

2324 if (read_backward()) return 0;	2330 if (read_backward()) return 0;

2325 if (budget <= 0) return 0;	2331 if (budget <= 0) return 0;

2326 return on_success()->EatsAtLeast(still_to_find,	2332 return on_success()->EatsAtLeast(still_to_find,

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2520 // The masks and values for the positions will be combined into a single	2526 // The masks and values for the positions will be combined into a single

2521 // machine word for the current character width in order to be used in	2527 // machine word for the current character width in order to be used in

2522 // generating a quick check.	2528 // generating a quick check.

2523 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,	2529 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,

2524 RegExpCompiler* compiler,	2530 RegExpCompiler* compiler,

2525 int characters_filled_in,	2531 int characters_filled_in,

2526 bool not_at_start) {	2532 bool not_at_start) {

2527 // Do not collect any quick check details if the text node reads backward,	2533 // Do not collect any quick check details if the text node reads backward,

2528 // since it reads in the opposite direction than we use for quick checks.	2534 // since it reads in the opposite direction than we use for quick checks.

2529 if (read_backward()) return;	2535 if (read_backward()) return;

	2536 Isolate* isolate = compiler->macro_assembler()->isolate();

2530 DCHECK(characters_filled_in < details->characters());	2537 DCHECK(characters_filled_in < details->characters());

2531 int characters = details->characters();	2538 int characters = details->characters();

2532 int char_mask;	2539 int char_mask;

2533 if (compiler->one_byte()) {	2540 if (compiler->one_byte()) {

2534 char_mask = String::kMaxOneByteCharCode;	2541 char_mask = String::kMaxOneByteCharCode;

2535 } else {	2542 } else {

2536 char_mask = String::kMaxUtf16CodeUnit;	2543 char_mask = String::kMaxUtf16CodeUnit;

2537 }	2544 }

2538 for (int k = 0; k < elements()->length(); k++) {	2545 for (int k = 0; k < elements()->length(); k++) {

2539 TextElement elm = elements()->at(k);	2546 TextElement elm = elements()->at(k);

2540 if (elm.text_type() == TextElement::ATOM) {	2547 if (elm.text_type() == TextElement::ATOM) {

2541 Vector<const uc16> quarks = elm.atom()->data();	2548 Vector<const uc16> quarks = elm.atom()->data();

2542 for (int i = 0; i < characters && i < quarks.length(); i++) {	2549 for (int i = 0; i < characters && i < quarks.length(); i++) {

2543 QuickCheckDetails::Position* pos =	2550 QuickCheckDetails::Position* pos =

2544 details->positions(characters_filled_in);	2551 details->positions(characters_filled_in);

2545 uc16 c = quarks[i];	2552 uc16 c = quarks[i];

2546 if (compiler->ignore_case()) {	2553 if (compiler->ignore_case()) {

2547 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	2554 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

2548 int length = GetCaseIndependentLetters(compiler, c, chars);	2555 int length = GetCaseIndependentLetters(isolate, c,

	2556 compiler->one_byte(), chars);

2549 if (length == 0) {	2557 if (length == 0) {

2550 // This can happen because all case variants are non-Latin1, but we	2558 // This can happen because all case variants are non-Latin1, but we

2551 // know the input is Latin1.	2559 // know the input is Latin1.

2552 details->set_cannot_match();	2560 details->set_cannot_match();

2553 pos->determines_perfectly = false;	2561 pos->determines_perfectly = false;

2554 return;	2562 return;

2555 }	2563 }

2556 if (length == 1) {	2564 if (length == 1) {

2557 // This letter has no case equivalents, so it's nice and simple	2565 // This letter has no case equivalents, so it's nice and simple

2558 // and the mask-compare will determine definitely whether we have	2566 // and the mask-compare will determine definitely whether we have

(...skipping 184 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2743 DCHECK(!info->visited);	2751 DCHECK(!info->visited);

2744 info->visited = true;	2752 info->visited = true;

2745 }	2753 }

2746 ~VisitMarker() {	2754 ~VisitMarker() {

2747 info_->visited = false;	2755 info_->visited = false;

2748 }	2756 }

2749 private:	2757 private:

2750 NodeInfo* info_;	2758 NodeInfo* info_;

2751 };	2759 };

2752	2760

2753 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) {	2761

	2762 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {

2754 if (info()->replacement_calculated) return replacement();	2763 if (info()->replacement_calculated) return replacement();

2755 if (depth < 0) return this;	2764 if (depth < 0) return this;

2756 DCHECK(!info()->visited);	2765 DCHECK(!info()->visited);

2757 VisitMarker marker(info());	2766 VisitMarker marker(info());

2758 return FilterSuccessor(depth - 1, compiler);	2767 return FilterSuccessor(depth - 1, ignore_case);

2759 }	2768 }

2760	2769

2761 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth,	2770

2762 RegExpCompiler* compiler) {	2771 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {

2763 RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler);	2772 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);

2764 if (next == NULL) return set_replacement(NULL);	2773 if (next == NULL) return set_replacement(NULL);

2765 on_success_ = next;	2774 on_success_ = next;

2766 return set_replacement(this);	2775 return set_replacement(this);

2767 }	2776 }

2768	2777

2769	2778

2770 // We need to check for the following characters: 0x39c 0x3bc 0x178.	2779 // We need to check for the following characters: 0x39c 0x3bc 0x178.

2771 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {	2780 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {

2772 // TODO(dcarney): this could be a lot more efficient.	2781 // TODO(dcarney): this could be a lot more efficient.

2773 return range.Contains(0x39c) \|\|	2782 return range.Contains(0x39c) \|\|

2774 range.Contains(0x3bc) \|\| range.Contains(0x178);	2783 range.Contains(0x3bc) \|\| range.Contains(0x178);

2775 }	2784 }

2776	2785

2777	2786

2778 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {	2787 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {

2779 for (int i = 0; i < ranges->length(); i++) {	2788 for (int i = 0; i < ranges->length(); i++) {

2780 // TODO(dcarney): this could be a lot more efficient.	2789 // TODO(dcarney): this could be a lot more efficient.

2781 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;	2790 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;

2782 }	2791 }

2783 return false;	2792 return false;

2784 }	2793 }

2785	2794

2786 static uc16 ConvertNonLatin1ToEquivalentLatin1(bool unicode, uc16 c) {

2787 #ifdef V8_I18N_SUPPORT

2788 if (unicode) {

2789 USet* set = uset_open(c, c);

2790 uset_closeOver(set, USET_CASE_INSENSITIVE);

2791 uset_removeAllStrings(set);

2792 int length = uset_size(set);

2793 uc16 result = 0;

2794 for (int i = 0; i < length; i++) {

2795 uc32 c = uset_charAt(set, i);

2796 if (c <= String::kMaxOneByteCharCode) {

2797 result = static_cast<uc16>(c);

2798 break;

2799 }

2800 }

2801 uset_close(set);

2802 return result;

2803 }

2804 // Fallback to unibrow if ICU is not included.

2805 #endif // V8_I18N_SUPPORT

2806 return unibrow::Latin1::ConvertNonLatin1ToLatin1(c);

2807 }

2808	2795

2809 RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) {	2796 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {

2810 if (info()->replacement_calculated) return replacement();	2797 if (info()->replacement_calculated) return replacement();

2811 if (depth < 0) return this;	2798 if (depth < 0) return this;

2812 DCHECK(!info()->visited);	2799 DCHECK(!info()->visited);

2813 VisitMarker marker(info());	2800 VisitMarker marker(info());

2814 int element_count = elements()->length();	2801 int element_count = elements()->length();

2815 for (int i = 0; i < element_count; i++) {	2802 for (int i = 0; i < element_count; i++) {

2816 TextElement elm = elements()->at(i);	2803 TextElement elm = elements()->at(i);

2817 if (elm.text_type() == TextElement::ATOM) {	2804 if (elm.text_type() == TextElement::ATOM) {

2818 Vector<const uc16> quarks = elm.atom()->data();	2805 Vector<const uc16> quarks = elm.atom()->data();

2819 for (int j = 0; j < quarks.length(); j++) {	2806 for (int j = 0; j < quarks.length(); j++) {

2820 uc16 c = quarks[j];	2807 uint16_t c = quarks[j];

2821 if (c <= String::kMaxOneByteCharCode) continue;	2808 if (c <= String::kMaxOneByteCharCode) continue;

2822 if (!compiler->ignore_case()) return set_replacement(NULL);	2809 if (!ignore_case) return set_replacement(NULL);

2823 // Here, we need to check for characters whose upper and lower cases	2810 // Here, we need to check for characters whose upper and lower cases

2824 // are outside the Latin-1 range.	2811 // are outside the Latin-1 range.

2825 uc16 converted =	2812 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);

2826 ConvertNonLatin1ToEquivalentLatin1(compiler->unicode(), c);

2827 // Character is outside Latin-1 completely	2813 // Character is outside Latin-1 completely

2828 if (converted == 0) return set_replacement(NULL);	2814 if (converted == 0) return set_replacement(NULL);

2829 // Convert quark to Latin-1 in place.	2815 // Convert quark to Latin-1 in place.

2830 uc16* copy = const_cast<uc16*>(quarks.start());	2816 uint16_t* copy = const_cast<uint16_t*>(quarks.start());

2831 copy[j] = converted;	2817 copy[j] = converted;

2832 }	2818 }

2833 } else {	2819 } else {

2834 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);	2820 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);

2835 RegExpCharacterClass* cc = elm.char_class();	2821 RegExpCharacterClass* cc = elm.char_class();

2836 ZoneList<CharacterRange>* ranges = cc->ranges(zone());	2822 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

2837 CharacterRange::Canonicalize(ranges);	2823 CharacterRange::Canonicalize(ranges);

2838 // Now they are in order so we only need to look at the first.	2824 // Now they are in order so we only need to look at the first.

2839 int range_count = ranges->length();	2825 int range_count = ranges->length();

2840 if (cc->is_negated()) {	2826 if (cc->is_negated()) {

2841 if (range_count != 0 &&	2827 if (range_count != 0 &&

2842 ranges->at(0).from() == 0 &&	2828 ranges->at(0).from() == 0 &&

2843 ranges->at(0).to() >= String::kMaxOneByteCharCode) {	2829 ranges->at(0).to() >= String::kMaxOneByteCharCode) {

2844 // This will be handled in a later filter.	2830 // This will be handled in a later filter.

2845 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))	2831 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

2846 continue;

2847 return set_replacement(NULL);	2832 return set_replacement(NULL);

2848 }	2833 }

2849 } else {	2834 } else {

2850 if (range_count == 0 \|\|	2835 if (range_count == 0 \|\|

2851 ranges->at(0).from() > String::kMaxOneByteCharCode) {	2836 ranges->at(0).from() > String::kMaxOneByteCharCode) {

2852 // This will be handled in a later filter.	2837 // This will be handled in a later filter.

2853 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))	2838 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

2854 continue;

2855 return set_replacement(NULL);	2839 return set_replacement(NULL);

2856 }	2840 }

2857 }	2841 }

2858 }	2842 }

2859 }	2843 }

2860 return FilterSuccessor(depth - 1, compiler);	2844 return FilterSuccessor(depth - 1, ignore_case);

2861 }	2845 }

2862	2846

2863 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {	2847

	2848 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {

2864 if (info()->replacement_calculated) return replacement();	2849 if (info()->replacement_calculated) return replacement();

2865 if (depth < 0) return this;	2850 if (depth < 0) return this;

2866 if (info()->visited) return this;	2851 if (info()->visited) return this;

2867 {	2852 {

2868 VisitMarker marker(info());	2853 VisitMarker marker(info());

2869	2854

2870 RegExpNode* continue_replacement =	2855 RegExpNode* continue_replacement =

2871 continue_node_->FilterOneByte(depth - 1, compiler);	2856 continue_node_->FilterOneByte(depth - 1, ignore_case);

2872 // If we can't continue after the loop then there is no sense in doing the	2857 // If we can't continue after the loop then there is no sense in doing the

2873 // loop.	2858 // loop.

2874 if (continue_replacement == NULL) return set_replacement(NULL);	2859 if (continue_replacement == NULL) return set_replacement(NULL);

2875 }	2860 }

2876	2861

2877 return ChoiceNode::FilterOneByte(depth - 1, compiler);	2862 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);

2878 }	2863 }

2879	2864

2880 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {	2865

	2866 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {

2881 if (info()->replacement_calculated) return replacement();	2867 if (info()->replacement_calculated) return replacement();

2882 if (depth < 0) return this;	2868 if (depth < 0) return this;

2883 if (info()->visited) return this;	2869 if (info()->visited) return this;

2884 VisitMarker marker(info());	2870 VisitMarker marker(info());

2885 int choice_count = alternatives_->length();	2871 int choice_count = alternatives_->length();

2886	2872

2887 for (int i = 0; i < choice_count; i++) {	2873 for (int i = 0; i < choice_count; i++) {

2888 GuardedAlternative alternative = alternatives_->at(i);	2874 GuardedAlternative alternative = alternatives_->at(i);

2889 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {	2875 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {

2890 set_replacement(this);	2876 set_replacement(this);

2891 return this;	2877 return this;

2892 }	2878 }

2893 }	2879 }

2894	2880

2895 int surviving = 0;	2881 int surviving = 0;

2896 RegExpNode* survivor = NULL;	2882 RegExpNode* survivor = NULL;

2897 for (int i = 0; i < choice_count; i++) {	2883 for (int i = 0; i < choice_count; i++) {

2898 GuardedAlternative alternative = alternatives_->at(i);	2884 GuardedAlternative alternative = alternatives_->at(i);

2899 RegExpNode* replacement =	2885 RegExpNode* replacement =

2900 alternative.node()->FilterOneByte(depth - 1, compiler);	2886 alternative.node()->FilterOneByte(depth - 1, ignore_case);

2901 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.	2887 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.

2902 if (replacement != NULL) {	2888 if (replacement != NULL) {

2903 alternatives_->at(i).set_node(replacement);	2889 alternatives_->at(i).set_node(replacement);

2904 surviving++;	2890 surviving++;

2905 survivor = replacement;	2891 survivor = replacement;

2906 }	2892 }

2907 }	2893 }

2908 if (surviving < 2) return set_replacement(survivor);	2894 if (surviving < 2) return set_replacement(survivor);

2909	2895

2910 set_replacement(this);	2896 set_replacement(this);

2911 if (surviving == choice_count) {	2897 if (surviving == choice_count) {

2912 return this;	2898 return this;

2913 }	2899 }

2914 // Only some of the nodes survived the filtering. We need to rebuild the	2900 // Only some of the nodes survived the filtering. We need to rebuild the

2915 // alternatives list.	2901 // alternatives list.

2916 ZoneList<GuardedAlternative>* new_alternatives =	2902 ZoneList<GuardedAlternative>* new_alternatives =

2917 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());	2903 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());

2918 for (int i = 0; i < choice_count; i++) {	2904 for (int i = 0; i < choice_count; i++) {

2919 RegExpNode* replacement =	2905 RegExpNode* replacement =

2920 alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler);	2906 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);

2921 if (replacement != NULL) {	2907 if (replacement != NULL) {

2922 alternatives_->at(i).set_node(replacement);	2908 alternatives_->at(i).set_node(replacement);

2923 new_alternatives->Add(alternatives_->at(i), zone());	2909 new_alternatives->Add(alternatives_->at(i), zone());

2924 }	2910 }

2925 }	2911 }

2926 alternatives_ = new_alternatives;	2912 alternatives_ = new_alternatives;

2927 return this;	2913 return this;

2928 }	2914 }

2929	2915

2930 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(	2916

2931 int depth, RegExpCompiler* compiler) {	2917 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,

	2918 bool ignore_case) {

2932 if (info()->replacement_calculated) return replacement();	2919 if (info()->replacement_calculated) return replacement();

2933 if (depth < 0) return this;	2920 if (depth < 0) return this;

2934 if (info()->visited) return this;	2921 if (info()->visited) return this;

2935 VisitMarker marker(info());	2922 VisitMarker marker(info());

2936 // Alternative 0 is the negative lookahead, alternative 1 is what comes	2923 // Alternative 0 is the negative lookahead, alternative 1 is what comes

2937 // afterwards.	2924 // afterwards.

2938 RegExpNode* node = alternatives_->at(1).node();	2925 RegExpNode* node = alternatives_->at(1).node();

2939 RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler);	2926 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);

2940 if (replacement == NULL) return set_replacement(NULL);	2927 if (replacement == NULL) return set_replacement(NULL);

2941 alternatives_->at(1).set_node(replacement);	2928 alternatives_->at(1).set_node(replacement);

2942	2929

2943 RegExpNode* neg_node = alternatives_->at(0).node();	2930 RegExpNode* neg_node = alternatives_->at(0).node();

2944 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler);	2931 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);

2945 // If the negative lookahead is always going to fail then	2932 // If the negative lookahead is always going to fail then

2946 // we don't need to check it.	2933 // we don't need to check it.

2947 if (neg_replacement == NULL) return set_replacement(replacement);	2934 if (neg_replacement == NULL) return set_replacement(replacement);

2948 alternatives_->at(0).set_node(neg_replacement);	2935 alternatives_->at(0).set_node(neg_replacement);

2949 return set_replacement(this);	2936 return set_replacement(this);

2950 }	2937 }

2951	2938

2952	2939

2953 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	2940 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2954 RegExpCompiler* compiler,	2941 RegExpCompiler* compiler,

2955 int characters_filled_in,	2942 int characters_filled_in,

2956 bool not_at_start) {	2943 bool not_at_start) {

2957 if (body_can_be_zero_length_ \|\| info()->visited) return;	2944 if (body_can_be_zero_length_ \|\| info()->visited) return;

2958 VisitMarker marker(info());	2945 VisitMarker marker(info());

2959 return ChoiceNode::GetQuickCheckDetails(details,	2946 return ChoiceNode::GetQuickCheckDetails(details,

2960 compiler,	2947 compiler,

2961 characters_filled_in,	2948 characters_filled_in,

2962 not_at_start);	2949 not_at_start);

2963 }	2950 }

2964	2951

2965 void LoopChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,	2952

2966 int budget, BoyerMooreLookahead* bm,	2953 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

2967 bool not_at_start) {	2954 BoyerMooreLookahead* bm, bool not_at_start) {

2968 if (body_can_be_zero_length_ \|\| budget <= 0) {	2955 if (body_can_be_zero_length_ \|\| budget <= 0) {

2969 bm->SetRest(offset);	2956 bm->SetRest(offset);

2970 SaveBMInfo(bm, not_at_start, offset);	2957 SaveBMInfo(bm, not_at_start, offset);

2971 return;	2958 return;

2972 }	2959 }

2973 ChoiceNode::FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);	2960 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);

2974 SaveBMInfo(bm, not_at_start, offset);	2961 SaveBMInfo(bm, not_at_start, offset);

2975 }	2962 }

2976	2963

2977	2964

2978 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	2965 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2979 RegExpCompiler* compiler,	2966 RegExpCompiler* compiler,

2980 int characters_filled_in,	2967 int characters_filled_in,

2981 bool not_at_start) {	2968 bool not_at_start) {

2982 not_at_start = (not_at_start \|\| not_at_start_);	2969 not_at_start = (not_at_start \|\| not_at_start_);

2983 int choice_count = alternatives_->length();	2970 int choice_count = alternatives_->length();

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3055 assembler->CheckNotCharacter('\r', new_trace.backtrack());	3042 assembler->CheckNotCharacter('\r', new_trace.backtrack());

3056 }	3043 }

3057 assembler->Bind(&ok);	3044 assembler->Bind(&ok);

3058 on_success->Emit(compiler, &new_trace);	3045 on_success->Emit(compiler, &new_trace);

3059 }	3046 }

3060	3047

3061	3048

3062 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).	3049 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).

3063 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {	3050 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {

3064 RegExpMacroAssembler* assembler = compiler->macro_assembler();	3051 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	3052 Isolate* isolate = assembler->isolate();

3065 Trace::TriBool next_is_word_character = Trace::UNKNOWN;	3053 Trace::TriBool next_is_word_character = Trace::UNKNOWN;

3066 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);	3054 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);

3067 BoyerMooreLookahead* lookahead = bm_info(not_at_start);	3055 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

3068 if (lookahead == NULL) {	3056 if (lookahead == NULL) {

3069 int eats_at_least =	3057 int eats_at_least =

3070 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,	3058 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,

3071 kRecursionBudget,	3059 kRecursionBudget,

3072 not_at_start));	3060 not_at_start));

3073 if (eats_at_least >= 1) {	3061 if (eats_at_least >= 1) {

3074 BoyerMooreLookahead* bm =	3062 BoyerMooreLookahead* bm =

3075 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());	3063 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());

3076 FillInBMInfo(compiler, 0, kRecursionBudget, bm, not_at_start);	3064 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);

3077 if (bm->at(0)->is_non_word())	3065 if (bm->at(0)->is_non_word())

3078 next_is_word_character = Trace::FALSE_VALUE;	3066 next_is_word_character = Trace::FALSE_VALUE;

3079 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;	3067 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;

3080 }	3068 }

3081 } else {	3069 } else {

3082 if (lookahead->at(0)->is_non_word())	3070 if (lookahead->at(0)->is_non_word())

3083 next_is_word_character = Trace::FALSE_VALUE;	3071 next_is_word_character = Trace::FALSE_VALUE;

3084 if (lookahead->at(0)->is_word())	3072 if (lookahead->at(0)->is_word())

3085 next_is_word_character = Trace::TRUE_VALUE;	3073 next_is_word_character = Trace::TRUE_VALUE;

3086 }	3074 }

(...skipping 151 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3238 // up to the limit the quick check already checked. In addition the quick	3226 // up to the limit the quick check already checked. In addition the quick

3239 // check can have involved a mask and compare operation which may simplify	3227 // check can have involved a mask and compare operation which may simplify

3240 // or obviate the need for further checks at some character positions.	3228 // or obviate the need for further checks at some character positions.

3241 void TextNode::TextEmitPass(RegExpCompiler* compiler,	3229 void TextNode::TextEmitPass(RegExpCompiler* compiler,

3242 TextEmitPassType pass,	3230 TextEmitPassType pass,

3243 bool preloaded,	3231 bool preloaded,

3244 Trace* trace,	3232 Trace* trace,

3245 bool first_element_checked,	3233 bool first_element_checked,

3246 int* checked_up_to) {	3234 int* checked_up_to) {

3247 RegExpMacroAssembler* assembler = compiler->macro_assembler();	3235 RegExpMacroAssembler* assembler = compiler->macro_assembler();

	3236 Isolate* isolate = assembler->isolate();

3248 bool one_byte = compiler->one_byte();	3237 bool one_byte = compiler->one_byte();

3249 Label* backtrack = trace->backtrack();	3238 Label* backtrack = trace->backtrack();

3250 QuickCheckDetails* quick_check = trace->quick_check_performed();	3239 QuickCheckDetails* quick_check = trace->quick_check_performed();

3251 int element_count = elements()->length();	3240 int element_count = elements()->length();

3252 int backward_offset = read_backward() ? -Length() : 0;	3241 int backward_offset = read_backward() ? -Length() : 0;

3253 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {	3242 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {

3254 TextElement elm = elements()->at(i);	3243 TextElement elm = elements()->at(i);

3255 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;	3244 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;

3256 if (elm.text_type() == TextElement::ATOM) {	3245 if (elm.text_type() == TextElement::ATOM) {

3257 Vector<const uc16> quarks = elm.atom()->data();	3246 Vector<const uc16> quarks = elm.atom()->data();

3258 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {	3247 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {

3259 if (first_element_checked && i == 0 && j == 0) continue;	3248 if (first_element_checked && i == 0 && j == 0) continue;

3260 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;	3249 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;

3261 EmitCharacterFunction* emit_function = NULL;	3250 EmitCharacterFunction* emit_function = NULL;

3262 switch (pass) {	3251 switch (pass) {

3263 case NON_LATIN1_MATCH:	3252 case NON_LATIN1_MATCH:

3264 DCHECK(one_byte);	3253 DCHECK(one_byte);

3265 DCHECK(!(compiler->unicode() && compiler->ignore_case()));

3266 if (quarks[j] > String::kMaxOneByteCharCode) {	3254 if (quarks[j] > String::kMaxOneByteCharCode) {

3267 assembler->GoTo(backtrack);	3255 assembler->GoTo(backtrack);

3268 return;	3256 return;

3269 }	3257 }

3270 break;	3258 break;

3271 case NON_LETTER_CHARACTER_MATCH:	3259 case NON_LETTER_CHARACTER_MATCH:

3272 emit_function = &EmitAtomNonLetter;	3260 emit_function = &EmitAtomNonLetter;

3273 break;	3261 break;

3274 case SIMPLE_CHARACTER_MATCH:	3262 case SIMPLE_CHARACTER_MATCH:

3275 emit_function = &EmitSimpleCharacter;	3263 emit_function = &EmitSimpleCharacter;

3276 break;	3264 break;

3277 case CASE_CHARACTER_MATCH:	3265 case CASE_CHARACTER_MATCH:

3278 emit_function = &EmitAtomLetter;	3266 emit_function = &EmitAtomLetter;

3279 break;	3267 break;

3280 default:	3268 default:

3281 break;	3269 break;

3282 }	3270 }

3283 if (emit_function != NULL) {	3271 if (emit_function != NULL) {

3284 bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();	3272 bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();

3285 bool bound_checked =	3273 bool bound_checked =

3286 emit_function(compiler, quarks[j], backtrack, cp_offset + j,	3274 emit_function(isolate, compiler, quarks[j], backtrack,

3287 bounds_check, preloaded);	3275 cp_offset + j, bounds_check, preloaded);

3288 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);	3276 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);

3289 }	3277 }

3290 }	3278 }

3291 } else {	3279 } else {

3292 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());	3280 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());

3293 if (pass == CHARACTER_CLASS_MATCH) {	3281 if (pass == CHARACTER_CLASS_MATCH) {

3294 if (first_element_checked && i == 0) continue;	3282 if (first_element_checked && i == 0) continue;

3295 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;	3283 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;

3296 RegExpCharacterClass* cc = elm.char_class();	3284 RegExpCharacterClass* cc = elm.char_class();

3297 bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();	3285 bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3360 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {	3348 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {

3361 LimitResult limit_result = LimitVersions(compiler, trace);	3349 LimitResult limit_result = LimitVersions(compiler, trace);

3362 if (limit_result == DONE) return;	3350 if (limit_result == DONE) return;

3363 DCHECK(limit_result == CONTINUE);	3351 DCHECK(limit_result == CONTINUE);

3364	3352

3365 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {	3353 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {

3366 compiler->SetRegExpTooBig();	3354 compiler->SetRegExpTooBig();

3367 return;	3355 return;

3368 }	3356 }

3369	3357

3370 if (compiler->one_byte() &&	3358 if (compiler->one_byte()) {

3371 !(compiler->unicode() && compiler->ignore_case())) {

3372 // If any character within the text node is outside the Latin1 range, it

3373 // cannot possibly match anything in a one-byte string. This still holds

3374 // for case-insensitive non-unicode regexp patterns. However, for

3375 // case-insensitive unicode regexp patterns, this is no longer true, e.g.

3376 // /\u212b/ui matches "\u00c5".

3377 int dummy = 0;	3359 int dummy = 0;

3378 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);	3360 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);

3379 }	3361 }

3380	3362

3381 bool first_elt_done = false;	3363 bool first_elt_done = false;

3382 int bound_checked_to = trace->cp_offset() - 1;	3364 int bound_checked_to = trace->cp_offset() - 1;

3383 bound_checked_to += trace->bound_checked_up_to();	3365 bound_checked_to += trace->bound_checked_up_to();

3384	3366

3385 // If a character is preloaded into the current character register then	3367 // If a character is preloaded into the current character register then

3386 // check that now.	3368 // check that now.

(...skipping 731 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4118	4100

4119 // Really we should be creating a new trace when we execute this function,	4101 // Really we should be creating a new trace when we execute this function,

4120 // but there is no need, because the code it generates cannot backtrack, and	4102 // but there is no need, because the code it generates cannot backtrack, and

4121 // we always arrive here with a trivial trace (since it's the entry to a	4103 // we always arrive here with a trivial trace (since it's the entry to a

4122 // loop. That also implies that there are no preloaded characters, which is	4104 // loop. That also implies that there are no preloaded characters, which is

4123 // good, because it means we won't be violating any assumptions by	4105 // good, because it means we won't be violating any assumptions by

4124 // overwriting those characters with new load instructions.	4106 // overwriting those characters with new load instructions.

4125 DCHECK(trace->is_trivial());	4107 DCHECK(trace->is_trivial());

4126	4108

4127 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	4109 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

	4110 Isolate* isolate = macro_assembler->isolate();

4128 // At this point we know that we are at a non-greedy loop that will eat	4111 // At this point we know that we are at a non-greedy loop that will eat

4129 // any character one at a time. Any non-anchored regexp has such a	4112 // any character one at a time. Any non-anchored regexp has such a

4130 // loop prepended to it in order to find where it starts. We look for	4113 // loop prepended to it in order to find where it starts. We look for

4131 // a pattern of the form ...abc... where we can look 6 characters ahead	4114 // a pattern of the form ...abc... where we can look 6 characters ahead

4132 // and step forwards 3 if the character is not one of abc. Abc need	4115 // and step forwards 3 if the character is not one of abc. Abc need

4133 // not be atoms, they can be any reasonably limited character class or	4116 // not be atoms, they can be any reasonably limited character class or

4134 // small alternation.	4117 // small alternation.

4135 BoyerMooreLookahead* bm = bm_info(false);	4118 BoyerMooreLookahead* bm = bm_info(false);

4136 if (bm == NULL) {	4119 if (bm == NULL) {

4137 eats_at_least = Min(kMaxLookaheadForBoyerMoore,	4120 eats_at_least = Min(kMaxLookaheadForBoyerMoore,

4138 EatsAtLeast(kMaxLookaheadForBoyerMoore,	4121 EatsAtLeast(kMaxLookaheadForBoyerMoore,

4139 kRecursionBudget,	4122 kRecursionBudget,

4140 false));	4123 false));

4141 if (eats_at_least >= 1) {	4124 if (eats_at_least >= 1) {

4142 bm = new(zone()) BoyerMooreLookahead(eats_at_least,	4125 bm = new(zone()) BoyerMooreLookahead(eats_at_least,

4143 compiler,	4126 compiler,

4144 zone());	4127 zone());

4145 GuardedAlternative alt0 = alternatives_->at(0);	4128 GuardedAlternative alt0 = alternatives_->at(0);

4146 alt0.node()->FillInBMInfo(compiler, 0, kRecursionBudget, bm, false);	4129 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);

4147 }	4130 }

4148 }	4131 }

4149 if (bm != NULL) {	4132 if (bm != NULL) {

4150 bm->EmitSkipInstructions(macro_assembler);	4133 bm->EmitSkipInstructions(macro_assembler);

4151 }	4134 }

4152 return eats_at_least;	4135 return eats_at_least;

4153 }	4136 }

4154	4137

4155	4138

4156 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,	4139 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,

(...skipping 2241 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6398	6381

6399 void Analysis::VisitBackReference(BackReferenceNode* that) {	6382 void Analysis::VisitBackReference(BackReferenceNode* that) {

6400 EnsureAnalyzed(that->on_success());	6383 EnsureAnalyzed(that->on_success());

6401 }	6384 }

6402	6385

6403	6386

6404 void Analysis::VisitAssertion(AssertionNode* that) {	6387 void Analysis::VisitAssertion(AssertionNode* that) {

6405 EnsureAnalyzed(that->on_success());	6388 EnsureAnalyzed(that->on_success());

6406 }	6389 }

6407	6390

6408 void BackReferenceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,	6391

6409 int budget, BoyerMooreLookahead* bm,	6392 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

	6393 BoyerMooreLookahead* bm,

6410 bool not_at_start) {	6394 bool not_at_start) {

6411 // Working out the set of characters that a backreference can match is too	6395 // Working out the set of characters that a backreference can match is too

6412 // hard, so we just say that any character can match.	6396 // hard, so we just say that any character can match.

6413 bm->SetRest(offset);	6397 bm->SetRest(offset);

6414 SaveBMInfo(bm, not_at_start, offset);	6398 SaveBMInfo(bm, not_at_start, offset);

6415 }	6399 }

6416	6400

6417	6401

6418 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==	6402 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==

6419 RegExpMacroAssembler::kTableSize);	6403 RegExpMacroAssembler::kTableSize);

6420	6404

6421 void ChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,	6405

	6406 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

6422 BoyerMooreLookahead* bm, bool not_at_start) {	6407 BoyerMooreLookahead* bm, bool not_at_start) {

6423 ZoneList<GuardedAlternative>* alts = alternatives();	6408 ZoneList<GuardedAlternative>* alts = alternatives();

6424 budget = (budget - 1) / alts->length();	6409 budget = (budget - 1) / alts->length();

6425 for (int i = 0; i < alts->length(); i++) {	6410 for (int i = 0; i < alts->length(); i++) {

6426 GuardedAlternative& alt = alts->at(i);	6411 GuardedAlternative& alt = alts->at(i);

6427 if (alt.guards() != NULL && alt.guards()->length() != 0) {	6412 if (alt.guards() != NULL && alt.guards()->length() != 0) {

6428 bm->SetRest(offset); // Give up trying to fill in info.	6413 bm->SetRest(offset); // Give up trying to fill in info.

6429 SaveBMInfo(bm, not_at_start, offset);	6414 SaveBMInfo(bm, not_at_start, offset);

6430 return;	6415 return;

6431 }	6416 }

6432 alt.node()->FillInBMInfo(compiler, offset, budget, bm, not_at_start);	6417 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);

6433 }	6418 }

6434 SaveBMInfo(bm, not_at_start, offset);	6419 SaveBMInfo(bm, not_at_start, offset);

6435 }	6420 }

6436	6421

6437 void TextNode::FillInBMInfo(RegExpCompiler* compiler, int initial_offset,	6422

6438 int budget, BoyerMooreLookahead* bm,	6423 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,

6439 bool not_at_start) {	6424 BoyerMooreLookahead* bm, bool not_at_start) {

6440 if (initial_offset >= bm->length()) return;	6425 if (initial_offset >= bm->length()) return;

6441 int offset = initial_offset;	6426 int offset = initial_offset;

6442 int max_char = bm->max_char();	6427 int max_char = bm->max_char();

6443 for (int i = 0; i < elements()->length(); i++) {	6428 for (int i = 0; i < elements()->length(); i++) {

6444 if (offset >= bm->length()) {	6429 if (offset >= bm->length()) {

6445 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6430 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6446 return;	6431 return;

6447 }	6432 }

6448 TextElement text = elements()->at(i);	6433 TextElement text = elements()->at(i);

6449 if (text.text_type() == TextElement::ATOM) {	6434 if (text.text_type() == TextElement::ATOM) {

6450 RegExpAtom* atom = text.atom();	6435 RegExpAtom* atom = text.atom();

6451 for (int j = 0; j < atom->length(); j++, offset++) {	6436 for (int j = 0; j < atom->length(); j++, offset++) {

6452 if (offset >= bm->length()) {	6437 if (offset >= bm->length()) {

6453 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6438 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6454 return;	6439 return;

6455 }	6440 }

6456 uc16 character = atom->data()[j];	6441 uc16 character = atom->data()[j];

6457 if (bm->compiler()->ignore_case()) {	6442 if (bm->compiler()->ignore_case()) {

6458 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	6443 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

6459 int length = GetCaseIndependentLetters(compiler, character, chars);	6444 int length = GetCaseIndependentLetters(

	6445 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,

	6446 chars);

6460 for (int j = 0; j < length; j++) {	6447 for (int j = 0; j < length; j++) {

6461 bm->Set(offset, chars[j]);	6448 bm->Set(offset, chars[j]);

6462 }	6449 }

6463 } else {	6450 } else {

6464 if (character <= max_char) bm->Set(offset, character);	6451 if (character <= max_char) bm->Set(offset, character);

6465 }	6452 }

6466 }	6453 }

6467 } else {	6454 } else {

6468 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());	6455 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());

6469 RegExpCharacterClass* char_class = text.char_class();	6456 RegExpCharacterClass* char_class = text.char_class();

6470 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());	6457 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());

6471 if (char_class->is_negated()) {	6458 if (char_class->is_negated()) {

6472 bm->SetAll(offset);	6459 bm->SetAll(offset);

6473 } else {	6460 } else {

6474 for (int k = 0; k < ranges->length(); k++) {	6461 for (int k = 0; k < ranges->length(); k++) {

6475 CharacterRange& range = ranges->at(k);	6462 CharacterRange& range = ranges->at(k);

6476 if (range.from() > max_char) continue;	6463 if (range.from() > max_char) continue;

6477 int to = Min(max_char, static_cast<int>(range.to()));	6464 int to = Min(max_char, static_cast<int>(range.to()));

6478 bm->SetInterval(offset, Interval(range.from(), to));	6465 bm->SetInterval(offset, Interval(range.from(), to));

6479 }	6466 }

6480 }	6467 }

6481 offset++;	6468 offset++;

6482 }	6469 }

6483 }	6470 }

6484 if (offset >= bm->length()) {	6471 if (offset >= bm->length()) {

6485 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6472 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6486 return;	6473 return;

6487 }	6474 }

6488 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm,	6475 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,

6489 true); // Not at start after a text node.	6476 true); // Not at start after a text node.

6490 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6477 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6491 }	6478 }

6492	6479

6493	6480

6494 // -------------------------------------------------------------------	6481 // -------------------------------------------------------------------

6495 // Dispatch table construction	6482 // Dispatch table construction

6496	6483

6497	6484

6498 void DispatchTableConstructor::VisitEnd(EndNode* that) {	6485 void DispatchTableConstructor::VisitEnd(EndNode* that) {

(...skipping 137 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6636 }	6623 }

6637	6624

6638	6625

6639 RegExpEngine::CompilationResult RegExpEngine::Compile(	6626 RegExpEngine::CompilationResult RegExpEngine::Compile(

6640 Isolate* isolate, Zone* zone, RegExpCompileData* data,	6627 Isolate* isolate, Zone* zone, RegExpCompileData* data,

6641 JSRegExp::Flags flags, Handle<String> pattern,	6628 JSRegExp::Flags flags, Handle<String> pattern,

6642 Handle<String> sample_subject, bool is_one_byte) {	6629 Handle<String> sample_subject, bool is_one_byte) {

6643 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {	6630 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {

6644 return IrregexpRegExpTooBig(isolate);	6631 return IrregexpRegExpTooBig(isolate);

6645 }	6632 }

	6633 bool ignore_case = flags & JSRegExp::kIgnoreCase;

6646 bool is_sticky = flags & JSRegExp::kSticky;	6634 bool is_sticky = flags & JSRegExp::kSticky;

6647 bool is_global = flags & JSRegExp::kGlobal;	6635 bool is_global = flags & JSRegExp::kGlobal;

6648 bool is_unicode = flags & JSRegExp::kUnicode;	6636 bool is_unicode = flags & JSRegExp::kUnicode;

6649 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,	6637 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,

6650 is_one_byte);	6638 is_one_byte);

6651	6639

6652 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));	6640 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));

6653	6641

6654 // Sample some characters from the middle of the string.	6642 // Sample some characters from the middle of the string.

6655 static const int kSampleSize = 128;	6643 static const int kSampleSize = 128;

(...skipping 29 matching lines...) Expand all Loading...
6685 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);	6673 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);

6686 first_step_node->AddAlternative(GuardedAlternative(captured_body));	6674 first_step_node->AddAlternative(GuardedAlternative(captured_body));

6687 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(	6675 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(

6688 new (zone) RegExpCharacterClass('*'), false, loop_node)));	6676 new (zone) RegExpCharacterClass('*'), false, loop_node)));

6689 node = first_step_node;	6677 node = first_step_node;

6690 } else {	6678 } else {

6691 node = loop_node;	6679 node = loop_node;

6692 }	6680 }

6693 }	6681 }

6694 if (is_one_byte) {	6682 if (is_one_byte) {

6695 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);	6683 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);

6696 // Do it again to propagate the new nodes to places where they were not	6684 // Do it again to propagate the new nodes to places where they were not

6697 // put because they had not been calculated yet.	6685 // put because they had not been calculated yet.

6698 if (node != NULL) {	6686 if (node != NULL) {

6699 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);	6687 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);

6700 }	6688 }

6701 } else if (compiler.unicode() && (is_global \|\| is_sticky)) {	6689 } else if (compiler.unicode() && (is_global \|\| is_sticky)) {

6702 node = OptionallyStepBackToLeadSurrogate(&compiler, node);	6690 node = OptionallyStepBackToLeadSurrogate(&compiler, node);

6703 }	6691 }

6704	6692

6705 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);	6693 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);

6706 data->node = node;	6694 data->node = node;

6707 Analysis analysis(isolate, flags, is_one_byte);	6695 Analysis analysis(isolate, flags, is_one_byte);

6708 analysis.EnsureAnalyzed(node);	6696 analysis.EnsureAnalyzed(node);

6709 if (analysis.has_failed()) {	6697 if (analysis.has_failed()) {

(...skipping 177 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6887	6875

6888	6876

6889 void RegExpResultsCache::Clear(FixedArray* cache) {	6877 void RegExpResultsCache::Clear(FixedArray* cache) {

6890 for (int i = 0; i < kRegExpResultsCacheSize; i++) {	6878 for (int i = 0; i < kRegExpResultsCacheSize; i++) {

6891 cache->set(i, Smi::FromInt(0));	6879 cache->set(i, Smi::FromInt(0));

6892 }	6880 }

6893 }	6881 }

6894	6882

6895 } // namespace internal	6883 } // namespace internal

6896 } // namespace v8	6884 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »