src/regexp/jsregexp.cc - Issue 1641613002: [regexp] implement /ui to mirror the implementation for /i.

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1641613002: [regexp] implement /ui to mirror the implementation for /i. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: addressed comment Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/jsregexp.h"	5 #include "src/regexp/jsregexp.h"

6	6

7 #include "src/ast/ast.h"	7 #include "src/ast/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 1580 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1591 macro_assembler->IfRegisterLT(guard->reg(),	1591 macro_assembler->IfRegisterLT(guard->reg(),

1592 guard->value(),	1592 guard->value(),

1593 trace->backtrack());	1593 trace->backtrack());

1594 break;	1594 break;

1595 }	1595 }

1596 }	1596 }

1597	1597

1598	1598

1599 // Returns the number of characters in the equivalence class, omitting those	1599 // Returns the number of characters in the equivalence class, omitting those

1600 // that cannot occur in the source string because it is Latin1.	1600 // that cannot occur in the source string because it is Latin1.

1601 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,	1601 static int GetCaseIndependentLetters(RegExpCompiler* compiler, uc16 character,

1602 bool one_byte_subject,	1602 uc32* letters) {

1603 unibrow::uchar* letters) {	1603 int length;

1604 int length =	1604 #ifdef V8_I18N_SUPPORT

1605 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);	1605 if (compiler->unicode()) {

1606 // Unibrow returns 0 or 1 for characters where case independence is	1606 USet* set = uset_open(character, character);

1607 // trivial.	1607 uset_closeOver(set, USET_CASE_INSENSITIVE);

1608 if (length == 0) {	1608 uset_removeAllStrings(set);

1609 letters[0] = character;	1609 length = uset_size(set);

1610 length = 1;	1610 for (int i = 0; i < length; i++) {

	1611 letters[i] = uset_charAt(set, i);

	1612 }

	1613 uset_close(set);

	1614 } else // NOLINT

	1615 // Fallback in case ICU is not included.

	1616 #endif // V8_I18N_SUPPORT

	1617 {

	1618 length = compiler->isolate()->jsregexp_uncanonicalize()->get(character,

	1619 '\0', letters);

	1620 // Unibrow returns 0 or 1 for characters where case independence is

	1621 // trivial.

	1622 if (length == 0) {

	1623 letters[0] = character;

	1624 length = 1;

	1625 }

1611 }	1626 }

1612	1627

1613 if (one_byte_subject) {	1628 if (compiler->one_byte()) {

1614 int new_length = 0;	1629 int new_length = 0;

1615 for (int i = 0; i < length; i++) {	1630 for (int i = 0; i < length; i++) {

1616 if (letters[i] <= String::kMaxOneByteCharCode) {	1631 if (letters[i] <= String::kMaxOneByteCharCode) {

1617 letters[new_length++] = letters[i];	1632 letters[new_length++] = letters[i];

1618 }	1633 }

1619 }	1634 }

1620 length = new_length;	1635 length = new_length;

1621 }	1636 }

1622	1637

1623 return length;	1638 return length;

1624 }	1639 }

1625	1640

1626	1641 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, uc16 c,

1627 static inline bool EmitSimpleCharacter(Isolate* isolate,	1642 Label* on_failure, int cp_offset,

1628 RegExpCompiler* compiler,	1643 bool check, bool preloaded) {

1629 uc16 c,

1630 Label* on_failure,

1631 int cp_offset,

1632 bool check,

1633 bool preloaded) {

1634 RegExpMacroAssembler* assembler = compiler->macro_assembler();	1644 RegExpMacroAssembler* assembler = compiler->macro_assembler();

1635 bool bound_checked = false;	1645 bool bound_checked = false;

1636 if (!preloaded) {	1646 if (!preloaded) {

1637 assembler->LoadCurrentCharacter(	1647 assembler->LoadCurrentCharacter(

1638 cp_offset,	1648 cp_offset,

1639 on_failure,	1649 on_failure,

1640 check);	1650 check);

1641 bound_checked = true;	1651 bound_checked = true;

1642 }	1652 }

1643 assembler->CheckNotCharacter(c, on_failure);	1653 assembler->CheckNotCharacter(c, on_failure);

1644 return bound_checked;	1654 return bound_checked;

1645 }	1655 }

1646	1656

1647	1657

1648 // Only emits non-letters (things that don't have case). Only used for case	1658 // Only emits non-letters (things that don't have case). Only used for case

1649 // independent matches.	1659 // independent matches.

1650 static inline bool EmitAtomNonLetter(Isolate* isolate,	1660 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, uc16 c,

1651 RegExpCompiler* compiler,	1661 Label* on_failure, int cp_offset,

1652 uc16 c,	1662 bool check, bool preloaded) {

1653 Label* on_failure,

1654 int cp_offset,

1655 bool check,

1656 bool preloaded) {

1657 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	1663 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

1658 bool one_byte = compiler->one_byte();

1659 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1664 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1660 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);	1665 int length = GetCaseIndependentLetters(compiler, c, chars);

1661 if (length < 1) {	1666 if (length < 1) {

1662 // This can't match. Must be an one-byte subject and a non-one-byte	1667 // This can't match. Must be an one-byte subject and a non-one-byte

1663 // character. We do not need to do anything since the one-byte pass	1668 // character. We do not need to do anything since the one-byte pass

1664 // already handled this.	1669 // already handled this.

1665 return false; // Bounds not checked.	1670 return false; // Bounds not checked.

1666 }	1671 }

1667 bool checked = false;	1672 bool checked = false;

1668 // We handle the length > 1 case in a later pass.	1673 // We handle the length > 1 case in a later pass.

1669 if (length == 1) {	1674 if (length == 1) {

1670 if (one_byte && c > String::kMaxOneByteCharCodeU) {	1675 if (compiler->one_byte() && c > String::kMaxOneByteCharCodeU) {

1671 // Can't match - see above.	1676 // This cannot match.

1672 return false; // Bounds not checked.	1677 return false; // Bounds not checked.

1673 }	1678 }

1674 if (!preloaded) {	1679 if (!preloaded) {

1675 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1680 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1676 checked = check;	1681 checked = check;

1677 }	1682 }

1678 macro_assembler->CheckNotCharacter(c, on_failure);	1683 macro_assembler->CheckNotCharacter(c, on_failure);

1679 }	1684 }

1680 return checked;	1685 return checked;

1681 }	1686 }

(...skipping 28 matching lines...) Expand all Loading...
1710 uc16 mask = char_mask ^ diff;	1715 uc16 mask = char_mask ^ diff;

1711 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,	1716 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,

1712 diff,	1717 diff,

1713 mask,	1718 mask,

1714 on_failure);	1719 on_failure);

1715 return true;	1720 return true;

1716 }	1721 }

1717 return false;	1722 return false;

1718 }	1723 }

1719	1724

1720	1725 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, uc16 c,

1721 typedef bool EmitCharacterFunction(Isolate* isolate,	1726 Label* on_failure, int cp_offset, bool check,

1722 RegExpCompiler* compiler,

1723 uc16 c,

1724 Label* on_failure,

1725 int cp_offset,

1726 bool check,

1727 bool preloaded);	1727 bool preloaded);

1728	1728

1729 // Only emits letters (things that have case). Only used for case independent	1729 // Only emits letters (things that have case). Only used for case independent

1730 // matches.	1730 // matches.

1731 static inline bool EmitAtomLetter(Isolate* isolate,	1731 static inline bool EmitAtomLetter(RegExpCompiler* compiler, uc16 c,

1732 RegExpCompiler* compiler,	1732 Label* on_failure, int cp_offset, bool check,

1733 uc16 c,

1734 Label* on_failure,

1735 int cp_offset,

1736 bool check,

1737 bool preloaded) {	1733 bool preloaded) {

1738 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	1734 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

1739 bool one_byte = compiler->one_byte();

1740 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1735 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1741 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);	1736 int length = GetCaseIndependentLetters(compiler, c, chars);

1742 if (length <= 1) return false;	1737 if (length <= 1) return false;

1743 // We may not need to check against the end of the input string	1738 // We may not need to check against the end of the input string

1744 // if this character lies before a character that matched.	1739 // if this character lies before a character that matched.

1745 if (!preloaded) {	1740 if (!preloaded) {

1746 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1741 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1747 }	1742 }

1748 Label ok;	1743 Label ok;

1749 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);	1744 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

1750 switch (length) {	1745 switch (length) {

1751 case 2: {	1746 case 2: {

1752 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],	1747 if (ShortCutEmitCharacterPair(macro_assembler, compiler->one_byte(),

1753 chars[1], on_failure)) {	1748 chars[0], chars[1], on_failure)) {

1754 } else {	1749 } else {

1755 macro_assembler->CheckCharacter(chars[0], &ok);	1750 macro_assembler->CheckCharacter(chars[0], &ok);

1756 macro_assembler->CheckNotCharacter(chars[1], on_failure);	1751 macro_assembler->CheckNotCharacter(chars[1], on_failure);

1757 macro_assembler->Bind(&ok);	1752 macro_assembler->Bind(&ok);

1758 }	1753 }

1759 break;	1754 break;

1760 }	1755 }

1761 case 4:	1756 case 4:

1762 macro_assembler->CheckCharacter(chars[3], &ok);	1757 macro_assembler->CheckCharacter(chars[3], &ok);

1763 // Fall through!	1758 // Fall through!

(...skipping 516 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2280 int ActionNode::EatsAtLeast(int still_to_find,	2275 int ActionNode::EatsAtLeast(int still_to_find,

2281 int budget,	2276 int budget,

2282 bool not_at_start) {	2277 bool not_at_start) {

2283 if (budget <= 0) return 0;	2278 if (budget <= 0) return 0;

2284 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!	2279 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!

2285 return on_success()->EatsAtLeast(still_to_find,	2280 return on_success()->EatsAtLeast(still_to_find,

2286 budget - 1,	2281 budget - 1,

2287 not_at_start);	2282 not_at_start);

2288 }	2283 }

2289	2284

2290	2285 void ActionNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,

2291 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

2292 BoyerMooreLookahead* bm, bool not_at_start) {	2286 BoyerMooreLookahead* bm, bool not_at_start) {

2293 if (action_type_ == BEGIN_SUBMATCH) {	2287 if (action_type_ == BEGIN_SUBMATCH) {

2294 bm->SetRest(offset);	2288 bm->SetRest(offset);

2295 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {	2289 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {

2296 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);	2290 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);

2297 }	2291 }

2298 SaveBMInfo(bm, not_at_start, offset);	2292 SaveBMInfo(bm, not_at_start, offset);

2299 }	2293 }

2300	2294

2301	2295

2302 int AssertionNode::EatsAtLeast(int still_to_find,	2296 int AssertionNode::EatsAtLeast(int still_to_find,

2303 int budget,	2297 int budget,

2304 bool not_at_start) {	2298 bool not_at_start) {

2305 if (budget <= 0) return 0;	2299 if (budget <= 0) return 0;

2306 // If we know we are not at the start and we are asked "how many characters	2300 // If we know we are not at the start and we are asked "how many characters

2307 // will you match if you succeed?" then we can answer anything since false	2301 // will you match if you succeed?" then we can answer anything since false

2308 // implies false. So lets just return the max answer (still_to_find) since	2302 // implies false. So lets just return the max answer (still_to_find) since

2309 // that won't prevent us from preloading a lot of characters for the other	2303 // that won't prevent us from preloading a lot of characters for the other

2310 // branches in the node graph.	2304 // branches in the node graph.

2311 if (assertion_type() == AT_START && not_at_start) return still_to_find;	2305 if (assertion_type() == AT_START && not_at_start) return still_to_find;

2312 return on_success()->EatsAtLeast(still_to_find,	2306 return on_success()->EatsAtLeast(still_to_find,

2313 budget - 1,	2307 budget - 1,

2314 not_at_start);	2308 not_at_start);

2315 }	2309 }

2316	2310

2317	2311 void AssertionNode::FillInBMInfo(RegExpCompiler* compiler, int offset,

2318 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,	2312 int budget, BoyerMooreLookahead* bm,

2319 BoyerMooreLookahead* bm, bool not_at_start) {	2313 bool not_at_start) {

2320 // Match the behaviour of EatsAtLeast on this node.	2314 // Match the behaviour of EatsAtLeast on this node.

2321 if (assertion_type() == AT_START && not_at_start) return;	2315 if (assertion_type() == AT_START && not_at_start) return;

2322 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);	2316 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);

2323 SaveBMInfo(bm, not_at_start, offset);	2317 SaveBMInfo(bm, not_at_start, offset);

2324 }	2318 }

2325	2319

2326	2320

2327 int BackReferenceNode::EatsAtLeast(int still_to_find,	2321 int BackReferenceNode::EatsAtLeast(int still_to_find,

2328 int budget,	2322 int budget,

2329 bool not_at_start) {	2323 bool not_at_start) {

2330 if (read_backward()) return 0;	2324 if (read_backward()) return 0;

2331 if (budget <= 0) return 0;	2325 if (budget <= 0) return 0;

2332 return on_success()->EatsAtLeast(still_to_find,	2326 return on_success()->EatsAtLeast(still_to_find,

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2526 // The masks and values for the positions will be combined into a single	2520 // The masks and values for the positions will be combined into a single

2527 // machine word for the current character width in order to be used in	2521 // machine word for the current character width in order to be used in

2528 // generating a quick check.	2522 // generating a quick check.

2529 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,	2523 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,

2530 RegExpCompiler* compiler,	2524 RegExpCompiler* compiler,

2531 int characters_filled_in,	2525 int characters_filled_in,

2532 bool not_at_start) {	2526 bool not_at_start) {

2533 // Do not collect any quick check details if the text node reads backward,	2527 // Do not collect any quick check details if the text node reads backward,

2534 // since it reads in the opposite direction than we use for quick checks.	2528 // since it reads in the opposite direction than we use for quick checks.

2535 if (read_backward()) return;	2529 if (read_backward()) return;

2536 Isolate* isolate = compiler->macro_assembler()->isolate();

2537 DCHECK(characters_filled_in < details->characters());	2530 DCHECK(characters_filled_in < details->characters());

2538 int characters = details->characters();	2531 int characters = details->characters();

2539 int char_mask;	2532 int char_mask;

2540 if (compiler->one_byte()) {	2533 if (compiler->one_byte()) {

2541 char_mask = String::kMaxOneByteCharCode;	2534 char_mask = String::kMaxOneByteCharCode;

2542 } else {	2535 } else {

2543 char_mask = String::kMaxUtf16CodeUnit;	2536 char_mask = String::kMaxUtf16CodeUnit;

2544 }	2537 }

2545 for (int k = 0; k < elements()->length(); k++) {	2538 for (int k = 0; k < elements()->length(); k++) {

2546 TextElement elm = elements()->at(k);	2539 TextElement elm = elements()->at(k);

2547 if (elm.text_type() == TextElement::ATOM) {	2540 if (elm.text_type() == TextElement::ATOM) {

2548 Vector<const uc16> quarks = elm.atom()->data();	2541 Vector<const uc16> quarks = elm.atom()->data();

2549 for (int i = 0; i < characters && i < quarks.length(); i++) {	2542 for (int i = 0; i < characters && i < quarks.length(); i++) {

2550 QuickCheckDetails::Position* pos =	2543 QuickCheckDetails::Position* pos =

2551 details->positions(characters_filled_in);	2544 details->positions(characters_filled_in);

2552 uc16 c = quarks[i];	2545 uc16 c = quarks[i];

2553 if (compiler->ignore_case()) {	2546 if (compiler->ignore_case()) {

2554 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	2547 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

2555 int length = GetCaseIndependentLetters(isolate, c,	2548 int length = GetCaseIndependentLetters(compiler, c, chars);

2556 compiler->one_byte(), chars);

2557 if (length == 0) {	2549 if (length == 0) {

2558 // This can happen because all case variants are non-Latin1, but we	2550 // This can happen because all case variants are non-Latin1, but we

2559 // know the input is Latin1.	2551 // know the input is Latin1.

2560 details->set_cannot_match();	2552 details->set_cannot_match();

2561 pos->determines_perfectly = false;	2553 pos->determines_perfectly = false;

2562 return;	2554 return;

2563 }	2555 }

2564 if (length == 1) {	2556 if (length == 1) {

2565 // This letter has no case equivalents, so it's nice and simple	2557 // This letter has no case equivalents, so it's nice and simple

2566 // and the mask-compare will determine definitely whether we have	2558 // and the mask-compare will determine definitely whether we have

(...skipping 184 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2751 DCHECK(!info->visited);	2743 DCHECK(!info->visited);

2752 info->visited = true;	2744 info->visited = true;

2753 }	2745 }

2754 ~VisitMarker() {	2746 ~VisitMarker() {

2755 info_->visited = false;	2747 info_->visited = false;

2756 }	2748 }

2757 private:	2749 private:

2758 NodeInfo* info_;	2750 NodeInfo* info_;

2759 };	2751 };

2760	2752

2761	2753 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) {

2762 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {

2763 if (info()->replacement_calculated) return replacement();	2754 if (info()->replacement_calculated) return replacement();

2764 if (depth < 0) return this;	2755 if (depth < 0) return this;

2765 DCHECK(!info()->visited);	2756 DCHECK(!info()->visited);

2766 VisitMarker marker(info());	2757 VisitMarker marker(info());

2767 return FilterSuccessor(depth - 1, ignore_case);	2758 return FilterSuccessor(depth - 1, compiler);

2768 }	2759 }

2769	2760

2770	2761 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth,

2771 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {	2762 RegExpCompiler* compiler) {

2772 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);	2763 RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler);

2773 if (next == NULL) return set_replacement(NULL);	2764 if (next == NULL) return set_replacement(NULL);

2774 on_success_ = next;	2765 on_success_ = next;

2775 return set_replacement(this);	2766 return set_replacement(this);

2776 }	2767 }

2777	2768

2778	2769

2779 // We need to check for the following characters: 0x39c 0x3bc 0x178.	2770 // We need to check for the following characters: 0x39c 0x3bc 0x178.

2780 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {	2771 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {

2781 // TODO(dcarney): this could be a lot more efficient.	2772 // TODO(dcarney): this could be a lot more efficient.

2782 return range.Contains(0x39c) \|\|	2773 return range.Contains(0x39c) \|\|

2783 range.Contains(0x3bc) \|\| range.Contains(0x178);	2774 range.Contains(0x3bc) \|\| range.Contains(0x178);

2784 }	2775 }

2785	2776

2786	2777

2787 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {	2778 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {

2788 for (int i = 0; i < ranges->length(); i++) {	2779 for (int i = 0; i < ranges->length(); i++) {

2789 // TODO(dcarney): this could be a lot more efficient.	2780 // TODO(dcarney): this could be a lot more efficient.

2790 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;	2781 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;

2791 }	2782 }

2792 return false;	2783 return false;

2793 }	2784 }

2794	2785

	2786 static uc16 ConvertNonLatin1ToEquivalentLatin1(bool unicode, uc16 c) {

	2787 #ifdef V8_I18N_SUPPORT

	2788 if (unicode) {

	2789 USet* set = uset_open(c, c);

	2790 uset_closeOver(set, USET_CASE_INSENSITIVE);

	2791 uset_removeAllStrings(set);

	2792 int length = uset_size(set);

	2793 uc16 result = 0;

	2794 for (int i = 0; i < length; i++) {

	2795 uc32 c = uset_charAt(set, i);

	2796 if (c <= String::kMaxOneByteCharCode) {

	2797 result = static_cast<uc16>(c);

	2798 break;

	2799 }

	2800 }

	2801 uset_close(set);

	2802 return result;

	2803 }

	2804 // Fallback to unibrow if ICU is not included.

	2805 #endif // V8_I18N_SUPPORT

	2806 return unibrow::Latin1::ConvertNonLatin1ToLatin1(c);

	2807 }

2795	2808

2796 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {	2809 RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) {

2797 if (info()->replacement_calculated) return replacement();	2810 if (info()->replacement_calculated) return replacement();

2798 if (depth < 0) return this;	2811 if (depth < 0) return this;

2799 DCHECK(!info()->visited);	2812 DCHECK(!info()->visited);

2800 VisitMarker marker(info());	2813 VisitMarker marker(info());

2801 int element_count = elements()->length();	2814 int element_count = elements()->length();

2802 for (int i = 0; i < element_count; i++) {	2815 for (int i = 0; i < element_count; i++) {

2803 TextElement elm = elements()->at(i);	2816 TextElement elm = elements()->at(i);

2804 if (elm.text_type() == TextElement::ATOM) {	2817 if (elm.text_type() == TextElement::ATOM) {

2805 Vector<const uc16> quarks = elm.atom()->data();	2818 Vector<const uc16> quarks = elm.atom()->data();

2806 for (int j = 0; j < quarks.length(); j++) {	2819 for (int j = 0; j < quarks.length(); j++) {

2807 uint16_t c = quarks[j];	2820 uc16 c = quarks[j];

2808 if (c <= String::kMaxOneByteCharCode) continue;	2821 if (c <= String::kMaxOneByteCharCode) continue;

2809 if (!ignore_case) return set_replacement(NULL);	2822 if (!compiler->ignore_case()) return set_replacement(NULL);

2810 // Here, we need to check for characters whose upper and lower cases	2823 // Here, we need to check for characters whose upper and lower cases

2811 // are outside the Latin-1 range.	2824 // are outside the Latin-1 range.

2812 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);	2825 uc16 converted =

	2826 ConvertNonLatin1ToEquivalentLatin1(compiler->unicode(), c);

2813 // Character is outside Latin-1 completely	2827 // Character is outside Latin-1 completely

2814 if (converted == 0) return set_replacement(NULL);	2828 if (converted == 0) return set_replacement(NULL);

2815 // Convert quark to Latin-1 in place.	2829 // Convert quark to Latin-1 in place.

2816 uint16_t* copy = const_cast<uint16_t*>(quarks.start());	2830 uc16* copy = const_cast<uc16*>(quarks.start());

2817 copy[j] = converted;	2831 copy[j] = converted;

2818 }	2832 }

2819 } else {	2833 } else {

2820 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);	2834 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);

2821 RegExpCharacterClass* cc = elm.char_class();	2835 RegExpCharacterClass* cc = elm.char_class();

2822 ZoneList<CharacterRange>* ranges = cc->ranges(zone());	2836 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

2823 CharacterRange::Canonicalize(ranges);	2837 CharacterRange::Canonicalize(ranges);

2824 // Now they are in order so we only need to look at the first.	2838 // Now they are in order so we only need to look at the first.

2825 int range_count = ranges->length();	2839 int range_count = ranges->length();

2826 if (cc->is_negated()) {	2840 if (cc->is_negated()) {

2827 if (range_count != 0 &&	2841 if (range_count != 0 &&

2828 ranges->at(0).from() == 0 &&	2842 ranges->at(0).from() == 0 &&

2829 ranges->at(0).to() >= String::kMaxOneByteCharCode) {	2843 ranges->at(0).to() >= String::kMaxOneByteCharCode) {

2830 // This will be handled in a later filter.	2844 // This will be handled in a later filter.

2831 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;	2845 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))

	2846 continue;

2832 return set_replacement(NULL);	2847 return set_replacement(NULL);

2833 }	2848 }

2834 } else {	2849 } else {

2835 if (range_count == 0 \|\|	2850 if (range_count == 0 \|\|

2836 ranges->at(0).from() > String::kMaxOneByteCharCode) {	2851 ranges->at(0).from() > String::kMaxOneByteCharCode) {

2837 // This will be handled in a later filter.	2852 // This will be handled in a later filter.

2838 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;	2853 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))

	2854 continue;

2839 return set_replacement(NULL);	2855 return set_replacement(NULL);

2840 }	2856 }

2841 }	2857 }

2842 }	2858 }

2843 }	2859 }

2844 return FilterSuccessor(depth - 1, ignore_case);	2860 return FilterSuccessor(depth - 1, compiler);

2845 }	2861 }

2846	2862

2847	2863 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {

2848 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {

2849 if (info()->replacement_calculated) return replacement();	2864 if (info()->replacement_calculated) return replacement();

2850 if (depth < 0) return this;	2865 if (depth < 0) return this;

2851 if (info()->visited) return this;	2866 if (info()->visited) return this;

2852 {	2867 {

2853 VisitMarker marker(info());	2868 VisitMarker marker(info());

2854	2869

2855 RegExpNode* continue_replacement =	2870 RegExpNode* continue_replacement =

2856 continue_node_->FilterOneByte(depth - 1, ignore_case);	2871 continue_node_->FilterOneByte(depth - 1, compiler);

2857 // If we can't continue after the loop then there is no sense in doing the	2872 // If we can't continue after the loop then there is no sense in doing the

2858 // loop.	2873 // loop.

2859 if (continue_replacement == NULL) return set_replacement(NULL);	2874 if (continue_replacement == NULL) return set_replacement(NULL);

2860 }	2875 }

2861	2876

2862 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);	2877 return ChoiceNode::FilterOneByte(depth - 1, compiler);

2863 }	2878 }

2864	2879

2865	2880 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {

2866 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {

2867 if (info()->replacement_calculated) return replacement();	2881 if (info()->replacement_calculated) return replacement();

2868 if (depth < 0) return this;	2882 if (depth < 0) return this;

2869 if (info()->visited) return this;	2883 if (info()->visited) return this;

2870 VisitMarker marker(info());	2884 VisitMarker marker(info());

2871 int choice_count = alternatives_->length();	2885 int choice_count = alternatives_->length();

2872	2886

2873 for (int i = 0; i < choice_count; i++) {	2887 for (int i = 0; i < choice_count; i++) {

2874 GuardedAlternative alternative = alternatives_->at(i);	2888 GuardedAlternative alternative = alternatives_->at(i);

2875 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {	2889 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {

2876 set_replacement(this);	2890 set_replacement(this);

2877 return this;	2891 return this;

2878 }	2892 }

2879 }	2893 }

2880	2894

2881 int surviving = 0;	2895 int surviving = 0;

2882 RegExpNode* survivor = NULL;	2896 RegExpNode* survivor = NULL;

2883 for (int i = 0; i < choice_count; i++) {	2897 for (int i = 0; i < choice_count; i++) {

2884 GuardedAlternative alternative = alternatives_->at(i);	2898 GuardedAlternative alternative = alternatives_->at(i);

2885 RegExpNode* replacement =	2899 RegExpNode* replacement =

2886 alternative.node()->FilterOneByte(depth - 1, ignore_case);	2900 alternative.node()->FilterOneByte(depth - 1, compiler);

2887 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.	2901 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.

2888 if (replacement != NULL) {	2902 if (replacement != NULL) {

2889 alternatives_->at(i).set_node(replacement);	2903 alternatives_->at(i).set_node(replacement);

2890 surviving++;	2904 surviving++;

2891 survivor = replacement;	2905 survivor = replacement;

2892 }	2906 }

2893 }	2907 }

2894 if (surviving < 2) return set_replacement(survivor);	2908 if (surviving < 2) return set_replacement(survivor);

2895	2909

2896 set_replacement(this);	2910 set_replacement(this);

2897 if (surviving == choice_count) {	2911 if (surviving == choice_count) {

2898 return this;	2912 return this;

2899 }	2913 }

2900 // Only some of the nodes survived the filtering. We need to rebuild the	2914 // Only some of the nodes survived the filtering. We need to rebuild the

2901 // alternatives list.	2915 // alternatives list.

2902 ZoneList<GuardedAlternative>* new_alternatives =	2916 ZoneList<GuardedAlternative>* new_alternatives =

2903 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());	2917 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());

2904 for (int i = 0; i < choice_count; i++) {	2918 for (int i = 0; i < choice_count; i++) {

2905 RegExpNode* replacement =	2919 RegExpNode* replacement =

2906 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);	2920 alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler);

2907 if (replacement != NULL) {	2921 if (replacement != NULL) {

2908 alternatives_->at(i).set_node(replacement);	2922 alternatives_->at(i).set_node(replacement);

2909 new_alternatives->Add(alternatives_->at(i), zone());	2923 new_alternatives->Add(alternatives_->at(i), zone());

2910 }	2924 }

2911 }	2925 }

2912 alternatives_ = new_alternatives;	2926 alternatives_ = new_alternatives;

2913 return this;	2927 return this;

2914 }	2928 }

2915	2929

2916	2930 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(

2917 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,	2931 int depth, RegExpCompiler* compiler) {

2918 bool ignore_case) {

2919 if (info()->replacement_calculated) return replacement();	2932 if (info()->replacement_calculated) return replacement();

2920 if (depth < 0) return this;	2933 if (depth < 0) return this;

2921 if (info()->visited) return this;	2934 if (info()->visited) return this;

2922 VisitMarker marker(info());	2935 VisitMarker marker(info());

2923 // Alternative 0 is the negative lookahead, alternative 1 is what comes	2936 // Alternative 0 is the negative lookahead, alternative 1 is what comes

2924 // afterwards.	2937 // afterwards.

2925 RegExpNode* node = alternatives_->at(1).node();	2938 RegExpNode* node = alternatives_->at(1).node();

2926 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);	2939 RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler);

2927 if (replacement == NULL) return set_replacement(NULL);	2940 if (replacement == NULL) return set_replacement(NULL);

2928 alternatives_->at(1).set_node(replacement);	2941 alternatives_->at(1).set_node(replacement);

2929	2942

2930 RegExpNode* neg_node = alternatives_->at(0).node();	2943 RegExpNode* neg_node = alternatives_->at(0).node();

2931 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);	2944 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler);

2932 // If the negative lookahead is always going to fail then	2945 // If the negative lookahead is always going to fail then

2933 // we don't need to check it.	2946 // we don't need to check it.

2934 if (neg_replacement == NULL) return set_replacement(replacement);	2947 if (neg_replacement == NULL) return set_replacement(replacement);

2935 alternatives_->at(0).set_node(neg_replacement);	2948 alternatives_->at(0).set_node(neg_replacement);

2936 return set_replacement(this);	2949 return set_replacement(this);

2937 }	2950 }

2938	2951

2939	2952

2940 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	2953 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2941 RegExpCompiler* compiler,	2954 RegExpCompiler* compiler,

2942 int characters_filled_in,	2955 int characters_filled_in,

2943 bool not_at_start) {	2956 bool not_at_start) {

2944 if (body_can_be_zero_length_ \|\| info()->visited) return;	2957 if (body_can_be_zero_length_ \|\| info()->visited) return;

2945 VisitMarker marker(info());	2958 VisitMarker marker(info());

2946 return ChoiceNode::GetQuickCheckDetails(details,	2959 return ChoiceNode::GetQuickCheckDetails(details,

2947 compiler,	2960 compiler,

2948 characters_filled_in,	2961 characters_filled_in,

2949 not_at_start);	2962 not_at_start);

2950 }	2963 }

2951	2964

2952	2965 void LoopChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,

2953 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,	2966 int budget, BoyerMooreLookahead* bm,

2954 BoyerMooreLookahead* bm, bool not_at_start) {	2967 bool not_at_start) {

2955 if (body_can_be_zero_length_ \|\| budget <= 0) {	2968 if (body_can_be_zero_length_ \|\| budget <= 0) {

2956 bm->SetRest(offset);	2969 bm->SetRest(offset);

2957 SaveBMInfo(bm, not_at_start, offset);	2970 SaveBMInfo(bm, not_at_start, offset);

2958 return;	2971 return;

2959 }	2972 }

2960 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);	2973 ChoiceNode::FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);

2961 SaveBMInfo(bm, not_at_start, offset);	2974 SaveBMInfo(bm, not_at_start, offset);

2962 }	2975 }

2963	2976

2964	2977

2965 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	2978 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2966 RegExpCompiler* compiler,	2979 RegExpCompiler* compiler,

2967 int characters_filled_in,	2980 int characters_filled_in,

2968 bool not_at_start) {	2981 bool not_at_start) {

2969 not_at_start = (not_at_start \|\| not_at_start_);	2982 not_at_start = (not_at_start \|\| not_at_start_);

2970 int choice_count = alternatives_->length();	2983 int choice_count = alternatives_->length();

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3042 assembler->CheckNotCharacter('\r', new_trace.backtrack());	3055 assembler->CheckNotCharacter('\r', new_trace.backtrack());

3043 }	3056 }

3044 assembler->Bind(&ok);	3057 assembler->Bind(&ok);

3045 on_success->Emit(compiler, &new_trace);	3058 on_success->Emit(compiler, &new_trace);

3046 }	3059 }

3047	3060

3048	3061

3049 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).	3062 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).

3050 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {	3063 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {

3051 RegExpMacroAssembler* assembler = compiler->macro_assembler();	3064 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3052 Isolate* isolate = assembler->isolate();

3053 Trace::TriBool next_is_word_character = Trace::UNKNOWN;	3065 Trace::TriBool next_is_word_character = Trace::UNKNOWN;

3054 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);	3066 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);

3055 BoyerMooreLookahead* lookahead = bm_info(not_at_start);	3067 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

3056 if (lookahead == NULL) {	3068 if (lookahead == NULL) {

3057 int eats_at_least =	3069 int eats_at_least =

3058 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,	3070 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,

3059 kRecursionBudget,	3071 kRecursionBudget,

3060 not_at_start));	3072 not_at_start));

3061 if (eats_at_least >= 1) {	3073 if (eats_at_least >= 1) {

3062 BoyerMooreLookahead* bm =	3074 BoyerMooreLookahead* bm =

3063 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());	3075 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());

3064 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);	3076 FillInBMInfo(compiler, 0, kRecursionBudget, bm, not_at_start);

3065 if (bm->at(0)->is_non_word())	3077 if (bm->at(0)->is_non_word())

3066 next_is_word_character = Trace::FALSE_VALUE;	3078 next_is_word_character = Trace::FALSE_VALUE;

3067 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;	3079 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;

3068 }	3080 }

3069 } else {	3081 } else {

3070 if (lookahead->at(0)->is_non_word())	3082 if (lookahead->at(0)->is_non_word())

3071 next_is_word_character = Trace::FALSE_VALUE;	3083 next_is_word_character = Trace::FALSE_VALUE;

3072 if (lookahead->at(0)->is_word())	3084 if (lookahead->at(0)->is_word())

3073 next_is_word_character = Trace::TRUE_VALUE;	3085 next_is_word_character = Trace::TRUE_VALUE;

3074 }	3086 }

(...skipping 151 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3226 // up to the limit the quick check already checked. In addition the quick	3238 // up to the limit the quick check already checked. In addition the quick

3227 // check can have involved a mask and compare operation which may simplify	3239 // check can have involved a mask and compare operation which may simplify

3228 // or obviate the need for further checks at some character positions.	3240 // or obviate the need for further checks at some character positions.

3229 void TextNode::TextEmitPass(RegExpCompiler* compiler,	3241 void TextNode::TextEmitPass(RegExpCompiler* compiler,

3230 TextEmitPassType pass,	3242 TextEmitPassType pass,

3231 bool preloaded,	3243 bool preloaded,

3232 Trace* trace,	3244 Trace* trace,

3233 bool first_element_checked,	3245 bool first_element_checked,

3234 int* checked_up_to) {	3246 int* checked_up_to) {

3235 RegExpMacroAssembler* assembler = compiler->macro_assembler();	3247 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3236 Isolate* isolate = assembler->isolate();

3237 bool one_byte = compiler->one_byte();	3248 bool one_byte = compiler->one_byte();

3238 Label* backtrack = trace->backtrack();	3249 Label* backtrack = trace->backtrack();

3239 QuickCheckDetails* quick_check = trace->quick_check_performed();	3250 QuickCheckDetails* quick_check = trace->quick_check_performed();

3240 int element_count = elements()->length();	3251 int element_count = elements()->length();

3241 int backward_offset = read_backward() ? -Length() : 0;	3252 int backward_offset = read_backward() ? -Length() : 0;

3242 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {	3253 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {

3243 TextElement elm = elements()->at(i);	3254 TextElement elm = elements()->at(i);

3244 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;	3255 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;

3245 if (elm.text_type() == TextElement::ATOM) {	3256 if (elm.text_type() == TextElement::ATOM) {

3246 Vector<const uc16> quarks = elm.atom()->data();	3257 Vector<const uc16> quarks = elm.atom()->data();

3247 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {	3258 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {

3248 if (first_element_checked && i == 0 && j == 0) continue;	3259 if (first_element_checked && i == 0 && j == 0) continue;

3249 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;	3260 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;

3250 EmitCharacterFunction* emit_function = NULL;	3261 EmitCharacterFunction* emit_function = NULL;

3251 switch (pass) {	3262 switch (pass) {

3252 case NON_LATIN1_MATCH:	3263 case NON_LATIN1_MATCH:

3253 DCHECK(one_byte);	3264 DCHECK(one_byte);

	3265 DCHECK(!(compiler->unicode() && compiler->ignore_case()));

3254 if (quarks[j] > String::kMaxOneByteCharCode) {	3266 if (quarks[j] > String::kMaxOneByteCharCode) {

3255 assembler->GoTo(backtrack);	3267 assembler->GoTo(backtrack);

3256 return;	3268 return;

3257 }	3269 }

3258 break;	3270 break;

3259 case NON_LETTER_CHARACTER_MATCH:	3271 case NON_LETTER_CHARACTER_MATCH:

3260 emit_function = &EmitAtomNonLetter;	3272 emit_function = &EmitAtomNonLetter;

3261 break;	3273 break;

3262 case SIMPLE_CHARACTER_MATCH:	3274 case SIMPLE_CHARACTER_MATCH:

3263 emit_function = &EmitSimpleCharacter;	3275 emit_function = &EmitSimpleCharacter;

3264 break;	3276 break;

3265 case CASE_CHARACTER_MATCH:	3277 case CASE_CHARACTER_MATCH:

3266 emit_function = &EmitAtomLetter;	3278 emit_function = &EmitAtomLetter;

3267 break;	3279 break;

3268 default:	3280 default:

3269 break;	3281 break;

3270 }	3282 }

3271 if (emit_function != NULL) {	3283 if (emit_function != NULL) {

3272 bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();	3284 bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();

3273 bool bound_checked =	3285 bool bound_checked =

3274 emit_function(isolate, compiler, quarks[j], backtrack,	3286 emit_function(compiler, quarks[j], backtrack, cp_offset + j,

3275 cp_offset + j, bounds_check, preloaded);	3287 bounds_check, preloaded);

3276 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);	3288 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);

3277 }	3289 }

3278 }	3290 }

3279 } else {	3291 } else {

3280 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());	3292 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());

3281 if (pass == CHARACTER_CLASS_MATCH) {	3293 if (pass == CHARACTER_CLASS_MATCH) {

3282 if (first_element_checked && i == 0) continue;	3294 if (first_element_checked && i == 0) continue;

3283 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;	3295 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;

3284 RegExpCharacterClass* cc = elm.char_class();	3296 RegExpCharacterClass* cc = elm.char_class();

3285 bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();	3297 bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3348 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {	3360 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {

3349 LimitResult limit_result = LimitVersions(compiler, trace);	3361 LimitResult limit_result = LimitVersions(compiler, trace);

3350 if (limit_result == DONE) return;	3362 if (limit_result == DONE) return;

3351 DCHECK(limit_result == CONTINUE);	3363 DCHECK(limit_result == CONTINUE);

3352	3364

3353 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {	3365 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {

3354 compiler->SetRegExpTooBig();	3366 compiler->SetRegExpTooBig();

3355 return;	3367 return;

3356 }	3368 }

3357	3369

3358 if (compiler->one_byte()) {	3370 if (compiler->one_byte() &&

	3371 !(compiler->unicode() && compiler->ignore_case())) {

	3372 // If any character within the text node is outside the Latin1 range, it

	3373 // cannot possibly match anything in a one-byte string. This still holds

	3374 // for case-insensitive non-unicode regexp patterns. However, for

	3375 // case-insensitive unicode regexp patterns, this is no longer true, e.g.

	3376 // /\u212b/ui matches "\u00c5".

3359 int dummy = 0;	3377 int dummy = 0;

3360 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);	3378 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);

3361 }	3379 }

3362	3380

3363 bool first_elt_done = false;	3381 bool first_elt_done = false;

3364 int bound_checked_to = trace->cp_offset() - 1;	3382 int bound_checked_to = trace->cp_offset() - 1;

3365 bound_checked_to += trace->bound_checked_up_to();	3383 bound_checked_to += trace->bound_checked_up_to();

3366	3384

3367 // If a character is preloaded into the current character register then	3385 // If a character is preloaded into the current character register then

3368 // check that now.	3386 // check that now.

(...skipping 731 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4100	4118

4101 // Really we should be creating a new trace when we execute this function,	4119 // Really we should be creating a new trace when we execute this function,

4102 // but there is no need, because the code it generates cannot backtrack, and	4120 // but there is no need, because the code it generates cannot backtrack, and

4103 // we always arrive here with a trivial trace (since it's the entry to a	4121 // we always arrive here with a trivial trace (since it's the entry to a

4104 // loop. That also implies that there are no preloaded characters, which is	4122 // loop. That also implies that there are no preloaded characters, which is

4105 // good, because it means we won't be violating any assumptions by	4123 // good, because it means we won't be violating any assumptions by

4106 // overwriting those characters with new load instructions.	4124 // overwriting those characters with new load instructions.

4107 DCHECK(trace->is_trivial());	4125 DCHECK(trace->is_trivial());

4108	4126

4109 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	4127 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

4110 Isolate* isolate = macro_assembler->isolate();

4111 // At this point we know that we are at a non-greedy loop that will eat	4128 // At this point we know that we are at a non-greedy loop that will eat

4112 // any character one at a time. Any non-anchored regexp has such a	4129 // any character one at a time. Any non-anchored regexp has such a

4113 // loop prepended to it in order to find where it starts. We look for	4130 // loop prepended to it in order to find where it starts. We look for

4114 // a pattern of the form ...abc... where we can look 6 characters ahead	4131 // a pattern of the form ...abc... where we can look 6 characters ahead

4115 // and step forwards 3 if the character is not one of abc. Abc need	4132 // and step forwards 3 if the character is not one of abc. Abc need

4116 // not be atoms, they can be any reasonably limited character class or	4133 // not be atoms, they can be any reasonably limited character class or

4117 // small alternation.	4134 // small alternation.

4118 BoyerMooreLookahead* bm = bm_info(false);	4135 BoyerMooreLookahead* bm = bm_info(false);

4119 if (bm == NULL) {	4136 if (bm == NULL) {

4120 eats_at_least = Min(kMaxLookaheadForBoyerMoore,	4137 eats_at_least = Min(kMaxLookaheadForBoyerMoore,

4121 EatsAtLeast(kMaxLookaheadForBoyerMoore,	4138 EatsAtLeast(kMaxLookaheadForBoyerMoore,

4122 kRecursionBudget,	4139 kRecursionBudget,

4123 false));	4140 false));

4124 if (eats_at_least >= 1) {	4141 if (eats_at_least >= 1) {

4125 bm = new(zone()) BoyerMooreLookahead(eats_at_least,	4142 bm = new(zone()) BoyerMooreLookahead(eats_at_least,

4126 compiler,	4143 compiler,

4127 zone());	4144 zone());

4128 GuardedAlternative alt0 = alternatives_->at(0);	4145 GuardedAlternative alt0 = alternatives_->at(0);

4129 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);	4146 alt0.node()->FillInBMInfo(compiler, 0, kRecursionBudget, bm, false);

4130 }	4147 }

4131 }	4148 }

4132 if (bm != NULL) {	4149 if (bm != NULL) {

4133 bm->EmitSkipInstructions(macro_assembler);	4150 bm->EmitSkipInstructions(macro_assembler);

4134 }	4151 }

4135 return eats_at_least;	4152 return eats_at_least;

4136 }	4153 }

4137	4154

4138	4155

4139 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,	4156 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,

(...skipping 2241 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6381	6398

6382 void Analysis::VisitBackReference(BackReferenceNode* that) {	6399 void Analysis::VisitBackReference(BackReferenceNode* that) {

6383 EnsureAnalyzed(that->on_success());	6400 EnsureAnalyzed(that->on_success());

6384 }	6401 }

6385	6402

6386	6403

6387 void Analysis::VisitAssertion(AssertionNode* that) {	6404 void Analysis::VisitAssertion(AssertionNode* that) {

6388 EnsureAnalyzed(that->on_success());	6405 EnsureAnalyzed(that->on_success());

6389 }	6406 }

6390	6407

6391	6408 void BackReferenceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,

6392 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,	6409 int budget, BoyerMooreLookahead* bm,

6393 BoyerMooreLookahead* bm,

6394 bool not_at_start) {	6410 bool not_at_start) {

6395 // Working out the set of characters that a backreference can match is too	6411 // Working out the set of characters that a backreference can match is too

6396 // hard, so we just say that any character can match.	6412 // hard, so we just say that any character can match.

6397 bm->SetRest(offset);	6413 bm->SetRest(offset);

6398 SaveBMInfo(bm, not_at_start, offset);	6414 SaveBMInfo(bm, not_at_start, offset);

6399 }	6415 }

6400	6416

6401	6417

6402 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==	6418 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==

6403 RegExpMacroAssembler::kTableSize);	6419 RegExpMacroAssembler::kTableSize);

6404	6420

6405	6421 void ChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,

6406 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

6407 BoyerMooreLookahead* bm, bool not_at_start) {	6422 BoyerMooreLookahead* bm, bool not_at_start) {

6408 ZoneList<GuardedAlternative>* alts = alternatives();	6423 ZoneList<GuardedAlternative>* alts = alternatives();

6409 budget = (budget - 1) / alts->length();	6424 budget = (budget - 1) / alts->length();

6410 for (int i = 0; i < alts->length(); i++) {	6425 for (int i = 0; i < alts->length(); i++) {

6411 GuardedAlternative& alt = alts->at(i);	6426 GuardedAlternative& alt = alts->at(i);

6412 if (alt.guards() != NULL && alt.guards()->length() != 0) {	6427 if (alt.guards() != NULL && alt.guards()->length() != 0) {

6413 bm->SetRest(offset); // Give up trying to fill in info.	6428 bm->SetRest(offset); // Give up trying to fill in info.

6414 SaveBMInfo(bm, not_at_start, offset);	6429 SaveBMInfo(bm, not_at_start, offset);

6415 return;	6430 return;

6416 }	6431 }

6417 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);	6432 alt.node()->FillInBMInfo(compiler, offset, budget, bm, not_at_start);

6418 }	6433 }

6419 SaveBMInfo(bm, not_at_start, offset);	6434 SaveBMInfo(bm, not_at_start, offset);

6420 }	6435 }

6421	6436

6422	6437 void TextNode::FillInBMInfo(RegExpCompiler* compiler, int initial_offset,

6423 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,	6438 int budget, BoyerMooreLookahead* bm,

6424 BoyerMooreLookahead* bm, bool not_at_start) {	6439 bool not_at_start) {

6425 if (initial_offset >= bm->length()) return;	6440 if (initial_offset >= bm->length()) return;

6426 int offset = initial_offset;	6441 int offset = initial_offset;

6427 int max_char = bm->max_char();	6442 int max_char = bm->max_char();

6428 for (int i = 0; i < elements()->length(); i++) {	6443 for (int i = 0; i < elements()->length(); i++) {

6429 if (offset >= bm->length()) {	6444 if (offset >= bm->length()) {

6430 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6445 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6431 return;	6446 return;

6432 }	6447 }

6433 TextElement text = elements()->at(i);	6448 TextElement text = elements()->at(i);

6434 if (text.text_type() == TextElement::ATOM) {	6449 if (text.text_type() == TextElement::ATOM) {

6435 RegExpAtom* atom = text.atom();	6450 RegExpAtom* atom = text.atom();

6436 for (int j = 0; j < atom->length(); j++, offset++) {	6451 for (int j = 0; j < atom->length(); j++, offset++) {

6437 if (offset >= bm->length()) {	6452 if (offset >= bm->length()) {

6438 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6453 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6439 return;	6454 return;

6440 }	6455 }

6441 uc16 character = atom->data()[j];	6456 uc16 character = atom->data()[j];

6442 if (bm->compiler()->ignore_case()) {	6457 if (bm->compiler()->ignore_case()) {

6443 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	6458 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

6444 int length = GetCaseIndependentLetters(	6459 int length = GetCaseIndependentLetters(compiler, character, chars);

6445 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,

6446 chars);

6447 for (int j = 0; j < length; j++) {	6460 for (int j = 0; j < length; j++) {

6448 bm->Set(offset, chars[j]);	6461 bm->Set(offset, chars[j]);

6449 }	6462 }

6450 } else {	6463 } else {

6451 if (character <= max_char) bm->Set(offset, character);	6464 if (character <= max_char) bm->Set(offset, character);

6452 }	6465 }

6453 }	6466 }

6454 } else {	6467 } else {

6455 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());	6468 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());

6456 RegExpCharacterClass* char_class = text.char_class();	6469 RegExpCharacterClass* char_class = text.char_class();

6457 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());	6470 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());

6458 if (char_class->is_negated()) {	6471 if (char_class->is_negated()) {

6459 bm->SetAll(offset);	6472 bm->SetAll(offset);

6460 } else {	6473 } else {

6461 for (int k = 0; k < ranges->length(); k++) {	6474 for (int k = 0; k < ranges->length(); k++) {

6462 CharacterRange& range = ranges->at(k);	6475 CharacterRange& range = ranges->at(k);

6463 if (range.from() > max_char) continue;	6476 if (range.from() > max_char) continue;

6464 int to = Min(max_char, static_cast<int>(range.to()));	6477 int to = Min(max_char, static_cast<int>(range.to()));

6465 bm->SetInterval(offset, Interval(range.from(), to));	6478 bm->SetInterval(offset, Interval(range.from(), to));

6466 }	6479 }

6467 }	6480 }

6468 offset++;	6481 offset++;

6469 }	6482 }

6470 }	6483 }

6471 if (offset >= bm->length()) {	6484 if (offset >= bm->length()) {

6472 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6485 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6473 return;	6486 return;

6474 }	6487 }

6475 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,	6488 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm,

6476 true); // Not at start after a text node.	6489 true); // Not at start after a text node.

6477 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6490 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6478 }	6491 }

6479	6492

6480	6493

6481 // -------------------------------------------------------------------	6494 // -------------------------------------------------------------------

6482 // Dispatch table construction	6495 // Dispatch table construction

6483	6496

6484	6497

6485 void DispatchTableConstructor::VisitEnd(EndNode* that) {	6498 void DispatchTableConstructor::VisitEnd(EndNode* that) {

(...skipping 137 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6623 }	6636 }

6624	6637

6625	6638

6626 RegExpEngine::CompilationResult RegExpEngine::Compile(	6639 RegExpEngine::CompilationResult RegExpEngine::Compile(

6627 Isolate* isolate, Zone* zone, RegExpCompileData* data,	6640 Isolate* isolate, Zone* zone, RegExpCompileData* data,

6628 JSRegExp::Flags flags, Handle<String> pattern,	6641 JSRegExp::Flags flags, Handle<String> pattern,

6629 Handle<String> sample_subject, bool is_one_byte) {	6642 Handle<String> sample_subject, bool is_one_byte) {

6630 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {	6643 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {

6631 return IrregexpRegExpTooBig(isolate);	6644 return IrregexpRegExpTooBig(isolate);

6632 }	6645 }

6633 bool ignore_case = flags & JSRegExp::kIgnoreCase;

6634 bool is_sticky = flags & JSRegExp::kSticky;	6646 bool is_sticky = flags & JSRegExp::kSticky;

6635 bool is_global = flags & JSRegExp::kGlobal;	6647 bool is_global = flags & JSRegExp::kGlobal;

6636 bool is_unicode = flags & JSRegExp::kUnicode;	6648 bool is_unicode = flags & JSRegExp::kUnicode;

6637 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,	6649 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,

6638 is_one_byte);	6650 is_one_byte);

6639	6651

6640 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));	6652 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));

6641	6653

6642 // Sample some characters from the middle of the string.	6654 // Sample some characters from the middle of the string.

6643 static const int kSampleSize = 128;	6655 static const int kSampleSize = 128;

(...skipping 29 matching lines...) Expand all Loading...
6673 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);	6685 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);

6674 first_step_node->AddAlternative(GuardedAlternative(captured_body));	6686 first_step_node->AddAlternative(GuardedAlternative(captured_body));

6675 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(	6687 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(

6676 new (zone) RegExpCharacterClass('*'), false, loop_node)));	6688 new (zone) RegExpCharacterClass('*'), false, loop_node)));

6677 node = first_step_node;	6689 node = first_step_node;

6678 } else {	6690 } else {

6679 node = loop_node;	6691 node = loop_node;

6680 }	6692 }

6681 }	6693 }

6682 if (is_one_byte) {	6694 if (is_one_byte) {

6683 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);	6695 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);

6684 // Do it again to propagate the new nodes to places where they were not	6696 // Do it again to propagate the new nodes to places where they were not

6685 // put because they had not been calculated yet.	6697 // put because they had not been calculated yet.

6686 if (node != NULL) {	6698 if (node != NULL) {

6687 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);	6699 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);

6688 }	6700 }

6689 } else if (compiler.unicode() && (is_global \|\| is_sticky)) {	6701 } else if (compiler.unicode() && (is_global \|\| is_sticky)) {

6690 node = OptionallyStepBackToLeadSurrogate(&compiler, node);	6702 node = OptionallyStepBackToLeadSurrogate(&compiler, node);

6691 }	6703 }

6692	6704

6693 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);	6705 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);

6694 data->node = node;	6706 data->node = node;

6695 Analysis analysis(isolate, flags, is_one_byte);	6707 Analysis analysis(isolate, flags, is_one_byte);

6696 analysis.EnsureAnalyzed(node);	6708 analysis.EnsureAnalyzed(node);

6697 if (analysis.has_failed()) {	6709 if (analysis.has_failed()) {

(...skipping 177 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6875	6887

6876	6888

6877 void RegExpResultsCache::Clear(FixedArray* cache) {	6889 void RegExpResultsCache::Clear(FixedArray* cache) {

6878 for (int i = 0; i < kRegExpResultsCacheSize; i++) {	6890 for (int i = 0; i < kRegExpResultsCacheSize; i++) {

6879 cache->set(i, Smi::FromInt(0));	6891 cache->set(i, Smi::FromInt(0));

6880 }	6892 }

6881 }	6893 }

6882	6894

6883 } // namespace internal	6895 } // namespace internal

6884 } // namespace v8	6896 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »