src/regexp/jsregexp.cc - Issue 1641613002: [regexp] implement /ui to mirror the implementation for /i.

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1641613002: [regexp] implement /ui to mirror the implementation for /i. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/jsregexp.h"	5 #include "src/regexp/jsregexp.h"

6	6

7 #include "src/ast/ast.h"	7 #include "src/ast/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 1570 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1581 macro_assembler->IfRegisterLT(guard->reg(),	1581 macro_assembler->IfRegisterLT(guard->reg(),

1582 guard->value(),	1582 guard->value(),

1583 trace->backtrack());	1583 trace->backtrack());

1584 break;	1584 break;

1585 }	1585 }

1586 }	1586 }

1587	1587

1588	1588

1589 // Returns the number of characters in the equivalence class, omitting those	1589 // Returns the number of characters in the equivalence class, omitting those

1590 // that cannot occur in the source string because it is Latin1.	1590 // that cannot occur in the source string because it is Latin1.

1591 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,	1591 static int GetCaseIndependentLetters(RegExpCompiler* compiler, uc16 character,

1592 bool one_byte_subject,	1592 bool one_byte_subject, uc32* letters) {
	erikcorry 2016/02/01 15:21:40 The compiler knows whether we have a one-byte subj The compiler knows whether we have a one-byte subject, so we don't need that argument any more. Yang 2016/02/02 06:06:48 Done. Show quoted text On 2016/02/01 15:21:40, erikcorry wrote: > The compiler knows whether we have a one-byte subject, so we don't need that > argument any more. Done.
1593 unibrow::uchar* letters) {	1593 int length;

1594 int length =	1594 #ifdef V8_I18N_SUPPORT

1595 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);	1595 if (compiler->unicode()) {

1596 // Unibrow returns 0 or 1 for characters where case independence is	1596 USet* set = uset_open(character, character);

1597 // trivial.	1597 uset_closeOver(set, USET_CASE_INSENSITIVE);

1598 if (length == 0) {	1598 uset_removeAllStrings(set);

1599 letters[0] = character;	1599 length = uset_size(set);

1600 length = 1;	1600 for (int i = 0; i < length; i++) {

	1601 letters[i] = uset_charAt(set, i);

	1602 }

	1603 uset_close(set);

	1604 } else // NOLINT

	1605 // Fallback in case ICU is not included.

	1606 #endif // V8_I18N_SUPPORT

	1607 {

	1608 length = compiler->isolate()->jsregexp_uncanonicalize()->get(character,

	1609 '\0', letters);

	1610 // Unibrow returns 0 or 1 for characters where case independence is

	1611 // trivial.

	1612 if (length == 0) {

	1613 letters[0] = character;

	1614 length = 1;

	1615 }

1601 }	1616 }

1602	1617

1603 if (one_byte_subject) {	1618 if (one_byte_subject) {

1604 int new_length = 0;	1619 int new_length = 0;

1605 for (int i = 0; i < length; i++) {	1620 for (int i = 0; i < length; i++) {

1606 if (letters[i] <= String::kMaxOneByteCharCode) {	1621 if (letters[i] <= String::kMaxOneByteCharCode) {

1607 letters[new_length++] = letters[i];	1622 letters[new_length++] = letters[i];

1608 }	1623 }

1609 }	1624 }

1610 length = new_length;	1625 length = new_length;

1611 }	1626 }

1612	1627

1613 return length;	1628 return length;

1614 }	1629 }

1615	1630

1616	1631 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler, uc16 c,

1617 static inline bool EmitSimpleCharacter(Isolate* isolate,	1632 Label* on_failure, int cp_offset,

1618 RegExpCompiler* compiler,	1633 bool check, bool preloaded) {

1619 uc16 c,

1620 Label* on_failure,

1621 int cp_offset,

1622 bool check,

1623 bool preloaded) {

1624 RegExpMacroAssembler* assembler = compiler->macro_assembler();	1634 RegExpMacroAssembler* assembler = compiler->macro_assembler();

1625 bool bound_checked = false;	1635 bool bound_checked = false;

1626 if (!preloaded) {	1636 if (!preloaded) {

1627 assembler->LoadCurrentCharacter(	1637 assembler->LoadCurrentCharacter(

1628 cp_offset,	1638 cp_offset,

1629 on_failure,	1639 on_failure,

1630 check);	1640 check);

1631 bound_checked = true;	1641 bound_checked = true;

1632 }	1642 }

1633 assembler->CheckNotCharacter(c, on_failure);	1643 assembler->CheckNotCharacter(c, on_failure);

1634 return bound_checked;	1644 return bound_checked;

1635 }	1645 }

1636	1646

1637	1647

1638 // Only emits non-letters (things that don't have case). Only used for case	1648 // Only emits non-letters (things that don't have case). Only used for case

1639 // independent matches.	1649 // independent matches.

1640 static inline bool EmitAtomNonLetter(Isolate* isolate,	1650 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler, uc16 c,

1641 RegExpCompiler* compiler,	1651 Label* on_failure, int cp_offset,

1642 uc16 c,	1652 bool check, bool preloaded) {

1643 Label* on_failure,

1644 int cp_offset,

1645 bool check,

1646 bool preloaded) {

1647 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	1653 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

1648 bool one_byte = compiler->one_byte();	1654 bool one_byte = compiler->one_byte();

1649 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1655 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1650 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);	1656 int length = GetCaseIndependentLetters(compiler, c, one_byte, chars);

1651 if (length < 1) {	1657 if (length < 1) {

1652 // This can't match. Must be an one-byte subject and a non-one-byte	1658 // This can't match. Must be an one-byte subject and a non-one-byte

1653 // character. We do not need to do anything since the one-byte pass	1659 // character. We do not need to do anything since the one-byte pass

1654 // already handled this.	1660 // already handled this.

1655 return false; // Bounds not checked.	1661 return false; // Bounds not checked.

1656 }	1662 }

1657 bool checked = false;	1663 bool checked = false;

1658 // We handle the length > 1 case in a later pass.	1664 // We handle the length > 1 case in a later pass.

1659 if (length == 1) {	1665 if (length == 1) {

1660 if (one_byte && c > String::kMaxOneByteCharCodeU) {
erikcorry 2016/02/01 15:21:40 I think this is still worth having. For these cha I think this is still worth having. For these characters we will emit an unconditional backtrack elsewhere (they can never match on a one-byte subject string) so it is just bloat to emit code here to load and check them. We could do more to avoid outputting code in that case, but this looks like an OK thing to have here. Yang 2016/02/02 06:06:48 I'm confused here. Wouldn't the FilterOneByte pass Show quoted text On 2016/02/01 15:21:40, erikcorry wrote: > I think this is still worth having. For these characters we will emit an > unconditional backtrack elsewhere (they can never match on a one-byte subject > string) so it is just bloat to emit code here to load and check them. > > We could do more to avoid outputting code in that case, but this looks like an > OK thing to have here. I'm confused here. Wouldn't the FilterOneByte pass remove this case before we reach here, unless FilterOneByte gives up due to too deep recursion? I put this line back just in case you disagree.
1661 // Can't match - see above.

1662 return false; // Bounds not checked.

1663 }

1664 if (!preloaded) {	1666 if (!preloaded) {

1665 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1667 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1666 checked = check;	1668 checked = check;

1667 }	1669 }

1668 macro_assembler->CheckNotCharacter(c, on_failure);	1670 macro_assembler->CheckNotCharacter(c, on_failure);

1669 }	1671 }

1670 return checked;	1672 return checked;

1671 }	1673 }

1672	1674

1673	1675

(...skipping 26 matching lines...) Expand all Loading...
1700 uc16 mask = char_mask ^ diff;	1702 uc16 mask = char_mask ^ diff;

1701 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,	1703 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,

1702 diff,	1704 diff,

1703 mask,	1705 mask,

1704 on_failure);	1706 on_failure);

1705 return true;	1707 return true;

1706 }	1708 }

1707 return false;	1709 return false;

1708 }	1710 }

1709	1711

1710	1712 typedef bool EmitCharacterFunction(RegExpCompiler* compiler, uc16 c,

1711 typedef bool EmitCharacterFunction(Isolate* isolate,	1713 Label* on_failure, int cp_offset, bool check,

1712 RegExpCompiler* compiler,

1713 uc16 c,

1714 Label* on_failure,

1715 int cp_offset,

1716 bool check,

1717 bool preloaded);	1714 bool preloaded);

1718	1715

1719 // Only emits letters (things that have case). Only used for case independent	1716 // Only emits letters (things that have case). Only used for case independent

1720 // matches.	1717 // matches.

1721 static inline bool EmitAtomLetter(Isolate* isolate,	1718 static inline bool EmitAtomLetter(RegExpCompiler* compiler, uc16 c,

1722 RegExpCompiler* compiler,	1719 Label* on_failure, int cp_offset, bool check,

1723 uc16 c,

1724 Label* on_failure,

1725 int cp_offset,

1726 bool check,

1727 bool preloaded) {	1720 bool preloaded) {

1728 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	1721 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

1729 bool one_byte = compiler->one_byte();	1722 bool one_byte = compiler->one_byte();

1730 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	1723 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1731 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);	1724 int length = GetCaseIndependentLetters(compiler, c, one_byte, chars);

1732 if (length <= 1) return false;	1725 if (length <= 1) return false;

1733 // We may not need to check against the end of the input string	1726 // We may not need to check against the end of the input string

1734 // if this character lies before a character that matched.	1727 // if this character lies before a character that matched.

1735 if (!preloaded) {	1728 if (!preloaded) {

1736 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);	1729 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1737 }	1730 }

1738 Label ok;	1731 Label ok;

1739 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);	1732 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

1740 switch (length) {	1733 switch (length) {

1741 case 2: {	1734 case 2: {

(...skipping 528 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2270 int ActionNode::EatsAtLeast(int still_to_find,	2263 int ActionNode::EatsAtLeast(int still_to_find,

2271 int budget,	2264 int budget,

2272 bool not_at_start) {	2265 bool not_at_start) {

2273 if (budget <= 0) return 0;	2266 if (budget <= 0) return 0;

2274 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!	2267 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!

2275 return on_success()->EatsAtLeast(still_to_find,	2268 return on_success()->EatsAtLeast(still_to_find,

2276 budget - 1,	2269 budget - 1,

2277 not_at_start);	2270 not_at_start);

2278 }	2271 }

2279	2272

2280	2273 void ActionNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,

2281 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

2282 BoyerMooreLookahead* bm, bool not_at_start) {	2274 BoyerMooreLookahead* bm, bool not_at_start) {

2283 if (action_type_ == BEGIN_SUBMATCH) {	2275 if (action_type_ == BEGIN_SUBMATCH) {

2284 bm->SetRest(offset);	2276 bm->SetRest(offset);

2285 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {	2277 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {

2286 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);	2278 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);

2287 }	2279 }

2288 SaveBMInfo(bm, not_at_start, offset);	2280 SaveBMInfo(bm, not_at_start, offset);

2289 }	2281 }

2290	2282

2291	2283

2292 int AssertionNode::EatsAtLeast(int still_to_find,	2284 int AssertionNode::EatsAtLeast(int still_to_find,

2293 int budget,	2285 int budget,

2294 bool not_at_start) {	2286 bool not_at_start) {

2295 if (budget <= 0) return 0;	2287 if (budget <= 0) return 0;

2296 // If we know we are not at the start and we are asked "how many characters	2288 // If we know we are not at the start and we are asked "how many characters

2297 // will you match if you succeed?" then we can answer anything since false	2289 // will you match if you succeed?" then we can answer anything since false

2298 // implies false. So lets just return the max answer (still_to_find) since	2290 // implies false. So lets just return the max answer (still_to_find) since

2299 // that won't prevent us from preloading a lot of characters for the other	2291 // that won't prevent us from preloading a lot of characters for the other

2300 // branches in the node graph.	2292 // branches in the node graph.

2301 if (assertion_type() == AT_START && not_at_start) return still_to_find;	2293 if (assertion_type() == AT_START && not_at_start) return still_to_find;

2302 return on_success()->EatsAtLeast(still_to_find,	2294 return on_success()->EatsAtLeast(still_to_find,

2303 budget - 1,	2295 budget - 1,

2304 not_at_start);	2296 not_at_start);

2305 }	2297 }

2306	2298

2307	2299 void AssertionNode::FillInBMInfo(RegExpCompiler* compiler, int offset,

2308 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,	2300 int budget, BoyerMooreLookahead* bm,

2309 BoyerMooreLookahead* bm, bool not_at_start) {	2301 bool not_at_start) {

2310 // Match the behaviour of EatsAtLeast on this node.	2302 // Match the behaviour of EatsAtLeast on this node.

2311 if (assertion_type() == AT_START && not_at_start) return;	2303 if (assertion_type() == AT_START && not_at_start) return;

2312 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);	2304 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);

2313 SaveBMInfo(bm, not_at_start, offset);	2305 SaveBMInfo(bm, not_at_start, offset);

2314 }	2306 }

2315	2307

2316	2308

2317 int BackReferenceNode::EatsAtLeast(int still_to_find,	2309 int BackReferenceNode::EatsAtLeast(int still_to_find,

2318 int budget,	2310 int budget,

2319 bool not_at_start) {	2311 bool not_at_start) {

2320 if (read_backward()) return 0;	2312 if (read_backward()) return 0;

2321 if (budget <= 0) return 0;	2313 if (budget <= 0) return 0;

2322 return on_success()->EatsAtLeast(still_to_find,	2314 return on_success()->EatsAtLeast(still_to_find,

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2516 // The masks and values for the positions will be combined into a single	2508 // The masks and values for the positions will be combined into a single

2517 // machine word for the current character width in order to be used in	2509 // machine word for the current character width in order to be used in

2518 // generating a quick check.	2510 // generating a quick check.

2519 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,	2511 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,

2520 RegExpCompiler* compiler,	2512 RegExpCompiler* compiler,

2521 int characters_filled_in,	2513 int characters_filled_in,

2522 bool not_at_start) {	2514 bool not_at_start) {

2523 // Do not collect any quick check details if the text node reads backward,	2515 // Do not collect any quick check details if the text node reads backward,

2524 // since it reads in the opposite direction than we use for quick checks.	2516 // since it reads in the opposite direction than we use for quick checks.

2525 if (read_backward()) return;	2517 if (read_backward()) return;

2526 Isolate* isolate = compiler->macro_assembler()->isolate();

2527 DCHECK(characters_filled_in < details->characters());	2518 DCHECK(characters_filled_in < details->characters());

2528 int characters = details->characters();	2519 int characters = details->characters();

2529 int char_mask;	2520 int char_mask;

2530 if (compiler->one_byte()) {	2521 if (compiler->one_byte()) {

2531 char_mask = String::kMaxOneByteCharCode;	2522 char_mask = String::kMaxOneByteCharCode;

2532 } else {	2523 } else {

2533 char_mask = String::kMaxUtf16CodeUnit;	2524 char_mask = String::kMaxUtf16CodeUnit;

2534 }	2525 }

2535 for (int k = 0; k < elements()->length(); k++) {	2526 for (int k = 0; k < elements()->length(); k++) {

2536 TextElement elm = elements()->at(k);	2527 TextElement elm = elements()->at(k);

2537 if (elm.text_type() == TextElement::ATOM) {	2528 if (elm.text_type() == TextElement::ATOM) {

2538 Vector<const uc16> quarks = elm.atom()->data();	2529 Vector<const uc16> quarks = elm.atom()->data();

2539 for (int i = 0; i < characters && i < quarks.length(); i++) {	2530 for (int i = 0; i < characters && i < quarks.length(); i++) {

2540 QuickCheckDetails::Position* pos =	2531 QuickCheckDetails::Position* pos =

2541 details->positions(characters_filled_in);	2532 details->positions(characters_filled_in);

2542 uc16 c = quarks[i];	2533 uc16 c = quarks[i];

2543 if (compiler->ignore_case()) {	2534 if (compiler->ignore_case()) {

2544 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	2535 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

2545 int length = GetCaseIndependentLetters(isolate, c,	2536 int length = GetCaseIndependentLetters(compiler, c,

2546 compiler->one_byte(), chars);	2537 compiler->one_byte(), chars);

2547 if (length == 0) {	2538 if (length == 0) {

2548 // This can happen because all case variants are non-Latin1, but we	2539 // This can happen because all case variants are non-Latin1, but we

2549 // know the input is Latin1.	2540 // know the input is Latin1.

2550 details->set_cannot_match();	2541 details->set_cannot_match();

2551 pos->determines_perfectly = false;	2542 pos->determines_perfectly = false;

2552 return;	2543 return;

2553 }	2544 }

2554 if (length == 1) {	2545 if (length == 1) {

2555 // This letter has no case equivalents, so it's nice and simple	2546 // This letter has no case equivalents, so it's nice and simple

(...skipping 185 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2741 DCHECK(!info->visited);	2732 DCHECK(!info->visited);

2742 info->visited = true;	2733 info->visited = true;

2743 }	2734 }

2744 ~VisitMarker() {	2735 ~VisitMarker() {

2745 info_->visited = false;	2736 info_->visited = false;

2746 }	2737 }

2747 private:	2738 private:

2748 NodeInfo* info_;	2739 NodeInfo* info_;

2749 };	2740 };

2750	2741

2751	2742 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpCompiler* compiler) {

2752 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {

2753 if (info()->replacement_calculated) return replacement();	2743 if (info()->replacement_calculated) return replacement();

2754 if (depth < 0) return this;	2744 if (depth < 0) return this;

2755 DCHECK(!info()->visited);	2745 DCHECK(!info()->visited);

2756 VisitMarker marker(info());	2746 VisitMarker marker(info());

2757 return FilterSuccessor(depth - 1, ignore_case);	2747 return FilterSuccessor(depth - 1, compiler);

2758 }	2748 }

2759	2749

2760	2750 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth,

2761 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {	2751 RegExpCompiler* compiler) {

2762 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);	2752 RegExpNode* next = on_success_->FilterOneByte(depth - 1, compiler);

2763 if (next == NULL) return set_replacement(NULL);	2753 if (next == NULL) return set_replacement(NULL);

2764 on_success_ = next;	2754 on_success_ = next;

2765 return set_replacement(this);	2755 return set_replacement(this);

2766 }	2756 }

2767	2757

2768	2758

2769 // We need to check for the following characters: 0x39c 0x3bc 0x178.	2759 // We need to check for the following characters: 0x39c 0x3bc 0x178.

2770 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {	2760 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {

2771 // TODO(dcarney): this could be a lot more efficient.	2761 // TODO(dcarney): this could be a lot more efficient.

2772 return range.Contains(0x39c) \|\|	2762 return range.Contains(0x39c) \|\|

2773 range.Contains(0x3bc) \|\| range.Contains(0x178);	2763 range.Contains(0x3bc) \|\| range.Contains(0x178);

2774 }	2764 }

2775	2765

2776	2766

2777 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {	2767 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {

2778 for (int i = 0; i < ranges->length(); i++) {	2768 for (int i = 0; i < ranges->length(); i++) {

2779 // TODO(dcarney): this could be a lot more efficient.	2769 // TODO(dcarney): this could be a lot more efficient.

2780 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;	2770 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;

2781 }	2771 }

2782 return false;	2772 return false;

2783 }	2773 }

2784	2774

	2775 static uc16 ConvertNonLatin1ToEquivalentLatin1(bool unicode, uc16 c) {

	2776 #ifdef V8_I18N_SUPPORT

	2777 if (unicode) {

	2778 USet* set = uset_open(c, c);

	2779 uset_closeOver(set, USET_CASE_INSENSITIVE);

	2780 uset_removeAllStrings(set);

	2781 int length = uset_size(set);

	2782 uc16 result = 0;

	2783 for (int i = 0; i < length; i++) {

	2784 uc32 c = uset_charAt(set, i);

	2785 if (c <= String::kMaxOneByteCharCode) {

	2786 result = static_cast<uc16>(c);

	2787 break;

	2788 }

	2789 }

	2790 uset_close(set);

	2791 return result;

	2792 }

	2793 // Fallback to unibrow if ICU is not included.

	2794 #endif // V8_I18N_SUPPORT

	2795 return unibrow::Latin1::ConvertNonLatin1ToLatin1(c);

	2796 }

2785	2797

2786 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {	2798 RegExpNode* TextNode::FilterOneByte(int depth, RegExpCompiler* compiler) {

2787 if (info()->replacement_calculated) return replacement();	2799 if (info()->replacement_calculated) return replacement();

2788 if (depth < 0) return this;	2800 if (depth < 0) return this;

2789 DCHECK(!info()->visited);	2801 DCHECK(!info()->visited);

2790 VisitMarker marker(info());	2802 VisitMarker marker(info());

2791 int element_count = elements()->length();	2803 int element_count = elements()->length();

2792 for (int i = 0; i < element_count; i++) {	2804 for (int i = 0; i < element_count; i++) {

2793 TextElement elm = elements()->at(i);	2805 TextElement elm = elements()->at(i);

2794 if (elm.text_type() == TextElement::ATOM) {	2806 if (elm.text_type() == TextElement::ATOM) {

2795 Vector<const uc16> quarks = elm.atom()->data();	2807 Vector<const uc16> quarks = elm.atom()->data();

2796 for (int j = 0; j < quarks.length(); j++) {	2808 for (int j = 0; j < quarks.length(); j++) {

2797 uint16_t c = quarks[j];	2809 uc16 c = quarks[j];

2798 if (c <= String::kMaxOneByteCharCode) continue;	2810 if (c <= String::kMaxOneByteCharCode) continue;

2799 if (!ignore_case) return set_replacement(NULL);	2811 if (!compiler->ignore_case()) return set_replacement(NULL);

2800 // Here, we need to check for characters whose upper and lower cases	2812 // Here, we need to check for characters whose upper and lower cases

2801 // are outside the Latin-1 range.	2813 // are outside the Latin-1 range.

2802 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);	2814 uc16 converted =

	2815 ConvertNonLatin1ToEquivalentLatin1(compiler->unicode(), c);

2803 // Character is outside Latin-1 completely	2816 // Character is outside Latin-1 completely

2804 if (converted == 0) return set_replacement(NULL);	2817 if (converted == 0) return set_replacement(NULL);

2805 // Convert quark to Latin-1 in place.	2818 // Convert quark to Latin-1 in place.

2806 uint16_t* copy = const_cast<uint16_t*>(quarks.start());	2819 uc16* copy = const_cast<uc16*>(quarks.start());

2807 copy[j] = converted;	2820 copy[j] = converted;

2808 }	2821 }

2809 } else {	2822 } else {

2810 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);	2823 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);

2811 RegExpCharacterClass* cc = elm.char_class();	2824 RegExpCharacterClass* cc = elm.char_class();

2812 ZoneList<CharacterRange>* ranges = cc->ranges(zone());	2825 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

2813 CharacterRange::Canonicalize(ranges);	2826 CharacterRange::Canonicalize(ranges);

2814 // Now they are in order so we only need to look at the first.	2827 // Now they are in order so we only need to look at the first.

2815 int range_count = ranges->length();	2828 int range_count = ranges->length();

2816 if (cc->is_negated()) {	2829 if (cc->is_negated()) {

2817 if (range_count != 0 &&	2830 if (range_count != 0 &&

2818 ranges->at(0).from() == 0 &&	2831 ranges->at(0).from() == 0 &&

2819 ranges->at(0).to() >= String::kMaxOneByteCharCode) {	2832 ranges->at(0).to() >= String::kMaxOneByteCharCode) {

2820 // This will be handled in a later filter.	2833 // This will be handled in a later filter.

2821 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;	2834 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))

	2835 continue;

2822 return set_replacement(NULL);	2836 return set_replacement(NULL);

2823 }	2837 }

2824 } else {	2838 } else {

2825 if (range_count == 0 \|\|	2839 if (range_count == 0 \|\|

2826 ranges->at(0).from() > String::kMaxOneByteCharCode) {	2840 ranges->at(0).from() > String::kMaxOneByteCharCode) {

2827 // This will be handled in a later filter.	2841 // This will be handled in a later filter.

2828 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;	2842 if (compiler->ignore_case() && RangesContainLatin1Equivalents(ranges))

	2843 continue;

2829 return set_replacement(NULL);	2844 return set_replacement(NULL);

2830 }	2845 }

2831 }	2846 }

2832 }	2847 }

2833 }	2848 }

2834 return FilterSuccessor(depth - 1, ignore_case);	2849 return FilterSuccessor(depth - 1, compiler);

2835 }	2850 }

2836	2851

2837	2852 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {

2838 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {

2839 if (info()->replacement_calculated) return replacement();	2853 if (info()->replacement_calculated) return replacement();

2840 if (depth < 0) return this;	2854 if (depth < 0) return this;

2841 if (info()->visited) return this;	2855 if (info()->visited) return this;

2842 {	2856 {

2843 VisitMarker marker(info());	2857 VisitMarker marker(info());

2844	2858

2845 RegExpNode* continue_replacement =	2859 RegExpNode* continue_replacement =

2846 continue_node_->FilterOneByte(depth - 1, ignore_case);	2860 continue_node_->FilterOneByte(depth - 1, compiler);

2847 // If we can't continue after the loop then there is no sense in doing the	2861 // If we can't continue after the loop then there is no sense in doing the

2848 // loop.	2862 // loop.

2849 if (continue_replacement == NULL) return set_replacement(NULL);	2863 if (continue_replacement == NULL) return set_replacement(NULL);

2850 }	2864 }

2851	2865

2852 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);	2866 return ChoiceNode::FilterOneByte(depth - 1, compiler);

2853 }	2867 }

2854	2868

2855	2869 RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpCompiler* compiler) {

2856 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {

2857 if (info()->replacement_calculated) return replacement();	2870 if (info()->replacement_calculated) return replacement();

2858 if (depth < 0) return this;	2871 if (depth < 0) return this;

2859 if (info()->visited) return this;	2872 if (info()->visited) return this;

2860 VisitMarker marker(info());	2873 VisitMarker marker(info());

2861 int choice_count = alternatives_->length();	2874 int choice_count = alternatives_->length();

2862	2875

2863 for (int i = 0; i < choice_count; i++) {	2876 for (int i = 0; i < choice_count; i++) {

2864 GuardedAlternative alternative = alternatives_->at(i);	2877 GuardedAlternative alternative = alternatives_->at(i);

2865 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {	2878 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {

2866 set_replacement(this);	2879 set_replacement(this);

2867 return this;	2880 return this;

2868 }	2881 }

2869 }	2882 }

2870	2883

2871 int surviving = 0;	2884 int surviving = 0;

2872 RegExpNode* survivor = NULL;	2885 RegExpNode* survivor = NULL;

2873 for (int i = 0; i < choice_count; i++) {	2886 for (int i = 0; i < choice_count; i++) {

2874 GuardedAlternative alternative = alternatives_->at(i);	2887 GuardedAlternative alternative = alternatives_->at(i);

2875 RegExpNode* replacement =	2888 RegExpNode* replacement =

2876 alternative.node()->FilterOneByte(depth - 1, ignore_case);	2889 alternative.node()->FilterOneByte(depth - 1, compiler);

2877 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.	2890 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.

2878 if (replacement != NULL) {	2891 if (replacement != NULL) {

2879 alternatives_->at(i).set_node(replacement);	2892 alternatives_->at(i).set_node(replacement);

2880 surviving++;	2893 surviving++;

2881 survivor = replacement;	2894 survivor = replacement;

2882 }	2895 }

2883 }	2896 }

2884 if (surviving < 2) return set_replacement(survivor);	2897 if (surviving < 2) return set_replacement(survivor);

2885	2898

2886 set_replacement(this);	2899 set_replacement(this);

2887 if (surviving == choice_count) {	2900 if (surviving == choice_count) {

2888 return this;	2901 return this;

2889 }	2902 }

2890 // Only some of the nodes survived the filtering. We need to rebuild the	2903 // Only some of the nodes survived the filtering. We need to rebuild the

2891 // alternatives list.	2904 // alternatives list.

2892 ZoneList<GuardedAlternative>* new_alternatives =	2905 ZoneList<GuardedAlternative>* new_alternatives =

2893 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());	2906 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());

2894 for (int i = 0; i < choice_count; i++) {	2907 for (int i = 0; i < choice_count; i++) {

2895 RegExpNode* replacement =	2908 RegExpNode* replacement =

2896 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);	2909 alternatives_->at(i).node()->FilterOneByte(depth - 1, compiler);

2897 if (replacement != NULL) {	2910 if (replacement != NULL) {

2898 alternatives_->at(i).set_node(replacement);	2911 alternatives_->at(i).set_node(replacement);

2899 new_alternatives->Add(alternatives_->at(i), zone());	2912 new_alternatives->Add(alternatives_->at(i), zone());

2900 }	2913 }

2901 }	2914 }

2902 alternatives_ = new_alternatives;	2915 alternatives_ = new_alternatives;

2903 return this;	2916 return this;

2904 }	2917 }

2905	2918

2906	2919 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(

2907 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,	2920 int depth, RegExpCompiler* compiler) {

2908 bool ignore_case) {

2909 if (info()->replacement_calculated) return replacement();	2921 if (info()->replacement_calculated) return replacement();

2910 if (depth < 0) return this;	2922 if (depth < 0) return this;

2911 if (info()->visited) return this;	2923 if (info()->visited) return this;

2912 VisitMarker marker(info());	2924 VisitMarker marker(info());

2913 // Alternative 0 is the negative lookahead, alternative 1 is what comes	2925 // Alternative 0 is the negative lookahead, alternative 1 is what comes

2914 // afterwards.	2926 // afterwards.

2915 RegExpNode* node = alternatives_->at(1).node();	2927 RegExpNode* node = alternatives_->at(1).node();

2916 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);	2928 RegExpNode* replacement = node->FilterOneByte(depth - 1, compiler);

2917 if (replacement == NULL) return set_replacement(NULL);	2929 if (replacement == NULL) return set_replacement(NULL);

2918 alternatives_->at(1).set_node(replacement);	2930 alternatives_->at(1).set_node(replacement);

2919	2931

2920 RegExpNode* neg_node = alternatives_->at(0).node();	2932 RegExpNode* neg_node = alternatives_->at(0).node();

2921 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);	2933 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, compiler);

2922 // If the negative lookahead is always going to fail then	2934 // If the negative lookahead is always going to fail then

2923 // we don't need to check it.	2935 // we don't need to check it.

2924 if (neg_replacement == NULL) return set_replacement(replacement);	2936 if (neg_replacement == NULL) return set_replacement(replacement);

2925 alternatives_->at(0).set_node(neg_replacement);	2937 alternatives_->at(0).set_node(neg_replacement);

2926 return set_replacement(this);	2938 return set_replacement(this);

2927 }	2939 }

2928	2940

2929	2941

2930 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	2942 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2931 RegExpCompiler* compiler,	2943 RegExpCompiler* compiler,

2932 int characters_filled_in,	2944 int characters_filled_in,

2933 bool not_at_start) {	2945 bool not_at_start) {

2934 if (body_can_be_zero_length_ \|\| info()->visited) return;	2946 if (body_can_be_zero_length_ \|\| info()->visited) return;

2935 VisitMarker marker(info());	2947 VisitMarker marker(info());

2936 return ChoiceNode::GetQuickCheckDetails(details,	2948 return ChoiceNode::GetQuickCheckDetails(details,

2937 compiler,	2949 compiler,

2938 characters_filled_in,	2950 characters_filled_in,

2939 not_at_start);	2951 not_at_start);

2940 }	2952 }

2941	2953

2942	2954 void LoopChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,

2943 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,	2955 int budget, BoyerMooreLookahead* bm,

2944 BoyerMooreLookahead* bm, bool not_at_start) {	2956 bool not_at_start) {

2945 if (body_can_be_zero_length_ \|\| budget <= 0) {	2957 if (body_can_be_zero_length_ \|\| budget <= 0) {

2946 bm->SetRest(offset);	2958 bm->SetRest(offset);

2947 SaveBMInfo(bm, not_at_start, offset);	2959 SaveBMInfo(bm, not_at_start, offset);

2948 return;	2960 return;

2949 }	2961 }

2950 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);	2962 ChoiceNode::FillInBMInfo(compiler, offset, budget - 1, bm, not_at_start);

2951 SaveBMInfo(bm, not_at_start, offset);	2963 SaveBMInfo(bm, not_at_start, offset);

2952 }	2964 }

2953	2965

2954	2966

2955 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,	2967 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2956 RegExpCompiler* compiler,	2968 RegExpCompiler* compiler,

2957 int characters_filled_in,	2969 int characters_filled_in,

2958 bool not_at_start) {	2970 bool not_at_start) {

2959 not_at_start = (not_at_start \|\| not_at_start_);	2971 not_at_start = (not_at_start \|\| not_at_start_);

2960 int choice_count = alternatives_->length();	2972 int choice_count = alternatives_->length();

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3032 assembler->CheckNotCharacter('\r', new_trace.backtrack());	3044 assembler->CheckNotCharacter('\r', new_trace.backtrack());

3033 }	3045 }

3034 assembler->Bind(&ok);	3046 assembler->Bind(&ok);

3035 on_success->Emit(compiler, &new_trace);	3047 on_success->Emit(compiler, &new_trace);

3036 }	3048 }

3037	3049

3038	3050

3039 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).	3051 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).

3040 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {	3052 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {

3041 RegExpMacroAssembler* assembler = compiler->macro_assembler();	3053 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3042 Isolate* isolate = assembler->isolate();

3043 Trace::TriBool next_is_word_character = Trace::UNKNOWN;	3054 Trace::TriBool next_is_word_character = Trace::UNKNOWN;

3044 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);	3055 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);

3045 BoyerMooreLookahead* lookahead = bm_info(not_at_start);	3056 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

3046 if (lookahead == NULL) {	3057 if (lookahead == NULL) {

3047 int eats_at_least =	3058 int eats_at_least =

3048 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,	3059 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,

3049 kRecursionBudget,	3060 kRecursionBudget,

3050 not_at_start));	3061 not_at_start));

3051 if (eats_at_least >= 1) {	3062 if (eats_at_least >= 1) {

3052 BoyerMooreLookahead* bm =	3063 BoyerMooreLookahead* bm =

3053 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());	3064 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());

3054 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);	3065 FillInBMInfo(compiler, 0, kRecursionBudget, bm, not_at_start);

3055 if (bm->at(0)->is_non_word())	3066 if (bm->at(0)->is_non_word())

3056 next_is_word_character = Trace::FALSE_VALUE;	3067 next_is_word_character = Trace::FALSE_VALUE;

3057 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;	3068 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;

3058 }	3069 }

3059 } else {	3070 } else {

3060 if (lookahead->at(0)->is_non_word())	3071 if (lookahead->at(0)->is_non_word())

3061 next_is_word_character = Trace::FALSE_VALUE;	3072 next_is_word_character = Trace::FALSE_VALUE;

3062 if (lookahead->at(0)->is_word())	3073 if (lookahead->at(0)->is_word())

3063 next_is_word_character = Trace::TRUE_VALUE;	3074 next_is_word_character = Trace::TRUE_VALUE;

3064 }	3075 }

(...skipping 151 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3216 // up to the limit the quick check already checked. In addition the quick	3227 // up to the limit the quick check already checked. In addition the quick

3217 // check can have involved a mask and compare operation which may simplify	3228 // check can have involved a mask and compare operation which may simplify

3218 // or obviate the need for further checks at some character positions.	3229 // or obviate the need for further checks at some character positions.

3219 void TextNode::TextEmitPass(RegExpCompiler* compiler,	3230 void TextNode::TextEmitPass(RegExpCompiler* compiler,

3220 TextEmitPassType pass,	3231 TextEmitPassType pass,

3221 bool preloaded,	3232 bool preloaded,

3222 Trace* trace,	3233 Trace* trace,

3223 bool first_element_checked,	3234 bool first_element_checked,

3224 int* checked_up_to) {	3235 int* checked_up_to) {

3225 RegExpMacroAssembler* assembler = compiler->macro_assembler();	3236 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3226 Isolate* isolate = assembler->isolate();

3227 bool one_byte = compiler->one_byte();	3237 bool one_byte = compiler->one_byte();

3228 Label* backtrack = trace->backtrack();	3238 Label* backtrack = trace->backtrack();

3229 QuickCheckDetails* quick_check = trace->quick_check_performed();	3239 QuickCheckDetails* quick_check = trace->quick_check_performed();

3230 int element_count = elements()->length();	3240 int element_count = elements()->length();

3231 int backward_offset = read_backward() ? -Length() : 0;	3241 int backward_offset = read_backward() ? -Length() : 0;

3232 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {	3242 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {

3233 TextElement elm = elements()->at(i);	3243 TextElement elm = elements()->at(i);

3234 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;	3244 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;

3235 if (elm.text_type() == TextElement::ATOM) {	3245 if (elm.text_type() == TextElement::ATOM) {

3236 Vector<const uc16> quarks = elm.atom()->data();	3246 Vector<const uc16> quarks = elm.atom()->data();

3237 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {	3247 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {

3238 if (first_element_checked && i == 0 && j == 0) continue;	3248 if (first_element_checked && i == 0 && j == 0) continue;

3239 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;	3249 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;

3240 EmitCharacterFunction* emit_function = NULL;	3250 EmitCharacterFunction* emit_function = NULL;

3241 switch (pass) {	3251 switch (pass) {

3242 case NON_LATIN1_MATCH:	3252 case NON_LATIN1_MATCH:

3243 DCHECK(one_byte);	3253 DCHECK(one_byte);

3244 if (quarks[j] > String::kMaxOneByteCharCode) {	3254 if (quarks[j] > String::kMaxOneByteCharCode &&

	3255 !(compiler->unicode() && compiler->ignore_case())) {

	3256 // In the non-unicode case, if the pattern is non-Latin1, it
	erikcorry 2016/02/01 15:21:40 I think you mean in the one-byte case (non-unicode I think you mean in the one-byte case (non-unicode sounds like you mean in the absence of the /u flag, but that doesn't fit with the rest of the sentence). Yang 2016/02/02 06:06:48 I did mean this. With /i, if a character is outsid Show quoted text On 2016/02/01 15:21:40, erikcorry wrote: > I think you mean in the one-byte case (non-unicode sounds like you mean in the > absence of the /u flag, but that doesn't fit with the rest of the sentence). I did mean this. With /i, if a character is outside of the latin1 range, it cannot match a character inside the latin1 range, so we had this optimization here. With /ui, we have to be careful not to miss the case where \u212b matches \u00c5. Changed the comment to be more clear about this.
	3257 // cannot possibly match a Latin1 character, unless we have the

	3258 // /ui flags, e.g. /\u212b/ui matches "\u00c5".

3245 assembler->GoTo(backtrack);	3259 assembler->GoTo(backtrack);

3246 return;	3260 return;

3247 }	3261 }

3248 break;	3262 break;

3249 case NON_LETTER_CHARACTER_MATCH:	3263 case NON_LETTER_CHARACTER_MATCH:

3250 emit_function = &EmitAtomNonLetter;	3264 emit_function = &EmitAtomNonLetter;

3251 break;	3265 break;

3252 case SIMPLE_CHARACTER_MATCH:	3266 case SIMPLE_CHARACTER_MATCH:

3253 emit_function = &EmitSimpleCharacter;	3267 emit_function = &EmitSimpleCharacter;

3254 break;	3268 break;

3255 case CASE_CHARACTER_MATCH:	3269 case CASE_CHARACTER_MATCH:

3256 emit_function = &EmitAtomLetter;	3270 emit_function = &EmitAtomLetter;

3257 break;	3271 break;

3258 default:	3272 default:

3259 break;	3273 break;

3260 }	3274 }

3261 if (emit_function != NULL) {	3275 if (emit_function != NULL) {

3262 bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();	3276 bool bounds_check = *checked_up_to < cp_offset + j \|\| read_backward();

3263 bool bound_checked =	3277 bool bound_checked =

3264 emit_function(isolate, compiler, quarks[j], backtrack,	3278 emit_function(compiler, quarks[j], backtrack, cp_offset + j,

3265 cp_offset + j, bounds_check, preloaded);	3279 bounds_check, preloaded);

3266 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);	3280 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);

3267 }	3281 }

3268 }	3282 }

3269 } else {	3283 } else {

3270 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());	3284 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());

3271 if (pass == CHARACTER_CLASS_MATCH) {	3285 if (pass == CHARACTER_CLASS_MATCH) {

3272 if (first_element_checked && i == 0) continue;	3286 if (first_element_checked && i == 0) continue;

3273 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;	3287 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;

3274 RegExpCharacterClass* cc = elm.char_class();	3288 RegExpCharacterClass* cc = elm.char_class();

3275 bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();	3289 bool bounds_check = *checked_up_to < cp_offset \|\| read_backward();

(...skipping 62 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3338 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {	3352 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {

3339 LimitResult limit_result = LimitVersions(compiler, trace);	3353 LimitResult limit_result = LimitVersions(compiler, trace);

3340 if (limit_result == DONE) return;	3354 if (limit_result == DONE) return;

3341 DCHECK(limit_result == CONTINUE);	3355 DCHECK(limit_result == CONTINUE);

3342	3356

3343 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {	3357 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {

3344 compiler->SetRegExpTooBig();	3358 compiler->SetRegExpTooBig();

3345 return;	3359 return;

3346 }	3360 }

3347	3361

3348 if (compiler->one_byte()) {	3362 if (compiler->one_byte()) {
	erikcorry 2016/02/01 15:21:40 You should check for the /ui flags here instead of You should check for the /ui flags here instead of inside TextEmitPass, since there's no point stepping through the text, only to decide every time that there's nothing to do. Yang 2016/02/02 06:06:48 Done. Show quoted text On 2016/02/01 15:21:40, erikcorry wrote: > You should check for the /ui flags here instead of inside TextEmitPass, since > there's no point stepping through the text, only to decide every time that > there's nothing to do. Done.
3349 int dummy = 0;	3363 int dummy = 0;

3350 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);	3364 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);

3351 }	3365 }

3352	3366

3353 bool first_elt_done = false;	3367 bool first_elt_done = false;

3354 int bound_checked_to = trace->cp_offset() - 1;	3368 int bound_checked_to = trace->cp_offset() - 1;

3355 bound_checked_to += trace->bound_checked_up_to();	3369 bound_checked_to += trace->bound_checked_up_to();

3356	3370

3357 // If a character is preloaded into the current character register then	3371 // If a character is preloaded into the current character register then

3358 // check that now.	3372 // check that now.

(...skipping 730 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4089	4103

4090 // Really we should be creating a new trace when we execute this function,	4104 // Really we should be creating a new trace when we execute this function,

4091 // but there is no need, because the code it generates cannot backtrack, and	4105 // but there is no need, because the code it generates cannot backtrack, and

4092 // we always arrive here with a trivial trace (since it's the entry to a	4106 // we always arrive here with a trivial trace (since it's the entry to a

4093 // loop. That also implies that there are no preloaded characters, which is	4107 // loop. That also implies that there are no preloaded characters, which is

4094 // good, because it means we won't be violating any assumptions by	4108 // good, because it means we won't be violating any assumptions by

4095 // overwriting those characters with new load instructions.	4109 // overwriting those characters with new load instructions.

4096 DCHECK(trace->is_trivial());	4110 DCHECK(trace->is_trivial());

4097	4111

4098 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();	4112 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

4099 Isolate* isolate = macro_assembler->isolate();

4100 // At this point we know that we are at a non-greedy loop that will eat	4113 // At this point we know that we are at a non-greedy loop that will eat

4101 // any character one at a time. Any non-anchored regexp has such a	4114 // any character one at a time. Any non-anchored regexp has such a

4102 // loop prepended to it in order to find where it starts. We look for	4115 // loop prepended to it in order to find where it starts. We look for

4103 // a pattern of the form ...abc... where we can look 6 characters ahead	4116 // a pattern of the form ...abc... where we can look 6 characters ahead

4104 // and step forwards 3 if the character is not one of abc. Abc need	4117 // and step forwards 3 if the character is not one of abc. Abc need

4105 // not be atoms, they can be any reasonably limited character class or	4118 // not be atoms, they can be any reasonably limited character class or

4106 // small alternation.	4119 // small alternation.

4107 BoyerMooreLookahead* bm = bm_info(false);	4120 BoyerMooreLookahead* bm = bm_info(false);

4108 if (bm == NULL) {	4121 if (bm == NULL) {

4109 eats_at_least = Min(kMaxLookaheadForBoyerMoore,	4122 eats_at_least = Min(kMaxLookaheadForBoyerMoore,

4110 EatsAtLeast(kMaxLookaheadForBoyerMoore,	4123 EatsAtLeast(kMaxLookaheadForBoyerMoore,

4111 kRecursionBudget,	4124 kRecursionBudget,

4112 false));	4125 false));

4113 if (eats_at_least >= 1) {	4126 if (eats_at_least >= 1) {

4114 bm = new(zone()) BoyerMooreLookahead(eats_at_least,	4127 bm = new(zone()) BoyerMooreLookahead(eats_at_least,

4115 compiler,	4128 compiler,

4116 zone());	4129 zone());

4117 GuardedAlternative alt0 = alternatives_->at(0);	4130 GuardedAlternative alt0 = alternatives_->at(0);

4118 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);	4131 alt0.node()->FillInBMInfo(compiler, 0, kRecursionBudget, bm, false);

4119 }	4132 }

4120 }	4133 }

4121 if (bm != NULL) {	4134 if (bm != NULL) {

4122 bm->EmitSkipInstructions(macro_assembler);	4135 bm->EmitSkipInstructions(macro_assembler);

4123 }	4136 }

4124 return eats_at_least;	4137 return eats_at_least;

4125 }	4138 }

4126	4139

4127	4140

4128 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,	4141 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,

(...skipping 2236 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6365	6378

6366 void Analysis::VisitBackReference(BackReferenceNode* that) {	6379 void Analysis::VisitBackReference(BackReferenceNode* that) {

6367 EnsureAnalyzed(that->on_success());	6380 EnsureAnalyzed(that->on_success());

6368 }	6381 }

6369	6382

6370	6383

6371 void Analysis::VisitAssertion(AssertionNode* that) {	6384 void Analysis::VisitAssertion(AssertionNode* that) {

6372 EnsureAnalyzed(that->on_success());	6385 EnsureAnalyzed(that->on_success());

6373 }	6386 }

6374	6387

6375	6388 void BackReferenceNode::FillInBMInfo(RegExpCompiler* compiler, int offset,

6376 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,	6389 int budget, BoyerMooreLookahead* bm,

6377 BoyerMooreLookahead* bm,

6378 bool not_at_start) {	6390 bool not_at_start) {

6379 // Working out the set of characters that a backreference can match is too	6391 // Working out the set of characters that a backreference can match is too

6380 // hard, so we just say that any character can match.	6392 // hard, so we just say that any character can match.

6381 bm->SetRest(offset);	6393 bm->SetRest(offset);

6382 SaveBMInfo(bm, not_at_start, offset);	6394 SaveBMInfo(bm, not_at_start, offset);

6383 }	6395 }

6384	6396

6385	6397

6386 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==	6398 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==

6387 RegExpMacroAssembler::kTableSize);	6399 RegExpMacroAssembler::kTableSize);

6388	6400

6389	6401 void ChoiceNode::FillInBMInfo(RegExpCompiler* compiler, int offset, int budget,

6390 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

6391 BoyerMooreLookahead* bm, bool not_at_start) {	6402 BoyerMooreLookahead* bm, bool not_at_start) {

6392 ZoneList<GuardedAlternative>* alts = alternatives();	6403 ZoneList<GuardedAlternative>* alts = alternatives();

6393 budget = (budget - 1) / alts->length();	6404 budget = (budget - 1) / alts->length();

6394 for (int i = 0; i < alts->length(); i++) {	6405 for (int i = 0; i < alts->length(); i++) {

6395 GuardedAlternative& alt = alts->at(i);	6406 GuardedAlternative& alt = alts->at(i);

6396 if (alt.guards() != NULL && alt.guards()->length() != 0) {	6407 if (alt.guards() != NULL && alt.guards()->length() != 0) {

6397 bm->SetRest(offset); // Give up trying to fill in info.	6408 bm->SetRest(offset); // Give up trying to fill in info.

6398 SaveBMInfo(bm, not_at_start, offset);	6409 SaveBMInfo(bm, not_at_start, offset);

6399 return;	6410 return;

6400 }	6411 }

6401 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);	6412 alt.node()->FillInBMInfo(compiler, offset, budget, bm, not_at_start);

6402 }	6413 }

6403 SaveBMInfo(bm, not_at_start, offset);	6414 SaveBMInfo(bm, not_at_start, offset);

6404 }	6415 }

6405	6416

6406	6417 void TextNode::FillInBMInfo(RegExpCompiler* compiler, int initial_offset,

6407 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,	6418 int budget, BoyerMooreLookahead* bm,

6408 BoyerMooreLookahead* bm, bool not_at_start) {	6419 bool not_at_start) {

6409 if (initial_offset >= bm->length()) return;	6420 if (initial_offset >= bm->length()) return;

6410 int offset = initial_offset;	6421 int offset = initial_offset;

6411 int max_char = bm->max_char();	6422 int max_char = bm->max_char();

6412 for (int i = 0; i < elements()->length(); i++) {	6423 for (int i = 0; i < elements()->length(); i++) {

6413 if (offset >= bm->length()) {	6424 if (offset >= bm->length()) {

6414 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6425 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6415 return;	6426 return;

6416 }	6427 }

6417 TextElement text = elements()->at(i);	6428 TextElement text = elements()->at(i);

6418 if (text.text_type() == TextElement::ATOM) {	6429 if (text.text_type() == TextElement::ATOM) {

6419 RegExpAtom* atom = text.atom();	6430 RegExpAtom* atom = text.atom();

6420 for (int j = 0; j < atom->length(); j++, offset++) {	6431 for (int j = 0; j < atom->length(); j++, offset++) {

6421 if (offset >= bm->length()) {	6432 if (offset >= bm->length()) {

6422 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6433 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6423 return;	6434 return;

6424 }	6435 }

6425 uc16 character = atom->data()[j];	6436 uc16 character = atom->data()[j];

6426 if (bm->compiler()->ignore_case()) {	6437 if (bm->compiler()->ignore_case()) {

6427 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	6438 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

6428 int length = GetCaseIndependentLetters(	6439 int length = GetCaseIndependentLetters(

6429 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,	6440 compiler, character,

6430 chars);	6441 bm->max_char() == String::kMaxOneByteCharCode, chars);

6431 for (int j = 0; j < length; j++) {	6442 for (int j = 0; j < length; j++) {

6432 bm->Set(offset, chars[j]);	6443 bm->Set(offset, chars[j]);

6433 }	6444 }

6434 } else {	6445 } else {

6435 if (character <= max_char) bm->Set(offset, character);	6446 if (character <= max_char) bm->Set(offset, character);

6436 }	6447 }

6437 }	6448 }

6438 } else {	6449 } else {

6439 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());	6450 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());

6440 RegExpCharacterClass* char_class = text.char_class();	6451 RegExpCharacterClass* char_class = text.char_class();

6441 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());	6452 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());

6442 if (char_class->is_negated()) {	6453 if (char_class->is_negated()) {

6443 bm->SetAll(offset);	6454 bm->SetAll(offset);

6444 } else {	6455 } else {

6445 for (int k = 0; k < ranges->length(); k++) {	6456 for (int k = 0; k < ranges->length(); k++) {

6446 CharacterRange& range = ranges->at(k);	6457 CharacterRange& range = ranges->at(k);

6447 if (range.from() > max_char) continue;	6458 if (range.from() > max_char) continue;

6448 int to = Min(max_char, static_cast<int>(range.to()));	6459 int to = Min(max_char, static_cast<int>(range.to()));

6449 bm->SetInterval(offset, Interval(range.from(), to));	6460 bm->SetInterval(offset, Interval(range.from(), to));

6450 }	6461 }

6451 }	6462 }

6452 offset++;	6463 offset++;

6453 }	6464 }

6454 }	6465 }

6455 if (offset >= bm->length()) {	6466 if (offset >= bm->length()) {

6456 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6467 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6457 return;	6468 return;

6458 }	6469 }

6459 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,	6470 on_success()->FillInBMInfo(compiler, offset, budget - 1, bm,

6460 true); // Not at start after a text node.	6471 true); // Not at start after a text node.

6461 if (initial_offset == 0) set_bm_info(not_at_start, bm);	6472 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6462 }	6473 }

6463	6474

6464	6475

6465 // -------------------------------------------------------------------	6476 // -------------------------------------------------------------------

6466 // Dispatch table construction	6477 // Dispatch table construction

6467	6478

6468	6479

6469 void DispatchTableConstructor::VisitEnd(EndNode* that) {	6480 void DispatchTableConstructor::VisitEnd(EndNode* that) {

(...skipping 137 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6607 }	6618 }

6608	6619

6609	6620

6610 RegExpEngine::CompilationResult RegExpEngine::Compile(	6621 RegExpEngine::CompilationResult RegExpEngine::Compile(

6611 Isolate* isolate, Zone* zone, RegExpCompileData* data,	6622 Isolate* isolate, Zone* zone, RegExpCompileData* data,

6612 JSRegExp::Flags flags, Handle<String> pattern,	6623 JSRegExp::Flags flags, Handle<String> pattern,

6613 Handle<String> sample_subject, bool is_one_byte) {	6624 Handle<String> sample_subject, bool is_one_byte) {

6614 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {	6625 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {

6615 return IrregexpRegExpTooBig(isolate);	6626 return IrregexpRegExpTooBig(isolate);

6616 }	6627 }

6617 bool ignore_case = flags & JSRegExp::kIgnoreCase;

6618 bool is_sticky = flags & JSRegExp::kSticky;	6628 bool is_sticky = flags & JSRegExp::kSticky;

6619 bool is_global = flags & JSRegExp::kGlobal;	6629 bool is_global = flags & JSRegExp::kGlobal;

6620 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,	6630 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,

6621 is_one_byte);	6631 is_one_byte);

6622	6632

6623 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));	6633 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));

6624	6634

6625 // Sample some characters from the middle of the string.	6635 // Sample some characters from the middle of the string.

6626 static const int kSampleSize = 128;	6636 static const int kSampleSize = 128;

6627	6637

(...skipping 28 matching lines...) Expand all Loading...
6656 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);	6666 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);

6657 first_step_node->AddAlternative(GuardedAlternative(captured_body));	6667 first_step_node->AddAlternative(GuardedAlternative(captured_body));

6658 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(	6668 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(

6659 new (zone) RegExpCharacterClass('*'), false, loop_node)));	6669 new (zone) RegExpCharacterClass('*'), false, loop_node)));

6660 node = first_step_node;	6670 node = first_step_node;

6661 } else {	6671 } else {

6662 node = loop_node;	6672 node = loop_node;

6663 }	6673 }

6664 }	6674 }

6665 if (is_one_byte) {	6675 if (is_one_byte) {

6666 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);	6676 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);

6667 // Do it again to propagate the new nodes to places where they were not	6677 // Do it again to propagate the new nodes to places where they were not

6668 // put because they had not been calculated yet.	6678 // put because they had not been calculated yet.

6669 if (node != NULL) {	6679 if (node != NULL) {

6670 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);	6680 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, &compiler);

6671 }	6681 }

6672 } else if (compiler.unicode() && (is_global \|\| is_sticky)) {	6682 } else if (compiler.unicode() && (is_global \|\| is_sticky)) {

6673 node = OptionallyStepBackToLeadSurrogate(&compiler, node);	6683 node = OptionallyStepBackToLeadSurrogate(&compiler, node);

6674 }	6684 }

6675	6685

6676 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);	6686 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);

6677 data->node = node;	6687 data->node = node;

6678 Analysis analysis(isolate, flags, is_one_byte);	6688 Analysis analysis(isolate, flags, is_one_byte);

6679 analysis.EnsureAnalyzed(node);	6689 analysis.EnsureAnalyzed(node);

6680 if (analysis.has_failed()) {	6690 if (analysis.has_failed()) {

(...skipping 174 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6855	6865

6856	6866

6857 void RegExpResultsCache::Clear(FixedArray* cache) {	6867 void RegExpResultsCache::Clear(FixedArray* cache) {

6858 for (int i = 0; i < kRegExpResultsCacheSize; i++) {	6868 for (int i = 0; i < kRegExpResultsCacheSize; i++) {

6859 cache->set(i, Smi::FromInt(0));	6869 cache->set(i, Smi::FromInt(0));

6860 }	6870 }

6861 }	6871 }

6862	6872

6863 } // namespace internal	6873 } // namespace internal

6864 } // namespace v8	6874 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/jsregexp.h ('k') | src/regexp/regexp-parser.cc » ('j') | no next file with comments »