Chromium Code Reviews| Index: src/jsregexp.cc |
| =================================================================== |
| --- src/jsregexp.cc (revision 11190) |
| +++ src/jsregexp.cc (working copy) |
| @@ -280,7 +280,8 @@ |
| // from the source pattern. |
| // If compilation fails, an exception is thrown and this function |
| // returns false. |
| -bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { |
| +bool RegExpImpl::EnsureCompiledIrregexp( |
| + Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { |
| Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); |
| #ifdef V8_INTERPRETED_REGEXP |
| if (compiled_code->IsByteArray()) return true; |
| @@ -296,7 +297,7 @@ |
| ASSERT(compiled_code->IsSmi()); |
| return true; |
| } |
| - return CompileIrregexp(re, is_ascii); |
| + return CompileIrregexp(re, sample_subject, is_ascii); |
| } |
| @@ -316,7 +317,9 @@ |
| } |
| -bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { |
| +bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
| + Handle<String> sample_subject, |
| + bool is_ascii) { |
| // Compile the RegExp. |
| Isolate* isolate = re->GetIsolate(); |
| ZoneScope zone_scope(isolate, DELETE_ON_EXIT); |
| @@ -365,6 +368,7 @@ |
| flags.is_ignore_case(), |
| flags.is_multiline(), |
| pattern, |
| + sample_subject, |
| is_ascii); |
| if (result.error_message != NULL) { |
| // Unable to compile regexp. |
| @@ -435,7 +439,7 @@ |
| // Check the asciiness of the underlying storage. |
| bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
| - if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; |
| + if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; |
| #ifdef V8_INTERPRETED_REGEXP |
| // Byte-code regexp needs space allocated for all its registers. |
| @@ -466,7 +470,7 @@ |
| #ifndef V8_INTERPRETED_REGEXP |
| ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
| do { |
| - EnsureCompiledIrregexp(regexp, is_ascii); |
| + EnsureCompiledIrregexp(regexp, subject, is_ascii); |
| Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); |
| NativeRegExpMacroAssembler::Result res = |
| NativeRegExpMacroAssembler::Match(code, |
| @@ -784,6 +788,65 @@ |
| } |
| +class FrequencyCollator { |
| + public: |
| + FrequencyCollator() : total_frequencies_(0) { |
| + for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { |
| + frequencies_[i] = CharacterFrequency(i); |
| + } |
| + } |
| + |
| + void CountCharacter(int character) { |
| + int index = (character & RegExpMacroAssembler::kTableMask); |
| + frequencies_[index].Increment(); |
| + total_frequencies_++; |
| + } |
| + |
| + void SortFrequencies() { |
| + typedef int (*RawComparer)(const void*, const void*); |
| + memcpy( |
| + &sorted_frequencies_[0], &frequencies_[0], sizeof(sorted_frequencies_)); |
|
ulan
2012/03/30 13:04:48
A nit: for consistency I would prefer breaking mem
Erik Corry
2012/04/01 00:48:46
Done.
|
| + qsort(&sorted_frequencies_[0], |
| + RegExpMacroAssembler::kTableSize, |
| + sizeof(sorted_frequencies_[0]), |
| + reinterpret_cast<RawComparer>(&CharacterFrequency::Compare)); |
| + } |
| + |
| + bool IsFrequent(int in_character) { |
| + if (total_frequencies_ < 10) return false; // Too little statistical basis. |
| + int freq = frequencies_[in_character].counter(); |
|
ulan
2012/03/30 13:04:48
Either mask the in_character or ASSERT(in_characte
Erik Corry
2012/04/01 00:48:46
Done.
|
| + bool popular = ((freq * 100) / total_frequencies_ > 20); |
| + return popular; |
|
ulan
2012/03/30 13:04:48
Named magic numbers become less magic :)
int freq
Erik Corry
2012/04/01 00:48:46
Done.
|
| + } |
| + |
| + private: |
| + class CharacterFrequency { |
| + public: |
| + CharacterFrequency() : counter_(0), character_(-1) { } |
| + explicit CharacterFrequency(int character) : counter_(0), character_(character) { } |
|
ulan
2012/03/30 13:04:48
Long line.
Erik Corry
2012/04/01 00:48:46
Done.
|
| + |
| + static int Compare(const CharacterFrequency* a, |
| + const CharacterFrequency* b) { |
| + return b->counter_ - a->counter_; |
| + } |
| + |
| + void Increment() { counter_++; } |
| + int counter() { return counter_; } |
| + int character() { return character_; } |
| + |
| + private: |
| + int counter_; |
| + int character_; |
| + }; |
| + |
| + |
| + private: |
| + CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; |
| + CharacterFrequency sorted_frequencies_[RegExpMacroAssembler::kTableSize]; |
| + int total_frequencies_; |
| +}; |
| + |
| + |
| class RegExpCompiler { |
| public: |
| RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); |
| @@ -819,6 +882,7 @@ |
| inline bool ignore_case() { return ignore_case_; } |
| inline bool ascii() { return ascii_; } |
| + FrequencyCollator* frequency_collator() { return &frequency_collator_; } |
| int current_expansion_factor() { return current_expansion_factor_; } |
| void set_current_expansion_factor(int value) { |
| @@ -837,6 +901,7 @@ |
| bool ascii_; |
| bool reg_exp_too_big_; |
| int current_expansion_factor_; |
| + FrequencyCollator frequency_collator_; |
| }; |
| @@ -865,7 +930,8 @@ |
| ignore_case_(ignore_case), |
| ascii_(ascii), |
| reg_exp_too_big_(false), |
| - current_expansion_factor_(1) { |
| + current_expansion_factor_(1), |
| + frequency_collator_() { |
| accept_ = new EndNode(EndNode::ACCEPT); |
| ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); |
| } |
| @@ -2056,6 +2122,16 @@ |
| } |
| +void ActionNode::FillInBMInfo( |
| + int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
|
ulan
2012/03/30 13:04:48
For consistency with the surrounding code consider
Erik Corry
2012/04/01 00:48:46
Done.
|
| + if (type_ == BEGIN_SUBMATCH) { |
| + bm->SetRest(offset); |
| + } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { |
| + on_success()->FillInBMInfo(offset, bm, not_at_start); |
| + } |
| +} |
| + |
| + |
| int AssertionNode::EatsAtLeast(int still_to_find, |
| int recursion_depth, |
| bool not_at_start) { |
| @@ -2072,6 +2148,14 @@ |
| } |
| +void AssertionNode::FillInBMInfo( |
| + int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| + // Match the behaviour of EatsAtLeast on this node. |
| + if (type() == AT_START && not_at_start) return; |
| + on_success()->FillInBMInfo(offset, bm, not_at_start); |
| +} |
| + |
| + |
| int BackReferenceNode::EatsAtLeast(int still_to_find, |
| int recursion_depth, |
| bool not_at_start) { |
| @@ -2504,6 +2588,17 @@ |
| } |
| +void LoopChoiceNode::FillInBMInfo( |
| + int offset, BoyerMooreLookahead* bm, bool nas) { |
| + if (body_can_be_zero_length_) { // || info()->visited) { |
|
ulan
2012/03/30 13:04:48
Forgotten comment.
Erik Corry
2012/04/01 00:48:46
Done.
|
| + bm->SetRest(offset); |
| + return; |
| + } |
| + // VisitMarker marker(info()); |
|
ulan
2012/03/30 13:04:48
Comment.
Erik Corry
2012/04/01 00:48:46
Done.
|
| + ChoiceNode::FillInBMInfo(offset, bm, nas); |
| +} |
| + |
| + |
| void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| RegExpCompiler* compiler, |
| int characters_filled_in, |
| @@ -3000,6 +3095,30 @@ |
| } |
| +RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( |
| + RegExpCompiler* compiler) { |
| + if (elms_->length() != 1) return NULL; |
| + TextElement elm = elms_->at(0); |
| + if (elm.type != TextElement::CHAR_CLASS) return NULL; |
| + RegExpCharacterClass* node = elm.data.u_char_class; |
| + ZoneList<CharacterRange>* ranges = node->ranges(); |
| + if (!CharacterRange::IsCanonical(ranges)) { |
| + CharacterRange::Canonicalize(ranges); |
| + } |
| + if (node->is_negated() && ranges->length() == 0) return on_success(); |
| + if (ranges->length() != 1) return NULL; |
| + if (ranges->at(0).from() != 0) return NULL; |
| + int max_char; |
| + if (compiler->ascii()) { |
|
ulan
2012/03/30 13:04:48
A nit, I like
max_char = compiler->ascii() ? Stri
Erik Corry
2012/04/01 00:48:46
Yes, but a bug in gcc means you get a linker error
|
| + max_char = String::kMaxAsciiCharCode; |
| + } else { |
| + max_char = String::kMaxUtf16CodeUnit; |
| + } |
| + if (ranges->at(0).to() < max_char) return NULL; |
|
ulan
2012/03/30 13:04:48
Consider using ranges(0)->IsEverything()
Erik Corry
2012/04/01 00:48:46
Done.
|
| + return on_success(); |
| +} |
| + |
| + |
| // Finds the fixed match length of a sequence of nodes that goes from |
| // this alternative and back to this choice node. If there are variable |
| // length nodes or other complications in the way then return a sentinel |
| @@ -3064,8 +3183,8 @@ |
| int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
| - bool not_at_start) { |
| - int preload_characters = EatsAtLeast(4, 0, not_at_start); |
| + int eats_at_least) { |
| + int preload_characters = eats_at_least > 4 ? 4 : eats_at_least; |
|
ulan
2012/03/30 13:04:48
Min(4, eats_at_least);
Erik Corry
2012/04/01 00:48:46
Done.
|
| if (compiler->macro_assembler()->CanReadUnaligned()) { |
| bool ascii = compiler->ascii(); |
| if (ascii) { |
| @@ -3131,6 +3250,197 @@ |
| }; |
| +BoyerMooreLookahead::BoyerMooreLookahead( |
| + int length, int map_length, RegExpCompiler* compiler) |
| + : length_(length), |
| + map_length_(map_length), |
|
ulan
2012/03/30 13:04:48
the Set() method assumes that map_length is a powe
Erik Corry
2012/04/01 00:48:46
Done.
|
| + compiler_(compiler) { |
| + if (compiler->ascii()) { |
| + max_char_ = String::kMaxAsciiCharCode; |
| + } else { |
| + max_char_ = String::kMaxUtf16CodeUnit; |
| + } |
| + bitmaps_ = new ZoneList<ZoneList<bool>*>(length); |
|
ulan
2012/03/30 13:04:48
I wonder if bit-packing (ZoneList<bool>) would imp
Erik Corry
2012/04/01 00:48:46
We normally only do this once per regexp, so I don
|
| + for (int i = 0; i < length; i++) { |
| + bitmaps_->Add(new ZoneList<bool>(map_length)); |
| + ZoneList<bool>* map = bitmaps_->at(i); |
| + for (int i = 0; i < map_length; i++) { |
| + map->Add(false); |
| + } |
| + } |
| +} |
| + |
| + |
| +bool BoyerMooreLookahead::TooVague(int offset, int limit) { |
| + if (bitmaps_->at(offset) == NULL) return true; |
| + if (Count(offset) > limit) return true; |
| + return false; |
| +} |
| + |
| + |
| +bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { |
| + // The characters that are very frequent in the sample subject string should |
| + // not be used for Boyer-Moore operations, so lets mark those positions as |
| + // being too vague. |
| + for (int i = 0; i < length_; i++) { |
| + if (Count(i) >= kTooManyCharacters) { |
| + SetAll(i); |
| + } else { |
| + ZoneList<bool>* map = bitmaps_->at(i); |
| + for (int ch = 0; ch < map_length_; ch++) { |
| + if (map->at(ch) && compiler_->frequency_collator()->IsFrequent(ch)) { |
| + SetAll(i); |
| + break; |
| + } |
| + } |
| + } |
| + } |
| + const int kNoPoints = -1; |
| + int biggest_points = kNoPoints; |
| + int biggest_interval_max = 0; |
| + int biggest_interval = 0; |
| + int vagueness_factor = 10; |
|
ulan
2012/03/30 13:04:48
Could you please write a comment describing the ma
Erik Corry
2012/04/01 00:48:46
I refactored this function because it reflected fa
|
| + for (int initial_max_vagueness = 4; |
| + initial_max_vagueness <= kTooManyCharacters; |
| + initial_max_vagueness *= 2, vagueness_factor--) { |
| + int max = length_ - 1; |
|
ulan
2012/03/30 13:04:48
This 'max' can just be called 'i'. I think there a
Erik Corry
2012/04/01 00:48:46
Done.
|
| + while (true) { |
| + while (max >= 0 && TooVague(max, initial_max_vagueness)) max--; |
| + if (max < 0) break; |
| + int remembered_max = max; |
| + int max_vagueness = initial_max_vagueness; |
| + while (max >= 0 && !TooVague(max, max_vagueness)) { |
| + max--; |
| + if (max_vagueness < kTooManyCharacters) max_vagueness *= 2; |
|
ulan
2012/03/30 13:04:48
What is the rationale behind this?
Nit: max_vague
Erik Corry
2012/04/01 00:48:46
All refactored away.
|
| + } |
| + int points = |
| + ((remembered_max - max) * kTooManyCharacters) * vagueness_factor; |
| + if (points > biggest_points) { |
| + biggest_interval_max = remembered_max; |
| + biggest_interval = remembered_max - max; |
|
ulan
2012/03/30 13:04:48
Consider maintaining (left, right) instead of (big
Erik Corry
2012/04/01 00:48:46
Done.
|
| + biggest_points = points; |
| + } |
| + if (max < 0) break; |
| + } |
| + } |
| + if (biggest_points == kNoPoints) return false; |
| + if (biggest_interval < 1) return false; |
| + *to = biggest_interval_max; |
| + *from = 1 + biggest_interval_max - biggest_interval; |
| + return true; |
| +} |
| + |
| + |
| +static int SortChar(const char* a, const char* b) { |
| + return static_cast<int>(*a) - *b; |
| +} |
| + |
| + |
| +int BoyerMooreLookahead::GetSkipTable(int max_lookahead, |
| + Handle<ByteArray> boolean_skip_table) { |
| + const int kSize = RegExpMacroAssembler::kTableSize; |
| + char table[kSize]; |
| + |
| + for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { |
| + table[i] = max_lookahead + 1; |
| + } |
| + |
| + for (int i = max_lookahead; i >= 0; i--) { |
| + int skip = max_lookahead - i; |
|
ulan
2012/03/30 16:54:51
This is the crucial part. A comment explaining why
Erik Corry
2012/04/01 00:48:46
Done.
|
| + ZoneList<bool>* map = bitmaps_->at(i); |
| + if (map == NULL) { |
| + for (int j = 0; j < map_length_; j++) { |
| + if (table[j] > skip) table[j] = skip; |
| + } |
| + break; |
| + } |
| + for (int j = 0; j < map_length_; j++) { |
| + if (map->at(j)) { |
| + if (table[j] > skip) table[j] = skip; |
| + } |
| + } |
| + } |
| + |
| + char sorted_table[kSize]; |
| + memcpy(sorted_table, table, kSize); |
| + typedef int (*RawComparer)(const void*, const void*); |
| + qsort(sorted_table, kSize, 1, reinterpret_cast<RawComparer>(&SortChar)); |
| + |
| + int boolean_skip_distance = sorted_table[kSize / 2]; |
|
ulan
2012/03/30 16:54:51
Instead of kSize / 2 we can take any i in [0..kSiz
Erik Corry
2012/04/01 00:48:46
Done.
|
| + for (int i = 0; i < kSize; i++) { |
| + boolean_skip_table->set(i, table[i] >= boolean_skip_distance ? 0 : 1); |
|
ulan
2012/03/30 13:04:48
Can (x >= y ? 0 : 1) be replaced with (x < y) ?
Erik Corry
2012/04/01 00:48:46
We don't normally allow implicit bool->int and int
|
| + } |
| + return boolean_skip_distance; |
| +} |
| + |
| + |
| +bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { |
| + int min_lookahead = 0; |
| + int max_lookahead = 0; |
| + |
| + if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; |
| + |
| + bool found_single_character = false; |
|
ulan
2012/03/30 16:54:51
Consider using a counter:
character_count = 0;
.
Erik Corry
2012/04/01 00:48:46
I want to bail out when I find the second characte
|
| + bool abandoned_search_for_single_character = false; |
| + int single_character = 0; |
| + for (int i = max_lookahead; i >= min_lookahead; i--) { |
| + ZoneList<bool>* map = bitmaps_->at(i); |
| + for (int j = 0; j < map_length_; j++) { |
| + if (map->at(j)) { |
| + if (found_single_character) { |
| + found_single_character = false; // Found two. |
| + abandoned_search_for_single_character = true; |
| + break; |
| + } else { |
| + found_single_character = true; |
| + single_character = j; |
| + } |
| + } |
| + } |
| + if (abandoned_search_for_single_character) break; |
| + } |
| + |
| + int lookahead_width = max_lookahead + 1 - min_lookahead; |
| + |
| + if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { |
| + // The mask-compare can probably handle this better. |
| + return false; |
| + } |
| + |
| + if (found_single_character) { |
| + Label cont, again; |
| + masm->Bind(&again); |
| + masm->LoadCurrentCharacter(max_lookahead, &cont, true); |
| + if (max_char_ > String::kMaxAsciiCharCode || |
| + map_length_ < RegExpMacroAssembler::kTableSize) { |
| + masm->CheckCharacterAfterAnd(single_character, |
| + RegExpMacroAssembler::kTableMask, |
| + &cont); |
| + } else { |
| + masm->CheckCharacter(single_character, &cont); |
|
ulan
2012/03/30 16:54:51
I think this implicitly assumes that
String::kMaxA
Erik Corry
2012/04/01 00:48:46
Fixed.
|
| + } |
| + masm->AdvanceCurrentPosition(lookahead_width); |
| + masm->GoTo(&again); |
| + masm->Bind(&cont); |
| + return true; |
| + } |
| + |
| + Handle<ByteArray> boolean_skip_table = |
| + FACTORY->NewByteArray(map_length_, TENURED); |
| + int boolean_skip_distance = GetSkipTable(max_lookahead, boolean_skip_table); |
| + if (boolean_skip_distance == 0) return false; |
| + |
| + Label cont, again; |
| + masm->Bind(&again); |
| + masm->LoadCurrentCharacter(max_lookahead, &cont, true); |
| + masm->CheckBitInTable(boolean_skip_table, &cont); |
| + masm->AdvanceCurrentPosition(boolean_skip_distance); |
| + masm->GoTo(&again); |
| + masm->Bind(&cont); |
| + |
| + return true; |
| +} |
| + |
| /* Code generation for choice nodes. |
| * |
| * We generate quick checks that do a mask and compare to eliminate a |
| @@ -3274,10 +3584,52 @@ |
| int first_normal_choice = greedy_loop ? 1 : 0; |
| - int preload_characters = |
| - CalculatePreloadCharacters(compiler, |
| - current_trace->at_start() == Trace::FALSE); |
| - bool preload_is_current = |
| + bool not_at_start = current_trace->at_start() == Trace::FALSE; |
| + const int kEatsAtLeastNotYetInitialized = -1; |
| + int eats_at_least = kEatsAtLeastNotYetInitialized; |
| + |
| + bool skip_was_emitted = false; |
| + |
| + // More makes code generation slower, less makes V8 benchmark score lower. |
| + const int kMaxLookaheadForBoyerMoore = 8; |
| + |
| + if (!greedy_loop && choice_count == 2) { |
| + GuardedAlternative alt1 = alternatives_->at(1); |
| + if (alt1.guards() == NULL || |
|
ulan
2012/03/30 16:54:51
This fits in one line.
Erik Corry
2012/04/01 00:48:46
Done.
Erik Corry
2012/04/01 00:48:46
Done.
|
| + alt1.guards()->length() == 0) { |
| + RegExpNode* eats_anything_node = alt1.node(); |
| + if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == |
| + this) { |
| + // At this point we know that we are at a non-greedy loop that will eat |
| + // any character one at a time. Any non-anchored regexp has such a |
| + // loop prepended to it in order to find where it starts. We look for |
| + // a pattern of the form ...abc... where we can look 6 characters ahead |
|
ulan
2012/03/30 16:54:51
Can we put a similar comment in description of Emi
Erik Corry
2012/04/01 00:48:46
Done.
|
| + // and step forwards 3 if the character is not one of abc. Abc need |
| + // not be atoms, they can be any reasonably limited character class or |
| + // small alternation. |
| + ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. |
| + eats_at_least = |
| + Min(kMaxLookaheadForBoyerMoore, |
| + EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); |
| + if (eats_at_least >= 1) { |
| + BoyerMooreLookahead bm(eats_at_least, |
| + RegExpMacroAssembler::kTableSize, |
| + compiler); |
| + GuardedAlternative alt0 = alternatives_->at(0); |
| + alt0.node()->FillInBMInfo(0, &bm, not_at_start); |
| + skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); |
| + } |
| + } |
| + } |
| + } |
| + |
| + if (eats_at_least == kEatsAtLeastNotYetInitialized) { |
| + // Save some time by looking at most one machine word ahead. |
| + eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); |
| + } |
| + int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); |
| + |
| + bool preload_is_current = !skip_was_emitted && |
| (current_trace->characters_preloaded() == preload_characters); |
| bool preload_has_checked_bounds = preload_is_current; |
| @@ -5432,6 +5784,77 @@ |
| } |
| +void ChoiceNode::FillInBMInfo( |
| + int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| + ZoneList<GuardedAlternative>* alts = alternatives(); |
| + for (int i = 0; i < alts->length(); i++) { |
| + GuardedAlternative& alt = alts->at(i); |
| + if (alt.guards() != NULL && |
|
ulan
2012/03/30 13:04:48
This fits in one line.
Erik Corry
2012/04/01 00:48:46
Done.
|
| + alt.guards()->length() != 0) { |
| + bm->SetRest(offset); // Give up trying to fill in info. |
| + return; |
| + } |
| + alt.node()->FillInBMInfo(offset, bm, not_at_start); |
| + } |
| +} |
| + |
| + |
| +void TextNode::FillInBMInfo( |
| + int offset, BoyerMooreLookahead* bm, bool not_at_start) { |
| + if (offset >= bm->length()) return; |
| + int max_char = bm->max_char(); |
| + for (int i = 0; i < elements()->length(); i++) { |
| + if (offset >= bm->length()) return; |
| + TextElement text = elements()->at(i); |
| + if (text.type == TextElement::ATOM) { |
| + RegExpAtom* atom = text.data.u_atom; |
| + for (int j = 0; j < atom->length(); j++, offset++) { |
| + if (offset >= bm->length()) return; |
| + uc16 character = atom->data()[j]; |
| + if (bm->compiler()->ignore_case()) { |
| + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| + int length = GetCaseIndependentLetters( |
| + ISOLATE, |
| + character, |
| + bm->max_char() == String::kMaxAsciiCharCode, |
| + chars); |
| + for (int j = 0; j < length; j++) { |
| + bm->Set(offset, chars[j]); |
| + } |
| + } else { |
| + if (character <= max_char) bm->Set(offset, character); |
| + } |
| + } |
| + } else { |
| + ASSERT(text.type == TextElement::CHAR_CLASS); |
| + RegExpCharacterClass* char_class = text.data.u_char_class; |
| + ZoneList<CharacterRange>* ranges = char_class->ranges(); |
| + if (char_class->is_negated()) { |
| + bm->SetAll(offset); |
| + } else { |
| + for (int k = 0; k < ranges->length(); k++) { |
| + CharacterRange& range = ranges->at(k); |
| + if (range.from() > max_char) continue; |
| + int to = Min(max_char, static_cast<int>(range.to())); |
| + if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { |
| + bm->SetAll(offset); |
| + break; |
| + } |
| + for (int m = range.from(); m <= to; m++) { |
|
ulan
2012/03/30 16:54:51
Why don't we handle ignore_case() here like in the
Erik Corry
2012/04/01 00:48:46
The character classes have had case-independence r
|
| + bm->Set(offset, m); |
| + } |
| + } |
| + } |
| + offset++; |
| + } |
| + } |
| + if (offset >= bm->length()) return; |
| + on_success()->FillInBMInfo(offset, |
|
ulan
2012/03/30 13:04:48
This fits in line if you put the comment in anothe
Erik Corry
2012/04/01 00:48:46
But the comment only applies to the line it is on,
|
| + bm, |
| + true); // Not at start after a text node. |
| +} |
| + |
| + |
| int TextNode::ComputeFirstCharacterSet(int budget) { |
| budget--; |
| if (budget >= 0) { |
| @@ -5589,15 +6012,27 @@ |
| } |
| -RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, |
| - bool ignore_case, |
| - bool is_multiline, |
| - Handle<String> pattern, |
| - bool is_ascii) { |
| +RegExpEngine::CompilationResult RegExpEngine::Compile( |
| + RegExpCompileData* data, |
| + bool ignore_case, |
| + bool is_multiline, |
| + Handle<String> pattern, |
| + Handle<String> sample_subject, |
| + bool is_ascii) { |
| if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
| return IrregexpRegExpTooBig(); |
| } |
| RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); |
| + |
| + FlattenString(sample_subject); |
| + int chars_sampled = 0; |
| + int half_way = sample_subject->length() / 2; |
|
ulan
2012/03/30 13:04:48
Why do we consider only the upper half of the samp
Erik Corry
2012/04/01 00:48:46
We consider a 100-character chunk in the middle of
|
| + for (int i = half_way; i < sample_subject->length(); i++) { |
| + if (chars_sampled++ > 100) break; |
|
ulan
2012/03/30 13:04:48
A nit: consider moving chars_sampled++ either to t
Erik Corry
2012/04/01 00:48:46
Done.
|
| + compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); |
| + } |
| + compiler.frequency_collator()->SortFrequencies(); |
| + |
| // Wrap the body of the regexp in capture #0. |
| RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, |
| 0, |