 Chromium Code Reviews
 Chromium Code Reviews Issue 9965010:
  Regexp: Improve the speed that we scan for an initial point where a non-anchored  (Closed) 
  Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
    
  
    Issue 9965010:
  Regexp: Improve the speed that we scan for an initial point where a non-anchored  (Closed) 
  Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/| Index: src/jsregexp.cc | 
| =================================================================== | 
| --- src/jsregexp.cc (revision 11190) | 
| +++ src/jsregexp.cc (working copy) | 
| @@ -280,7 +280,8 @@ | 
| // from the source pattern. | 
| // If compilation fails, an exception is thrown and this function | 
| // returns false. | 
| -bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 
| +bool RegExpImpl::EnsureCompiledIrregexp( | 
| + Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { | 
| Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); | 
| #ifdef V8_INTERPRETED_REGEXP | 
| if (compiled_code->IsByteArray()) return true; | 
| @@ -296,7 +297,7 @@ | 
| ASSERT(compiled_code->IsSmi()); | 
| return true; | 
| } | 
| - return CompileIrregexp(re, is_ascii); | 
| + return CompileIrregexp(re, sample_subject, is_ascii); | 
| } | 
| @@ -316,7 +317,9 @@ | 
| } | 
| -bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 
| +bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, | 
| + Handle<String> sample_subject, | 
| + bool is_ascii) { | 
| // Compile the RegExp. | 
| Isolate* isolate = re->GetIsolate(); | 
| ZoneScope zone_scope(isolate, DELETE_ON_EXIT); | 
| @@ -365,6 +368,7 @@ | 
| flags.is_ignore_case(), | 
| flags.is_multiline(), | 
| pattern, | 
| + sample_subject, | 
| is_ascii); | 
| if (result.error_message != NULL) { | 
| // Unable to compile regexp. | 
| @@ -435,7 +439,7 @@ | 
| // Check the asciiness of the underlying storage. | 
| bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 
| - if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; | 
| + if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; | 
| #ifdef V8_INTERPRETED_REGEXP | 
| // Byte-code regexp needs space allocated for all its registers. | 
| @@ -466,7 +470,7 @@ | 
| #ifndef V8_INTERPRETED_REGEXP | 
| ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); | 
| do { | 
| - EnsureCompiledIrregexp(regexp, is_ascii); | 
| + EnsureCompiledIrregexp(regexp, subject, is_ascii); | 
| Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); | 
| NativeRegExpMacroAssembler::Result res = | 
| NativeRegExpMacroAssembler::Match(code, | 
| @@ -784,6 +788,53 @@ | 
| } | 
| +class FrequencyCollator { | 
| + public: | 
| + FrequencyCollator() : total_samples_(0) { | 
| + for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { | 
| + frequencies_[i] = CharacterFrequency(i); | 
| + } | 
| + } | 
| + | 
| + void CountCharacter(int character) { | 
| + int index = (character & RegExpMacroAssembler::kTableMask); | 
| + frequencies_[index].Increment(); | 
| + total_samples_++; | 
| + } | 
| + | 
| + // Does not measure in percent, but rather per-128 (the table size from the | 
| + // regexp macro assembler). | 
| + int Frequency(int in_character) { | 
| + ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character); | 
| + if (total_samples_ < 1) return 1; // Division by zero. | 
| + int freq_in_per128 = | 
| + (frequencies_[in_character].counter() * 128) / total_samples_; | 
| + return freq_in_per128; | 
| + } | 
| + | 
| + private: | 
| + class CharacterFrequency { | 
| + public: | 
| + CharacterFrequency() : counter_(0), character_(-1) { } | 
| + explicit CharacterFrequency(int character) | 
| + : counter_(0), character_(character) { } | 
| + | 
| + void Increment() { counter_++; } | 
| + int counter() { return counter_; } | 
| + int character() { return character_; } | 
| + | 
| + private: | 
| + int counter_; | 
| + int character_; | 
| + }; | 
| + | 
| + | 
| + private: | 
| + CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; | 
| + int total_samples_; | 
| +}; | 
| + | 
| + | 
| class RegExpCompiler { | 
| public: | 
| RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); | 
| @@ -819,6 +870,7 @@ | 
| inline bool ignore_case() { return ignore_case_; } | 
| inline bool ascii() { return ascii_; } | 
| + FrequencyCollator* frequency_collator() { return &frequency_collator_; } | 
| int current_expansion_factor() { return current_expansion_factor_; } | 
| void set_current_expansion_factor(int value) { | 
| @@ -837,6 +889,7 @@ | 
| bool ascii_; | 
| bool reg_exp_too_big_; | 
| int current_expansion_factor_; | 
| + FrequencyCollator frequency_collator_; | 
| }; | 
| @@ -865,7 +918,8 @@ | 
| ignore_case_(ignore_case), | 
| ascii_(ascii), | 
| reg_exp_too_big_(false), | 
| - current_expansion_factor_(1) { | 
| + current_expansion_factor_(1), | 
| + frequency_collator_() { | 
| accept_ = new EndNode(EndNode::ACCEPT); | 
| ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); | 
| } | 
| @@ -2056,6 +2110,17 @@ | 
| } | 
| +void ActionNode::FillInBMInfo(int offset, | 
| + BoyerMooreLookahead* bm, | 
| + bool not_at_start) { | 
| + if (type_ == BEGIN_SUBMATCH) { | 
| + bm->SetRest(offset); | 
| + } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { | 
| + on_success()->FillInBMInfo(offset, bm, not_at_start); | 
| + } | 
| +} | 
| + | 
| + | 
| int AssertionNode::EatsAtLeast(int still_to_find, | 
| int recursion_depth, | 
| bool not_at_start) { | 
| @@ -2072,6 +2137,14 @@ | 
| } | 
| +void AssertionNode::FillInBMInfo( | 
| + int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 
| + // Match the behaviour of EatsAtLeast on this node. | 
| + if (type() == AT_START && not_at_start) return; | 
| + on_success()->FillInBMInfo(offset, bm, not_at_start); | 
| +} | 
| + | 
| + | 
| int BackReferenceNode::EatsAtLeast(int still_to_find, | 
| int recursion_depth, | 
| bool not_at_start) { | 
| @@ -2504,6 +2577,16 @@ | 
| } | 
| +void LoopChoiceNode::FillInBMInfo( | 
| + int offset, BoyerMooreLookahead* bm, bool nas) { | 
| + if (body_can_be_zero_length_) { | 
| + bm->SetRest(offset); | 
| + return; | 
| + } | 
| + ChoiceNode::FillInBMInfo(offset, bm, nas); | 
| +} | 
| + | 
| + | 
| void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 
| RegExpCompiler* compiler, | 
| int characters_filled_in, | 
| @@ -3000,6 +3083,30 @@ | 
| } | 
| +RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( | 
| + RegExpCompiler* compiler) { | 
| + if (elms_->length() != 1) return NULL; | 
| + TextElement elm = elms_->at(0); | 
| + if (elm.type != TextElement::CHAR_CLASS) return NULL; | 
| + RegExpCharacterClass* node = elm.data.u_char_class; | 
| + ZoneList<CharacterRange>* ranges = node->ranges(); | 
| + if (!CharacterRange::IsCanonical(ranges)) { | 
| + CharacterRange::Canonicalize(ranges); | 
| + } | 
| + if (node->is_negated()) { | 
| + return ranges->length() == 0 ? on_success() : NULL; | 
| + } | 
| + if (ranges->length() != 1) return NULL; | 
| + uint32_t max_char; | 
| + if (compiler->ascii()) { | 
| + max_char = String::kMaxAsciiCharCode; | 
| + } else { | 
| + max_char = String::kMaxUtf16CodeUnit; | 
| + } | 
| + return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; | 
| +} | 
| + | 
| + | 
| // Finds the fixed match length of a sequence of nodes that goes from | 
| // this alternative and back to this choice node. If there are variable | 
| // length nodes or other complications in the way then return a sentinel | 
| @@ -3064,8 +3171,8 @@ | 
| int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, | 
| - bool not_at_start) { | 
| - int preload_characters = EatsAtLeast(4, 0, not_at_start); | 
| + int eats_at_least) { | 
| + int preload_characters = Min(4, eats_at_least); | 
| if (compiler->macro_assembler()->CanReadUnaligned()) { | 
| bool ascii = compiler->ascii(); | 
| if (ascii) { | 
| @@ -3131,6 +3238,193 @@ | 
| }; | 
| +BoyerMooreLookahead::BoyerMooreLookahead( | 
| + int length, int map_length, RegExpCompiler* compiler) | 
| + : length_(length), | 
| + map_length_(map_length), | 
| + compiler_(compiler) { | 
| + ASSERT(IsPowerOf2(map_length)); | 
| + if (compiler->ascii()) { | 
| + max_char_ = String::kMaxAsciiCharCode; | 
| + } else { | 
| + max_char_ = String::kMaxUtf16CodeUnit; | 
| + } | 
| + bitmaps_ = new ZoneList<ZoneList<bool>*>(length); | 
| + for (int i = 0; i < length; i++) { | 
| + bitmaps_->Add(new ZoneList<bool>(map_length)); | 
| + ZoneList<bool>* map = bitmaps_->at(i); | 
| + for (int i = 0; i < map_length; i++) { | 
| + map->Add(false); | 
| + } | 
| + } | 
| +} | 
| + | 
| + | 
| +// Find the longest range of lookahead that has the fewest number of different | 
| +// characters that can occur at a given position. Since we are optimizing two | 
| +// different parameters at once this is a tradeoff. | 
| +bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { | 
| + int biggest_points = 0; | 
| + for (int max_number_of_chars = 4; | 
| + max_number_of_chars < kTooManyCharacters; | 
| + max_number_of_chars *= 2) { | 
| + biggest_points = | 
| + FindBestInterval(max_number_of_chars, biggest_points, from, to); | 
| + } | 
| + if (biggest_points == 0) return false; | 
| + return true; | 
| +} | 
| + | 
| + | 
| +// Find the highest-points range between 0 and length_ where the character | 
| +// information is not too vague. 'Too vague' means that there are more than | 
| +// max_number_of_chars that can occur at this position. Calculates the number | 
| +// of points as the product of width-of-the-range and | 
| +// probability-of-finding-one-of-the-characters, where the probability is | 
| +// calculated using the frequency distribution of the sample subject string. | 
| +int BoyerMooreLookahead::FindBestInterval( | 
| + int max_number_of_chars, int old_biggest_points, int* from, int* to) { | 
| + int biggest_points = old_biggest_points; | 
| + static const int kSize = RegExpMacroAssembler::kTableSize; | 
| + for (int i = 0; i < length_; ) { | 
| + while (i < length_ && Count(i) > max_number_of_chars) i++; | 
| + if (i == length_) break; | 
| + int remembered_from = i; | 
| + bool union_map[kSize]; | 
| + for (int j = 0; j < kSize; j++) union_map[j] = false; | 
| + while (i < length_ && Count(i) <= max_number_of_chars) { | 
| + ZoneList<bool>* map = bitmaps_->at(i); | 
| + for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); | 
| + i++; | 
| + } | 
| + int frequency = 0; | 
| + for (int j = 0; j < kSize; j++) { | 
| + if (union_map[j]) { | 
| + // Add 1 to the frequency to give a small per-character boost for | 
| + // the cases where our sampling is not good enough and many | 
| + // characters have a frequency of zero. | 
| + frequency += compiler_->frequency_collator()->Frequency(j) + 1; | 
| + } | 
| + } | 
| + // We use the probability of skipping times the distance we are skipping to | 
| + // judge the effectiveness of this. Actually we have a cut-off: By | 
| + // dividing by 2 we switch off the skipping if the probability of skipping | 
| + // is less than 50%. This is because the multibyte mask-and-compare | 
| + // skipping in quickcheck is more likely to do well on this case. | 
| + bool in_quickcheck_range = ((i - remembered_from < 4) || | 
| + (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); | 
| + int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; | 
| 
ulan
2012/04/02 08:12:24
This shouldn't affect the correctness but:
if I r
 
Erik Corry
2012/04/02 09:36:22
This code is as intended.  I added a few comments.
 | 
| + int points = (i - remembered_from) * probability; | 
| + if (points > biggest_points) { | 
| + *from = remembered_from; | 
| + *to = i - 1; | 
| + biggest_points = points; | 
| + } | 
| + } | 
| + return biggest_points; | 
| +} | 
| + | 
| + | 
| +// Take all the characters that will not prevent a successful match if they | 
| +// occur occur in the subject string in the range between min_lookahead and | 
| 
ulan
2012/04/02 08:12:24
'occur' twice.
 
Erik Corry
2012/04/02 09:36:22
Done.
 | 
| +// max_lookahead (inclusive) measured from the current position. If the | 
| +// character at max_lookahead offset is not one of these characters, then we | 
| +// can safely skip forwards by the number of characters in the range. | 
| +int BoyerMooreLookahead::GetSkipTable(int min_lookahead, | 
| + int max_lookahead, | 
| + Handle<ByteArray> boolean_skip_table) { | 
| + const int kSize = RegExpMacroAssembler::kTableSize; | 
| + | 
| + const int kSkipArrayEntry = 0; | 
| + const int kDontSkipArrayEntry = 1; | 
| + | 
| + for (int i = 0; i < kSize; i++) { | 
| + boolean_skip_table->set(i, kSkipArrayEntry); | 
| + } | 
| + int skip = max_lookahead + 1 - min_lookahead; | 
| + | 
| + for (int i = max_lookahead; i >= min_lookahead; i--) { | 
| + ZoneList<bool>* map = bitmaps_->at(i); | 
| + for (int j = 0; j < map_length_; j++) { | 
| + if (map->at(j)) { | 
| + boolean_skip_table->set(j, kDontSkipArrayEntry); | 
| + } | 
| + } | 
| + } | 
| + | 
| + return skip; | 
| +} | 
| + | 
| + | 
| +// See comment above on the implementation of GetSkipTable. | 
| +bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | 
| + int min_lookahead = 0; | 
| + int max_lookahead = 0; | 
| + | 
| + if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | 
| + | 
| + bool found_single_character = false; | 
| + bool abandoned_search_for_single_character = false; | 
| + int single_character = 0; | 
| + for (int i = max_lookahead; i >= min_lookahead; i--) { | 
| + ZoneList<bool>* map = bitmaps_->at(i); | 
| + for (int j = 0; j < map_length_; j++) { | 
| + if (map->at(j)) { | 
| + if (found_single_character) { | 
| + found_single_character = false; // Found two. | 
| + abandoned_search_for_single_character = true; | 
| + break; | 
| + } else { | 
| + found_single_character = true; | 
| + single_character = j; | 
| + } | 
| + } | 
| + } | 
| + if (abandoned_search_for_single_character) break; | 
| + } | 
| + | 
| + int lookahead_width = max_lookahead + 1 - min_lookahead; | 
| + | 
| + if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | 
| + // The mask-compare can probably handle this better. | 
| + return false; | 
| + } | 
| + | 
| + if (found_single_character) { | 
| + Label cont, again; | 
| + masm->Bind(&again); | 
| + masm->LoadCurrentCharacter(max_lookahead, &cont, true); | 
| + if (max_char_ > map_length_) { | 
| + ASSERT(map_length_ == RegExpMacroAssembler::kTableSize); | 
| + masm->CheckCharacterAfterAnd(single_character, | 
| + RegExpMacroAssembler::kTableMask, | 
| + &cont); | 
| + } else { | 
| + masm->CheckCharacter(single_character, &cont); | 
| + } | 
| + masm->AdvanceCurrentPosition(lookahead_width); | 
| + masm->GoTo(&again); | 
| + masm->Bind(&cont); | 
| + return true; | 
| + } | 
| + | 
| + Handle<ByteArray> boolean_skip_table = | 
| + FACTORY->NewByteArray(map_length_, TENURED); | 
| + int skip_distance = GetSkipTable( | 
| + min_lookahead, max_lookahead, boolean_skip_table); | 
| + ASSERT(skip_distance != 0); | 
| + | 
| + Label cont, again; | 
| + masm->Bind(&again); | 
| + masm->LoadCurrentCharacter(max_lookahead, &cont, true); | 
| + masm->CheckBitInTable(boolean_skip_table, &cont); | 
| + masm->AdvanceCurrentPosition(skip_distance); | 
| + masm->GoTo(&again); | 
| + masm->Bind(&cont); | 
| + | 
| + return true; | 
| +} | 
| + | 
| /* Code generation for choice nodes. | 
| * | 
| * We generate quick checks that do a mask and compare to eliminate a | 
| @@ -3274,10 +3568,51 @@ | 
| int first_normal_choice = greedy_loop ? 1 : 0; | 
| - int preload_characters = | 
| - CalculatePreloadCharacters(compiler, | 
| - current_trace->at_start() == Trace::FALSE); | 
| - bool preload_is_current = | 
| + bool not_at_start = current_trace->at_start() == Trace::FALSE; | 
| + const int kEatsAtLeastNotYetInitialized = -1; | 
| + int eats_at_least = kEatsAtLeastNotYetInitialized; | 
| + | 
| + bool skip_was_emitted = false; | 
| + | 
| + // More makes code generation slower, less makes V8 benchmark score lower. | 
| + const int kMaxLookaheadForBoyerMoore = 8; | 
| + | 
| + if (!greedy_loop && choice_count == 2) { | 
| + GuardedAlternative alt1 = alternatives_->at(1); | 
| + if (alt1.guards() == NULL || alt1.guards()->length() == 0) { | 
| + RegExpNode* eats_anything_node = alt1.node(); | 
| + if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | 
| + this) { | 
| + // At this point we know that we are at a non-greedy loop that will eat | 
| + // any character one at a time. Any non-anchored regexp has such a | 
| + // loop prepended to it in order to find where it starts. We look for | 
| + // a pattern of the form ...abc... where we can look 6 characters ahead | 
| + // and step forwards 3 if the character is not one of abc. Abc need | 
| + // not be atoms, they can be any reasonably limited character class or | 
| + // small alternation. | 
| + ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | 
| + eats_at_least = | 
| + Min(kMaxLookaheadForBoyerMoore, | 
| + EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | 
| + if (eats_at_least >= 1) { | 
| + BoyerMooreLookahead bm(eats_at_least, | 
| + RegExpMacroAssembler::kTableSize, | 
| + compiler); | 
| + GuardedAlternative alt0 = alternatives_->at(0); | 
| + alt0.node()->FillInBMInfo(0, &bm, not_at_start); | 
| + skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); | 
| + } | 
| + } | 
| + } | 
| + } | 
| + | 
| + if (eats_at_least == kEatsAtLeastNotYetInitialized) { | 
| + // Save some time by looking at most one machine word ahead. | 
| + eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); | 
| + } | 
| + int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); | 
| + | 
| + bool preload_is_current = !skip_was_emitted && | 
| (current_trace->characters_preloaded() == preload_characters); | 
| bool preload_has_checked_bounds = preload_is_current; | 
| @@ -5432,6 +5767,76 @@ | 
| } | 
| +void ChoiceNode::FillInBMInfo( | 
| + int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 
| + ZoneList<GuardedAlternative>* alts = alternatives(); | 
| + for (int i = 0; i < alts->length(); i++) { | 
| + GuardedAlternative& alt = alts->at(i); | 
| + if (alt.guards() != NULL && alt.guards()->length() != 0) { | 
| + bm->SetRest(offset); // Give up trying to fill in info. | 
| + return; | 
| + } | 
| + alt.node()->FillInBMInfo(offset, bm, not_at_start); | 
| + } | 
| +} | 
| + | 
| + | 
| +void TextNode::FillInBMInfo( | 
| + int offset, BoyerMooreLookahead* bm, bool not_at_start) { | 
| + if (offset >= bm->length()) return; | 
| + int max_char = bm->max_char(); | 
| + for (int i = 0; i < elements()->length(); i++) { | 
| + if (offset >= bm->length()) return; | 
| + TextElement text = elements()->at(i); | 
| + if (text.type == TextElement::ATOM) { | 
| + RegExpAtom* atom = text.data.u_atom; | 
| + for (int j = 0; j < atom->length(); j++, offset++) { | 
| + if (offset >= bm->length()) return; | 
| + uc16 character = atom->data()[j]; | 
| + if (bm->compiler()->ignore_case()) { | 
| + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 
| + int length = GetCaseIndependentLetters( | 
| + ISOLATE, | 
| + character, | 
| + bm->max_char() == String::kMaxAsciiCharCode, | 
| + chars); | 
| + for (int j = 0; j < length; j++) { | 
| + bm->Set(offset, chars[j]); | 
| + } | 
| + } else { | 
| + if (character <= max_char) bm->Set(offset, character); | 
| + } | 
| + } | 
| + } else { | 
| + ASSERT(text.type == TextElement::CHAR_CLASS); | 
| + RegExpCharacterClass* char_class = text.data.u_char_class; | 
| + ZoneList<CharacterRange>* ranges = char_class->ranges(); | 
| + if (char_class->is_negated()) { | 
| + bm->SetAll(offset); | 
| + } else { | 
| + for (int k = 0; k < ranges->length(); k++) { | 
| + CharacterRange& range = ranges->at(k); | 
| + if (range.from() > max_char) continue; | 
| + int to = Min(max_char, static_cast<int>(range.to())); | 
| + if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { | 
| + bm->SetAll(offset); | 
| + break; | 
| + } | 
| + for (int m = range.from(); m <= to; m++) { | 
| + bm->Set(offset, m); | 
| + } | 
| + } | 
| + } | 
| + offset++; | 
| + } | 
| + } | 
| + if (offset >= bm->length()) return; | 
| + on_success()->FillInBMInfo(offset, | 
| + bm, | 
| + true); // Not at start after a text node. | 
| +} | 
| + | 
| + | 
| int TextNode::ComputeFirstCharacterSet(int budget) { | 
| budget--; | 
| if (budget >= 0) { | 
| @@ -5589,15 +5994,31 @@ | 
| } | 
| -RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, | 
| - bool ignore_case, | 
| - bool is_multiline, | 
| - Handle<String> pattern, | 
| - bool is_ascii) { | 
| +RegExpEngine::CompilationResult RegExpEngine::Compile( | 
| + RegExpCompileData* data, | 
| + bool ignore_case, | 
| + bool is_multiline, | 
| + Handle<String> pattern, | 
| + Handle<String> sample_subject, | 
| + bool is_ascii) { | 
| if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { | 
| return IrregexpRegExpTooBig(); | 
| } | 
| RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); | 
| + | 
| + // Sample some characters from the middle of the string. | 
| + static const int kSampleSize = 128; | 
| + | 
| + FlattenString(sample_subject); | 
| + int chars_sampled = 0; | 
| + int half_way = (sample_subject->length() - kSampleSize) / 2; | 
| + for (int i = Max(0, half_way); | 
| + i < sample_subject->length(); | 
| + i++, chars_sampled++) { | 
| + if (chars_sampled > kSampleSize) break; | 
| 
ulan
2012/04/02 08:12:24
Now we can move this into the for loop condition.
 
Erik Corry
2012/04/02 09:36:22
Done.
 | 
| + compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); | 
| + } | 
| + | 
| // Wrap the body of the regexp in capture #0. | 
| RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, | 
| 0, |