src/jsregexp.cc - Issue 9965010: Regexp: Improve the speed that we scan for an initial point where a non-anchored

Unified Diff: src/jsregexp.cc

Issue 9965010: Regexp: Improve the speed that we scan for an initial point where a non-anchored (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 11190)

+++ src/jsregexp.cc (working copy)

@@ -280,7 +280,8 @@

// from the source pattern.

// If compilation fails, an exception is thrown and this function

// returns false.

-bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {

+bool RegExpImpl::EnsureCompiledIrregexp(

+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {

Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));

#ifdef V8_INTERPRETED_REGEXP

if (compiled_code->IsByteArray()) return true;

@@ -296,7 +297,7 @@

ASSERT(compiled_code->IsSmi());

return true;

}

- return CompileIrregexp(re, is_ascii);

+ return CompileIrregexp(re, sample_subject, is_ascii);

}

@@ -316,7 +317,9 @@

}

-bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {

+bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,

+ Handle<String> sample_subject,

+ bool is_ascii) {

// Compile the RegExp.

Isolate* isolate = re->GetIsolate();

ZoneScope zone_scope(isolate, DELETE_ON_EXIT);

@@ -365,6 +368,7 @@

flags.is_ignore_case(),

flags.is_multiline(),

pattern,

+ sample_subject,

is_ascii);

if (result.error_message != NULL) {

// Unable to compile regexp.

@@ -435,7 +439,7 @@

// Check the asciiness of the underlying storage.

bool is_ascii = subject->IsAsciiRepresentationUnderneath();

- if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;

+ if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;

#ifdef V8_INTERPRETED_REGEXP

// Byte-code regexp needs space allocated for all its registers.

@@ -466,7 +470,7 @@

#ifndef V8_INTERPRETED_REGEXP

ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);

do {

- EnsureCompiledIrregexp(regexp, is_ascii);

+ EnsureCompiledIrregexp(regexp, subject, is_ascii);

Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);

NativeRegExpMacroAssembler::Result res =

NativeRegExpMacroAssembler::Match(code,

@@ -784,6 +788,65 @@

}

+class FrequencyCollator {

+ public:

+ FrequencyCollator() : total_frequencies_(0) {

+ for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {

+ frequencies_[i] = CharacterFrequency(i);

+ }

+ void CountCharacter(int character) {

+ int index = (character & RegExpMacroAssembler::kTableMask);

+ frequencies_[index].Increment();

+ total_frequencies_++;

+ }

+ void SortFrequencies() {

+ typedef int (*RawComparer)(const void*, const void*);

+ memcpy(

+ &sorted_frequencies_[0], &frequencies_[0], sizeof(sorted_frequencies_));

ulan 2012/03/30 13:04:48 A nit: for consistency I would prefer breaking mem

Erik Corry 2012/04/01 00:48:46 Done.

+ qsort(&sorted_frequencies_[0],

+ RegExpMacroAssembler::kTableSize,

+ sizeof(sorted_frequencies_[0]),

+ reinterpret_cast<RawComparer>(&CharacterFrequency::Compare));

+ }

+ bool IsFrequent(int in_character) {

+ if (total_frequencies_ < 10) return false; // Too little statistical basis.

+ int freq = frequencies_[in_character].counter();

ulan 2012/03/30 13:04:48 Either mask the in_character or ASSERT(in_characte

Erik Corry 2012/04/01 00:48:46 Done.

+ bool popular = ((freq * 100) / total_frequencies_ > 20);

+ return popular;

ulan 2012/03/30 13:04:48 Named magic numbers become less magic :) int freq

Erik Corry 2012/04/01 00:48:46 Done.

+ }

+ private:

+ class CharacterFrequency {

+ public:

+ CharacterFrequency() : counter_(0), character_(-1) { }

+ explicit CharacterFrequency(int character) : counter_(0), character_(character) { }

ulan 2012/03/30 13:04:48 Long line.

Erik Corry 2012/04/01 00:48:46 Done.

+ static int Compare(const CharacterFrequency* a,

+ const CharacterFrequency* b) {

+ return b->counter_ - a->counter_;

+ }

+ void Increment() { counter_++; }

+ int counter() { return counter_; }

+ int character() { return character_; }

+ private:

+ int counter_;

+ int character_;

+ };

+ private:

+ CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];

+ CharacterFrequency sorted_frequencies_[RegExpMacroAssembler::kTableSize];

+ int total_frequencies_;

+};

class RegExpCompiler {

public:

RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);

@@ -819,6 +882,7 @@

inline bool ignore_case() { return ignore_case_; }

inline bool ascii() { return ascii_; }

+ FrequencyCollator* frequency_collator() { return &frequency_collator_; }

int current_expansion_factor() { return current_expansion_factor_; }

void set_current_expansion_factor(int value) {

@@ -837,6 +901,7 @@

bool ascii_;

bool reg_exp_too_big_;

int current_expansion_factor_;

+ FrequencyCollator frequency_collator_;

};

@@ -865,7 +930,8 @@

ignore_case_(ignore_case),

ascii_(ascii),

reg_exp_too_big_(false),

- current_expansion_factor_(1) {

+ current_expansion_factor_(1),

+ frequency_collator_() {

accept_ = new EndNode(EndNode::ACCEPT);

ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);

}

@@ -2056,6 +2122,16 @@

}

+void ActionNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

ulan 2012/03/30 13:04:48 For consistency with the surrounding code consider

Erik Corry 2012/04/01 00:48:46 Done.

+ if (type_ == BEGIN_SUBMATCH) {

+ bm->SetRest(offset);

+ } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {

+ on_success()->FillInBMInfo(offset, bm, not_at_start);

+ }

int AssertionNode::EatsAtLeast(int still_to_find,

int recursion_depth,

bool not_at_start) {

@@ -2072,6 +2148,14 @@

}

+void AssertionNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ // Match the behaviour of EatsAtLeast on this node.

+ if (type() == AT_START && not_at_start) return;

+ on_success()->FillInBMInfo(offset, bm, not_at_start);

int BackReferenceNode::EatsAtLeast(int still_to_find,

int recursion_depth,

bool not_at_start) {

@@ -2504,6 +2588,17 @@

}

+void LoopChoiceNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool nas) {

+ if (body_can_be_zero_length_) { // || info()->visited) {

ulan 2012/03/30 13:04:48 Forgotten comment.

Erik Corry 2012/04/01 00:48:46 Done.

+ bm->SetRest(offset);

+ return;

+ }

+ // VisitMarker marker(info());

ulan 2012/03/30 13:04:48 Comment.

Erik Corry 2012/04/01 00:48:46 Done.

+ ChoiceNode::FillInBMInfo(offset, bm, nas);

void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

RegExpCompiler* compiler,

int characters_filled_in,

@@ -3000,6 +3095,30 @@

}

+RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(

+ RegExpCompiler* compiler) {

+ if (elms_->length() != 1) return NULL;

+ TextElement elm = elms_->at(0);

+ if (elm.type != TextElement::CHAR_CLASS) return NULL;

+ RegExpCharacterClass* node = elm.data.u_char_class;

+ ZoneList<CharacterRange>* ranges = node->ranges();

+ if (!CharacterRange::IsCanonical(ranges)) {

+ CharacterRange::Canonicalize(ranges);

+ }

+ if (node->is_negated() && ranges->length() == 0) return on_success();

+ if (ranges->length() != 1) return NULL;

+ if (ranges->at(0).from() != 0) return NULL;

+ int max_char;

+ if (compiler->ascii()) {

ulan 2012/03/30 13:04:48 A nit, I like max_char = compiler->ascii() ? Stri

Erik Corry 2012/04/01 00:48:46 Yes, but a bug in gcc means you get a linker error

+ max_char = String::kMaxAsciiCharCode;

+ } else {

+ max_char = String::kMaxUtf16CodeUnit;

+ }

+ if (ranges->at(0).to() < max_char) return NULL;

ulan 2012/03/30 13:04:48 Consider using ranges(0)->IsEverything()

Erik Corry 2012/04/01 00:48:46 Done.

+ return on_success();

// Finds the fixed match length of a sequence of nodes that goes from

// this alternative and back to this choice node. If there are variable

// length nodes or other complications in the way then return a sentinel

@@ -3064,8 +3183,8 @@

int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,

- bool not_at_start) {

- int preload_characters = EatsAtLeast(4, 0, not_at_start);

+ int eats_at_least) {

+ int preload_characters = eats_at_least > 4 ? 4 : eats_at_least;

ulan 2012/03/30 13:04:48 Min(4, eats_at_least);

Erik Corry 2012/04/01 00:48:46 Done.

if (compiler->macro_assembler()->CanReadUnaligned()) {

bool ascii = compiler->ascii();

if (ascii) {

@@ -3131,6 +3250,197 @@

};

+BoyerMooreLookahead::BoyerMooreLookahead(

+ int length, int map_length, RegExpCompiler* compiler)

+ : length_(length),

+ map_length_(map_length),

ulan 2012/03/30 13:04:48 the Set() method assumes that map_length is a powe

Erik Corry 2012/04/01 00:48:46 Done.

+ compiler_(compiler) {

+ if (compiler->ascii()) {

+ max_char_ = String::kMaxAsciiCharCode;

+ } else {

+ max_char_ = String::kMaxUtf16CodeUnit;

+ }

+ bitmaps_ = new ZoneList<ZoneList<bool>*>(length);

ulan 2012/03/30 13:04:48 I wonder if bit-packing (ZoneList<bool>) would imp

Erik Corry 2012/04/01 00:48:46 We normally only do this once per regexp, so I don

+ for (int i = 0; i < length; i++) {

+ bitmaps_->Add(new ZoneList<bool>(map_length));

+ ZoneList<bool>* map = bitmaps_->at(i);

+ for (int i = 0; i < map_length; i++) {

+ map->Add(false);

+ }

+bool BoyerMooreLookahead::TooVague(int offset, int limit) {

+ if (bitmaps_->at(offset) == NULL) return true;

+ if (Count(offset) > limit) return true;

+ return false;

+bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {

+ // The characters that are very frequent in the sample subject string should

+ // not be used for Boyer-Moore operations, so lets mark those positions as

+ // being too vague.

+ for (int i = 0; i < length_; i++) {

+ if (Count(i) >= kTooManyCharacters) {

+ SetAll(i);

+ } else {

+ ZoneList<bool>* map = bitmaps_->at(i);

+ for (int ch = 0; ch < map_length_; ch++) {

+ if (map->at(ch) && compiler_->frequency_collator()->IsFrequent(ch)) {

+ SetAll(i);

+ break;

+ }

+ const int kNoPoints = -1;

+ int biggest_points = kNoPoints;

+ int biggest_interval_max = 0;

+ int biggest_interval = 0;

+ int vagueness_factor = 10;

ulan 2012/03/30 13:04:48 Could you please write a comment describing the ma

Erik Corry 2012/04/01 00:48:46 I refactored this function because it reflected fa

+ for (int initial_max_vagueness = 4;

+ initial_max_vagueness <= kTooManyCharacters;

+ initial_max_vagueness *= 2, vagueness_factor--) {

+ int max = length_ - 1;

ulan 2012/03/30 13:04:48 This 'max' can just be called 'i'. I think there a

Erik Corry 2012/04/01 00:48:46 Done.

+ while (true) {

+ while (max >= 0 && TooVague(max, initial_max_vagueness)) max--;

+ if (max < 0) break;

+ int remembered_max = max;

+ int max_vagueness = initial_max_vagueness;

+ while (max >= 0 && !TooVague(max, max_vagueness)) {

+ max--;

+ if (max_vagueness < kTooManyCharacters) max_vagueness *= 2;

ulan 2012/03/30 13:04:48 What is the rationale behind this? Nit: max_vague

Erik Corry 2012/04/01 00:48:46 All refactored away.

+ }

+ int points =

+ ((remembered_max - max) * kTooManyCharacters) * vagueness_factor;

+ if (points > biggest_points) {

+ biggest_interval_max = remembered_max;

+ biggest_interval = remembered_max - max;

ulan 2012/03/30 13:04:48 Consider maintaining (left, right) instead of (big

Erik Corry 2012/04/01 00:48:46 Done.

+ biggest_points = points;

+ }

+ if (max < 0) break;

+ }

+ if (biggest_points == kNoPoints) return false;

+ if (biggest_interval < 1) return false;

+ *to = biggest_interval_max;

+ *from = 1 + biggest_interval_max - biggest_interval;

+ return true;

+static int SortChar(const char* a, const char* b) {

+ return static_cast<int>(*a) - *b;

+int BoyerMooreLookahead::GetSkipTable(int max_lookahead,

+ Handle<ByteArray> boolean_skip_table) {

+ const int kSize = RegExpMacroAssembler::kTableSize;

+ char table[kSize];

+ for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {

+ table[i] = max_lookahead + 1;

+ }

+ for (int i = max_lookahead; i >= 0; i--) {

+ int skip = max_lookahead - i;

ulan 2012/03/30 16:54:51 This is the crucial part. A comment explaining why

Erik Corry 2012/04/01 00:48:46 Done.

+ ZoneList<bool>* map = bitmaps_->at(i);

+ if (map == NULL) {

+ for (int j = 0; j < map_length_; j++) {

+ if (table[j] > skip) table[j] = skip;

+ }

+ break;

+ }

+ for (int j = 0; j < map_length_; j++) {

+ if (map->at(j)) {

+ if (table[j] > skip) table[j] = skip;

+ }

+ char sorted_table[kSize];

+ memcpy(sorted_table, table, kSize);

+ typedef int (*RawComparer)(const void*, const void*);

+ qsort(sorted_table, kSize, 1, reinterpret_cast<RawComparer>(&SortChar));

+ int boolean_skip_distance = sorted_table[kSize / 2];

ulan 2012/03/30 16:54:51 Instead of kSize / 2 we can take any i in [0..kSiz

Erik Corry 2012/04/01 00:48:46 Done.

+ for (int i = 0; i < kSize; i++) {

+ boolean_skip_table->set(i, table[i] >= boolean_skip_distance ? 0 : 1);

ulan 2012/03/30 13:04:48 Can (x >= y ? 0 : 1) be replaced with (x < y) ?

Erik Corry 2012/04/01 00:48:46 We don't normally allow implicit bool->int and int

+ }

+ return boolean_skip_distance;

+bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {

+ int min_lookahead = 0;

+ int max_lookahead = 0;

+ if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;

+ bool found_single_character = false;

ulan 2012/03/30 16:54:51 Consider using a counter: character_count = 0; .

Erik Corry 2012/04/01 00:48:46 I want to bail out when I find the second characte

+ bool abandoned_search_for_single_character = false;

+ int single_character = 0;

+ for (int i = max_lookahead; i >= min_lookahead; i--) {

+ ZoneList<bool>* map = bitmaps_->at(i);

+ for (int j = 0; j < map_length_; j++) {

+ if (map->at(j)) {

+ if (found_single_character) {

+ found_single_character = false; // Found two.

+ abandoned_search_for_single_character = true;

+ break;

+ } else {

+ found_single_character = true;

+ single_character = j;

+ }

+ if (abandoned_search_for_single_character) break;

+ }

+ int lookahead_width = max_lookahead + 1 - min_lookahead;

+ if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {

+ // The mask-compare can probably handle this better.

+ return false;

+ }

+ if (found_single_character) {

+ Label cont, again;

+ masm->Bind(&again);

+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);

+ if (max_char_ > String::kMaxAsciiCharCode ||

+ map_length_ < RegExpMacroAssembler::kTableSize) {

+ masm->CheckCharacterAfterAnd(single_character,

+ RegExpMacroAssembler::kTableMask,

+ &cont);

+ } else {

+ masm->CheckCharacter(single_character, &cont);

ulan 2012/03/30 16:54:51 I think this implicitly assumes that String::kMaxA

Erik Corry 2012/04/01 00:48:46 Fixed.

+ }

+ masm->AdvanceCurrentPosition(lookahead_width);

+ masm->GoTo(&again);

+ masm->Bind(&cont);

+ return true;

+ }

+ Handle<ByteArray> boolean_skip_table =

+ FACTORY->NewByteArray(map_length_, TENURED);

+ int boolean_skip_distance = GetSkipTable(max_lookahead, boolean_skip_table);

+ if (boolean_skip_distance == 0) return false;

+ Label cont, again;

+ masm->Bind(&again);

+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);

+ masm->CheckBitInTable(boolean_skip_table, &cont);

+ masm->AdvanceCurrentPosition(boolean_skip_distance);

+ masm->GoTo(&again);

+ masm->Bind(&cont);

+ return true;

/* Code generation for choice nodes.

* We generate quick checks that do a mask and compare to eliminate a

@@ -3274,10 +3584,52 @@

int first_normal_choice = greedy_loop ? 1 : 0;

- int preload_characters =

- CalculatePreloadCharacters(compiler,

- current_trace->at_start() == Trace::FALSE);

- bool preload_is_current =

+ bool not_at_start = current_trace->at_start() == Trace::FALSE;

+ const int kEatsAtLeastNotYetInitialized = -1;

+ int eats_at_least = kEatsAtLeastNotYetInitialized;

+ bool skip_was_emitted = false;

+ // More makes code generation slower, less makes V8 benchmark score lower.

+ const int kMaxLookaheadForBoyerMoore = 8;

+ if (!greedy_loop && choice_count == 2) {

+ GuardedAlternative alt1 = alternatives_->at(1);

+ if (alt1.guards() == NULL ||

ulan 2012/03/30 16:54:51 This fits in one line.

Erik Corry 2012/04/01 00:48:46 Done.

+ alt1.guards()->length() == 0) {

+ RegExpNode* eats_anything_node = alt1.node();

+ if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==

+ this) {

+ // At this point we know that we are at a non-greedy loop that will eat

+ // any character one at a time. Any non-anchored regexp has such a

+ // loop prepended to it in order to find where it starts. We look for

+ // a pattern of the form ...abc... where we can look 6 characters ahead

ulan 2012/03/30 16:54:51 Can we put a similar comment in description of Emi

Erik Corry 2012/04/01 00:48:46 Done.

+ // and step forwards 3 if the character is not one of abc. Abc need

+ // not be atoms, they can be any reasonably limited character class or

+ // small alternation.

+ ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.

+ eats_at_least =

+ Min(kMaxLookaheadForBoyerMoore,

+ EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));

+ if (eats_at_least >= 1) {

+ BoyerMooreLookahead bm(eats_at_least,

+ RegExpMacroAssembler::kTableSize,

+ compiler);

+ GuardedAlternative alt0 = alternatives_->at(0);

+ alt0.node()->FillInBMInfo(0, &bm, not_at_start);

+ skip_was_emitted = bm.EmitSkipInstructions(macro_assembler);

+ }

+ if (eats_at_least == kEatsAtLeastNotYetInitialized) {

+ // Save some time by looking at most one machine word ahead.

+ eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);

+ }

+ int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);

+ bool preload_is_current = !skip_was_emitted &&

(current_trace->characters_preloaded() == preload_characters);

bool preload_has_checked_bounds = preload_is_current;

@@ -5432,6 +5784,77 @@

}

+void ChoiceNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ ZoneList<GuardedAlternative>* alts = alternatives();

+ for (int i = 0; i < alts->length(); i++) {

+ GuardedAlternative& alt = alts->at(i);

+ if (alt.guards() != NULL &&

ulan 2012/03/30 13:04:48 This fits in one line.

Erik Corry 2012/04/01 00:48:46 Done.

+ alt.guards()->length() != 0) {

+ bm->SetRest(offset); // Give up trying to fill in info.

+ return;

+ }

+ alt.node()->FillInBMInfo(offset, bm, not_at_start);

+ }

+void TextNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ if (offset >= bm->length()) return;

+ int max_char = bm->max_char();

+ for (int i = 0; i < elements()->length(); i++) {

+ if (offset >= bm->length()) return;

+ TextElement text = elements()->at(i);

+ if (text.type == TextElement::ATOM) {

+ RegExpAtom* atom = text.data.u_atom;

+ for (int j = 0; j < atom->length(); j++, offset++) {

+ if (offset >= bm->length()) return;

+ uc16 character = atom->data()[j];

+ if (bm->compiler()->ignore_case()) {

+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ int length = GetCaseIndependentLetters(

+ ISOLATE,

+ character,

+ bm->max_char() == String::kMaxAsciiCharCode,

+ chars);

+ for (int j = 0; j < length; j++) {

+ bm->Set(offset, chars[j]);

+ }

+ } else {

+ if (character <= max_char) bm->Set(offset, character);

+ }

+ } else {

+ ASSERT(text.type == TextElement::CHAR_CLASS);

+ RegExpCharacterClass* char_class = text.data.u_char_class;

+ ZoneList<CharacterRange>* ranges = char_class->ranges();

+ if (char_class->is_negated()) {

+ bm->SetAll(offset);

+ } else {

+ for (int k = 0; k < ranges->length(); k++) {

+ CharacterRange& range = ranges->at(k);

+ if (range.from() > max_char) continue;

+ int to = Min(max_char, static_cast<int>(range.to()));

+ if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) {

+ bm->SetAll(offset);

+ break;

+ }

+ for (int m = range.from(); m <= to; m++) {

ulan 2012/03/30 16:54:51 Why don't we handle ignore_case() here like in the

Erik Corry 2012/04/01 00:48:46 The character classes have had case-independence r

+ bm->Set(offset, m);

+ }

+ offset++;

+ }

+ if (offset >= bm->length()) return;

+ on_success()->FillInBMInfo(offset,

ulan 2012/03/30 13:04:48 This fits in line if you put the comment in anothe

Erik Corry 2012/04/01 00:48:46 But the comment only applies to the line it is on,

+ bm,

+ true); // Not at start after a text node.

int TextNode::ComputeFirstCharacterSet(int budget) {

budget--;

if (budget >= 0) {

@@ -5589,15 +6012,27 @@

}

-RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,

- bool ignore_case,

- bool is_multiline,

- Handle<String> pattern,

- bool is_ascii) {

+RegExpEngine::CompilationResult RegExpEngine::Compile(

+ RegExpCompileData* data,

+ bool ignore_case,

+ bool is_multiline,

+ Handle<String> pattern,

+ Handle<String> sample_subject,

+ bool is_ascii) {

if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {

return IrregexpRegExpTooBig();

}

RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);

+ FlattenString(sample_subject);

+ int chars_sampled = 0;

+ int half_way = sample_subject->length() / 2;

ulan 2012/03/30 13:04:48 Why do we consider only the upper half of the samp

Erik Corry 2012/04/01 00:48:46 We consider a 100-character chunk in the middle of

+ for (int i = half_way; i < sample_subject->length(); i++) {

+ if (chars_sampled++ > 100) break;

ulan 2012/03/30 13:04:48 A nit: consider moving chars_sampled++ either to t

Erik Corry 2012/04/01 00:48:46 Done.

+ compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));

+ }

+ compiler.frequency_collator()->SortFrequencies();

// Wrap the body of the regexp in capture #0.

RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,

« src/jsregexp.h ('K') | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »