Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(548)

Unified Diff: src/jsregexp.cc

Issue 9965010: Regexp: Improve the speed that we scan for an initial point where a non-anchored (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: '' Created 8 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 11190)
+++ src/jsregexp.cc (working copy)
@@ -280,7 +280,8 @@
// from the source pattern.
// If compilation fails, an exception is thrown and this function
// returns false.
-bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
+bool RegExpImpl::EnsureCompiledIrregexp(
+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
#ifdef V8_INTERPRETED_REGEXP
if (compiled_code->IsByteArray()) return true;
@@ -296,7 +297,7 @@
ASSERT(compiled_code->IsSmi());
return true;
}
- return CompileIrregexp(re, is_ascii);
+ return CompileIrregexp(re, sample_subject, is_ascii);
}
@@ -316,7 +317,9 @@
}
-bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
+bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
+ Handle<String> sample_subject,
+ bool is_ascii) {
// Compile the RegExp.
Isolate* isolate = re->GetIsolate();
ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
@@ -365,6 +368,7 @@
flags.is_ignore_case(),
flags.is_multiline(),
pattern,
+ sample_subject,
is_ascii);
if (result.error_message != NULL) {
// Unable to compile regexp.
@@ -435,7 +439,7 @@
// Check the asciiness of the underlying storage.
bool is_ascii = subject->IsAsciiRepresentationUnderneath();
- if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;
+ if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
#ifdef V8_INTERPRETED_REGEXP
// Byte-code regexp needs space allocated for all its registers.
@@ -466,7 +470,7 @@
#ifndef V8_INTERPRETED_REGEXP
ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
do {
- EnsureCompiledIrregexp(regexp, is_ascii);
+ EnsureCompiledIrregexp(regexp, subject, is_ascii);
Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
NativeRegExpMacroAssembler::Result res =
NativeRegExpMacroAssembler::Match(code,
@@ -784,6 +788,65 @@
}
+class FrequencyCollator {
+ public:
+ FrequencyCollator() : total_frequencies_(0) {
+ for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
+ frequencies_[i] = CharacterFrequency(i);
+ }
+ }
+
+ void CountCharacter(int character) {
+ int index = (character & RegExpMacroAssembler::kTableMask);
+ frequencies_[index].Increment();
+ total_frequencies_++;
+ }
+
+ void SortFrequencies() {
+ typedef int (*RawComparer)(const void*, const void*);
+ memcpy(
+ &sorted_frequencies_[0], &frequencies_[0], sizeof(sorted_frequencies_));
ulan 2012/03/30 13:04:48 A nit: for consistency I would prefer breaking mem
Erik Corry 2012/04/01 00:48:46 Done.
+ qsort(&sorted_frequencies_[0],
+ RegExpMacroAssembler::kTableSize,
+ sizeof(sorted_frequencies_[0]),
+ reinterpret_cast<RawComparer>(&CharacterFrequency::Compare));
+ }
+
+ bool IsFrequent(int in_character) {
+ if (total_frequencies_ < 10) return false; // Too little statistical basis.
+ int freq = frequencies_[in_character].counter();
ulan 2012/03/30 13:04:48 Either mask the in_character or ASSERT(in_characte
Erik Corry 2012/04/01 00:48:46 Done.
+ bool popular = ((freq * 100) / total_frequencies_ > 20);
+ return popular;
ulan 2012/03/30 13:04:48 Named magic numbers become less magic :) int freq
Erik Corry 2012/04/01 00:48:46 Done.
+ }
+
+ private:
+ class CharacterFrequency {
+ public:
+ CharacterFrequency() : counter_(0), character_(-1) { }
+ explicit CharacterFrequency(int character) : counter_(0), character_(character) { }
ulan 2012/03/30 13:04:48 Long line.
Erik Corry 2012/04/01 00:48:46 Done.
+
+ static int Compare(const CharacterFrequency* a,
+ const CharacterFrequency* b) {
+ return b->counter_ - a->counter_;
+ }
+
+ void Increment() { counter_++; }
+ int counter() { return counter_; }
+ int character() { return character_; }
+
+ private:
+ int counter_;
+ int character_;
+ };
+
+
+ private:
+ CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
+ CharacterFrequency sorted_frequencies_[RegExpMacroAssembler::kTableSize];
+ int total_frequencies_;
+};
+
+
class RegExpCompiler {
public:
RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
@@ -819,6 +882,7 @@
inline bool ignore_case() { return ignore_case_; }
inline bool ascii() { return ascii_; }
+ FrequencyCollator* frequency_collator() { return &frequency_collator_; }
int current_expansion_factor() { return current_expansion_factor_; }
void set_current_expansion_factor(int value) {
@@ -837,6 +901,7 @@
bool ascii_;
bool reg_exp_too_big_;
int current_expansion_factor_;
+ FrequencyCollator frequency_collator_;
};
@@ -865,7 +930,8 @@
ignore_case_(ignore_case),
ascii_(ascii),
reg_exp_too_big_(false),
- current_expansion_factor_(1) {
+ current_expansion_factor_(1),
+ frequency_collator_() {
accept_ = new EndNode(EndNode::ACCEPT);
ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
}
@@ -2056,6 +2122,16 @@
}
+void ActionNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
ulan 2012/03/30 13:04:48 For consistency with the surrounding code consider
Erik Corry 2012/04/01 00:48:46 Done.
+ if (type_ == BEGIN_SUBMATCH) {
+ bm->SetRest(offset);
+ } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {
+ on_success()->FillInBMInfo(offset, bm, not_at_start);
+ }
+}
+
+
int AssertionNode::EatsAtLeast(int still_to_find,
int recursion_depth,
bool not_at_start) {
@@ -2072,6 +2148,14 @@
}
+void AssertionNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ // Match the behaviour of EatsAtLeast on this node.
+ if (type() == AT_START && not_at_start) return;
+ on_success()->FillInBMInfo(offset, bm, not_at_start);
+}
+
+
int BackReferenceNode::EatsAtLeast(int still_to_find,
int recursion_depth,
bool not_at_start) {
@@ -2504,6 +2588,17 @@
}
+void LoopChoiceNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool nas) {
+ if (body_can_be_zero_length_) { // || info()->visited) {
ulan 2012/03/30 13:04:48 Forgotten comment.
Erik Corry 2012/04/01 00:48:46 Done.
+ bm->SetRest(offset);
+ return;
+ }
+ // VisitMarker marker(info());
ulan 2012/03/30 13:04:48 Comment.
Erik Corry 2012/04/01 00:48:46 Done.
+ ChoiceNode::FillInBMInfo(offset, bm, nas);
+}
+
+
void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
RegExpCompiler* compiler,
int characters_filled_in,
@@ -3000,6 +3095,30 @@
}
+RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
+ RegExpCompiler* compiler) {
+ if (elms_->length() != 1) return NULL;
+ TextElement elm = elms_->at(0);
+ if (elm.type != TextElement::CHAR_CLASS) return NULL;
+ RegExpCharacterClass* node = elm.data.u_char_class;
+ ZoneList<CharacterRange>* ranges = node->ranges();
+ if (!CharacterRange::IsCanonical(ranges)) {
+ CharacterRange::Canonicalize(ranges);
+ }
+ if (node->is_negated() && ranges->length() == 0) return on_success();
+ if (ranges->length() != 1) return NULL;
+ if (ranges->at(0).from() != 0) return NULL;
+ int max_char;
+ if (compiler->ascii()) {
ulan 2012/03/30 13:04:48 A nit, I like max_char = compiler->ascii() ? Stri
Erik Corry 2012/04/01 00:48:46 Yes, but a bug in gcc means you get a linker error
+ max_char = String::kMaxAsciiCharCode;
+ } else {
+ max_char = String::kMaxUtf16CodeUnit;
+ }
+ if (ranges->at(0).to() < max_char) return NULL;
ulan 2012/03/30 13:04:48 Consider using ranges(0)->IsEverything()
Erik Corry 2012/04/01 00:48:46 Done.
+ return on_success();
+}
+
+
// Finds the fixed match length of a sequence of nodes that goes from
// this alternative and back to this choice node. If there are variable
// length nodes or other complications in the way then return a sentinel
@@ -3064,8 +3183,8 @@
int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
- bool not_at_start) {
- int preload_characters = EatsAtLeast(4, 0, not_at_start);
+ int eats_at_least) {
+ int preload_characters = eats_at_least > 4 ? 4 : eats_at_least;
ulan 2012/03/30 13:04:48 Min(4, eats_at_least);
Erik Corry 2012/04/01 00:48:46 Done.
if (compiler->macro_assembler()->CanReadUnaligned()) {
bool ascii = compiler->ascii();
if (ascii) {
@@ -3131,6 +3250,197 @@
};
+BoyerMooreLookahead::BoyerMooreLookahead(
+ int length, int map_length, RegExpCompiler* compiler)
+ : length_(length),
+ map_length_(map_length),
ulan 2012/03/30 13:04:48 the Set() method assumes that map_length is a powe
Erik Corry 2012/04/01 00:48:46 Done.
+ compiler_(compiler) {
+ if (compiler->ascii()) {
+ max_char_ = String::kMaxAsciiCharCode;
+ } else {
+ max_char_ = String::kMaxUtf16CodeUnit;
+ }
+ bitmaps_ = new ZoneList<ZoneList<bool>*>(length);
ulan 2012/03/30 13:04:48 I wonder if bit-packing (ZoneList<bool>) would imp
Erik Corry 2012/04/01 00:48:46 We normally only do this once per regexp, so I don
+ for (int i = 0; i < length; i++) {
+ bitmaps_->Add(new ZoneList<bool>(map_length));
+ ZoneList<bool>* map = bitmaps_->at(i);
+ for (int i = 0; i < map_length; i++) {
+ map->Add(false);
+ }
+ }
+}
+
+
+bool BoyerMooreLookahead::TooVague(int offset, int limit) {
+ if (bitmaps_->at(offset) == NULL) return true;
+ if (Count(offset) > limit) return true;
+ return false;
+}
+
+
+bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
+ // The characters that are very frequent in the sample subject string should
+ // not be used for Boyer-Moore operations, so lets mark those positions as
+ // being too vague.
+ for (int i = 0; i < length_; i++) {
+ if (Count(i) >= kTooManyCharacters) {
+ SetAll(i);
+ } else {
+ ZoneList<bool>* map = bitmaps_->at(i);
+ for (int ch = 0; ch < map_length_; ch++) {
+ if (map->at(ch) && compiler_->frequency_collator()->IsFrequent(ch)) {
+ SetAll(i);
+ break;
+ }
+ }
+ }
+ }
+ const int kNoPoints = -1;
+ int biggest_points = kNoPoints;
+ int biggest_interval_max = 0;
+ int biggest_interval = 0;
+ int vagueness_factor = 10;
ulan 2012/03/30 13:04:48 Could you please write a comment describing the ma
Erik Corry 2012/04/01 00:48:46 I refactored this function because it reflected fa
+ for (int initial_max_vagueness = 4;
+ initial_max_vagueness <= kTooManyCharacters;
+ initial_max_vagueness *= 2, vagueness_factor--) {
+ int max = length_ - 1;
ulan 2012/03/30 13:04:48 This 'max' can just be called 'i'. I think there a
Erik Corry 2012/04/01 00:48:46 Done.
+ while (true) {
+ while (max >= 0 && TooVague(max, initial_max_vagueness)) max--;
+ if (max < 0) break;
+ int remembered_max = max;
+ int max_vagueness = initial_max_vagueness;
+ while (max >= 0 && !TooVague(max, max_vagueness)) {
+ max--;
+ if (max_vagueness < kTooManyCharacters) max_vagueness *= 2;
ulan 2012/03/30 13:04:48 What is the rationale behind this? Nit: max_vague
Erik Corry 2012/04/01 00:48:46 All refactored away.
+ }
+ int points =
+ ((remembered_max - max) * kTooManyCharacters) * vagueness_factor;
+ if (points > biggest_points) {
+ biggest_interval_max = remembered_max;
+ biggest_interval = remembered_max - max;
ulan 2012/03/30 13:04:48 Consider maintaining (left, right) instead of (big
Erik Corry 2012/04/01 00:48:46 Done.
+ biggest_points = points;
+ }
+ if (max < 0) break;
+ }
+ }
+ if (biggest_points == kNoPoints) return false;
+ if (biggest_interval < 1) return false;
+ *to = biggest_interval_max;
+ *from = 1 + biggest_interval_max - biggest_interval;
+ return true;
+}
+
+
+static int SortChar(const char* a, const char* b) {
+ return static_cast<int>(*a) - *b;
+}
+
+
+int BoyerMooreLookahead::GetSkipTable(int max_lookahead,
+ Handle<ByteArray> boolean_skip_table) {
+ const int kSize = RegExpMacroAssembler::kTableSize;
+ char table[kSize];
+
+ for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
+ table[i] = max_lookahead + 1;
+ }
+
+ for (int i = max_lookahead; i >= 0; i--) {
+ int skip = max_lookahead - i;
ulan 2012/03/30 16:54:51 This is the crucial part. A comment explaining why
Erik Corry 2012/04/01 00:48:46 Done.
+ ZoneList<bool>* map = bitmaps_->at(i);
+ if (map == NULL) {
+ for (int j = 0; j < map_length_; j++) {
+ if (table[j] > skip) table[j] = skip;
+ }
+ break;
+ }
+ for (int j = 0; j < map_length_; j++) {
+ if (map->at(j)) {
+ if (table[j] > skip) table[j] = skip;
+ }
+ }
+ }
+
+ char sorted_table[kSize];
+ memcpy(sorted_table, table, kSize);
+ typedef int (*RawComparer)(const void*, const void*);
+ qsort(sorted_table, kSize, 1, reinterpret_cast<RawComparer>(&SortChar));
+
+ int boolean_skip_distance = sorted_table[kSize / 2];
ulan 2012/03/30 16:54:51 Instead of kSize / 2 we can take any i in [0..kSiz
Erik Corry 2012/04/01 00:48:46 Done.
+ for (int i = 0; i < kSize; i++) {
+ boolean_skip_table->set(i, table[i] >= boolean_skip_distance ? 0 : 1);
ulan 2012/03/30 13:04:48 Can (x >= y ? 0 : 1) be replaced with (x < y) ?
Erik Corry 2012/04/01 00:48:46 We don't normally allow implicit bool->int and int
+ }
+ return boolean_skip_distance;
+}
+
+
+bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
+ int min_lookahead = 0;
+ int max_lookahead = 0;
+
+ if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
+
+ bool found_single_character = false;
ulan 2012/03/30 16:54:51 Consider using a counter: character_count = 0; .
Erik Corry 2012/04/01 00:48:46 I want to bail out when I find the second characte
+ bool abandoned_search_for_single_character = false;
+ int single_character = 0;
+ for (int i = max_lookahead; i >= min_lookahead; i--) {
+ ZoneList<bool>* map = bitmaps_->at(i);
+ for (int j = 0; j < map_length_; j++) {
+ if (map->at(j)) {
+ if (found_single_character) {
+ found_single_character = false; // Found two.
+ abandoned_search_for_single_character = true;
+ break;
+ } else {
+ found_single_character = true;
+ single_character = j;
+ }
+ }
+ }
+ if (abandoned_search_for_single_character) break;
+ }
+
+ int lookahead_width = max_lookahead + 1 - min_lookahead;
+
+ if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
+ // The mask-compare can probably handle this better.
+ return false;
+ }
+
+ if (found_single_character) {
+ Label cont, again;
+ masm->Bind(&again);
+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);
+ if (max_char_ > String::kMaxAsciiCharCode ||
+ map_length_ < RegExpMacroAssembler::kTableSize) {
+ masm->CheckCharacterAfterAnd(single_character,
+ RegExpMacroAssembler::kTableMask,
+ &cont);
+ } else {
+ masm->CheckCharacter(single_character, &cont);
ulan 2012/03/30 16:54:51 I think this implicitly assumes that String::kMaxA
Erik Corry 2012/04/01 00:48:46 Fixed.
+ }
+ masm->AdvanceCurrentPosition(lookahead_width);
+ masm->GoTo(&again);
+ masm->Bind(&cont);
+ return true;
+ }
+
+ Handle<ByteArray> boolean_skip_table =
+ FACTORY->NewByteArray(map_length_, TENURED);
+ int boolean_skip_distance = GetSkipTable(max_lookahead, boolean_skip_table);
+ if (boolean_skip_distance == 0) return false;
+
+ Label cont, again;
+ masm->Bind(&again);
+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);
+ masm->CheckBitInTable(boolean_skip_table, &cont);
+ masm->AdvanceCurrentPosition(boolean_skip_distance);
+ masm->GoTo(&again);
+ masm->Bind(&cont);
+
+ return true;
+}
+
/* Code generation for choice nodes.
*
* We generate quick checks that do a mask and compare to eliminate a
@@ -3274,10 +3584,52 @@
int first_normal_choice = greedy_loop ? 1 : 0;
- int preload_characters =
- CalculatePreloadCharacters(compiler,
- current_trace->at_start() == Trace::FALSE);
- bool preload_is_current =
+ bool not_at_start = current_trace->at_start() == Trace::FALSE;
+ const int kEatsAtLeastNotYetInitialized = -1;
+ int eats_at_least = kEatsAtLeastNotYetInitialized;
+
+ bool skip_was_emitted = false;
+
+ // More makes code generation slower, less makes V8 benchmark score lower.
+ const int kMaxLookaheadForBoyerMoore = 8;
+
+ if (!greedy_loop && choice_count == 2) {
+ GuardedAlternative alt1 = alternatives_->at(1);
+ if (alt1.guards() == NULL ||
ulan 2012/03/30 16:54:51 This fits in one line.
Erik Corry 2012/04/01 00:48:46 Done.
Erik Corry 2012/04/01 00:48:46 Done.
+ alt1.guards()->length() == 0) {
+ RegExpNode* eats_anything_node = alt1.node();
+ if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
+ this) {
+ // At this point we know that we are at a non-greedy loop that will eat
+ // any character one at a time. Any non-anchored regexp has such a
+ // loop prepended to it in order to find where it starts. We look for
+ // a pattern of the form ...abc... where we can look 6 characters ahead
ulan 2012/03/30 16:54:51 Can we put a similar comment in description of Emi
Erik Corry 2012/04/01 00:48:46 Done.
+ // and step forwards 3 if the character is not one of abc. Abc need
+ // not be atoms, they can be any reasonably limited character class or
+ // small alternation.
+ ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
+ eats_at_least =
+ Min(kMaxLookaheadForBoyerMoore,
+ EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
+ if (eats_at_least >= 1) {
+ BoyerMooreLookahead bm(eats_at_least,
+ RegExpMacroAssembler::kTableSize,
+ compiler);
+ GuardedAlternative alt0 = alternatives_->at(0);
+ alt0.node()->FillInBMInfo(0, &bm, not_at_start);
+ skip_was_emitted = bm.EmitSkipInstructions(macro_assembler);
+ }
+ }
+ }
+ }
+
+ if (eats_at_least == kEatsAtLeastNotYetInitialized) {
+ // Save some time by looking at most one machine word ahead.
+ eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);
+ }
+ int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
+
+ bool preload_is_current = !skip_was_emitted &&
(current_trace->characters_preloaded() == preload_characters);
bool preload_has_checked_bounds = preload_is_current;
@@ -5432,6 +5784,77 @@
}
+void ChoiceNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ ZoneList<GuardedAlternative>* alts = alternatives();
+ for (int i = 0; i < alts->length(); i++) {
+ GuardedAlternative& alt = alts->at(i);
+ if (alt.guards() != NULL &&
ulan 2012/03/30 13:04:48 This fits in one line.
Erik Corry 2012/04/01 00:48:46 Done.
+ alt.guards()->length() != 0) {
+ bm->SetRest(offset); // Give up trying to fill in info.
+ return;
+ }
+ alt.node()->FillInBMInfo(offset, bm, not_at_start);
+ }
+}
+
+
+void TextNode::FillInBMInfo(
+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {
+ if (offset >= bm->length()) return;
+ int max_char = bm->max_char();
+ for (int i = 0; i < elements()->length(); i++) {
+ if (offset >= bm->length()) return;
+ TextElement text = elements()->at(i);
+ if (text.type == TextElement::ATOM) {
+ RegExpAtom* atom = text.data.u_atom;
+ for (int j = 0; j < atom->length(); j++, offset++) {
+ if (offset >= bm->length()) return;
+ uc16 character = atom->data()[j];
+ if (bm->compiler()->ignore_case()) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ int length = GetCaseIndependentLetters(
+ ISOLATE,
+ character,
+ bm->max_char() == String::kMaxAsciiCharCode,
+ chars);
+ for (int j = 0; j < length; j++) {
+ bm->Set(offset, chars[j]);
+ }
+ } else {
+ if (character <= max_char) bm->Set(offset, character);
+ }
+ }
+ } else {
+ ASSERT(text.type == TextElement::CHAR_CLASS);
+ RegExpCharacterClass* char_class = text.data.u_char_class;
+ ZoneList<CharacterRange>* ranges = char_class->ranges();
+ if (char_class->is_negated()) {
+ bm->SetAll(offset);
+ } else {
+ for (int k = 0; k < ranges->length(); k++) {
+ CharacterRange& range = ranges->at(k);
+ if (range.from() > max_char) continue;
+ int to = Min(max_char, static_cast<int>(range.to()));
+ if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) {
+ bm->SetAll(offset);
+ break;
+ }
+ for (int m = range.from(); m <= to; m++) {
ulan 2012/03/30 16:54:51 Why don't we handle ignore_case() here like in the
Erik Corry 2012/04/01 00:48:46 The character classes have had case-independence r
+ bm->Set(offset, m);
+ }
+ }
+ }
+ offset++;
+ }
+ }
+ if (offset >= bm->length()) return;
+ on_success()->FillInBMInfo(offset,
ulan 2012/03/30 13:04:48 This fits in line if you put the comment in anothe
Erik Corry 2012/04/01 00:48:46 But the comment only applies to the line it is on,
+ bm,
+ true); // Not at start after a text node.
+}
+
+
int TextNode::ComputeFirstCharacterSet(int budget) {
budget--;
if (budget >= 0) {
@@ -5589,15 +6012,27 @@
}
-RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
- bool ignore_case,
- bool is_multiline,
- Handle<String> pattern,
- bool is_ascii) {
+RegExpEngine::CompilationResult RegExpEngine::Compile(
+ RegExpCompileData* data,
+ bool ignore_case,
+ bool is_multiline,
+ Handle<String> pattern,
+ Handle<String> sample_subject,
+ bool is_ascii) {
if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
return IrregexpRegExpTooBig();
}
RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
+
+ FlattenString(sample_subject);
+ int chars_sampled = 0;
+ int half_way = sample_subject->length() / 2;
ulan 2012/03/30 13:04:48 Why do we consider only the upper half of the samp
Erik Corry 2012/04/01 00:48:46 We consider a 100-character chunk in the middle of
+ for (int i = half_way; i < sample_subject->length(); i++) {
+ if (chars_sampled++ > 100) break;
ulan 2012/03/30 13:04:48 A nit: consider moving chars_sampled++ either to t
Erik Corry 2012/04/01 00:48:46 Done.
+ compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
+ }
+ compiler.frequency_collator()->SortFrequencies();
+
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
0,
« src/jsregexp.h ('K') | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698