src/jsregexp.cc - Issue 9965010: Regexp: Improve the speed that we scan for an initial point where a non-anchored

Unified Diff: src/jsregexp.cc

Issue 9965010: Regexp: Improve the speed that we scan for an initial point where a non-anchored (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 8 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 11190)

+++ src/jsregexp.cc (working copy)

@@ -280,7 +280,8 @@

// from the source pattern.

// If compilation fails, an exception is thrown and this function

// returns false.

-bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {

+bool RegExpImpl::EnsureCompiledIrregexp(

+ Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {

Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));

#ifdef V8_INTERPRETED_REGEXP

if (compiled_code->IsByteArray()) return true;

@@ -296,7 +297,7 @@

ASSERT(compiled_code->IsSmi());

return true;

}

- return CompileIrregexp(re, is_ascii);

+ return CompileIrregexp(re, sample_subject, is_ascii);

}

@@ -316,7 +317,9 @@

}

-bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {

+bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,

+ Handle<String> sample_subject,

+ bool is_ascii) {

// Compile the RegExp.

Isolate* isolate = re->GetIsolate();

ZoneScope zone_scope(isolate, DELETE_ON_EXIT);

@@ -365,6 +368,7 @@

flags.is_ignore_case(),

flags.is_multiline(),

pattern,

+ sample_subject,

is_ascii);

if (result.error_message != NULL) {

// Unable to compile regexp.

@@ -435,7 +439,7 @@

// Check the asciiness of the underlying storage.

bool is_ascii = subject->IsAsciiRepresentationUnderneath();

- if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1;

+ if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;

#ifdef V8_INTERPRETED_REGEXP

// Byte-code regexp needs space allocated for all its registers.

@@ -466,7 +470,7 @@

#ifndef V8_INTERPRETED_REGEXP

ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);

do {

- EnsureCompiledIrregexp(regexp, is_ascii);

+ EnsureCompiledIrregexp(regexp, subject, is_ascii);

Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);

NativeRegExpMacroAssembler::Result res =

NativeRegExpMacroAssembler::Match(code,

@@ -784,6 +788,53 @@

}

+class FrequencyCollator {

+ public:

+ FrequencyCollator() : total_samples_(0) {

+ for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {

+ frequencies_[i] = CharacterFrequency(i);

+ }

+ void CountCharacter(int character) {

+ int index = (character & RegExpMacroAssembler::kTableMask);

+ frequencies_[index].Increment();

+ total_samples_++;

+ }

+ // Does not measure in percent, but rather per-128 (the table size from the

+ // regexp macro assembler).

+ int Frequency(int in_character) {

+ ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);

+ if (total_samples_ < 1) return 1; // Division by zero.

+ int freq_in_per128 =

+ (frequencies_[in_character].counter() * 128) / total_samples_;

+ return freq_in_per128;

+ }

+ private:

+ class CharacterFrequency {

+ public:

+ CharacterFrequency() : counter_(0), character_(-1) { }

+ explicit CharacterFrequency(int character)

+ : counter_(0), character_(character) { }

+ void Increment() { counter_++; }

+ int counter() { return counter_; }

+ int character() { return character_; }

+ private:

+ int counter_;

+ int character_;

+ };

+ private:

+ CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];

+ int total_samples_;

+};

class RegExpCompiler {

public:

RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);

@@ -819,6 +870,7 @@

inline bool ignore_case() { return ignore_case_; }

inline bool ascii() { return ascii_; }

+ FrequencyCollator* frequency_collator() { return &frequency_collator_; }

int current_expansion_factor() { return current_expansion_factor_; }

void set_current_expansion_factor(int value) {

@@ -837,6 +889,7 @@

bool ascii_;

bool reg_exp_too_big_;

int current_expansion_factor_;

+ FrequencyCollator frequency_collator_;

};

@@ -865,7 +918,8 @@

ignore_case_(ignore_case),

ascii_(ascii),

reg_exp_too_big_(false),

- current_expansion_factor_(1) {

+ current_expansion_factor_(1),

+ frequency_collator_() {

accept_ = new EndNode(EndNode::ACCEPT);

ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);

}

@@ -2056,6 +2110,17 @@

}

+void ActionNode::FillInBMInfo(int offset,

+ BoyerMooreLookahead* bm,

+ bool not_at_start) {

+ if (type_ == BEGIN_SUBMATCH) {

+ bm->SetRest(offset);

+ } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {

+ on_success()->FillInBMInfo(offset, bm, not_at_start);

+ }

int AssertionNode::EatsAtLeast(int still_to_find,

int recursion_depth,

bool not_at_start) {

@@ -2072,6 +2137,14 @@

}

+void AssertionNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ // Match the behaviour of EatsAtLeast on this node.

+ if (type() == AT_START && not_at_start) return;

+ on_success()->FillInBMInfo(offset, bm, not_at_start);

int BackReferenceNode::EatsAtLeast(int still_to_find,

int recursion_depth,

bool not_at_start) {

@@ -2504,6 +2577,16 @@

}

+void LoopChoiceNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool nas) {

+ if (body_can_be_zero_length_) {

+ bm->SetRest(offset);

+ return;

+ }

+ ChoiceNode::FillInBMInfo(offset, bm, nas);

void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

RegExpCompiler* compiler,

int characters_filled_in,

@@ -3000,6 +3083,30 @@

}

+RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(

+ RegExpCompiler* compiler) {

+ if (elms_->length() != 1) return NULL;

+ TextElement elm = elms_->at(0);

+ if (elm.type != TextElement::CHAR_CLASS) return NULL;

+ RegExpCharacterClass* node = elm.data.u_char_class;

+ ZoneList<CharacterRange>* ranges = node->ranges();

+ if (!CharacterRange::IsCanonical(ranges)) {

+ CharacterRange::Canonicalize(ranges);

+ }

+ if (node->is_negated()) {

+ return ranges->length() == 0 ? on_success() : NULL;

+ }

+ if (ranges->length() != 1) return NULL;

+ uint32_t max_char;

+ if (compiler->ascii()) {

+ max_char = String::kMaxAsciiCharCode;

+ } else {

+ max_char = String::kMaxUtf16CodeUnit;

+ }

+ return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;

// Finds the fixed match length of a sequence of nodes that goes from

// this alternative and back to this choice node. If there are variable

// length nodes or other complications in the way then return a sentinel

@@ -3064,8 +3171,8 @@

int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,

- bool not_at_start) {

- int preload_characters = EatsAtLeast(4, 0, not_at_start);

+ int eats_at_least) {

+ int preload_characters = Min(4, eats_at_least);

if (compiler->macro_assembler()->CanReadUnaligned()) {

bool ascii = compiler->ascii();

if (ascii) {

@@ -3131,6 +3238,193 @@

};

+BoyerMooreLookahead::BoyerMooreLookahead(

+ int length, int map_length, RegExpCompiler* compiler)

+ : length_(length),

+ map_length_(map_length),

+ compiler_(compiler) {

+ ASSERT(IsPowerOf2(map_length));

+ if (compiler->ascii()) {

+ max_char_ = String::kMaxAsciiCharCode;

+ } else {

+ max_char_ = String::kMaxUtf16CodeUnit;

+ }

+ bitmaps_ = new ZoneList<ZoneList<bool>*>(length);

+ for (int i = 0; i < length; i++) {

+ bitmaps_->Add(new ZoneList<bool>(map_length));

+ ZoneList<bool>* map = bitmaps_->at(i);

+ for (int i = 0; i < map_length; i++) {

+ map->Add(false);

+ }

+// Find the longest range of lookahead that has the fewest number of different

+// characters that can occur at a given position. Since we are optimizing two

+// different parameters at once this is a tradeoff.

+bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {

+ int biggest_points = 0;

+ for (int max_number_of_chars = 4;

+ max_number_of_chars < kTooManyCharacters;

+ max_number_of_chars *= 2) {

+ biggest_points =

+ FindBestInterval(max_number_of_chars, biggest_points, from, to);

+ }

+ if (biggest_points == 0) return false;

+ return true;

+// Find the highest-points range between 0 and length_ where the character

+// information is not too vague. 'Too vague' means that there are more than

+// max_number_of_chars that can occur at this position. Calculates the number

+// of points as the product of width-of-the-range and

+// probability-of-finding-one-of-the-characters, where the probability is

+// calculated using the frequency distribution of the sample subject string.

+int BoyerMooreLookahead::FindBestInterval(

+ int max_number_of_chars, int old_biggest_points, int* from, int* to) {

+ int biggest_points = old_biggest_points;

+ static const int kSize = RegExpMacroAssembler::kTableSize;

+ for (int i = 0; i < length_; ) {

+ while (i < length_ && Count(i) > max_number_of_chars) i++;

+ if (i == length_) break;

+ int remembered_from = i;

+ bool union_map[kSize];

+ for (int j = 0; j < kSize; j++) union_map[j] = false;

+ while (i < length_ && Count(i) <= max_number_of_chars) {

+ ZoneList<bool>* map = bitmaps_->at(i);

+ for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);

+ i++;

+ }

+ int frequency = 0;

+ for (int j = 0; j < kSize; j++) {

+ if (union_map[j]) {

+ // Add 1 to the frequency to give a small per-character boost for

+ // the cases where our sampling is not good enough and many

+ // characters have a frequency of zero.

+ frequency += compiler_->frequency_collator()->Frequency(j) + 1;

+ }

+ // We use the probability of skipping times the distance we are skipping to

+ // judge the effectiveness of this. Actually we have a cut-off: By

+ // dividing by 2 we switch off the skipping if the probability of skipping

+ // is less than 50%. This is because the multibyte mask-and-compare

+ // skipping in quickcheck is more likely to do well on this case.

+ bool in_quickcheck_range = ((i - remembered_from < 4) ||

+ (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));

+ int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;

ulan 2012/04/02 08:12:24 This shouldn't affect the correctness but: if I r

Erik Corry 2012/04/02 09:36:22 This code is as intended. I added a few comments.

+ int points = (i - remembered_from) * probability;

+ if (points > biggest_points) {

+ *from = remembered_from;

+ *to = i - 1;

+ biggest_points = points;

+ }

+ return biggest_points;

+// Take all the characters that will not prevent a successful match if they

+// occur occur in the subject string in the range between min_lookahead and

ulan 2012/04/02 08:12:24 'occur' twice.

Erik Corry 2012/04/02 09:36:22 Done.

+// max_lookahead (inclusive) measured from the current position. If the

+// character at max_lookahead offset is not one of these characters, then we

+// can safely skip forwards by the number of characters in the range.

+int BoyerMooreLookahead::GetSkipTable(int min_lookahead,

+ int max_lookahead,

+ Handle<ByteArray> boolean_skip_table) {

+ const int kSize = RegExpMacroAssembler::kTableSize;

+ const int kSkipArrayEntry = 0;

+ const int kDontSkipArrayEntry = 1;

+ for (int i = 0; i < kSize; i++) {

+ boolean_skip_table->set(i, kSkipArrayEntry);

+ }

+ int skip = max_lookahead + 1 - min_lookahead;

+ for (int i = max_lookahead; i >= min_lookahead; i--) {

+ ZoneList<bool>* map = bitmaps_->at(i);

+ for (int j = 0; j < map_length_; j++) {

+ if (map->at(j)) {

+ boolean_skip_table->set(j, kDontSkipArrayEntry);

+ }

+ return skip;

+// See comment above on the implementation of GetSkipTable.

+bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {

+ int min_lookahead = 0;

+ int max_lookahead = 0;

+ if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;

+ bool found_single_character = false;

+ bool abandoned_search_for_single_character = false;

+ int single_character = 0;

+ for (int i = max_lookahead; i >= min_lookahead; i--) {

+ ZoneList<bool>* map = bitmaps_->at(i);

+ for (int j = 0; j < map_length_; j++) {

+ if (map->at(j)) {

+ if (found_single_character) {

+ found_single_character = false; // Found two.

+ abandoned_search_for_single_character = true;

+ break;

+ } else {

+ found_single_character = true;

+ single_character = j;

+ }

+ if (abandoned_search_for_single_character) break;

+ }

+ int lookahead_width = max_lookahead + 1 - min_lookahead;

+ if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {

+ // The mask-compare can probably handle this better.

+ return false;

+ }

+ if (found_single_character) {

+ Label cont, again;

+ masm->Bind(&again);

+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);

+ if (max_char_ > map_length_) {

+ ASSERT(map_length_ == RegExpMacroAssembler::kTableSize);

+ masm->CheckCharacterAfterAnd(single_character,

+ RegExpMacroAssembler::kTableMask,

+ &cont);

+ } else {

+ masm->CheckCharacter(single_character, &cont);

+ }

+ masm->AdvanceCurrentPosition(lookahead_width);

+ masm->GoTo(&again);

+ masm->Bind(&cont);

+ return true;

+ }

+ Handle<ByteArray> boolean_skip_table =

+ FACTORY->NewByteArray(map_length_, TENURED);

+ int skip_distance = GetSkipTable(

+ min_lookahead, max_lookahead, boolean_skip_table);

+ ASSERT(skip_distance != 0);

+ Label cont, again;

+ masm->Bind(&again);

+ masm->LoadCurrentCharacter(max_lookahead, &cont, true);

+ masm->CheckBitInTable(boolean_skip_table, &cont);

+ masm->AdvanceCurrentPosition(skip_distance);

+ masm->GoTo(&again);

+ masm->Bind(&cont);

+ return true;

/* Code generation for choice nodes.

* We generate quick checks that do a mask and compare to eliminate a

@@ -3274,10 +3568,51 @@

int first_normal_choice = greedy_loop ? 1 : 0;

- int preload_characters =

- CalculatePreloadCharacters(compiler,

- current_trace->at_start() == Trace::FALSE);

- bool preload_is_current =

+ bool not_at_start = current_trace->at_start() == Trace::FALSE;

+ const int kEatsAtLeastNotYetInitialized = -1;

+ int eats_at_least = kEatsAtLeastNotYetInitialized;

+ bool skip_was_emitted = false;

+ // More makes code generation slower, less makes V8 benchmark score lower.

+ const int kMaxLookaheadForBoyerMoore = 8;

+ if (!greedy_loop && choice_count == 2) {

+ GuardedAlternative alt1 = alternatives_->at(1);

+ if (alt1.guards() == NULL || alt1.guards()->length() == 0) {

+ RegExpNode* eats_anything_node = alt1.node();

+ if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==

+ this) {

+ // At this point we know that we are at a non-greedy loop that will eat

+ // any character one at a time. Any non-anchored regexp has such a

+ // loop prepended to it in order to find where it starts. We look for

+ // a pattern of the form ...abc... where we can look 6 characters ahead

+ // and step forwards 3 if the character is not one of abc. Abc need

+ // not be atoms, they can be any reasonably limited character class or

+ // small alternation.

+ ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.

+ eats_at_least =

+ Min(kMaxLookaheadForBoyerMoore,

+ EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));

+ if (eats_at_least >= 1) {

+ BoyerMooreLookahead bm(eats_at_least,

+ RegExpMacroAssembler::kTableSize,

+ compiler);

+ GuardedAlternative alt0 = alternatives_->at(0);

+ alt0.node()->FillInBMInfo(0, &bm, not_at_start);

+ skip_was_emitted = bm.EmitSkipInstructions(macro_assembler);

+ }

+ if (eats_at_least == kEatsAtLeastNotYetInitialized) {

+ // Save some time by looking at most one machine word ahead.

+ eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);

+ }

+ int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);

+ bool preload_is_current = !skip_was_emitted &&

(current_trace->characters_preloaded() == preload_characters);

bool preload_has_checked_bounds = preload_is_current;

@@ -5432,6 +5767,76 @@

}

+void ChoiceNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ ZoneList<GuardedAlternative>* alts = alternatives();

+ for (int i = 0; i < alts->length(); i++) {

+ GuardedAlternative& alt = alts->at(i);

+ if (alt.guards() != NULL && alt.guards()->length() != 0) {

+ bm->SetRest(offset); // Give up trying to fill in info.

+ return;

+ }

+ alt.node()->FillInBMInfo(offset, bm, not_at_start);

+ }

+void TextNode::FillInBMInfo(

+ int offset, BoyerMooreLookahead* bm, bool not_at_start) {

+ if (offset >= bm->length()) return;

+ int max_char = bm->max_char();

+ for (int i = 0; i < elements()->length(); i++) {

+ if (offset >= bm->length()) return;

+ TextElement text = elements()->at(i);

+ if (text.type == TextElement::ATOM) {

+ RegExpAtom* atom = text.data.u_atom;

+ for (int j = 0; j < atom->length(); j++, offset++) {

+ if (offset >= bm->length()) return;

+ uc16 character = atom->data()[j];

+ if (bm->compiler()->ignore_case()) {

+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ int length = GetCaseIndependentLetters(

+ ISOLATE,

+ character,

+ bm->max_char() == String::kMaxAsciiCharCode,

+ chars);

+ for (int j = 0; j < length; j++) {

+ bm->Set(offset, chars[j]);

+ }

+ } else {

+ if (character <= max_char) bm->Set(offset, character);

+ }

+ } else {

+ ASSERT(text.type == TextElement::CHAR_CLASS);

+ RegExpCharacterClass* char_class = text.data.u_char_class;

+ ZoneList<CharacterRange>* ranges = char_class->ranges();

+ if (char_class->is_negated()) {

+ bm->SetAll(offset);

+ } else {

+ for (int k = 0; k < ranges->length(); k++) {

+ CharacterRange& range = ranges->at(k);

+ if (range.from() > max_char) continue;

+ int to = Min(max_char, static_cast<int>(range.to()));

+ if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) {

+ bm->SetAll(offset);

+ break;

+ }

+ for (int m = range.from(); m <= to; m++) {

+ bm->Set(offset, m);

+ }

+ offset++;

+ }

+ if (offset >= bm->length()) return;

+ on_success()->FillInBMInfo(offset,

+ bm,

+ true); // Not at start after a text node.

int TextNode::ComputeFirstCharacterSet(int budget) {

budget--;

if (budget >= 0) {

@@ -5589,15 +5994,31 @@

}

-RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,

- bool ignore_case,

- bool is_multiline,

- Handle<String> pattern,

- bool is_ascii) {

+RegExpEngine::CompilationResult RegExpEngine::Compile(

+ RegExpCompileData* data,

+ bool ignore_case,

+ bool is_multiline,

+ Handle<String> pattern,

+ Handle<String> sample_subject,

+ bool is_ascii) {

if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {

return IrregexpRegExpTooBig();

}

RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);

+ // Sample some characters from the middle of the string.

+ static const int kSampleSize = 128;

+ FlattenString(sample_subject);

+ int chars_sampled = 0;

+ int half_way = (sample_subject->length() - kSampleSize) / 2;

+ for (int i = Max(0, half_way);

+ i < sample_subject->length();

+ i++, chars_sampled++) {

+ if (chars_sampled > kSampleSize) break;

ulan 2012/04/02 08:12:24 Now we can move this into the for loop condition.

Erik Corry 2012/04/02 09:36:22 Done.

+ compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));

+ }

// Wrap the body of the regexp in capture #0.

RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,

« no previous file with comments | « src/jsregexp.h ('k') | src/x64/regexp-macro-assembler-x64.cc » ('j') | no next file with comments »