src/jsregexp.cc - Issue 13247: * Have an ASCII and a UC16 interpreter for Irregexp bytecodes -...

Unified Diff: src/jsregexp.cc

Issue 13247: * Have an ASCII and a UC16 interpreter for Irregexp bytecodes -... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 12 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 939)

+++ src/jsregexp.cc (working copy)

@@ -962,10 +962,12 @@

}

Handle<ByteArray> byte_codes = IrregexpByteCode(irregexp);

- Handle<String> two_byte_subject = CachedStringToTwoByte(subject);

+ if (!subject->IsFlat(StringShape(*subject))) {

+ FlattenString(subject);

Lasse Reichstein 2008/12/09 07:43:07 The string is also flattened in the IA32 branch (l

+ }

rc = IrregexpInterpreter::Match(byte_codes,

- two_byte_subject,

+ subject,

offsets_vector,

previous_index);

break;

@@ -1191,7 +1193,7 @@

class RegExpCompiler {

public:

- RegExpCompiler(int capture_count, bool ignore_case);

+ RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);

int AllocateRegister() { return next_register_++; }

@@ -1215,6 +1217,7 @@

inline void DecrementRecursionDepth() { recursion_depth_--; }

inline bool ignore_case() { return ignore_case_; }

+ inline bool ascii() { return ascii_; }

private:

EndNode* accept_;

@@ -1223,6 +1226,7 @@

int recursion_depth_;

RegExpMacroAssembler* macro_assembler_;

bool ignore_case_;

+ bool ascii_;

};

@@ -1239,11 +1243,12 @@

// Attempts to compile the regexp using an Irregexp code generator. Returns

// a fixed array or a null handle depending on whether it succeeded.

-RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)

+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)

: next_register_(2 * (capture_count + 1)),

work_list_(NULL),

recursion_depth_(0),

- ignore_case_(ignore_case) {

+ ignore_case_(ignore_case),

+ ascii_(ascii) {

accept_ = new EndNode(EndNode::ACCEPT);

}

@@ -1682,7 +1687,6 @@

chars[0],

chars[1],

on_failure)) {

- ok.Unuse();

} else {

macro_assembler->CheckCharacter(chars[0], &ok);

macro_assembler->CheckNotCharacter(chars[1], on_failure);

@@ -1711,8 +1715,10 @@

RegExpCharacterClass* cc,

int cp_offset,

Label* on_failure,

- bool check_offset) {

+ bool check_offset,

+ bool ascii) {

ZoneList<CharacterRange>* ranges = cc->ranges();

+ const int max_char = ascii ? 0x7f : 0xffff;

Lasse Reichstein 2008/12/09 07:43:07 Use String::kMaxAsciiCharCode instead of 0x7f?

Label success;

@@ -1721,16 +1727,27 @@

int range_count = ranges->length();

- if (range_count == 0) {

+ int last_valid_range = range_count - 1;

+ while (last_valid_range >= 0) {

+ CharacterRange& range = ranges->at(last_valid_range);

+ if (range.from() <= max_char) {

+ break;

+ }

+ last_valid_range--;

+ }

+ if (last_valid_range < 0) {

if (!cc->is_negated()) {

+ // TODO(plesner): We can remove this when the node level does our

+ // ASCII optimizations for us.

macro_assembler->GoTo(on_failure);

}

return;

}

- if (range_count == 1 &&

+ if (last_valid_range == 0 &&

!cc->is_negated() &&

- ranges->at(0).IsEverything(0xffff)) {

+ ranges->at(0).IsEverything(max_char)) {

// This is a common case hit by non-anchored expressions.

// TODO(erikcorry): We should have a macro assembler instruction that just

// checks for end of string without loading the character.

@@ -1748,18 +1765,22 @@

macro_assembler->LoadCurrentCharacterUnchecked(cp_offset);

}

- for (int i = 0; i < range_count - 1; i++) {

+ for (int i = 0; i <= last_valid_range; i++) {

CharacterRange& range = ranges->at(i);

Label next_range;

uc16 from = range.from();

uc16 to = range.to();

+ if (from > max_char) {

+ continue;

+ }

+ if (to > max_char) to = max_char;

if (to == from) {

macro_assembler->CheckCharacter(to, char_is_in_class);

} else {

if (from != 0) {

macro_assembler->CheckCharacterLT(from, &next_range);

Lasse Reichstein 2008/12/09 07:43:07 How about a CheckCharacterRange(from, to, char_is_

}

- if (to != 0xffff) {

+ if (to != max_char) {

macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);

} else {

macro_assembler->GoTo(char_is_in_class);

@@ -1768,10 +1789,13 @@

macro_assembler->Bind(&next_range);

}

- CharacterRange& range = ranges->at(range_count - 1);

+ CharacterRange& range = ranges->at(last_valid_range);

uc16 from = range.from();

uc16 to = range.to();

+ if (to > max_char) to = max_char;

+ ASSERT(to >= from);

if (to == from) {

if (cc->is_negated()) {

macro_assembler->CheckCharacter(to, on_failure);

@@ -1875,7 +1899,25 @@

macro_assembler->GoTo(backtrack);

return true;

}

- // First, handle straight character matches.

+ // First check for non-ASCII text.

+ // TODO(plesner): We should do this at node level.

+ if (compiler->ascii()) {

+ for (int i = element_count - 1; i >= 0; i--) {

+ TextElement elm = elms_->at(i);

+ if (elm.type == TextElement::ATOM) {

+ Vector<const uc16> quarks = elm.data.u_atom->data();

+ for (int j = quarks.length() - 1; j >= 0; j--) {

+ if (quarks[j] > 0x7f) {

Lasse Reichstein 2008/12/09 07:43:07 Use String::kMaxAsciiCharCode

+ macro_assembler->GoTo(backtrack);

+ return true;

+ }

+ } else {

+ ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);

+ }

+ // Second, handle straight character matches.

int checked_up_to = -1;

for (int i = element_count - 1; i >= 0; i--) {

TextElement elm = elms_->at(i);

@@ -1902,7 +1944,7 @@

ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);

}

- // Second, handle case independent letter matches if any.

+ // Third, handle case independent letter matches if any.

if (compiler->ignore_case()) {

for (int i = element_count - 1; i >= 0; i--) {

TextElement elm = elms_->at(i);

@@ -1930,7 +1972,8 @@

cc,

cp_offset,

backtrack,

- checked_up_to < cp_offset);

+ checked_up_to < cp_offset,

+ compiler->ascii());

if (cp_offset > checked_up_to) checked_up_to = cp_offset;

}

@@ -3611,7 +3654,7 @@

bool is_multiline,

Handle<String> pattern,

bool is_ascii) {

- RegExpCompiler compiler(input->capture_count, ignore_case);

+ RegExpCompiler compiler(input->capture_count, ignore_case, is_ascii);

// Wrap the body of the regexp in capture #0.

RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,

« no previous file with comments | « src/interpreter-irregexp.cc ('k') | no next file » | no next file with comments »