src/jsregexp.cc - Issue 12427: Merge regexp2000 back into bleeding_edge

Unified Diff: src/jsregexp.cc

Issue 12427: Merge regexp2000 back into bleeding_edge (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 12 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

===================================================================

--- src/jsregexp.cc (revision 830)

+++ src/jsregexp.cc (working copy)

@@ -25,15 +25,30 @@

// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+#define _HAS_EXCEPTIONS 0

Erik Corry 2008/11/25 11:03:52 ??

+#include <set>

Erik Corry 2008/11/25 11:03:52 Where is this coming from?

Mads Ager (chromium) 2008/11/25 21:09:41 These are still there. Please remove!

Christian Plesner Hansen 2008/11/26 06:49:56 It's in use.

#include "v8.h"

+#include "ast.h"

#include "execution.h"

#include "factory.h"

-#include "jsregexp.h"

+#include "jsregexp-inl.h"

#include "platform.h"

#include "runtime.h"

#include "top.h"

#include "compilation-cache.h"

+#include "string-stream.h"

+#include "parser.h"

+#include "assembler-irregexp.h"

+#include "regexp-macro-assembler.h"

+#include "regexp-macro-assembler-irregexp.h"

+#if defined __arm__ || defined __thumb__ || defined ARM

+// include regexp-macro-assembler-arm.h when created.

+#else // ia32

+#include "regexp-macro-assembler-ia32.h"

+#endif

+#include "interpreter-irregexp.h"

// Including pcre.h undefines DEBUG to avoid getting debug output from

// the JSCRE implementation. Make sure to redefine it in debug mode

@@ -45,12 +60,10 @@

#include "third_party/jscre/pcre.h"

#endif

namespace v8 { namespace internal {

-#define CAPTURE_INDEX 0

-#define INTERNAL_INDEX 1

static Failure* malloc_failure;

static void* JSREMalloc(size_t size) {

@@ -176,7 +189,16 @@

}

-unibrow::Predicate<unibrow::RegExpSpecialChar, 128> is_reg_exp_special_char;

+static inline void ThrowRegExpException(Handle<JSRegExp> re,

+ Handle<String> pattern,

+ Handle<String> error_text,

+ const char* message) {

+ Handle<JSArray> array = Factory::NewJSArray(2);

+ SetElement(array, 0, pattern);

+ SetElement(array, 1, error_text);

+ Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);

+ Top::Throw(*regexp_err);

Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,

@@ -186,20 +208,42 @@

Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);

bool in_cache = !cached.is_null();

Handle<Object> result;

- StringShape shape(*pattern);

if (in_cache) {

re->set_data(*cached);

result = re;

} else {

- bool is_atom = !flags.is_ignore_case();

- for (int i = 0; is_atom && i < pattern->length(shape); i++) {

- if (is_reg_exp_special_char.get(pattern->Get(shape, i)))

- is_atom = false;

+ FlattenString(pattern);

+ RegExpParseResult parse_result;

+ FlatStringReader reader(pattern);

+ if (!ParseRegExp(&reader, &parse_result)) {

+ // Throw an exception if we fail to parse the pattern.

+ ThrowRegExpException(re,

+ pattern,

+ parse_result.error,

+ "malformed_regexp");

+ return Handle<Object>();

}

- if (is_atom) {

- result = AtomCompile(re, pattern, flags);

+ RegExpAtom* atom = parse_result.tree->AsAtom();

+ if (atom != NULL && !flags.is_ignore_case()) {

+ if (parse_result.has_character_escapes) {

+ Vector<const uc16> atom_pattern = atom->data();

+ Handle<String> atom_string =

+ Factory::NewStringFromTwoByte(atom_pattern);

+ result = AtomCompile(re, pattern, flags, atom_string);

+ } else {

+ result = AtomCompile(re, pattern, flags, pattern);

+ }

} else {

- result = JsreCompile(re, pattern, flags);

+ RegExpNode* node = NULL;

+ Handle<FixedArray> irregexp_data =

+ RegExpEngine::Compile(&parse_result,

+ &node,

+ flags.is_ignore_case());

+ if (irregexp_data.is_null()) {

+ result = JscrePrepare(re, pattern, flags);

+ } else {

+ result = IrregexpPrepare(re, pattern, flags, irregexp_data);

+ }

}

Object* data = re->data();

if (data->IsFixedArray()) {

@@ -220,9 +264,11 @@

Handle<Object> index) {

switch (regexp->TypeTag()) {

case JSRegExp::JSCRE:

- return JsreExec(regexp, subject, index);

+ return JscreExec(regexp, subject, index);

case JSRegExp::ATOM:

return AtomExec(regexp, subject, index);

+ case JSRegExp::IRREGEXP:

+ return IrregexpExec(regexp, subject, index);

default:

UNREACHABLE();

return Handle<Object>();

@@ -234,9 +280,11 @@

Handle<String> subject) {

switch (regexp->TypeTag()) {

case JSRegExp::JSCRE:

- return JsreExecGlobal(regexp, subject);

+ return JscreExecGlobal(regexp, subject);

case JSRegExp::ATOM:

return AtomExecGlobal(regexp, subject);

+ case JSRegExp::IRREGEXP:

+ return IrregexpExecGlobal(regexp, subject);

default:

UNREACHABLE();

return Handle<Object>();

@@ -246,8 +294,9 @@

Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,

Handle<String> pattern,

- JSRegExp::Flags flags) {

- Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, pattern);

+ JSRegExp::Flags flags,

+ Handle<String> match_pattern) {

+ Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);

return re;

}

@@ -267,12 +316,8 @@

if (value == -1) return Factory::null_value();

Handle<FixedArray> array = Factory::NewFixedArray(2);

- array->set(0,

- Smi::FromInt(value),

- SKIP_WRITE_BARRIER);

- array->set(1,

- Smi::FromInt(value + needle->length()),

- SKIP_WRITE_BARRIER);

+ array->set(0, Smi::FromInt(value));

+ array->set(1, Smi::FromInt(value + needle->length()));

return Factory::NewJSArrayWithElements(array);

}

@@ -296,12 +341,8 @@

int end = value + needle_length;

Handle<FixedArray> array = Factory::NewFixedArray(2);

- array->set(0,

- Smi::FromInt(value),

- SKIP_WRITE_BARRIER);

- array->set(1,

- Smi::FromInt(end),

- SKIP_WRITE_BARRIER);

+ array->set(0, Smi::FromInt(value));

+ array->set(1, Smi::FromInt(end));

Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);

SetElement(result, match_count, pair);

match_count++;

@@ -312,6 +353,24 @@

}

+Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,

+ Handle<String> pattern,

+ JSRegExp::Flags flags) {

+ Handle<Object> value(Heap::undefined_value());

+ Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);

+ return re;

+Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,

+ Handle<String> pattern,

+ JSRegExp::Flags flags,

+ Handle<FixedArray> irregexp_data) {

+ Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data);

+ return re;

static inline Object* DoCompile(String* pattern,

JSRegExp::Flags flags,

unsigned* number_of_captures,

@@ -358,9 +417,13 @@

}

-Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re,

- Handle<String> pattern,

- JSRegExp::Flags flags) {

+Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {

+ ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);

+ ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());

+ Handle<String> pattern(re->Pattern());

+ JSRegExp::Flags flags = re->GetFlags();

Handle<String> two_byte_pattern = StringToTwoByte(pattern);

unsigned number_of_captures;

@@ -391,26 +454,110 @@

Handle<ByteArray> internal(

ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));

- Handle<FixedArray> value = Factory::NewFixedArray(2);

- value->set(CAPTURE_INDEX, Smi::FromInt(number_of_captures));

- value->set(INTERNAL_INDEX, *internal);

+ Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);

+ value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));

+ value->set(kJscreInternalIndex, *internal);

Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);

return re;

}

-Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp,

- int num_captures,

- Handle<String> subject,

- int previous_index,

- const uc16* two_byte_subject,

- int* offsets_vector,

- int offsets_vector_length) {

+Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp,

+ int num_captures,

+ Handle<String> two_byte_subject,

+ int previous_index,

+ int* offsets_vector,

+ int offsets_vector_length) {

+#ifdef DEBUG

+ if (FLAG_trace_regexp_bytecodes) {

+ String* pattern = regexp->Pattern();

+ PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));

+ PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString()));

+ }

+#endif

+ ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation());

+ ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject)));

+ bool rc;

+ {

Mads Ager (chromium) 2008/11/25 21:09:41 Why is the extra scope needed here?

Erik Corry 2008/11/26 12:18:36 It isn't. Removed.

+ for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {

+ offsets_vector[i] = -1;

+ }

+ LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject));

+ FixedArray* irregexp =

+ FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex));

+ int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();

+ switch (tag) {

Mads Ager (chromium) 2008/11/25 21:09:41 The indentation of this switch is off. Indent the

Erik Corry 2008/11/26 12:18:36 Done.

+ case RegExpMacroAssembler::kIA32Implementation: {

+ Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex));

+ SmartPointer<int> captures(NewArray<int>((num_captures + 1) * 2));

+ Address start_addr =

+ Handle<SeqTwoByteString>::cast(two_byte_subject)->GetCharsAddress();

+ int start_offset =

+ start_addr - reinterpret_cast<Address>(*two_byte_subject);

+ int end_offset =

+ start_offset + (two_byte_subject->length() - previous_index) * 2;

+ typedef bool testfunc(String**, int, int, int*);

+ testfunc* test = FUNCTION_CAST<testfunc*>(code->entry());

+ rc = test(two_byte_subject.location(),

+ start_offset,

+ end_offset,

+ *captures);

+ if (rc) {

+ // Capture values are relative to start_offset only.

+ for (int i = 0; i < offsets_vector_length; i++) {

+ if (offsets_vector[i] >= 0) {

+ offsets_vector[i] += previous_index;

+ }

+ break;

+ }

+ default:

Mads Ager (chromium) 2008/11/25 21:09:41 Why is this default case in the middle of the rest

Erik Corry 2008/11/26 12:18:36 Because it falls through? Lasse, please take a lo

+ case RegExpMacroAssembler::kARMImplementation:

+ UNREACHABLE();

+ rc = false;

+ break;

+ case RegExpMacroAssembler::kBytecodeImplementation: {

+ Handle<ByteArray> byte_codes = IrregexpCode(regexp);

+ rc = IrregexpInterpreter::Match(byte_codes,

+ two_byte_subject,

+ offsets_vector,

+ previous_index);

+ break;

+ }

+ if (!rc) {

+ return Factory::null_value();

+ }

+ Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));

+ // The captures come in (start, end+1) pairs.

+ for (int i = 0; i < 2 * (num_captures+1); i += 2) {

+ array->set(i, Smi::FromInt(offsets_vector[i]));

+ array->set(i+1, Smi::FromInt(offsets_vector[i+1]));

+ }

+ return Factory::NewJSArrayWithElements(array);

+Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,

+ int num_captures,

+ Handle<String> subject,

+ int previous_index,

+ const uc16* two_byte_subject,

+ int* offsets_vector,

+ int offsets_vector_length) {

int rc;

{

AssertNoAllocation a;

- ByteArray* internal = JsreInternal(regexp);

+ ByteArray* internal = JscreInternal(regexp);

const JscreRegExp* js_regexp =

reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress());

@@ -444,12 +591,8 @@

Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));

// The captures come in (start, end+1) pairs.

for (int i = 0; i < 2 * (num_captures+1); i += 2) {

- array->set(i,

- Smi::FromInt(offsets_vector[i]),

- SKIP_WRITE_BARRIER);

- array->set(i+1,

- Smi::FromInt(offsets_vector[i+1]),

- SKIP_WRITE_BARRIER);

+ array->set(i, Smi::FromInt(offsets_vector[i]));

+ array->set(i+1, Smi::FromInt(offsets_vector[i+1]));

}

return Factory::NewJSArrayWithElements(array);

}

@@ -457,8 +600,8 @@

class OffsetsVector {

public:

- inline OffsetsVector(int num_captures) {

- offsets_vector_length_ = (num_captures + 1) * 3;

+ inline OffsetsVector(int num_registers) :

Mads Ager (chromium) 2008/11/25 21:09:41 Move : to the next line and do a four space indent

+ offsets_vector_length_(num_registers) {

if (offsets_vector_length_ > kStaticOffsetsVectorSize) {

vector_ = NewArray<int>(offsets_vector_length_);

} else {

@@ -487,7 +630,7 @@

private:

int* vector_;

int offsets_vector_length_;

- static const int kStaticOffsetsVectorSize = 30;

+ static const int kStaticOffsetsVectorSize = 50;

static int static_offsets_vector_[kStaticOffsetsVectorSize];

};

@@ -496,34 +639,127 @@

OffsetsVector::kStaticOffsetsVectorSize];

-Handle<Object> RegExpImpl::JsreExec(Handle<JSRegExp> regexp,

+Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,

Handle<String> subject,

Mads Ager (chromium) 2008/11/25 21:09:41 Indentation is off.

Erik Corry 2008/11/26 12:18:36 Fixed.

Handle<Object> index) {

+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);

+ ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());

// Prepare space for the return values.

- int num_captures = JsreCapture(regexp);

+ int number_of_registers = IrregexpNumberOfRegisters(regexp);

+ OffsetsVector offsets(number_of_registers);

- OffsetsVector offsets(num_captures);

+ int num_captures = IrregexpNumberOfCaptures(regexp);

int previous_index = static_cast<int>(DoubleToInteger(index->Number()));

Handle<String> subject16 = CachedStringToTwoByte(subject);

- Handle<Object> result(JsreExecOnce(regexp, num_captures, subject,

- previous_index,

- subject16->GetTwoByteData(),

- offsets.vector(), offsets.length()));

+ Handle<Object> result(

+ IrregexpExecOnce(regexp,

+ num_captures,

+ subject16,

+ previous_index,

+ offsets.vector(),

+ offsets.length()));

+ return result;

+Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,

+ Handle<String> subject,

+ Handle<Object> index) {

+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);

+ if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {

+ Handle<Object> compile_result = JscreCompile(regexp);

+ if (compile_result.is_null()) return compile_result;

+ }

+ ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());

+ int num_captures = JscreNumberOfCaptures(regexp);

+ OffsetsVector offsets((num_captures + 1) * 3);

+ int previous_index = static_cast<int>(DoubleToInteger(index->Number()));

+ Handle<String> subject16 = CachedStringToTwoByte(subject);

+ Handle<Object> result(JscreExecOnce(regexp,

+ num_captures,

+ subject,

+ previous_index,

+ subject16->GetTwoByteData(),

+ offsets.vector(),

+ offsets.length()));

return result;

}

-Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp,

- Handle<String> subject) {

+Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,

+ Handle<String> subject) {

+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);

+ ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());

// Prepare space for the return values.

- int num_captures = JsreCapture(regexp);

+ int number_of_registers = IrregexpNumberOfRegisters(regexp);

+ OffsetsVector offsets(number_of_registers);

- OffsetsVector offsets(num_captures);

+ int previous_index = 0;

+ Handle<JSArray> result = Factory::NewJSArray(0);

+ int i = 0;

+ Handle<Object> matches;

+ Handle<String> subject16 = CachedStringToTwoByte(subject);

+ do {

+ if (previous_index > subject->length() || previous_index < 0) {

+ // Per ECMA-262 15.10.6.2, if the previous index is greater than the

+ // string length, there is no match.

+ matches = Factory::null_value();

+ } else {

+ matches = IrregexpExecOnce(regexp,

+ IrregexpNumberOfCaptures(regexp),

+ subject16,

+ previous_index,

+ offsets.vector(),

+ offsets.length());

+ if (matches->IsJSArray()) {

+ SetElement(result, i, matches);

+ i++;

+ previous_index = offsets.vector()[1];

+ if (offsets.vector()[0] == offsets.vector()[1]) {

+ previous_index++;

+ }

+ } while (matches->IsJSArray());

+ // If we exited the loop with an exception, throw it.

+ if (matches->IsNull()) { // Exited loop normally.

Mads Ager (chromium) 2008/11/25 21:09:41 Move these comments to new lines before the return

Erik Corry 2008/11/26 12:18:36 I'm not sure why that's any better, but fixed. Wh

+ return result;

+ } else { // Exited loop with the exception in matches.

+ return matches;

+ }

+Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,

+ Handle<String> subject) {

+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);

+ if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {

+ Handle<Object> compile_result = JscreCompile(regexp);

+ if (compile_result.is_null()) return compile_result;

+ }

+ ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());

+ // Prepare space for the return values.

+ int num_captures = JscreNumberOfCaptures(regexp);

+ OffsetsVector offsets((num_captures + 1) * 3);

int previous_index = 0;

Handle<JSArray> result = Factory::NewJSArray(0);

@@ -538,9 +774,13 @@

// string length, there is no match.

matches = Factory::null_value();

} else {

- matches = JsreExecOnce(regexp, num_captures, subject, previous_index,

- subject16->GetTwoByteData(),

- offsets.vector(), offsets.length());

+ matches = JscreExecOnce(regexp,

+ num_captures,

+ subject,

+ previous_index,

+ subject16->GetTwoByteData(),

+ offsets.vector(),

+ offsets.length());

if (matches->IsJSArray()) {

SetElement(result, i, matches);

@@ -562,15 +802,1798 @@

}

-int RegExpImpl::JsreCapture(Handle<JSRegExp> re) {

+int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {

FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));

- return Smi::cast(value->get(CAPTURE_INDEX))->value();

+ return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->

+ value();

Mads Ager (chromium) 2008/11/25 21:09:41 This should fit on the previous line.

Erik Corry 2008/11/26 12:18:36 Done.

}

-ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) {

+ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {

FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));

- return ByteArray::cast(value->get(INTERNAL_INDEX));

+ return ByteArray::cast(value->get(kJscreInternalIndex));

}

+int RegExpImpl::IrregexpNumberOfCaptures(Handle<JSRegExp> re) {

+ FixedArray* value =

+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));

+ return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value();

+int RegExpImpl::IrregexpNumberOfRegisters(Handle<JSRegExp> re) {

+ FixedArray* value =

+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));

+ return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value();

+Handle<ByteArray> RegExpImpl::IrregexpCode(Handle<JSRegExp> re) {

+ FixedArray* value =

+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));

+ return Handle<ByteArray>(ByteArray::cast(value->get(kIrregexpCodeIndex)));

+// -------------------------------------------------------------------

+// New regular expression engine

Erik Corry 2008/11/25 11:03:52 Probably should mention "Irregexp"

Erik Corry 2008/11/26 12:18:36 Fixed.

+void RegExpTree::AppendToText(RegExpText* text) {

+ UNREACHABLE();

+void RegExpAtom::AppendToText(RegExpText* text) {

+ text->AddElement(TextElement::Atom(this));

+void RegExpCharacterClass::AppendToText(RegExpText* text) {

+ text->AddElement(TextElement::CharClass(this));

+void RegExpText::AppendToText(RegExpText* text) {

+ for (int i = 0; i < elements()->length(); i++)

+ text->AddElement(elements()->at(i));

+TextElement TextElement::Atom(RegExpAtom* atom) {

+ TextElement result = TextElement(ATOM);

+ result.data.u_atom = atom;

+ return result;

+TextElement TextElement::CharClass(

+ RegExpCharacterClass* char_class) {

+ TextElement result = TextElement(CHAR_CLASS);

+ result.data.u_char_class = char_class;

+ return result;

+class RegExpCompiler {

+ public:

+ RegExpCompiler(int capture_count, bool ignore_case);

+ int AllocateRegister() { return next_register_++; }

+ Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,

+ RegExpNode* start,

+ int capture_count);

+ inline void AddWork(RegExpNode* node) { work_list_->Add(node); }

+ static const int kImplementationOffset = 0;

+ static const int kNumberOfRegistersOffset = 0;

+ static const int kCodeOffset = 1;

+ RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }

+ EndNode* accept() { return accept_; }

+ EndNode* backtrack() { return backtrack_; }

+ static const int kMaxRecursion = 100;

+ inline int recursion_depth() { return recursion_depth_; }

+ inline void IncrementRecursionDepth() { recursion_depth_++; }

+ inline void DecrementRecursionDepth() { recursion_depth_--; }

+ inline bool is_case_independent() { return is_case_independent_; }

+ private:

+ EndNode* accept_;

+ EndNode* backtrack_;

+ int next_register_;

+ List<RegExpNode*>* work_list_;

+ int recursion_depth_;

+ RegExpMacroAssembler* macro_assembler_;

+ bool is_case_independent_;

+};

+// Attempts to compile the regexp using an Irregexp code generator. Returns

+// a fixed array or a null handle depending on whether it succeeded.

+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)

+ : next_register_(2 * (capture_count + 1)),

Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.

Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.

+ work_list_(NULL),

+ recursion_depth_(0),

+ is_case_independent_(ignore_case) {

+ accept_ = new EndNode(EndNode::ACCEPT);

+ backtrack_ = new EndNode(EndNode::BACKTRACK);

+Handle<FixedArray> RegExpCompiler::Assemble(

+ RegExpMacroAssembler* macro_assembler,

+ RegExpNode* start,

+ int capture_count) {

+ if (!FLAG_attempt_case_independent && is_case_independent_) {

+ return Handle<FixedArray>::null();

+ }

+ macro_assembler_ = macro_assembler;

+ List <RegExpNode*> work_list(0);

+ work_list_ = &work_list;

+ Label fail;

+ macro_assembler->PushBacktrack(&fail);

+ if (!start->GoTo(this)) {

+ fail.Unuse();

+ return Handle<FixedArray>::null();

+ }

+ while (!work_list.is_empty()) {

+ if (!work_list.RemoveLast()->GoTo(this)) {

+ fail.Unuse();

+ return Handle<FixedArray>::null();

+ }

+ macro_assembler->Bind(&fail);

+ macro_assembler->Fail();

+ Handle<FixedArray> array =

+ Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);

+ array->set(RegExpImpl::kIrregexpImplementationIndex,

+ Smi::FromInt(macro_assembler->Implementation()));

+ array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,

+ Smi::FromInt(next_register_));

+ array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,

+ Smi::FromInt(capture_count));

+ Handle<Object> code = macro_assembler->GetCode();

+ array->set(RegExpImpl::kIrregexpCodeIndex, *code);

+ work_list_ = NULL;

+ return array;

+bool RegExpNode::GoTo(RegExpCompiler* compiler) {

+ // TODO(erikcorry): Implement support.

+ if (info_.follows_word_interest ||

+ info_.follows_newline_interest ||

+ info_.follows_start_interest) {

+ return false;

+ }

+ if (label_.is_bound()) {

+ compiler->macro_assembler()->GoTo(&label_);

+ return true;

+ } else {

Mads Ager (chromium) 2008/11/25 21:09:41 Most of this nesting could go away since all of th

Christian Plesner Hansen 2008/11/26 06:49:56 Nesting aids readability: you can trivially see wh

+ if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) {

+ compiler->macro_assembler()->GoTo(&label_);

+ compiler->AddWork(this);

+ return true;

+ } else {

+ compiler->IncrementRecursionDepth();

+ bool how_it_went = Emit(compiler);

+ compiler->DecrementRecursionDepth();

+ return how_it_went;

+ }

+bool EndNode::GoTo(RegExpCompiler* compiler) {

+ if (info()->follows_word_interest ||

+ info()->follows_newline_interest ||

+ info()->follows_start_interest) {

+ return false;

+ }

+ if (!label()->is_bound()) {

+ Bind(compiler->macro_assembler());

+ }

+ switch (action_) {

+ case ACCEPT:

+ compiler->macro_assembler()->Succeed();

+ break;

Mads Ager (chromium) 2008/11/25 21:09:41 Indent the break.

Erik Corry 2008/11/26 12:18:36 Fixed.

+ case BACKTRACK:

+ compiler->macro_assembler()->Backtrack();

+ break;

Mads Ager (chromium) 2008/11/25 21:09:41 Indent the break.

Erik Corry 2008/11/26 12:18:36 Fixed.

+ }

+ return true;

+Label* RegExpNode::label() {

+ return &label_;

+bool EndNode::Emit(RegExpCompiler* compiler) {

+ RegExpMacroAssembler* macro = compiler->macro_assembler();

+ switch (action_) {

+ case ACCEPT:

+ Bind(macro);

+ macro->Succeed();

+ return true;

+ case BACKTRACK:

+ Bind(macro);

+ macro->Backtrack();

+ return true;

+ }

+ return false;

+void GuardedAlternative::AddGuard(Guard* guard) {

+ if (guards_ == NULL)

+ guards_ = new ZoneList<Guard*>(1);

+ guards_->Add(guard);

+ActionNode* ActionNode::StoreRegister(int reg,

+ int val,

+ RegExpNode* on_success) {

+ ActionNode* result = new ActionNode(STORE_REGISTER, on_success);

+ result->data_.u_store_register.reg = reg;

+ result->data_.u_store_register.value = val;

+ return result;

+ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {

+ ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);

+ result->data_.u_increment_register.reg = reg;

+ return result;

+ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {

+ ActionNode* result = new ActionNode(STORE_POSITION, on_success);

+ result->data_.u_position_register.reg = reg;

+ return result;

+ActionNode* ActionNode::SavePosition(int reg, RegExpNode* on_success) {

+ ActionNode* result = new ActionNode(SAVE_POSITION, on_success);

+ result->data_.u_position_register.reg = reg;

+ return result;

+ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) {

+ ActionNode* result = new ActionNode(RESTORE_POSITION, on_success);

+ result->data_.u_position_register.reg = reg;

+ return result;

+ActionNode* ActionNode::BeginSubmatch(int reg, RegExpNode* on_success) {

+ ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);

+ result->data_.u_submatch_stack_pointer_register.reg = reg;

+ return result;

+ActionNode* ActionNode::EscapeSubmatch(int reg, RegExpNode* on_success) {

+ ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success);

+ result->data_.u_submatch_stack_pointer_register.reg = reg;

+ return result;

+#define DEFINE_ACCEPT(Type) \

+ void Type##Node::Accept(NodeVisitor* visitor) { \

+ visitor->Visit##Type(this); \

+ }

+FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)

+#undef DEFINE_ACCEPT

+// -------------------------------------------------------------------

+// Emit code.

+void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,

+ Guard* guard,

+ Label* on_failure) {

+ switch (guard->op()) {

+ case Guard::LT:

+ macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure);

+ break;

+ case Guard::GEQ:

+ macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure);

+ break;

+ }

+static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;

+static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

+static inline void EmitAtomNonLetters(

+ RegExpMacroAssembler* macro_assembler,

+ TextElement elm,

+ Vector<const uc16> quarks,

+ Label* on_failure,

+ int cp_offset) {

+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ for (int i = quarks.length() - 1; i >= 0; i--) {

+ uc16 c = quarks[i];

+ int length = uncanonicalize.get(c, '\0', chars);

+ if (length <= 1) {

+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);

+ macro_assembler->CheckNotCharacter(c, on_failure);

+ }

+static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,

+ uc16 c1,

+ uc16 c2,

+ Label* on_failure) {

+ uc16 exor = c1 ^ c2;

+ // Check whether exor has only one bit set.

+ if (((exor - 1) & exor) == 0) {

Mads Ager (chromium) 2008/11/25 21:09:41 The if-else could be replaced by two ifs that each

Erik Corry 2008/11/26 12:18:36 Fixed.

+ // If c1 and c2 differ only by one bit.

+ // Ecma262UnCanonicalize always gives the highest number last.

+ ASSERT(c2 > c1);

+ macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);

+ return true;

+ } else {

+ ASSERT(c2 > c1);

+ uc16 diff = c2 - c1;

+ if (((diff - 1) & diff) == 0 && c1 >= diff) {

+ // If the characters differ by 2^n but don't differ by one bit then

+ // subtract the difference from the found character, then do the or

+ // trick. We avoid the theoretical case where negative numbers are

+ // involved in order to simplify code generation.

+ macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,

+ diff,

+ on_failure);

+ return true;

+ }

+ return false;

+static inline void EmitAtomLetters(

+ RegExpMacroAssembler* macro_assembler,

+ TextElement elm,

+ Vector<const uc16> quarks,

+ Label* on_failure,

+ int cp_offset) {

+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ for (int i = quarks.length() - 1; i >= 0; i--) {

+ uc16 c = quarks[i];

+ int length = uncanonicalize.get(c, '\0', chars);

+ if (length <= 1) continue;

+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);

+ Label ok;

+ ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

+ switch (length) {

+ case 2: {

+ if (ShortCutEmitCharacterPair(macro_assembler,

+ chars[0],

+ chars[1],

+ on_failure)) {

+ ok.Unuse();

+ } else {

+ macro_assembler->CheckCharacter(chars[0], &ok);

+ macro_assembler->CheckNotCharacter(chars[1], on_failure);

+ macro_assembler->Bind(&ok);

+ }

+ break;

+ }

+ case 4:

+ macro_assembler->CheckCharacter(chars[3], &ok);

+ // Fall through!

+ case 3:

+ macro_assembler->CheckCharacter(chars[0], &ok);

+ macro_assembler->CheckCharacter(chars[1], &ok);

+ macro_assembler->CheckNotCharacter(chars[2], on_failure);

+ macro_assembler->Bind(&ok);

+ break;

+ default:

+ UNREACHABLE();

+ break;

+ }

+static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

+ RegExpCharacterClass* cc,

+ int cp_offset,

+ Label* on_failure) {

+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);

+ cp_offset++;

+ ZoneList<CharacterRange>* ranges = cc->ranges();

+ Label success;

+ Label *char_is_in_class =

Erik Corry 2008/11/25 11:03:52 Misplaced asterisk.

Erik Corry 2008/11/26 12:18:36 Fixed.

+ cc->is_negated() ? on_failure : &success;

+ int range_count = ranges->length();

+ if (range_count == 0) {

+ if (!cc->is_negated()) {

+ macro_assembler->GoTo(on_failure);

+ }

+ return;

+ }

+ for (int i = 0; i < range_count - 1; i++) {

+ CharacterRange& range = ranges->at(i);

+ Label next_range;

+ uc16 from = range.from();

+ uc16 to = range.to();

+ if (to == from) {

+ macro_assembler->CheckCharacter(to, char_is_in_class);

+ } else {

+ if (from != 0) {

+ macro_assembler->CheckCharacterLT(from, &next_range);

+ }

+ if (to != 0xffff) {

+ macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);

+ } else {

+ macro_assembler->GoTo(char_is_in_class);

+ }

+ macro_assembler->Bind(&next_range);

+ }

+ CharacterRange& range = ranges->at(range_count - 1);

+ uc16 from = range.from();

+ uc16 to = range.to();

+ if (to == from) {

+ if (cc->is_negated()) {

+ macro_assembler->CheckCharacter(to, on_failure);

+ } else {

+ macro_assembler->CheckNotCharacter(to, on_failure);

+ }

+ } else {

+ if (from != 0) {

+ if (!cc->is_negated()) {

+ macro_assembler->CheckCharacterLT(from, on_failure);

+ } else {

+ macro_assembler->CheckCharacterLT(from, &success);

+ }

+ if (to != 0xffff) {

+ if (!cc->is_negated()) {

+ macro_assembler->CheckCharacterGT(to, on_failure);

+ } else {

+ macro_assembler->CheckCharacterLT(to + 1, on_failure);

+ }

+ } else {

+ if (cc->is_negated()) {

+ macro_assembler->GoTo(on_failure);

+ }

+ macro_assembler->Bind(&success);

+bool TextNode::Emit(RegExpCompiler* compiler) {

+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

+ Bind(macro_assembler);

+ int element_count = elms_->length();

+ int cp_offset = 0;

+ // First, handle straight character matches.

+ for (int i = 0; i < element_count; i++) {

+ TextElement elm = elms_->at(i);

+ if (elm.type == TextElement::ATOM) {

+ Vector<const uc16> quarks = elm.data.u_atom->data();

+ if (!compiler->is_case_independent()) {

+ macro_assembler->CheckCharacters(quarks,

+ cp_offset,

+ on_failure_->label());

+ } else {

+ EmitAtomNonLetters(macro_assembler,

+ elm,

+ quarks,

+ on_failure_->label(),

+ cp_offset);

+ }

+ cp_offset += quarks.length();

+ } else {

+ ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);

+ cp_offset++;

+ }

+ // Second, handle case independent letter matches if any.

+ if (compiler->is_case_independent()) {

+ cp_offset = 0;

+ for (int i = 0; i < element_count; i++) {

+ TextElement elm = elms_->at(i);

+ if (elm.type == TextElement::ATOM) {

+ Vector<const uc16> quarks = elm.data.u_atom->data();

+ EmitAtomLetters(macro_assembler,

+ elm,

+ quarks,

+ on_failure_->label(),

+ cp_offset);

+ cp_offset += quarks.length();

+ } else {

+ cp_offset++;

+ }

+ // If the fast character matches passed then do the character classes.

+ cp_offset = 0;

+ for (int i = 0; i < element_count; i++) {

+ TextElement elm = elms_->at(i);

+ if (elm.type == TextElement::CHAR_CLASS) {

+ RegExpCharacterClass* cc = elm.data.u_char_class;

+ EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label());

+ cp_offset ++;

+ } else {

+ cp_offset += elm.data.u_atom->data().length();

+ }

+ compiler->AddWork(on_failure_);

+ macro_assembler->AdvanceCurrentPosition(cp_offset);

+ return on_success()->GoTo(compiler);

+bool ChoiceNode::Emit(RegExpCompiler* compiler) {

+ int choice_count = alternatives_->length();

+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

+ Bind(macro_assembler);

+ // For now we just call all choices one after the other. The idea ultimately

+ // is to use the Dispatch table to try only the relevant ones.

+ int i;

Mads Ager (chromium) 2008/11/25 21:09:41 I don't see how the i is needed after the for loop

Erik Corry 2008/11/26 12:18:36 Fixed.

+ for (i = 0; i < choice_count - 1; i++) {

+ GuardedAlternative alternative = alternatives_->at(i);

+ Label after;

+ Label after_no_pop_cp;

+ ZoneList<Guard*>* guards = alternative.guards();

+ if (guards != NULL) {

+ int guard_count = guards->length();

+ for (int j = 0; j < guard_count; j++) {

+ GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp);

+ }

+ macro_assembler->PushCurrentPosition();

+ macro_assembler->PushBacktrack(&after);

+ if (!alternative.node()->GoTo(compiler)) {

+ after.Unuse();

+ after_no_pop_cp.Unuse();

+ return false;

+ }

+ macro_assembler->Bind(&after);

+ macro_assembler->PopCurrentPosition();

+ macro_assembler->Bind(&after_no_pop_cp);

+ }

+ GuardedAlternative alternative = alternatives_->at(i);

+ ZoneList<Guard*>* guards = alternative.guards();

+ if (guards != NULL) {

+ int guard_count = guards->length();

+ for (int j = 0; j < guard_count; j++) {

+ GenerateGuard(macro_assembler, guards->at(j), on_failure_->label());

+ }

+ if (!on_failure_->IsBacktrack()) {

+ ASSERT_NOT_NULL(on_failure_ -> label());

+ macro_assembler->PushBacktrack(on_failure_->label());

+ compiler->AddWork(on_failure_);

+ }

+ if (!alternative.node()->GoTo(compiler)) {

+ return false;

+ }

+ return true;

+bool ActionNode::Emit(RegExpCompiler* compiler) {

+ RegExpMacroAssembler* macro = compiler->macro_assembler();

+ Bind(macro);

+ switch (type_) {

+ case STORE_REGISTER:

+ macro->SetRegister(data_.u_store_register.reg,

+ data_.u_store_register.value);

+ break;

+ case INCREMENT_REGISTER: {

+ Label undo;

+ macro->PushBacktrack(&undo);

+ macro->AdvanceRegister(data_.u_increment_register.reg, 1);

+ bool ok = on_success()->GoTo(compiler);

+ if (!ok) {

+ undo.Unuse();

+ return false;

+ }

+ macro->Bind(&undo);

+ macro->AdvanceRegister(data_.u_increment_register.reg, -1);

+ macro->Backtrack();

+ break;

+ }

+ case STORE_POSITION: {

+ Label undo;

+ macro->PushRegister(data_.u_position_register.reg);

+ macro->PushBacktrack(&undo);

+ macro->WriteCurrentPositionToRegister(data_.u_position_register.reg);

+ bool ok = on_success()->GoTo(compiler);

+ if (!ok) {

+ undo.Unuse();

+ return false;

+ }

+ macro->Bind(&undo);

+ macro->PopRegister(data_.u_position_register.reg);

+ macro->Backtrack();

+ break;

+ }

+ case SAVE_POSITION:

+ macro->WriteCurrentPositionToRegister(

+ data_.u_position_register.reg);

+ break;

+ case RESTORE_POSITION:

+ macro->ReadCurrentPositionFromRegister(

+ data_.u_position_register.reg);

+ break;

+ case BEGIN_SUBMATCH:

+ macro->WriteStackPointerToRegister(

+ data_.u_submatch_stack_pointer_register.reg);

+ break;

+ case ESCAPE_SUBMATCH:

+ macro->ReadStackPointerFromRegister(

+ data_.u_submatch_stack_pointer_register.reg);

+ break;

+ default:

+ UNREACHABLE();

+ return false;

+ }

+ return on_success()->GoTo(compiler);

+bool BackReferenceNode::Emit(RegExpCompiler* compiler) {

+ RegExpMacroAssembler* macro = compiler->macro_assembler();

+ Bind(macro);

+ // Check whether the registers are uninitialized and always

+ // succeed if they are.

+ macro->IfRegisterLT(start_reg_, 0, on_success()->label());

+ macro->IfRegisterLT(end_reg_, 0, on_success()->label());

+ ASSERT_EQ(start_reg_ + 1, end_reg_);

+ macro->CheckNotBackReference(start_reg_, on_failure_->label());

+ return on_success()->GoTo(compiler);

+// -------------------------------------------------------------------

+// Dot/dotty output

+#ifdef DEBUG

+class DotPrinter: public NodeVisitor {

+ public:

+ DotPrinter() : stream_(&alloc_) { }

+ void PrintNode(const char* label, RegExpNode* node);

+ void Visit(RegExpNode* node);

+ void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure);

+ StringStream* stream() { return &stream_; }

+#define DECLARE_VISIT(Type) \

+ virtual void Visit##Type(Type##Node* that);

+FOR_EACH_NODE_TYPE(DECLARE_VISIT)

+#undef DECLARE_VISIT

+ private:

+ HeapStringAllocator alloc_;

+ StringStream stream_;

+ std::set<RegExpNode*> seen_;

+};

+void DotPrinter::PrintNode(const char* label, RegExpNode* node) {

+ stream()->Add("digraph G {\n graph [label=\"");

+ for (int i = 0; label[i]; i++) {

+ switch (label[i]) {

+ case '\\':

+ stream()->Add("\\\\");

+ break;

+ case '"':

+ stream()->Add("\"");

+ break;

+ default:

+ stream()->Put(label[i]);

+ break;

+ }

+ stream()->Add("\"];\n");

+ Visit(node);

+ stream()->Add("}\n");

+ printf("%s", *(stream()->ToCString()));

+void DotPrinter::Visit(RegExpNode* node) {

+ if (seen_.find(node) != seen_.end())

+ return;

+ seen_.insert(node);

+ node->Accept(this);

+void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {

+ if (on_failure->IsBacktrack()) return;

+ stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);

+ Visit(on_failure);

+class TableEntryBodyPrinter {

+ public:

+ TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)

+ : stream_(stream), choice_(choice) { }

Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.

Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.

+ void Call(uc16 from, DispatchTable::Entry entry) {

+ OutSet* out_set = entry.out_set();

+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

+ if (out_set->Get(i)) {

+ stream()->Add(" n%p:s%io%i -> n%p;\n",

+ choice(),

+ from,

+ i,

+ choice()->alternatives()->at(i).node());

+ }

+ private:

+ StringStream* stream() { return stream_; }

+ ChoiceNode* choice() { return choice_; }

+ StringStream* stream_;

+ ChoiceNode* choice_;

+};

+class TableEntryHeaderPrinter {

+ public:

+ explicit TableEntryHeaderPrinter(StringStream* stream)

+ : first_(true), stream_(stream) { }

Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.

Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.

+ void Call(uc16 from, DispatchTable::Entry entry) {

+ if (first_) {

+ first_ = false;

+ } else {

+ stream()->Add("|");

+ }

+ stream()->Add("{\\%k-\\%k|{", from, entry.to());

+ OutSet* out_set = entry.out_set();

+ int priority = 0;

+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

+ if (out_set->Get(i)) {

+ if (priority > 0) stream()->Add("|");

+ stream()->Add("<s%io%i> %i", from, i, priority);

+ priority++;

+ }

+ stream()->Add("}}");

+ }

+ private:

+ bool first_;

+ StringStream* stream() { return stream_; }

+ StringStream* stream_;

+};

+void DotPrinter::VisitChoice(ChoiceNode* that) {

+ stream()->Add(" n%p [shape=Mrecord, label=\"", that);

+ TableEntryHeaderPrinter header_printer(stream());

+ that->table()->ForEach(&header_printer);

+ stream()->Add("\"]\n", that);

+ TableEntryBodyPrinter body_printer(stream(), that);

+ that->table()->ForEach(&body_printer);

+ PrintOnFailure(that, that->on_failure());

+ for (int i = 0; i < that->alternatives()->length(); i++) {

+ GuardedAlternative alt = that->alternatives()->at(i);

+ alt.node()->Accept(this);

+ }

+void DotPrinter::VisitText(TextNode* that) {

+ stream()->Add(" n%p [label=\"", that);

+ for (int i = 0; i < that->elements()->length(); i++) {

+ if (i > 0) stream()->Add(" ");

+ TextElement elm = that->elements()->at(i);

+ switch (elm.type) {

+ case TextElement::ATOM: {

+ stream()->Add("'%w'", elm.data.u_atom->data());

+ break;

+ }

+ case TextElement::CHAR_CLASS: {

+ RegExpCharacterClass* node = elm.data.u_char_class;

+ stream()->Add("[");

+ if (node->is_negated())

+ stream()->Add("^");

+ for (int j = 0; j < node->ranges()->length(); j++) {

+ CharacterRange range = node->ranges()->at(j);

+ stream()->Add("%k-%k", range.from(), range.to());

+ }

+ stream()->Add("]");

+ break;

+ }

+ default:

+ UNREACHABLE();

+ }

+ stream()->Add("\", shape=box, peripheries=2];\n");

+ stream()->Add(" n%p -> n%p;\n", that, that->on_success());

+ Visit(that->on_success());

+ PrintOnFailure(that, that->on_failure());

+void DotPrinter::VisitBackReference(BackReferenceNode* that) {

+ stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",

+ that,

+ that->start_register(),

+ that->end_register());

+ stream()->Add(" n%p -> n%p;\n", that, that->on_success());

+ Visit(that->on_success());

+ PrintOnFailure(that, that->on_failure());

+void DotPrinter::VisitEnd(EndNode* that) {

+ stream()->Add(" n%p [style=bold, shape=point];\n", that);

+void DotPrinter::VisitAction(ActionNode* that) {

+ stream()->Add(" n%p [", that);

+ switch (that->type_) {

+ case ActionNode::STORE_REGISTER:

+ stream()->Add("label=\"$%i:=%i\", shape=octagon",

+ that->data_.u_store_register.reg,

+ that->data_.u_store_register.value);

+ break;

+ case ActionNode::INCREMENT_REGISTER:

+ stream()->Add("label=\"$%i++\", shape=octagon",

+ that->data_.u_increment_register.reg);

+ break;

+ case ActionNode::STORE_POSITION:

+ stream()->Add("label=\"$%i:=$pos\", shape=octagon",

+ that->data_.u_position_register.reg);

+ break;

+ case ActionNode::SAVE_POSITION:

+ stream()->Add("label=\"$%i:=$pos\", shape=octagon",

+ that->data_.u_position_register.reg);

+ break;

+ case ActionNode::RESTORE_POSITION:

+ stream()->Add("label=\"$pos:=$%i\", shape=octagon",

+ that->data_.u_position_register.reg);

+ break;

+ case ActionNode::BEGIN_SUBMATCH:

+ stream()->Add("label=\"begin\", shape=septagon");

+ break;

+ case ActionNode::ESCAPE_SUBMATCH:

+ stream()->Add("label=\"escape\", shape=septagon");

+ break;

+ }

+ stream()->Add("];\n");

+ stream()->Add(" n%p -> n%p;\n", that, that->on_success());

+ Visit(that->on_success());

+class DispatchTableDumper {

+ public:

+ explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }

+ void Call(uc16 key, DispatchTable::Entry entry);

+ StringStream* stream() { return stream_; }

+ private:

+ StringStream* stream_;

+};

+void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {

+ stream()->Add("[%k-%k]: {", key, entry.to());

+ OutSet* set = entry.out_set();

+ bool first = true;

+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

+ if (set->Get(i)) {

+ if (first) {

+ first = false;

+ } else {

+ stream()->Add(", ");

+ }

+ stream()->Add("%i", i);

+ }

+ stream()->Add("}\n");

+void DispatchTable::Dump() {

+ HeapStringAllocator alloc;

+ StringStream stream(&alloc);

+ DispatchTableDumper dumper(&stream);

+ tree()->ForEach(&dumper);

+ OS::PrintError("%s", *stream.ToCString());

+void RegExpEngine::DotPrint(const char* label, RegExpNode* node) {

+ DotPrinter printer;

+ printer.PrintNode(label, node);

+#endif // DEBUG

+// -------------------------------------------------------------------

+// Tree to graph conversion

+RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);

+ elms->Add(TextElement::Atom(this));

+ return new TextNode(elms, on_success, on_failure);

+RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ return new TextNode(elements(), on_success, on_failure);

+RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);

+ elms->Add(TextElement::CharClass(this));

+ return new TextNode(elms, on_success, on_failure);

+RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ ZoneList<RegExpTree*>* alternatives = this->alternatives();

+ int length = alternatives->length();

+ ChoiceNode* result = new ChoiceNode(length, on_failure);

+ for (int i = 0; i < length; i++) {

+ GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,

+ on_success,

+ on_failure));

+ result->AddAlternative(alternative);

+ }

+ return result;

+RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ return ToNode(min(),

+ max(),

+ is_greedy(),

+ body(),

+ compiler,

+ on_success,

+ on_failure);

+RegExpNode* RegExpQuantifier::ToNode(int min,

+ int max,

+ bool is_greedy,

+ RegExpTree* body,

+ RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ // x{f, t} becomes this:

+ //

+ // (r++)<-.

+ // | `

+ // | (x)

+ // v ^

+ // (r=0)-->(?)---/ [if r < t]

+ // |

+ // [if r >= f] \----> ...

+ //

+ // TODO(someone): clear captures on repetition and handle empty

+ // matches.

+ bool has_min = min > 0;

+ bool has_max = max < RegExpQuantifier::kInfinity;

+ bool needs_counter = has_min || has_max;

+ int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;

+ ChoiceNode* center = new ChoiceNode(2, on_failure);

+ RegExpNode* loop_return = needs_counter

+ ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))

+ : static_cast<RegExpNode*>(center);

+ RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure);

+ GuardedAlternative body_alt(body_node);

+ if (has_max) {

+ Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);

+ body_alt.AddGuard(body_guard);

+ }

+ GuardedAlternative rest_alt(on_success);

+ if (has_min) {

+ Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);

+ rest_alt.AddGuard(rest_guard);

+ }

+ if (is_greedy) {

+ center->AddAlternative(body_alt);

+ center->AddAlternative(rest_alt);

+ } else {

+ center->AddAlternative(rest_alt);

+ center->AddAlternative(body_alt);

+ }

+ if (needs_counter) {

+ return ActionNode::StoreRegister(reg_ctr, 0, center);

+ } else {

+ return center;

+ }

+RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ NodeInfo info;

+ switch (type()) {

+ case START_OF_LINE:

+ info.follows_newline_interest = true;

+ break;

+ case START_OF_INPUT:

+ info.follows_start_interest = true;

+ break;

+ case BOUNDARY: case NON_BOUNDARY:

+ info.follows_word_interest = true;

+ break;

+ case END_OF_LINE: case END_OF_INPUT:

+ // This is wrong but has the effect of making the compiler abort.

+ info.follows_start_interest = true;

+ }

+ return on_success->PropagateInterest(&info);

+RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ return new BackReferenceNode(RegExpCapture::StartRegister(index()),

+ RegExpCapture::EndRegister(index()),

+ on_success,

+ on_failure);

+RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ return on_success;

+RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ int stack_pointer_register = compiler->AllocateRegister();

+ int position_register = compiler->AllocateRegister();

+ if (is_positive()) {

+ // begin submatch scope

+ // $reg = $pos

+ // if [body]

+ // then

+ // $pos = $reg

+ // escape submatch scope (drop all backtracks created in scope)

+ // succeed

+ // else

+ // end submatch scope (nothing to clean up, just exit the scope)

+ // fail

+ return ActionNode::BeginSubmatch(

+ stack_pointer_register,

+ ActionNode::SavePosition(

+ position_register,

+ body()->ToNode(

+ compiler,

+ ActionNode::RestorePosition(

+ position_register,

+ ActionNode::EscapeSubmatch(stack_pointer_register,

+ on_success)),

+ on_failure)));

+ } else {

+ // begin submatch scope

+ // try

+ // first if (body)

+ // then

+ // escape submatch scope

+ // fail

+ // else

+ // backtrack

+ // second

+ // end submatch scope

+ // restore current position

+ // succeed

+ ChoiceNode* try_node =

+ new ChoiceNode(1, ActionNode::RestorePosition(position_register,

+ on_success));

+ RegExpNode* body_node = body()->ToNode(

+ compiler,

+ ActionNode::EscapeSubmatch(stack_pointer_register, on_failure),

+ compiler->backtrack());

+ GuardedAlternative body_alt(body_node);

+ try_node->AddAlternative(body_alt);

+ return ActionNode::BeginSubmatch(stack_pointer_register,

+ ActionNode::SavePosition(

+ position_register,

+ try_node));

+ }

+RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ return ToNode(body(), index(), compiler, on_success, on_failure);

+RegExpNode* RegExpCapture::ToNode(RegExpTree* body,

+ int index,

+ RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ int start_reg = RegExpCapture::StartRegister(index);

+ int end_reg = RegExpCapture::EndRegister(index);

+ RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);

+ RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure);

+ return ActionNode::StorePosition(start_reg, body_node);

+RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpNode* on_failure) {

+ ZoneList<RegExpTree*>* children = nodes();

+ RegExpNode* current = on_success;

+ for (int i = children->length() - 1; i >= 0; i--) {

+ current = children->at(i)->ToNode(compiler, current, on_failure);

+ }

+ return current;

+static const int kSpaceRangeCount = 20;

+static const uc16 kSpaceRanges[kSpaceRangeCount] = {

+ 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680,

+ 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,

+ 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000

+};

+static const int kWordRangeCount = 8;

+static const uc16 kWordRanges[kWordRangeCount] = {

+ '0', '9', 'A', 'Z', '_', '_', 'a', 'z'

+};

+static const int kDigitRangeCount = 2;

+static const uc16 kDigitRanges[kDigitRangeCount] = {

+ '0', '9'

+};

+static const int kLineTerminatorRangeCount = 6;

+static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {

+ 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029

+};

+static void AddClass(const uc16* elmv,

+ int elmc,

+ ZoneList<CharacterRange>* ranges) {

+ for (int i = 0; i < elmc; i += 2) {

+ ASSERT(elmv[i] <= elmv[i + 1]);

+ ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));

+ }

+static void AddClassNegated(const uc16 *elmv,

+ int elmc,

+ ZoneList<CharacterRange>* ranges) {

+ ASSERT(elmv[0] != 0x0000);

+ ASSERT(elmv[elmc-1] != 0xFFFF);

+ uc16 last = 0x0000;

+ for (int i = 0; i < elmc; i += 2) {

+ ASSERT(last <= elmv[i] - 1);

+ ASSERT(elmv[i] <= elmv[i + 1]);

+ ranges->Add(CharacterRange(last, elmv[i] - 1));

+ last = elmv[i + 1] + 1;

+ }

+ ranges->Add(CharacterRange(last, 0xFFFF));

+void CharacterRange::AddClassEscape(uc16 type,

+ ZoneList<CharacterRange>* ranges) {

+ switch (type) {

+ case 's':

+ AddClass(kSpaceRanges, kSpaceRangeCount, ranges);

+ break;

+ case 'S':

+ AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);

+ break;

+ case 'w':

+ AddClass(kWordRanges, kWordRangeCount, ranges);

+ break;

+ case 'W':

+ AddClassNegated(kWordRanges, kWordRangeCount, ranges);

+ break;

+ case 'd':

+ AddClass(kDigitRanges, kDigitRangeCount, ranges);

+ break;

+ case 'D':

+ AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);

+ break;

+ case '.':

+ AddClassNegated(kLineTerminatorRanges,

+ kLineTerminatorRangeCount,

+ ranges);

+ break;

+ // This is not a character range as defined by the spec but a

+ // convenient shorthand for a character class that matches any

+ // character.

+ case '*':

+ ranges->Add(CharacterRange::Everything());

+ break;

+ default:

+ UNREACHABLE();

+ }

+void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {

+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ if (IsSingleton()) {

+ // If this is a singleton we just expand the one character.

+ int length = uncanonicalize.get(from(), '\0', chars);

+ for (int i = 0; i < length; i++) {

+ uc32 chr = chars[i];

+ if (chr != from()) {

+ ranges->Add(CharacterRange::Singleton(chars[i]));

+ }

+ } else if (from() <= kRangeCanonicalizeMax

+ && to() <= kRangeCanonicalizeMax) {

Mads Ager (chromium) 2008/11/25 21:09:41 Indentation is off.

Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.

+ // If this is a range we expand the characters block by block,

+ // expanding contiguous subranges (blocks) one at a time.

+ // The approach is as follows. For a given start character we

+ // look up the block that contains it, for instance 'a' if the

+ // start character is 'c'. A block is characterized by the property

+ // that all characters uncanonicalize in the same way as the first

+ // element, except that each entry in the result is incremented

+ // by the distance from the first element. So a-z is a block

+ // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter

+ // uncanonicalizes to ['a' + k, 'A' + k].

+ // Once we've found the start point we look up its uncanonicalization

+ // and produce a range for each element. For instance for [c-f]

+ // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only

+ // add a range if it is not already contained in the input, so [c-f]

+ // will be skipped but [C-F] will be added. If this range is not

+ // completely contained in a block we do this for all the blocks

+ // covered by the range.

+ unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

+ // First, look up the block that contains the 'from' character.

+ int length = canonrange.get(from(), '\0', range);

+ if (length == 0) {

+ range[0] = from();

+ } else {

+ ASSERT_EQ(1, length);

+ }

+ int pos = from();

+ // The start of the current block. Note that except for the first

+ // iteration 'start' is always equal to 'pos'.

+ int start;

+ // If it is not the start point of a block the entry contains the

+ // offset of the character from the start point.

+ if ((range[0] & kStartMarker) == 0) {

+ start = pos - range[0];

+ } else {

+ start = pos;

+ }

+ // Then we add the ranges on at a time, incrementing the current

+ // position to be after the last block each time. The position

+ // always points to the start of a block.

+ while (pos < to()) {

+ length = canonrange.get(start, '\0', range);

+ if (length == 0) {

+ range[0] = start;

+ } else {

+ ASSERT_EQ(1, length);

+ }

+ ASSERT((range[0] & kStartMarker) != 0);

+ // The start point of a block contains the distance to the end

+ // of the range.

+ int block_end = start + (range[0] & kPayloadMask) - 1;

+ int end = (block_end > to()) ? to() : block_end;

+ length = uncanonicalize.get(start, '\0', range);

+ for (int i = 0; i < length; i++) {

+ uc32 c = range[i];

+ uc16 range_from = c + (pos - start);

+ uc16 range_to = c + (end - start);

+ if (!(from() <= range_from && range_to <= to()))

+ ranges->Add(CharacterRange(range_from, range_to));

+ }

+ start = pos = block_end + 1;

+ }

+ } else {

+ // TODO(plesner) when we've fixed the 2^11 bug in unibrow.

+ }

+// -------------------------------------------------------------------

+// Interest propagation

+RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {

+ for (int i = 0; i < siblings_.length(); i++) {

+ RegExpNode* sibling = siblings_.Get(i);

+ if (sibling->info()->SameInterests(info))

+ return sibling;

+ }

+ return NULL;

+template <class C>

+static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {

+ RegExpNode* sibling = node->GetSibling(info);

+ if (sibling != NULL) return sibling;

+ node->EnsureSiblings();

+ sibling = new C(*node);

+ sibling->info()->AdoptInterests(info);

+ node->AddSibling(sibling);

+ return sibling;

+RegExpNode* ActionNode::PropagateInterest(NodeInfo* info) {

+ RegExpNode* sibling = GetSibling(info);

+ if (sibling != NULL) return sibling;

+ EnsureSiblings();

+ ActionNode* action = new ActionNode(*this);

+ action->info()->AdoptInterests(info);

+ AddSibling(action);

+ action->set_on_success(action->on_success()->PropagateInterest(info));

+ return action;

+RegExpNode* ChoiceNode::PropagateInterest(NodeInfo* info) {

+ RegExpNode* sibling = GetSibling(info);

+ if (sibling != NULL) return sibling;

+ EnsureSiblings();

+ ChoiceNode* choice = new ChoiceNode(*this);

+ choice->info()->AdoptInterests(info);

+ AddSibling(choice);

+ ZoneList<GuardedAlternative>* old_alternatives = alternatives();

+ int count = old_alternatives->length();

+ choice->alternatives_ = new ZoneList<GuardedAlternative>(count);

+ for (int i = 0; i < count; i++) {

+ GuardedAlternative alternative = old_alternatives->at(i);

+ alternative.set_node(alternative.node()->PropagateInterest(info));

+ choice->alternatives()->Add(alternative);

+ }

+ return choice;

+RegExpNode* EndNode::PropagateInterest(NodeInfo* info) {

+ return PropagateToEndpoint(this, info);

+RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) {

+ return PropagateToEndpoint(this, info);

+RegExpNode* TextNode::PropagateInterest(NodeInfo* info) {

+ return PropagateToEndpoint(this, info);

+// -------------------------------------------------------------------

+// Splay tree

+OutSet* OutSet::Extend(unsigned value) {

+ if (Get(value))

+ return this;

+ if (successors() != NULL) {

+ for (int i = 0; i < successors()->length(); i++) {

+ OutSet* successor = successors()->at(i);

+ if (successor->Get(value))

+ return successor;

+ }

+ } else {

+ successors_ = new ZoneList<OutSet*>(2);

+ }

+ OutSet* result = new OutSet(first_, remaining_);

+ result->Set(value);

+ successors()->Add(result);

+ return result;

+void OutSet::Set(unsigned value) {

+ if (value < kFirstLimit) {

+ first_ |= (1 << value);

+ } else {

+ if (remaining_ == NULL)

+ remaining_ = new ZoneList<unsigned>(1);

+ if (remaining_->is_empty() || !remaining_->Contains(value))

+ remaining_->Add(value);

+ }

+bool OutSet::Get(unsigned value) {

+ if (value < kFirstLimit) {

+ return (first_ & (1 << value)) != 0;

+ } else if (remaining_ == NULL) {

+ return false;

+ } else {

+ return remaining_->Contains(value);

+ }

+const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;

+const DispatchTable::Entry DispatchTable::Config::kNoValue;

+void DispatchTable::AddRange(CharacterRange full_range, int value) {

+ CharacterRange current = full_range;

+ if (tree()->is_empty()) {

+ // If this is the first range we just insert into the table.

+ ZoneSplayTree<Config>::Locator loc;

+ ASSERT_RESULT(tree()->Insert(current.from(), &loc));

+ loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));

+ return;

+ }

+ // First see if there is a range to the left of this one that

+ // overlaps.

+ ZoneSplayTree<Config>::Locator loc;

+ if (tree()->FindGreatestLessThan(current.from(), &loc)) {

+ Entry* entry = &loc.value();

+ // If we've found a range that overlaps with this one, and it

+ // starts strictly to the left of this one, we have to fix it

+ // because the following code only handles ranges that start on

+ // or after the start point of the range we're adding.

+ if (entry->from() < current.from() && entry->to() >= current.from()) {

+ // Snap the overlapping range in half around the start point of

+ // the range we're adding.

+ CharacterRange left(entry->from(), current.from() - 1);

+ CharacterRange right(current.from(), entry->to());

+ // The left part of the overlapping range doesn't overlap.

+ // Truncate the whole entry to be just the left part.

+ entry->set_to(left.to());

+ // The right part is the one that overlaps. We add this part

+ // to the map and let the next step deal with merging it with

+ // the range we're adding.

+ ZoneSplayTree<Config>::Locator loc;

+ ASSERT_RESULT(tree()->Insert(right.from(), &loc));

+ loc.set_value(Entry(right.from(),

+ right.to(),

+ entry->out_set()));

+ }

+ while (current.is_valid()) {

+ if (tree()->FindLeastGreaterThan(current.from(), &loc) &&

+ (loc.value().from() <= current.to()) &&

+ (loc.value().to() >= current.from())) {

+ Entry* entry = &loc.value();

+ // We have overlap. If there is space between the start point of

+ // the range we're adding and where the overlapping range starts

+ // then we have to add a range covering just that space.

+ if (current.from() < entry->from()) {

+ ZoneSplayTree<Config>::Locator ins;

+ ASSERT_RESULT(tree()->Insert(current.from(), &ins));

+ ins.set_value(Entry(current.from(),

+ entry->from() - 1,

+ empty()->Extend(value)));

+ current.set_from(entry->from());

+ }

+ ASSERT_EQ(current.from(), entry->from());

+ // If the overlapping range extends beyond the one we want to add

+ // we have to snap the right part off and add it separately.

+ if (entry->to() > current.to()) {

+ ZoneSplayTree<Config>::Locator ins;

+ ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));

+ ins.set_value(Entry(current.to() + 1,

+ entry->to(),

+ entry->out_set()));

+ entry->set_to(current.to());

+ }

+ ASSERT(entry->to() <= current.to());

+ // The overlapping range is now completely contained by the range

+ // we're adding so we can just update it and move the start point

+ // of the range we're adding just past it.

+ entry->AddValue(value);

+ // Bail out if the last interval ended at 0xFFFF since otherwise

+ // adding 1 will wrap around to 0.

+ if (entry->to() == 0xFFFF)

+ break;

+ ASSERT(entry->to() + 1 > current.from());

+ current.set_from(entry->to() + 1);

+ } else {

+ // There is no overlap so we can just add the range

+ ZoneSplayTree<Config>::Locator ins;

+ ASSERT_RESULT(tree()->Insert(current.from(), &ins));

+ ins.set_value(Entry(current.from(),

+ current.to(),

+ empty()->Extend(value)));

+ break;

+ }

+OutSet* DispatchTable::Get(uc16 value) {

+ ZoneSplayTree<Config>::Locator loc;

+ if (!tree()->FindGreatestLessThan(value, &loc))

+ return empty();

+ Entry* entry = &loc.value();

+ if (value <= entry->to())

+ return entry->out_set();

+ else

+ return empty();

+// -------------------------------------------------------------------

+// Analysis

+void Analysis::EnsureAnalyzed(RegExpNode* that) {

+ if (that->info()->been_analyzed || that->info()->being_analyzed)

+ return;

+ that->info()->being_analyzed = true;

+ that->Accept(this);

+ that->info()->being_analyzed = false;

+ that->info()->been_analyzed = true;

+void Analysis::VisitEnd(EndNode* that) {

+ // nothing to do

+void Analysis::VisitText(TextNode* that) {

+ EnsureAnalyzed(that->on_success());

+ EnsureAnalyzed(that->on_failure());

+void Analysis::VisitAction(ActionNode* that) {

+ RegExpNode* next = that->on_success();

+ EnsureAnalyzed(next);

+ that->info()->determine_newline = next->info()->prev_determine_newline();

+ that->info()->determine_word = next->info()->prev_determine_word();

+ that->info()->determine_start = next->info()->prev_determine_start();

+void Analysis::VisitChoice(ChoiceNode* that) {

+ NodeInfo* info = that->info();

+ for (int i = 0; i < that->alternatives()->length(); i++) {

+ RegExpNode* node = that->alternatives()->at(i).node();

+ EnsureAnalyzed(node);

+ info->determine_newline |= node->info()->prev_determine_newline();

+ info->determine_word |= node->info()->prev_determine_word();

+ info->determine_start |= node->info()->prev_determine_start();

+ }

+ if (!that->table_calculated()) {

+ DispatchTableConstructor cons(that->table());

+ cons.BuildTable(that);

+ }

+ EnsureAnalyzed(that->on_failure());

+void Analysis::VisitBackReference(BackReferenceNode* that) {

+ EnsureAnalyzed(that->on_success());

+ EnsureAnalyzed(that->on_failure());

+// -------------------------------------------------------------------

+// Dispatch table construction

+void DispatchTableConstructor::VisitEnd(EndNode* that) {

+ AddRange(CharacterRange::Everything());

+void DispatchTableConstructor::BuildTable(ChoiceNode* node) {

+ ASSERT(!node->table_calculated());

+ node->set_being_calculated(true);

+ ZoneList<GuardedAlternative>* alternatives = node->alternatives();

+ for (int i = 0; i < alternatives->length(); i++) {

+ set_choice_index(i);

+ alternatives->at(i).node()->Accept(this);

+ }

+ node->set_being_calculated(false);

+ node->set_table_calculated(true);

+class AddDispatchRange {

+ public:

+ explicit AddDispatchRange(DispatchTableConstructor* constructor)

+ : constructor_(constructor) { }

+ void Call(uc32 from, DispatchTable::Entry entry);

+ private:

+ DispatchTableConstructor* constructor_;

+};

+void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {

+ CharacterRange range(from, entry.to());

+ constructor_->AddRange(range);

+void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {

+ if (node->being_calculated())

+ return;

+ if (!node->table_calculated()) {

+ DispatchTableConstructor constructor(node->table());

+ constructor.BuildTable(node);

+ }

+ ASSERT(node->table_calculated());

+ AddDispatchRange adder(this);

+ node->table()->ForEach(&adder);

+void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {

+ // TODO(160): Find the node that we refer back to and propagate its start

+ // set back to here. For now we just accept anything.

+ AddRange(CharacterRange::Everything());

+static int CompareRangeByFrom(const CharacterRange* a,

+ const CharacterRange* b) {

+ return Spaceship<uc16>(a->from(), b->from());

+void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {

+ ranges->Sort(CompareRangeByFrom);

+ uc16 last = 0;

+ for (int i = 0; i < ranges->length(); i++) {

+ CharacterRange range = ranges->at(i);

+ if (last < range.from())

+ AddRange(CharacterRange(last, range.from() - 1));

+ if (range.to() >= last) {

+ if (range.to() == 0xFFFF) {

+ return;

+ } else {

+ last = range.to() + 1;

+ }

+ AddRange(CharacterRange(last, 0xFFFF));

+void DispatchTableConstructor::VisitText(TextNode* that) {

+ TextElement elm = that->elements()->at(0);

+ switch (elm.type) {

+ case TextElement::ATOM: {

+ uc16 c = elm.data.u_atom->data()[0];

+ AddRange(CharacterRange(c, c));

+ break;

+ }

+ case TextElement::CHAR_CLASS: {

+ RegExpCharacterClass* tree = elm.data.u_char_class;

+ ZoneList<CharacterRange>* ranges = tree->ranges();

+ if (tree->is_negated()) {

+ AddInverse(ranges);

+ } else {

+ for (int i = 0; i < ranges->length(); i++)

+ AddRange(ranges->at(i));

+ }

+ break;

+ }

+ default: {

+ UNIMPLEMENTED();

+ }

+void DispatchTableConstructor::VisitAction(ActionNode* that) {

+ that->on_success()->Accept(this);

+Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,

+ RegExpNode** node_return,

+ bool ignore_case) {

+ RegExpCompiler compiler(input->capture_count, ignore_case);

+ // Wrap the body of the regexp in capture #0.

+ RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,

+ 0,

+ &compiler,

+ compiler.accept(),

+ compiler.backtrack());

+ // Add a .*? at the beginning, outside the body capture.

+ // Note: We could choose to not add this if the regexp is anchored at

+ // the start of the input but I'm not sure how best to do that and

+ // since we don't even handle ^ yet I'm saving that optimization for

+ // later.

+ RegExpNode* node = RegExpQuantifier::ToNode(0,

+ RegExpQuantifier::kInfinity,

+ false,

+ new RegExpCharacterClass('*'),

+ &compiler,

+ captured_body,

+ compiler.backtrack());

+ if (node_return != NULL) *node_return = node;

+ Analysis analysis;

+ analysis.EnsureAnalyzed(node);

+ if (!FLAG_irregexp) {

+ return Handle<FixedArray>::null();

+ }

+#if !(defined ARM || defined __arm__ || defined __thumb__)

+ if (FLAG_irregexp_native) { // Flag only checked in IA32 mode.

+ // TODO(lrn) Move compilation to a later point in the life-cycle

+ // of the RegExp. We don't know the type of input string yet.

+ // For now, always assume two-byte strings.

+ RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16,

+ (input->capture_count + 1) * 2,

+ ignore_case);

+ return compiler.Assemble(&macro_assembler,

+ node,

+ input->capture_count);

+ }

+#endif

+ byte codes[1024];

+ IrregexpAssembler assembler(Vector<byte>(codes, 1024));

+ RegExpMacroAssemblerIrregexp macro_assembler(&assembler);

+ return compiler.Assemble(&macro_assembler,

+ node,

+ input->capture_count);

}} // namespace v8::internal

« src/jsregexp.h ('K') | « src/jsregexp.h ('k') | src/jsregexp-inl.h » ('j') | src/jsregexp-inl.h » ('J')