Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(178)

Unified Diff: src/jsregexp.cc

Issue 12427: Merge regexp2000 back into bleeding_edge (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/jsregexp.cc
===================================================================
--- src/jsregexp.cc (revision 830)
+++ src/jsregexp.cc (working copy)
@@ -25,15 +25,30 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#define _HAS_EXCEPTIONS 0
Erik Corry 2008/11/25 11:03:52 ??
+#include <set>
Erik Corry 2008/11/25 11:03:52 Where is this coming from?
Mads Ager (chromium) 2008/11/25 21:09:41 These are still there. Please remove!
Christian Plesner Hansen 2008/11/26 06:49:56 It's in use.
+
#include "v8.h"
+#include "ast.h"
#include "execution.h"
#include "factory.h"
-#include "jsregexp.h"
+#include "jsregexp-inl.h"
#include "platform.h"
#include "runtime.h"
#include "top.h"
#include "compilation-cache.h"
+#include "string-stream.h"
+#include "parser.h"
+#include "assembler-irregexp.h"
+#include "regexp-macro-assembler.h"
+#include "regexp-macro-assembler-irregexp.h"
+#if defined __arm__ || defined __thumb__ || defined ARM
+// include regexp-macro-assembler-arm.h when created.
+#else // ia32
+#include "regexp-macro-assembler-ia32.h"
+#endif
+#include "interpreter-irregexp.h"
// Including pcre.h undefines DEBUG to avoid getting debug output from
// the JSCRE implementation. Make sure to redefine it in debug mode
@@ -45,12 +60,10 @@
#include "third_party/jscre/pcre.h"
#endif
+
namespace v8 { namespace internal {
-#define CAPTURE_INDEX 0
-#define INTERNAL_INDEX 1
-
static Failure* malloc_failure;
static void* JSREMalloc(size_t size) {
@@ -176,7 +189,16 @@
}
-unibrow::Predicate<unibrow::RegExpSpecialChar, 128> is_reg_exp_special_char;
+static inline void ThrowRegExpException(Handle<JSRegExp> re,
+ Handle<String> pattern,
+ Handle<String> error_text,
+ const char* message) {
+ Handle<JSArray> array = Factory::NewJSArray(2);
+ SetElement(array, 0, pattern);
+ SetElement(array, 1, error_text);
+ Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
+ Top::Throw(*regexp_err);
+}
Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
@@ -186,20 +208,42 @@
Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
bool in_cache = !cached.is_null();
Handle<Object> result;
- StringShape shape(*pattern);
if (in_cache) {
re->set_data(*cached);
result = re;
} else {
- bool is_atom = !flags.is_ignore_case();
- for (int i = 0; is_atom && i < pattern->length(shape); i++) {
- if (is_reg_exp_special_char.get(pattern->Get(shape, i)))
- is_atom = false;
+ FlattenString(pattern);
+ RegExpParseResult parse_result;
+ FlatStringReader reader(pattern);
+ if (!ParseRegExp(&reader, &parse_result)) {
+ // Throw an exception if we fail to parse the pattern.
+ ThrowRegExpException(re,
+ pattern,
+ parse_result.error,
+ "malformed_regexp");
+ return Handle<Object>();
}
- if (is_atom) {
- result = AtomCompile(re, pattern, flags);
+ RegExpAtom* atom = parse_result.tree->AsAtom();
+ if (atom != NULL && !flags.is_ignore_case()) {
+ if (parse_result.has_character_escapes) {
+ Vector<const uc16> atom_pattern = atom->data();
+ Handle<String> atom_string =
+ Factory::NewStringFromTwoByte(atom_pattern);
+ result = AtomCompile(re, pattern, flags, atom_string);
+ } else {
+ result = AtomCompile(re, pattern, flags, pattern);
+ }
} else {
- result = JsreCompile(re, pattern, flags);
+ RegExpNode* node = NULL;
+ Handle<FixedArray> irregexp_data =
+ RegExpEngine::Compile(&parse_result,
+ &node,
+ flags.is_ignore_case());
+ if (irregexp_data.is_null()) {
+ result = JscrePrepare(re, pattern, flags);
+ } else {
+ result = IrregexpPrepare(re, pattern, flags, irregexp_data);
+ }
}
Object* data = re->data();
if (data->IsFixedArray()) {
@@ -220,9 +264,11 @@
Handle<Object> index) {
switch (regexp->TypeTag()) {
case JSRegExp::JSCRE:
- return JsreExec(regexp, subject, index);
+ return JscreExec(regexp, subject, index);
case JSRegExp::ATOM:
return AtomExec(regexp, subject, index);
+ case JSRegExp::IRREGEXP:
+ return IrregexpExec(regexp, subject, index);
default:
UNREACHABLE();
return Handle<Object>();
@@ -234,9 +280,11 @@
Handle<String> subject) {
switch (regexp->TypeTag()) {
case JSRegExp::JSCRE:
- return JsreExecGlobal(regexp, subject);
+ return JscreExecGlobal(regexp, subject);
case JSRegExp::ATOM:
return AtomExecGlobal(regexp, subject);
+ case JSRegExp::IRREGEXP:
+ return IrregexpExecGlobal(regexp, subject);
default:
UNREACHABLE();
return Handle<Object>();
@@ -246,8 +294,9 @@
Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
Handle<String> pattern,
- JSRegExp::Flags flags) {
- Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, pattern);
+ JSRegExp::Flags flags,
+ Handle<String> match_pattern) {
+ Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
return re;
}
@@ -267,12 +316,8 @@
if (value == -1) return Factory::null_value();
Handle<FixedArray> array = Factory::NewFixedArray(2);
- array->set(0,
- Smi::FromInt(value),
- SKIP_WRITE_BARRIER);
- array->set(1,
- Smi::FromInt(value + needle->length()),
- SKIP_WRITE_BARRIER);
+ array->set(0, Smi::FromInt(value));
+ array->set(1, Smi::FromInt(value + needle->length()));
return Factory::NewJSArrayWithElements(array);
}
@@ -296,12 +341,8 @@
int end = value + needle_length;
Handle<FixedArray> array = Factory::NewFixedArray(2);
- array->set(0,
- Smi::FromInt(value),
- SKIP_WRITE_BARRIER);
- array->set(1,
- Smi::FromInt(end),
- SKIP_WRITE_BARRIER);
+ array->set(0, Smi::FromInt(value));
+ array->set(1, Smi::FromInt(end));
Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
SetElement(result, match_count, pair);
match_count++;
@@ -312,6 +353,24 @@
}
+Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
+ Handle<String> pattern,
+ JSRegExp::Flags flags) {
+ Handle<Object> value(Heap::undefined_value());
+ Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
+ return re;
+}
+
+
+Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
+ Handle<String> pattern,
+ JSRegExp::Flags flags,
+ Handle<FixedArray> irregexp_data) {
+ Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data);
+ return re;
+}
+
+
static inline Object* DoCompile(String* pattern,
JSRegExp::Flags flags,
unsigned* number_of_captures,
@@ -358,9 +417,13 @@
}
-Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re,
- Handle<String> pattern,
- JSRegExp::Flags flags) {
+Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
+ ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
+ ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
+
+ Handle<String> pattern(re->Pattern());
+ JSRegExp::Flags flags = re->GetFlags();
+
Handle<String> two_byte_pattern = StringToTwoByte(pattern);
unsigned number_of_captures;
@@ -391,26 +454,110 @@
Handle<ByteArray> internal(
ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
- Handle<FixedArray> value = Factory::NewFixedArray(2);
- value->set(CAPTURE_INDEX, Smi::FromInt(number_of_captures));
- value->set(INTERNAL_INDEX, *internal);
+ Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
+ value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
+ value->set(kJscreInternalIndex, *internal);
Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
return re;
}
-Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp,
- int num_captures,
- Handle<String> subject,
- int previous_index,
- const uc16* two_byte_subject,
- int* offsets_vector,
- int offsets_vector_length) {
+Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp,
+ int num_captures,
+ Handle<String> two_byte_subject,
+ int previous_index,
+ int* offsets_vector,
+ int offsets_vector_length) {
+#ifdef DEBUG
+ if (FLAG_trace_regexp_bytecodes) {
+ String* pattern = regexp->Pattern();
+ PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
+ PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString()));
+ }
+#endif
+ ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation());
+ ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject)));
+ bool rc;
+ {
Mads Ager (chromium) 2008/11/25 21:09:41 Why is the extra scope needed here?
Erik Corry 2008/11/26 12:18:36 It isn't. Removed.
+ for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
+ offsets_vector[i] = -1;
+ }
+
+ LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject));
+
+ FixedArray* irregexp =
+ FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex));
+ int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
+
+ switch (tag) {
Mads Ager (chromium) 2008/11/25 21:09:41 The indentation of this switch is off. Indent the
Erik Corry 2008/11/26 12:18:36 Done.
+ case RegExpMacroAssembler::kIA32Implementation: {
+ Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex));
+ SmartPointer<int> captures(NewArray<int>((num_captures + 1) * 2));
+ Address start_addr =
+ Handle<SeqTwoByteString>::cast(two_byte_subject)->GetCharsAddress();
+ int start_offset =
+ start_addr - reinterpret_cast<Address>(*two_byte_subject);
+ int end_offset =
+ start_offset + (two_byte_subject->length() - previous_index) * 2;
+ typedef bool testfunc(String**, int, int, int*);
+ testfunc* test = FUNCTION_CAST<testfunc*>(code->entry());
+ rc = test(two_byte_subject.location(),
+ start_offset,
+ end_offset,
+ *captures);
+ if (rc) {
+ // Capture values are relative to start_offset only.
+ for (int i = 0; i < offsets_vector_length; i++) {
+ if (offsets_vector[i] >= 0) {
+ offsets_vector[i] += previous_index;
+ }
+ }
+ }
+ break;
+ }
+ default:
Mads Ager (chromium) 2008/11/25 21:09:41 Why is this default case in the middle of the rest
Erik Corry 2008/11/26 12:18:36 Because it falls through? Lasse, please take a lo
+ case RegExpMacroAssembler::kARMImplementation:
+ UNREACHABLE();
+ rc = false;
+ break;
+ case RegExpMacroAssembler::kBytecodeImplementation: {
+ Handle<ByteArray> byte_codes = IrregexpCode(regexp);
+
+ rc = IrregexpInterpreter::Match(byte_codes,
+ two_byte_subject,
+ offsets_vector,
+ previous_index);
+ break;
+ }
+ }
+ }
+
+ if (!rc) {
+ return Factory::null_value();
+ }
+
+ Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
+ // The captures come in (start, end+1) pairs.
+ for (int i = 0; i < 2 * (num_captures+1); i += 2) {
+ array->set(i, Smi::FromInt(offsets_vector[i]));
+ array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
+ }
+ return Factory::NewJSArrayWithElements(array);
+}
+
+
+Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
+ int num_captures,
+ Handle<String> subject,
+ int previous_index,
+ const uc16* two_byte_subject,
+ int* offsets_vector,
+ int offsets_vector_length) {
int rc;
{
AssertNoAllocation a;
- ByteArray* internal = JsreInternal(regexp);
+ ByteArray* internal = JscreInternal(regexp);
const JscreRegExp* js_regexp =
reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress());
@@ -444,12 +591,8 @@
Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
// The captures come in (start, end+1) pairs.
for (int i = 0; i < 2 * (num_captures+1); i += 2) {
- array->set(i,
- Smi::FromInt(offsets_vector[i]),
- SKIP_WRITE_BARRIER);
- array->set(i+1,
- Smi::FromInt(offsets_vector[i+1]),
- SKIP_WRITE_BARRIER);
+ array->set(i, Smi::FromInt(offsets_vector[i]));
+ array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
}
return Factory::NewJSArrayWithElements(array);
}
@@ -457,8 +600,8 @@
class OffsetsVector {
public:
- inline OffsetsVector(int num_captures) {
- offsets_vector_length_ = (num_captures + 1) * 3;
+ inline OffsetsVector(int num_registers) :
Mads Ager (chromium) 2008/11/25 21:09:41 Move : to the next line and do a four space indent
+ offsets_vector_length_(num_registers) {
if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
vector_ = NewArray<int>(offsets_vector_length_);
} else {
@@ -487,7 +630,7 @@
private:
int* vector_;
int offsets_vector_length_;
- static const int kStaticOffsetsVectorSize = 30;
+ static const int kStaticOffsetsVectorSize = 50;
static int static_offsets_vector_[kStaticOffsetsVectorSize];
};
@@ -496,34 +639,127 @@
OffsetsVector::kStaticOffsetsVectorSize];
-Handle<Object> RegExpImpl::JsreExec(Handle<JSRegExp> regexp,
+Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
Handle<String> subject,
Mads Ager (chromium) 2008/11/25 21:09:41 Indentation is off.
Erik Corry 2008/11/26 12:18:36 Fixed.
Handle<Object> index) {
+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
+ ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());
+
// Prepare space for the return values.
- int num_captures = JsreCapture(regexp);
+ int number_of_registers = IrregexpNumberOfRegisters(regexp);
+ OffsetsVector offsets(number_of_registers);
- OffsetsVector offsets(num_captures);
+ int num_captures = IrregexpNumberOfCaptures(regexp);
int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
Handle<String> subject16 = CachedStringToTwoByte(subject);
- Handle<Object> result(JsreExecOnce(regexp, num_captures, subject,
- previous_index,
- subject16->GetTwoByteData(),
- offsets.vector(), offsets.length()));
+ Handle<Object> result(
+ IrregexpExecOnce(regexp,
+ num_captures,
+ subject16,
+ previous_index,
+ offsets.vector(),
+ offsets.length()));
+ return result;
+}
+
+Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
+ Handle<String> subject,
+ Handle<Object> index) {
+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
+ if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
+ Handle<Object> compile_result = JscreCompile(regexp);
+ if (compile_result.is_null()) return compile_result;
+ }
+ ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
+
+ int num_captures = JscreNumberOfCaptures(regexp);
+
+ OffsetsVector offsets((num_captures + 1) * 3);
+
+ int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
+
+ Handle<String> subject16 = CachedStringToTwoByte(subject);
+
+ Handle<Object> result(JscreExecOnce(regexp,
+ num_captures,
+ subject,
+ previous_index,
+ subject16->GetTwoByteData(),
+ offsets.vector(),
+ offsets.length()));
+
return result;
}
-Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp,
- Handle<String> subject) {
+Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
+ Handle<String> subject) {
+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
+ ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());
+
// Prepare space for the return values.
- int num_captures = JsreCapture(regexp);
+ int number_of_registers = IrregexpNumberOfRegisters(regexp);
+ OffsetsVector offsets(number_of_registers);
- OffsetsVector offsets(num_captures);
+ int previous_index = 0;
+ Handle<JSArray> result = Factory::NewJSArray(0);
+ int i = 0;
+ Handle<Object> matches;
+
+ Handle<String> subject16 = CachedStringToTwoByte(subject);
+
+ do {
+ if (previous_index > subject->length() || previous_index < 0) {
+ // Per ECMA-262 15.10.6.2, if the previous index is greater than the
+ // string length, there is no match.
+ matches = Factory::null_value();
+ } else {
+ matches = IrregexpExecOnce(regexp,
+ IrregexpNumberOfCaptures(regexp),
+ subject16,
+ previous_index,
+ offsets.vector(),
+ offsets.length());
+
+ if (matches->IsJSArray()) {
+ SetElement(result, i, matches);
+ i++;
+ previous_index = offsets.vector()[1];
+ if (offsets.vector()[0] == offsets.vector()[1]) {
+ previous_index++;
+ }
+ }
+ }
+ } while (matches->IsJSArray());
+
+ // If we exited the loop with an exception, throw it.
+ if (matches->IsNull()) { // Exited loop normally.
Mads Ager (chromium) 2008/11/25 21:09:41 Move these comments to new lines before the return
Erik Corry 2008/11/26 12:18:36 I'm not sure why that's any better, but fixed. Wh
+ return result;
+ } else { // Exited loop with the exception in matches.
+ return matches;
+ }
+}
+
+
+Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
+ Handle<String> subject) {
+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
+ if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
+ Handle<Object> compile_result = JscreCompile(regexp);
+ if (compile_result.is_null()) return compile_result;
+ }
+ ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
+
+ // Prepare space for the return values.
+ int num_captures = JscreNumberOfCaptures(regexp);
+
+ OffsetsVector offsets((num_captures + 1) * 3);
+
int previous_index = 0;
Handle<JSArray> result = Factory::NewJSArray(0);
@@ -538,9 +774,13 @@
// string length, there is no match.
matches = Factory::null_value();
} else {
- matches = JsreExecOnce(regexp, num_captures, subject, previous_index,
- subject16->GetTwoByteData(),
- offsets.vector(), offsets.length());
+ matches = JscreExecOnce(regexp,
+ num_captures,
+ subject,
+ previous_index,
+ subject16->GetTwoByteData(),
+ offsets.vector(),
+ offsets.length());
if (matches->IsJSArray()) {
SetElement(result, i, matches);
@@ -562,15 +802,1798 @@
}
-int RegExpImpl::JsreCapture(Handle<JSRegExp> re) {
+int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
- return Smi::cast(value->get(CAPTURE_INDEX))->value();
+ return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->
+ value();
Mads Ager (chromium) 2008/11/25 21:09:41 This should fit on the previous line.
Erik Corry 2008/11/26 12:18:36 Done.
}
-ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) {
+ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
- return ByteArray::cast(value->get(INTERNAL_INDEX));
+ return ByteArray::cast(value->get(kJscreInternalIndex));
}
+
+int RegExpImpl::IrregexpNumberOfCaptures(Handle<JSRegExp> re) {
+ FixedArray* value =
+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
+ return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value();
+}
+
+
+int RegExpImpl::IrregexpNumberOfRegisters(Handle<JSRegExp> re) {
+ FixedArray* value =
+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
+ return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value();
+}
+
+
+Handle<ByteArray> RegExpImpl::IrregexpCode(Handle<JSRegExp> re) {
+ FixedArray* value =
+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
+ return Handle<ByteArray>(ByteArray::cast(value->get(kIrregexpCodeIndex)));
+}
+
+
+// -------------------------------------------------------------------
+// New regular expression engine
Erik Corry 2008/11/25 11:03:52 Probably should mention "Irregexp"
Erik Corry 2008/11/26 12:18:36 Fixed.
+
+
+void RegExpTree::AppendToText(RegExpText* text) {
+ UNREACHABLE();
+}
+
+
+void RegExpAtom::AppendToText(RegExpText* text) {
+ text->AddElement(TextElement::Atom(this));
+}
+
+
+void RegExpCharacterClass::AppendToText(RegExpText* text) {
+ text->AddElement(TextElement::CharClass(this));
+}
+
+
+void RegExpText::AppendToText(RegExpText* text) {
+ for (int i = 0; i < elements()->length(); i++)
+ text->AddElement(elements()->at(i));
+}
+
+
+TextElement TextElement::Atom(RegExpAtom* atom) {
+ TextElement result = TextElement(ATOM);
+ result.data.u_atom = atom;
+ return result;
+}
+
+
+TextElement TextElement::CharClass(
+ RegExpCharacterClass* char_class) {
+ TextElement result = TextElement(CHAR_CLASS);
+ result.data.u_char_class = char_class;
+ return result;
+}
+
+
+class RegExpCompiler {
+ public:
+ RegExpCompiler(int capture_count, bool ignore_case);
+
+ int AllocateRegister() { return next_register_++; }
+
+ Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
+ RegExpNode* start,
+ int capture_count);
+
+ inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
+
+ static const int kImplementationOffset = 0;
+ static const int kNumberOfRegistersOffset = 0;
+ static const int kCodeOffset = 1;
+
+ RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
+ EndNode* accept() { return accept_; }
+ EndNode* backtrack() { return backtrack_; }
+
+ static const int kMaxRecursion = 100;
+ inline int recursion_depth() { return recursion_depth_; }
+ inline void IncrementRecursionDepth() { recursion_depth_++; }
+ inline void DecrementRecursionDepth() { recursion_depth_--; }
+
+ inline bool is_case_independent() { return is_case_independent_; }
+
+ private:
+ EndNode* accept_;
+ EndNode* backtrack_;
+ int next_register_;
+ List<RegExpNode*>* work_list_;
+ int recursion_depth_;
+ RegExpMacroAssembler* macro_assembler_;
+ bool is_case_independent_;
+};
+
+
+// Attempts to compile the regexp using an Irregexp code generator. Returns
+// a fixed array or a null handle depending on whether it succeeded.
+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)
+ : next_register_(2 * (capture_count + 1)),
Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.
Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.
+ work_list_(NULL),
+ recursion_depth_(0),
+ is_case_independent_(ignore_case) {
+ accept_ = new EndNode(EndNode::ACCEPT);
+ backtrack_ = new EndNode(EndNode::BACKTRACK);
+}
+
+
+Handle<FixedArray> RegExpCompiler::Assemble(
+ RegExpMacroAssembler* macro_assembler,
+ RegExpNode* start,
+ int capture_count) {
+ if (!FLAG_attempt_case_independent && is_case_independent_) {
+ return Handle<FixedArray>::null();
+ }
+ macro_assembler_ = macro_assembler;
+ List <RegExpNode*> work_list(0);
+ work_list_ = &work_list;
+ Label fail;
+ macro_assembler->PushBacktrack(&fail);
+ if (!start->GoTo(this)) {
+ fail.Unuse();
+ return Handle<FixedArray>::null();
+ }
+ while (!work_list.is_empty()) {
+ if (!work_list.RemoveLast()->GoTo(this)) {
+ fail.Unuse();
+ return Handle<FixedArray>::null();
+ }
+ }
+ macro_assembler->Bind(&fail);
+ macro_assembler->Fail();
+ Handle<FixedArray> array =
+ Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
+ array->set(RegExpImpl::kIrregexpImplementationIndex,
+ Smi::FromInt(macro_assembler->Implementation()));
+ array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
+ Smi::FromInt(next_register_));
+ array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
+ Smi::FromInt(capture_count));
+ Handle<Object> code = macro_assembler->GetCode();
+ array->set(RegExpImpl::kIrregexpCodeIndex, *code);
+ work_list_ = NULL;
+ return array;
+}
+
+
+bool RegExpNode::GoTo(RegExpCompiler* compiler) {
+ // TODO(erikcorry): Implement support.
+ if (info_.follows_word_interest ||
+ info_.follows_newline_interest ||
+ info_.follows_start_interest) {
+ return false;
+ }
+ if (label_.is_bound()) {
+ compiler->macro_assembler()->GoTo(&label_);
+ return true;
+ } else {
Mads Ager (chromium) 2008/11/25 21:09:41 Most of this nesting could go away since all of th
Christian Plesner Hansen 2008/11/26 06:49:56 Nesting aids readability: you can trivially see wh
+ if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) {
+ compiler->macro_assembler()->GoTo(&label_);
+ compiler->AddWork(this);
+ return true;
+ } else {
+ compiler->IncrementRecursionDepth();
+ bool how_it_went = Emit(compiler);
+ compiler->DecrementRecursionDepth();
+ return how_it_went;
+ }
+ }
+}
+
+
+bool EndNode::GoTo(RegExpCompiler* compiler) {
+ if (info()->follows_word_interest ||
+ info()->follows_newline_interest ||
+ info()->follows_start_interest) {
+ return false;
+ }
+ if (!label()->is_bound()) {
+ Bind(compiler->macro_assembler());
+ }
+ switch (action_) {
+ case ACCEPT:
+ compiler->macro_assembler()->Succeed();
+ break;
Mads Ager (chromium) 2008/11/25 21:09:41 Indent the break.
Erik Corry 2008/11/26 12:18:36 Fixed.
+ case BACKTRACK:
+ compiler->macro_assembler()->Backtrack();
+ break;
Mads Ager (chromium) 2008/11/25 21:09:41 Indent the break.
Erik Corry 2008/11/26 12:18:36 Fixed.
+ }
+ return true;
+}
+
+
+Label* RegExpNode::label() {
+ return &label_;
+}
+
+
+bool EndNode::Emit(RegExpCompiler* compiler) {
+ RegExpMacroAssembler* macro = compiler->macro_assembler();
+ switch (action_) {
+ case ACCEPT:
+ Bind(macro);
+ macro->Succeed();
+ return true;
+ case BACKTRACK:
+ Bind(macro);
+ macro->Backtrack();
+ return true;
+ }
+ return false;
+}
+
+
+void GuardedAlternative::AddGuard(Guard* guard) {
+ if (guards_ == NULL)
+ guards_ = new ZoneList<Guard*>(1);
+ guards_->Add(guard);
+}
+
+
+ActionNode* ActionNode::StoreRegister(int reg,
+ int val,
+ RegExpNode* on_success) {
+ ActionNode* result = new ActionNode(STORE_REGISTER, on_success);
+ result->data_.u_store_register.reg = reg;
+ result->data_.u_store_register.value = val;
+ return result;
+}
+
+
+ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
+ ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
+ result->data_.u_increment_register.reg = reg;
+ return result;
+}
+
+
+ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {
+ ActionNode* result = new ActionNode(STORE_POSITION, on_success);
+ result->data_.u_position_register.reg = reg;
+ return result;
+}
+
+
+ActionNode* ActionNode::SavePosition(int reg, RegExpNode* on_success) {
+ ActionNode* result = new ActionNode(SAVE_POSITION, on_success);
+ result->data_.u_position_register.reg = reg;
+ return result;
+}
+
+
+ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) {
+ ActionNode* result = new ActionNode(RESTORE_POSITION, on_success);
+ result->data_.u_position_register.reg = reg;
+ return result;
+}
+
+
+ActionNode* ActionNode::BeginSubmatch(int reg, RegExpNode* on_success) {
+ ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
+ result->data_.u_submatch_stack_pointer_register.reg = reg;
+ return result;
+}
+
+
+ActionNode* ActionNode::EscapeSubmatch(int reg, RegExpNode* on_success) {
+ ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success);
+ result->data_.u_submatch_stack_pointer_register.reg = reg;
+ return result;
+}
+
+
+#define DEFINE_ACCEPT(Type) \
+ void Type##Node::Accept(NodeVisitor* visitor) { \
+ visitor->Visit##Type(this); \
+ }
+FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
+#undef DEFINE_ACCEPT
+
+
+// -------------------------------------------------------------------
+// Emit code.
+
+
+void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
+ Guard* guard,
+ Label* on_failure) {
+ switch (guard->op()) {
+ case Guard::LT:
+ macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure);
+ break;
+ case Guard::GEQ:
+ macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure);
+ break;
+ }
+}
+
+
+static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
+static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
+
+
+static inline void EmitAtomNonLetters(
+ RegExpMacroAssembler* macro_assembler,
+ TextElement elm,
+ Vector<const uc16> quarks,
+ Label* on_failure,
+ int cp_offset) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ for (int i = quarks.length() - 1; i >= 0; i--) {
+ uc16 c = quarks[i];
+ int length = uncanonicalize.get(c, '\0', chars);
+ if (length <= 1) {
+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
+ macro_assembler->CheckNotCharacter(c, on_failure);
+ }
+ }
+}
+
+
+static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
+ uc16 c1,
+ uc16 c2,
+ Label* on_failure) {
+ uc16 exor = c1 ^ c2;
+ // Check whether exor has only one bit set.
+ if (((exor - 1) & exor) == 0) {
Mads Ager (chromium) 2008/11/25 21:09:41 The if-else could be replaced by two ifs that each
Erik Corry 2008/11/26 12:18:36 Fixed.
+ // If c1 and c2 differ only by one bit.
+ // Ecma262UnCanonicalize always gives the highest number last.
+ ASSERT(c2 > c1);
+ macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);
+ return true;
+ } else {
+ ASSERT(c2 > c1);
+ uc16 diff = c2 - c1;
+ if (((diff - 1) & diff) == 0 && c1 >= diff) {
+ // If the characters differ by 2^n but don't differ by one bit then
+ // subtract the difference from the found character, then do the or
+ // trick. We avoid the theoretical case where negative numbers are
+ // involved in order to simplify code generation.
+ macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,
+ diff,
+ on_failure);
+ return true;
+ }
+ }
+ return false;
+}
+
+
+static inline void EmitAtomLetters(
+ RegExpMacroAssembler* macro_assembler,
+ TextElement elm,
+ Vector<const uc16> quarks,
+ Label* on_failure,
+ int cp_offset) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ for (int i = quarks.length() - 1; i >= 0; i--) {
+ uc16 c = quarks[i];
+ int length = uncanonicalize.get(c, '\0', chars);
+ if (length <= 1) continue;
+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
+ Label ok;
+ ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
+ switch (length) {
+ case 2: {
+ if (ShortCutEmitCharacterPair(macro_assembler,
+ chars[0],
+ chars[1],
+ on_failure)) {
+ ok.Unuse();
+ } else {
+ macro_assembler->CheckCharacter(chars[0], &ok);
+ macro_assembler->CheckNotCharacter(chars[1], on_failure);
+ macro_assembler->Bind(&ok);
+ }
+ break;
+ }
+ case 4:
+ macro_assembler->CheckCharacter(chars[3], &ok);
+ // Fall through!
+ case 3:
+ macro_assembler->CheckCharacter(chars[0], &ok);
+ macro_assembler->CheckCharacter(chars[1], &ok);
+ macro_assembler->CheckNotCharacter(chars[2], on_failure);
+ macro_assembler->Bind(&ok);
+ break;
+ default:
+ UNREACHABLE();
+ break;
+ }
+ }
+}
+
+
+static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
+ RegExpCharacterClass* cc,
+ int cp_offset,
+ Label* on_failure) {
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
+ cp_offset++;
+
+ ZoneList<CharacterRange>* ranges = cc->ranges();
+
+ Label success;
+
+ Label *char_is_in_class =
Erik Corry 2008/11/25 11:03:52 Misplaced asterisk.
Erik Corry 2008/11/26 12:18:36 Fixed.
+ cc->is_negated() ? on_failure : &success;
+
+ int range_count = ranges->length();
+
+ if (range_count == 0) {
+ if (!cc->is_negated()) {
+ macro_assembler->GoTo(on_failure);
+ }
+ return;
+ }
+
+ for (int i = 0; i < range_count - 1; i++) {
+ CharacterRange& range = ranges->at(i);
+ Label next_range;
+ uc16 from = range.from();
+ uc16 to = range.to();
+ if (to == from) {
+ macro_assembler->CheckCharacter(to, char_is_in_class);
+ } else {
+ if (from != 0) {
+ macro_assembler->CheckCharacterLT(from, &next_range);
+ }
+ if (to != 0xffff) {
+ macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
+ } else {
+ macro_assembler->GoTo(char_is_in_class);
+ }
+ }
+ macro_assembler->Bind(&next_range);
+ }
+
+ CharacterRange& range = ranges->at(range_count - 1);
+ uc16 from = range.from();
+ uc16 to = range.to();
+
+ if (to == from) {
+ if (cc->is_negated()) {
+ macro_assembler->CheckCharacter(to, on_failure);
+ } else {
+ macro_assembler->CheckNotCharacter(to, on_failure);
+ }
+ } else {
+ if (from != 0) {
+ if (!cc->is_negated()) {
+ macro_assembler->CheckCharacterLT(from, on_failure);
+ } else {
+ macro_assembler->CheckCharacterLT(from, &success);
+ }
+ }
+ if (to != 0xffff) {
+ if (!cc->is_negated()) {
+ macro_assembler->CheckCharacterGT(to, on_failure);
+ } else {
+ macro_assembler->CheckCharacterLT(to + 1, on_failure);
+ }
+ } else {
+ if (cc->is_negated()) {
+ macro_assembler->GoTo(on_failure);
+ }
+ }
+ }
+ macro_assembler->Bind(&success);
+}
+
+
+
+bool TextNode::Emit(RegExpCompiler* compiler) {
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ Bind(macro_assembler);
+ int element_count = elms_->length();
+ int cp_offset = 0;
+ // First, handle straight character matches.
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::ATOM) {
+ Vector<const uc16> quarks = elm.data.u_atom->data();
+ if (!compiler->is_case_independent()) {
+ macro_assembler->CheckCharacters(quarks,
+ cp_offset,
+ on_failure_->label());
+ } else {
+ EmitAtomNonLetters(macro_assembler,
+ elm,
+ quarks,
+ on_failure_->label(),
+ cp_offset);
+ }
+ cp_offset += quarks.length();
+ } else {
+ ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
+ cp_offset++;
+ }
+ }
+ // Second, handle case independent letter matches if any.
+ if (compiler->is_case_independent()) {
+ cp_offset = 0;
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::ATOM) {
+ Vector<const uc16> quarks = elm.data.u_atom->data();
+ EmitAtomLetters(macro_assembler,
+ elm,
+ quarks,
+ on_failure_->label(),
+ cp_offset);
+ cp_offset += quarks.length();
+ } else {
+ cp_offset++;
+ }
+ }
+ }
+ // If the fast character matches passed then do the character classes.
+ cp_offset = 0;
+ for (int i = 0; i < element_count; i++) {
+ TextElement elm = elms_->at(i);
+ if (elm.type == TextElement::CHAR_CLASS) {
+ RegExpCharacterClass* cc = elm.data.u_char_class;
+ EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label());
+ cp_offset ++;
+ } else {
+ cp_offset += elm.data.u_atom->data().length();
+ }
+ }
+
+ compiler->AddWork(on_failure_);
+ macro_assembler->AdvanceCurrentPosition(cp_offset);
+ return on_success()->GoTo(compiler);
+}
+
+
+bool ChoiceNode::Emit(RegExpCompiler* compiler) {
+ int choice_count = alternatives_->length();
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
+ Bind(macro_assembler);
+ // For now we just call all choices one after the other. The idea ultimately
+ // is to use the Dispatch table to try only the relevant ones.
+ int i;
Mads Ager (chromium) 2008/11/25 21:09:41 I don't see how the i is needed after the for loop
Erik Corry 2008/11/26 12:18:36 Fixed.
+ for (i = 0; i < choice_count - 1; i++) {
+ GuardedAlternative alternative = alternatives_->at(i);
+ Label after;
+ Label after_no_pop_cp;
+ ZoneList<Guard*>* guards = alternative.guards();
+ if (guards != NULL) {
+ int guard_count = guards->length();
+ for (int j = 0; j < guard_count; j++) {
+ GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp);
+ }
+ }
+ macro_assembler->PushCurrentPosition();
+ macro_assembler->PushBacktrack(&after);
+ if (!alternative.node()->GoTo(compiler)) {
+ after.Unuse();
+ after_no_pop_cp.Unuse();
+ return false;
+ }
+ macro_assembler->Bind(&after);
+ macro_assembler->PopCurrentPosition();
+ macro_assembler->Bind(&after_no_pop_cp);
+ }
+ GuardedAlternative alternative = alternatives_->at(i);
+ ZoneList<Guard*>* guards = alternative.guards();
+ if (guards != NULL) {
+ int guard_count = guards->length();
+ for (int j = 0; j < guard_count; j++) {
+ GenerateGuard(macro_assembler, guards->at(j), on_failure_->label());
+ }
+ }
+ if (!on_failure_->IsBacktrack()) {
+ ASSERT_NOT_NULL(on_failure_ -> label());
+ macro_assembler->PushBacktrack(on_failure_->label());
+ compiler->AddWork(on_failure_);
+ }
+ if (!alternative.node()->GoTo(compiler)) {
+ return false;
+ }
+ return true;
+}
+
+
+bool ActionNode::Emit(RegExpCompiler* compiler) {
+ RegExpMacroAssembler* macro = compiler->macro_assembler();
+ Bind(macro);
+ switch (type_) {
+ case STORE_REGISTER:
+ macro->SetRegister(data_.u_store_register.reg,
+ data_.u_store_register.value);
+ break;
+ case INCREMENT_REGISTER: {
+ Label undo;
+ macro->PushBacktrack(&undo);
+ macro->AdvanceRegister(data_.u_increment_register.reg, 1);
+ bool ok = on_success()->GoTo(compiler);
+ if (!ok) {
+ undo.Unuse();
+ return false;
+ }
+ macro->Bind(&undo);
+ macro->AdvanceRegister(data_.u_increment_register.reg, -1);
+ macro->Backtrack();
+ break;
+ }
+ case STORE_POSITION: {
+ Label undo;
+ macro->PushRegister(data_.u_position_register.reg);
+ macro->PushBacktrack(&undo);
+ macro->WriteCurrentPositionToRegister(data_.u_position_register.reg);
+ bool ok = on_success()->GoTo(compiler);
+ if (!ok) {
+ undo.Unuse();
+ return false;
+ }
+ macro->Bind(&undo);
+ macro->PopRegister(data_.u_position_register.reg);
+ macro->Backtrack();
+ break;
+ }
+ case SAVE_POSITION:
+ macro->WriteCurrentPositionToRegister(
+ data_.u_position_register.reg);
+ break;
+ case RESTORE_POSITION:
+ macro->ReadCurrentPositionFromRegister(
+ data_.u_position_register.reg);
+ break;
+ case BEGIN_SUBMATCH:
+ macro->WriteStackPointerToRegister(
+ data_.u_submatch_stack_pointer_register.reg);
+ break;
+ case ESCAPE_SUBMATCH:
+ macro->ReadStackPointerFromRegister(
+ data_.u_submatch_stack_pointer_register.reg);
+ break;
+ default:
+ UNREACHABLE();
+ return false;
+ }
+ return on_success()->GoTo(compiler);
+}
+
+
+bool BackReferenceNode::Emit(RegExpCompiler* compiler) {
+ RegExpMacroAssembler* macro = compiler->macro_assembler();
+ Bind(macro);
+ // Check whether the registers are uninitialized and always
+ // succeed if they are.
+ macro->IfRegisterLT(start_reg_, 0, on_success()->label());
+ macro->IfRegisterLT(end_reg_, 0, on_success()->label());
+ ASSERT_EQ(start_reg_ + 1, end_reg_);
+ macro->CheckNotBackReference(start_reg_, on_failure_->label());
+ return on_success()->GoTo(compiler);
+}
+
+
+// -------------------------------------------------------------------
+// Dot/dotty output
+
+
+#ifdef DEBUG
+
+
+class DotPrinter: public NodeVisitor {
+ public:
+ DotPrinter() : stream_(&alloc_) { }
+ void PrintNode(const char* label, RegExpNode* node);
+ void Visit(RegExpNode* node);
+ void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure);
+ StringStream* stream() { return &stream_; }
+#define DECLARE_VISIT(Type) \
+ virtual void Visit##Type(Type##Node* that);
+FOR_EACH_NODE_TYPE(DECLARE_VISIT)
+#undef DECLARE_VISIT
+ private:
+ HeapStringAllocator alloc_;
+ StringStream stream_;
+ std::set<RegExpNode*> seen_;
+};
+
+
+void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
+ stream()->Add("digraph G {\n graph [label=\"");
+ for (int i = 0; label[i]; i++) {
+ switch (label[i]) {
+ case '\\':
+ stream()->Add("\\\\");
+ break;
+ case '"':
+ stream()->Add("\"");
+ break;
+ default:
+ stream()->Put(label[i]);
+ break;
+ }
+ }
+ stream()->Add("\"];\n");
+ Visit(node);
+ stream()->Add("}\n");
+ printf("%s", *(stream()->ToCString()));
+}
+
+
+void DotPrinter::Visit(RegExpNode* node) {
+ if (seen_.find(node) != seen_.end())
+ return;
+ seen_.insert(node);
+ node->Accept(this);
+}
+
+
+void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
+ if (on_failure->IsBacktrack()) return;
+ stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
+ Visit(on_failure);
+}
+
+
+class TableEntryBodyPrinter {
+ public:
+ TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
+ : stream_(stream), choice_(choice) { }
Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.
Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.
+ void Call(uc16 from, DispatchTable::Entry entry) {
+ OutSet* out_set = entry.out_set();
+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
+ if (out_set->Get(i)) {
+ stream()->Add(" n%p:s%io%i -> n%p;\n",
+ choice(),
+ from,
+ i,
+ choice()->alternatives()->at(i).node());
+ }
+ }
+ }
+ private:
+ StringStream* stream() { return stream_; }
+ ChoiceNode* choice() { return choice_; }
+ StringStream* stream_;
+ ChoiceNode* choice_;
+};
+
+
+class TableEntryHeaderPrinter {
+ public:
+ explicit TableEntryHeaderPrinter(StringStream* stream)
+ : first_(true), stream_(stream) { }
Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.
Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.
+ void Call(uc16 from, DispatchTable::Entry entry) {
+ if (first_) {
+ first_ = false;
+ } else {
+ stream()->Add("|");
+ }
+ stream()->Add("{\\%k-\\%k|{", from, entry.to());
+ OutSet* out_set = entry.out_set();
+ int priority = 0;
+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
+ if (out_set->Get(i)) {
+ if (priority > 0) stream()->Add("|");
+ stream()->Add("<s%io%i> %i", from, i, priority);
+ priority++;
+ }
+ }
+ stream()->Add("}}");
+ }
+ private:
+ bool first_;
+ StringStream* stream() { return stream_; }
+ StringStream* stream_;
+};
+
+
+void DotPrinter::VisitChoice(ChoiceNode* that) {
+ stream()->Add(" n%p [shape=Mrecord, label=\"", that);
+ TableEntryHeaderPrinter header_printer(stream());
+ that->table()->ForEach(&header_printer);
+ stream()->Add("\"]\n", that);
+ TableEntryBodyPrinter body_printer(stream(), that);
+ that->table()->ForEach(&body_printer);
+ PrintOnFailure(that, that->on_failure());
+ for (int i = 0; i < that->alternatives()->length(); i++) {
+ GuardedAlternative alt = that->alternatives()->at(i);
+ alt.node()->Accept(this);
+ }
+}
+
+
+void DotPrinter::VisitText(TextNode* that) {
+ stream()->Add(" n%p [label=\"", that);
+ for (int i = 0; i < that->elements()->length(); i++) {
+ if (i > 0) stream()->Add(" ");
+ TextElement elm = that->elements()->at(i);
+ switch (elm.type) {
+ case TextElement::ATOM: {
+ stream()->Add("'%w'", elm.data.u_atom->data());
+ break;
+ }
+ case TextElement::CHAR_CLASS: {
+ RegExpCharacterClass* node = elm.data.u_char_class;
+ stream()->Add("[");
+ if (node->is_negated())
+ stream()->Add("^");
+ for (int j = 0; j < node->ranges()->length(); j++) {
+ CharacterRange range = node->ranges()->at(j);
+ stream()->Add("%k-%k", range.from(), range.to());
+ }
+ stream()->Add("]");
+ break;
+ }
+ default:
+ UNREACHABLE();
+ }
+ }
+ stream()->Add("\", shape=box, peripheries=2];\n");
+ stream()->Add(" n%p -> n%p;\n", that, that->on_success());
+ Visit(that->on_success());
+ PrintOnFailure(that, that->on_failure());
+}
+
+
+void DotPrinter::VisitBackReference(BackReferenceNode* that) {
+ stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
+ that,
+ that->start_register(),
+ that->end_register());
+ stream()->Add(" n%p -> n%p;\n", that, that->on_success());
+ Visit(that->on_success());
+ PrintOnFailure(that, that->on_failure());
+}
+
+
+void DotPrinter::VisitEnd(EndNode* that) {
+ stream()->Add(" n%p [style=bold, shape=point];\n", that);
+}
+
+
+void DotPrinter::VisitAction(ActionNode* that) {
+ stream()->Add(" n%p [", that);
+ switch (that->type_) {
+ case ActionNode::STORE_REGISTER:
+ stream()->Add("label=\"$%i:=%i\", shape=octagon",
+ that->data_.u_store_register.reg,
+ that->data_.u_store_register.value);
+ break;
+ case ActionNode::INCREMENT_REGISTER:
+ stream()->Add("label=\"$%i++\", shape=octagon",
+ that->data_.u_increment_register.reg);
+ break;
+ case ActionNode::STORE_POSITION:
+ stream()->Add("label=\"$%i:=$pos\", shape=octagon",
+ that->data_.u_position_register.reg);
+ break;
+ case ActionNode::SAVE_POSITION:
+ stream()->Add("label=\"$%i:=$pos\", shape=octagon",
+ that->data_.u_position_register.reg);
+ break;
+ case ActionNode::RESTORE_POSITION:
+ stream()->Add("label=\"$pos:=$%i\", shape=octagon",
+ that->data_.u_position_register.reg);
+ break;
+ case ActionNode::BEGIN_SUBMATCH:
+ stream()->Add("label=\"begin\", shape=septagon");
+ break;
+ case ActionNode::ESCAPE_SUBMATCH:
+ stream()->Add("label=\"escape\", shape=septagon");
+ break;
+ }
+ stream()->Add("];\n");
+ stream()->Add(" n%p -> n%p;\n", that, that->on_success());
+ Visit(that->on_success());
+}
+
+
+class DispatchTableDumper {
+ public:
+ explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
+ void Call(uc16 key, DispatchTable::Entry entry);
+ StringStream* stream() { return stream_; }
+ private:
+ StringStream* stream_;
+};
+
+
+void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
+ stream()->Add("[%k-%k]: {", key, entry.to());
+ OutSet* set = entry.out_set();
+ bool first = true;
+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
+ if (set->Get(i)) {
+ if (first) {
+ first = false;
+ } else {
+ stream()->Add(", ");
+ }
+ stream()->Add("%i", i);
+ }
+ }
+ stream()->Add("}\n");
+}
+
+
+void DispatchTable::Dump() {
+ HeapStringAllocator alloc;
+ StringStream stream(&alloc);
+ DispatchTableDumper dumper(&stream);
+ tree()->ForEach(&dumper);
+ OS::PrintError("%s", *stream.ToCString());
+}
+
+
+void RegExpEngine::DotPrint(const char* label, RegExpNode* node) {
+ DotPrinter printer;
+ printer.PrintNode(label, node);
+}
+
+
+#endif // DEBUG
+
+
+// -------------------------------------------------------------------
+// Tree to graph conversion
+
+
+RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
+ elms->Add(TextElement::Atom(this));
+ return new TextNode(elms, on_success, on_failure);
+}
+
+
+RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ return new TextNode(elements(), on_success, on_failure);
+}
+
+
+RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
+ elms->Add(TextElement::CharClass(this));
+ return new TextNode(elms, on_success, on_failure);
+}
+
+
+RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+ ChoiceNode* result = new ChoiceNode(length, on_failure);
+ for (int i = 0; i < length; i++) {
+ GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
+ on_success,
+ on_failure));
+ result->AddAlternative(alternative);
+ }
+ return result;
+}
+
+
+RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ return ToNode(min(),
+ max(),
+ is_greedy(),
+ body(),
+ compiler,
+ on_success,
+ on_failure);
+}
+
+
+RegExpNode* RegExpQuantifier::ToNode(int min,
+ int max,
+ bool is_greedy,
+ RegExpTree* body,
+ RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ // x{f, t} becomes this:
+ //
+ // (r++)<-.
+ // | `
+ // | (x)
+ // v ^
+ // (r=0)-->(?)---/ [if r < t]
+ // |
+ // [if r >= f] \----> ...
+ //
+ //
+ // TODO(someone): clear captures on repetition and handle empty
+ // matches.
+ bool has_min = min > 0;
+ bool has_max = max < RegExpQuantifier::kInfinity;
+ bool needs_counter = has_min || has_max;
+ int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;
+ ChoiceNode* center = new ChoiceNode(2, on_failure);
+ RegExpNode* loop_return = needs_counter
+ ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
+ : static_cast<RegExpNode*>(center);
+ RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure);
+ GuardedAlternative body_alt(body_node);
+ if (has_max) {
+ Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
+ body_alt.AddGuard(body_guard);
+ }
+ GuardedAlternative rest_alt(on_success);
+ if (has_min) {
+ Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
+ rest_alt.AddGuard(rest_guard);
+ }
+ if (is_greedy) {
+ center->AddAlternative(body_alt);
+ center->AddAlternative(rest_alt);
+ } else {
+ center->AddAlternative(rest_alt);
+ center->AddAlternative(body_alt);
+ }
+ if (needs_counter) {
+ return ActionNode::StoreRegister(reg_ctr, 0, center);
+ } else {
+ return center;
+ }
+}
+
+
+RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ NodeInfo info;
+ switch (type()) {
+ case START_OF_LINE:
+ info.follows_newline_interest = true;
+ break;
+ case START_OF_INPUT:
+ info.follows_start_interest = true;
+ break;
+ case BOUNDARY: case NON_BOUNDARY:
+ info.follows_word_interest = true;
+ break;
+ case END_OF_LINE: case END_OF_INPUT:
+ // This is wrong but has the effect of making the compiler abort.
+ info.follows_start_interest = true;
+ }
+ return on_success->PropagateInterest(&info);
+}
+
+
+RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ return new BackReferenceNode(RegExpCapture::StartRegister(index()),
+ RegExpCapture::EndRegister(index()),
+ on_success,
+ on_failure);
+}
+
+
+RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ return on_success;
+}
+
+
+RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ int stack_pointer_register = compiler->AllocateRegister();
+ int position_register = compiler->AllocateRegister();
+ if (is_positive()) {
+ // begin submatch scope
+ // $reg = $pos
+ // if [body]
+ // then
+ // $pos = $reg
+ // escape submatch scope (drop all backtracks created in scope)
+ // succeed
+ // else
+ // end submatch scope (nothing to clean up, just exit the scope)
+ // fail
+ return ActionNode::BeginSubmatch(
+ stack_pointer_register,
+ ActionNode::SavePosition(
+ position_register,
+ body()->ToNode(
+ compiler,
+ ActionNode::RestorePosition(
+ position_register,
+ ActionNode::EscapeSubmatch(stack_pointer_register,
+ on_success)),
+ on_failure)));
+ } else {
+ // begin submatch scope
+ // try
+ // first if (body)
+ // then
+ // escape submatch scope
+ // fail
+ // else
+ // backtrack
+ // second
+ // end submatch scope
+ // restore current position
+ // succeed
+ ChoiceNode* try_node =
+ new ChoiceNode(1, ActionNode::RestorePosition(position_register,
+ on_success));
+ RegExpNode* body_node = body()->ToNode(
+ compiler,
+ ActionNode::EscapeSubmatch(stack_pointer_register, on_failure),
+ compiler->backtrack());
+ GuardedAlternative body_alt(body_node);
+ try_node->AddAlternative(body_alt);
+ return ActionNode::BeginSubmatch(stack_pointer_register,
+ ActionNode::SavePosition(
+ position_register,
+ try_node));
+ }
+}
+
+
+RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ return ToNode(body(), index(), compiler, on_success, on_failure);
+}
+
+
+RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
+ int index,
+ RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ int start_reg = RegExpCapture::StartRegister(index);
+ int end_reg = RegExpCapture::EndRegister(index);
+ RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);
+ RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure);
+ return ActionNode::StorePosition(start_reg, body_node);
+}
+
+
+RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ RegExpNode* on_failure) {
+ ZoneList<RegExpTree*>* children = nodes();
+ RegExpNode* current = on_success;
+ for (int i = children->length() - 1; i >= 0; i--) {
+ current = children->at(i)->ToNode(compiler, current, on_failure);
+ }
+ return current;
+}
+
+
+static const int kSpaceRangeCount = 20;
+static const uc16 kSpaceRanges[kSpaceRangeCount] = {
+ 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680,
+ 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
+ 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000
+};
+
+
+static const int kWordRangeCount = 8;
+static const uc16 kWordRanges[kWordRangeCount] = {
+ '0', '9', 'A', 'Z', '_', '_', 'a', 'z'
+};
+
+
+static const int kDigitRangeCount = 2;
+static const uc16 kDigitRanges[kDigitRangeCount] = {
+ '0', '9'
+};
+
+
+static const int kLineTerminatorRangeCount = 6;
+static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {
+ 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029
+};
+
+
+static void AddClass(const uc16* elmv,
+ int elmc,
+ ZoneList<CharacterRange>* ranges) {
+ for (int i = 0; i < elmc; i += 2) {
+ ASSERT(elmv[i] <= elmv[i + 1]);
+ ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
+ }
+}
+
+
+static void AddClassNegated(const uc16 *elmv,
+ int elmc,
+ ZoneList<CharacterRange>* ranges) {
+ ASSERT(elmv[0] != 0x0000);
+ ASSERT(elmv[elmc-1] != 0xFFFF);
+ uc16 last = 0x0000;
+ for (int i = 0; i < elmc; i += 2) {
+ ASSERT(last <= elmv[i] - 1);
+ ASSERT(elmv[i] <= elmv[i + 1]);
+ ranges->Add(CharacterRange(last, elmv[i] - 1));
+ last = elmv[i + 1] + 1;
+ }
+ ranges->Add(CharacterRange(last, 0xFFFF));
+}
+
+
+void CharacterRange::AddClassEscape(uc16 type,
+ ZoneList<CharacterRange>* ranges) {
+ switch (type) {
+ case 's':
+ AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
+ break;
+ case 'S':
+ AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
+ break;
+ case 'w':
+ AddClass(kWordRanges, kWordRangeCount, ranges);
+ break;
+ case 'W':
+ AddClassNegated(kWordRanges, kWordRangeCount, ranges);
+ break;
+ case 'd':
+ AddClass(kDigitRanges, kDigitRangeCount, ranges);
+ break;
+ case 'D':
+ AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
+ break;
+ case '.':
+ AddClassNegated(kLineTerminatorRanges,
+ kLineTerminatorRangeCount,
+ ranges);
+ break;
+ // This is not a character range as defined by the spec but a
+ // convenient shorthand for a character class that matches any
+ // character.
+ case '*':
+ ranges->Add(CharacterRange::Everything());
+ break;
+ default:
+ UNREACHABLE();
+ }
+}
+
+
+void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ if (IsSingleton()) {
+ // If this is a singleton we just expand the one character.
+ int length = uncanonicalize.get(from(), '\0', chars);
+ for (int i = 0; i < length; i++) {
+ uc32 chr = chars[i];
+ if (chr != from()) {
+ ranges->Add(CharacterRange::Singleton(chars[i]));
+ }
+ }
+ } else if (from() <= kRangeCanonicalizeMax
+ && to() <= kRangeCanonicalizeMax) {
Mads Ager (chromium) 2008/11/25 21:09:41 Indentation is off.
Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.
+ // If this is a range we expand the characters block by block,
+ // expanding contiguous subranges (blocks) one at a time.
+ // The approach is as follows. For a given start character we
+ // look up the block that contains it, for instance 'a' if the
+ // start character is 'c'. A block is characterized by the property
+ // that all characters uncanonicalize in the same way as the first
+ // element, except that each entry in the result is incremented
+ // by the distance from the first element. So a-z is a block
+ // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
+ // uncanonicalizes to ['a' + k, 'A' + k].
+ // Once we've found the start point we look up its uncanonicalization
+ // and produce a range for each element. For instance for [c-f]
+ // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
+ // add a range if it is not already contained in the input, so [c-f]
+ // will be skipped but [C-F] will be added. If this range is not
+ // completely contained in a block we do this for all the blocks
+ // covered by the range.
+ unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
+ // First, look up the block that contains the 'from' character.
+ int length = canonrange.get(from(), '\0', range);
+ if (length == 0) {
+ range[0] = from();
+ } else {
+ ASSERT_EQ(1, length);
+ }
+ int pos = from();
+ // The start of the current block. Note that except for the first
+ // iteration 'start' is always equal to 'pos'.
+ int start;
+ // If it is not the start point of a block the entry contains the
+ // offset of the character from the start point.
+ if ((range[0] & kStartMarker) == 0) {
+ start = pos - range[0];
+ } else {
+ start = pos;
+ }
+ // Then we add the ranges on at a time, incrementing the current
+ // position to be after the last block each time. The position
+ // always points to the start of a block.
+ while (pos < to()) {
+ length = canonrange.get(start, '\0', range);
+ if (length == 0) {
+ range[0] = start;
+ } else {
+ ASSERT_EQ(1, length);
+ }
+ ASSERT((range[0] & kStartMarker) != 0);
+ // The start point of a block contains the distance to the end
+ // of the range.
+ int block_end = start + (range[0] & kPayloadMask) - 1;
+ int end = (block_end > to()) ? to() : block_end;
+ length = uncanonicalize.get(start, '\0', range);
+ for (int i = 0; i < length; i++) {
+ uc32 c = range[i];
+ uc16 range_from = c + (pos - start);
+ uc16 range_to = c + (end - start);
+ if (!(from() <= range_from && range_to <= to()))
+ ranges->Add(CharacterRange(range_from, range_to));
+ }
+ start = pos = block_end + 1;
+ }
+ } else {
+ // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
+ }
+}
+
+
+// -------------------------------------------------------------------
+// Interest propagation
+
+
+RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {
+ for (int i = 0; i < siblings_.length(); i++) {
+ RegExpNode* sibling = siblings_.Get(i);
+ if (sibling->info()->SameInterests(info))
+ return sibling;
+ }
+ return NULL;
+}
+
+
+template <class C>
+static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
+ RegExpNode* sibling = node->GetSibling(info);
+ if (sibling != NULL) return sibling;
+ node->EnsureSiblings();
+ sibling = new C(*node);
+ sibling->info()->AdoptInterests(info);
+ node->AddSibling(sibling);
+ return sibling;
+}
+
+
+RegExpNode* ActionNode::PropagateInterest(NodeInfo* info) {
+ RegExpNode* sibling = GetSibling(info);
+ if (sibling != NULL) return sibling;
+ EnsureSiblings();
+ ActionNode* action = new ActionNode(*this);
+ action->info()->AdoptInterests(info);
+ AddSibling(action);
+ action->set_on_success(action->on_success()->PropagateInterest(info));
+ return action;
+}
+
+
+RegExpNode* ChoiceNode::PropagateInterest(NodeInfo* info) {
+ RegExpNode* sibling = GetSibling(info);
+ if (sibling != NULL) return sibling;
+ EnsureSiblings();
+ ChoiceNode* choice = new ChoiceNode(*this);
+ choice->info()->AdoptInterests(info);
+ AddSibling(choice);
+ ZoneList<GuardedAlternative>* old_alternatives = alternatives();
+ int count = old_alternatives->length();
+ choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
+ for (int i = 0; i < count; i++) {
+ GuardedAlternative alternative = old_alternatives->at(i);
+ alternative.set_node(alternative.node()->PropagateInterest(info));
+ choice->alternatives()->Add(alternative);
+ }
+ return choice;
+}
+
+
+RegExpNode* EndNode::PropagateInterest(NodeInfo* info) {
+ return PropagateToEndpoint(this, info);
+}
+
+
+RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) {
+ return PropagateToEndpoint(this, info);
+}
+
+
+RegExpNode* TextNode::PropagateInterest(NodeInfo* info) {
+ return PropagateToEndpoint(this, info);
+}
+
+
+// -------------------------------------------------------------------
+// Splay tree
+
+
+OutSet* OutSet::Extend(unsigned value) {
+ if (Get(value))
+ return this;
+ if (successors() != NULL) {
+ for (int i = 0; i < successors()->length(); i++) {
+ OutSet* successor = successors()->at(i);
+ if (successor->Get(value))
+ return successor;
+ }
+ } else {
+ successors_ = new ZoneList<OutSet*>(2);
+ }
+ OutSet* result = new OutSet(first_, remaining_);
+ result->Set(value);
+ successors()->Add(result);
+ return result;
+}
+
+
+void OutSet::Set(unsigned value) {
+ if (value < kFirstLimit) {
+ first_ |= (1 << value);
+ } else {
+ if (remaining_ == NULL)
+ remaining_ = new ZoneList<unsigned>(1);
+ if (remaining_->is_empty() || !remaining_->Contains(value))
+ remaining_->Add(value);
+ }
+}
+
+
+bool OutSet::Get(unsigned value) {
+ if (value < kFirstLimit) {
+ return (first_ & (1 << value)) != 0;
+ } else if (remaining_ == NULL) {
+ return false;
+ } else {
+ return remaining_->Contains(value);
+ }
+}
+
+
+const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
+const DispatchTable::Entry DispatchTable::Config::kNoValue;
+
+
+void DispatchTable::AddRange(CharacterRange full_range, int value) {
+ CharacterRange current = full_range;
+ if (tree()->is_empty()) {
+ // If this is the first range we just insert into the table.
+ ZoneSplayTree<Config>::Locator loc;
+ ASSERT_RESULT(tree()->Insert(current.from(), &loc));
+ loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
+ return;
+ }
+ // First see if there is a range to the left of this one that
+ // overlaps.
+ ZoneSplayTree<Config>::Locator loc;
+ if (tree()->FindGreatestLessThan(current.from(), &loc)) {
+ Entry* entry = &loc.value();
+ // If we've found a range that overlaps with this one, and it
+ // starts strictly to the left of this one, we have to fix it
+ // because the following code only handles ranges that start on
+ // or after the start point of the range we're adding.
+ if (entry->from() < current.from() && entry->to() >= current.from()) {
+ // Snap the overlapping range in half around the start point of
+ // the range we're adding.
+ CharacterRange left(entry->from(), current.from() - 1);
+ CharacterRange right(current.from(), entry->to());
+ // The left part of the overlapping range doesn't overlap.
+ // Truncate the whole entry to be just the left part.
+ entry->set_to(left.to());
+ // The right part is the one that overlaps. We add this part
+ // to the map and let the next step deal with merging it with
+ // the range we're adding.
+ ZoneSplayTree<Config>::Locator loc;
+ ASSERT_RESULT(tree()->Insert(right.from(), &loc));
+ loc.set_value(Entry(right.from(),
+ right.to(),
+ entry->out_set()));
+ }
+ }
+ while (current.is_valid()) {
+ if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
+ (loc.value().from() <= current.to()) &&
+ (loc.value().to() >= current.from())) {
+ Entry* entry = &loc.value();
+ // We have overlap. If there is space between the start point of
+ // the range we're adding and where the overlapping range starts
+ // then we have to add a range covering just that space.
+ if (current.from() < entry->from()) {
+ ZoneSplayTree<Config>::Locator ins;
+ ASSERT_RESULT(tree()->Insert(current.from(), &ins));
+ ins.set_value(Entry(current.from(),
+ entry->from() - 1,
+ empty()->Extend(value)));
+ current.set_from(entry->from());
+ }
+ ASSERT_EQ(current.from(), entry->from());
+ // If the overlapping range extends beyond the one we want to add
+ // we have to snap the right part off and add it separately.
+ if (entry->to() > current.to()) {
+ ZoneSplayTree<Config>::Locator ins;
+ ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
+ ins.set_value(Entry(current.to() + 1,
+ entry->to(),
+ entry->out_set()));
+ entry->set_to(current.to());
+ }
+ ASSERT(entry->to() <= current.to());
+ // The overlapping range is now completely contained by the range
+ // we're adding so we can just update it and move the start point
+ // of the range we're adding just past it.
+ entry->AddValue(value);
+ // Bail out if the last interval ended at 0xFFFF since otherwise
+ // adding 1 will wrap around to 0.
+ if (entry->to() == 0xFFFF)
+ break;
+ ASSERT(entry->to() + 1 > current.from());
+ current.set_from(entry->to() + 1);
+ } else {
+ // There is no overlap so we can just add the range
+ ZoneSplayTree<Config>::Locator ins;
+ ASSERT_RESULT(tree()->Insert(current.from(), &ins));
+ ins.set_value(Entry(current.from(),
+ current.to(),
+ empty()->Extend(value)));
+ break;
+ }
+ }
+}
+
+
+OutSet* DispatchTable::Get(uc16 value) {
+ ZoneSplayTree<Config>::Locator loc;
+ if (!tree()->FindGreatestLessThan(value, &loc))
+ return empty();
+ Entry* entry = &loc.value();
+ if (value <= entry->to())
+ return entry->out_set();
+ else
+ return empty();
+}
+
+
+// -------------------------------------------------------------------
+// Analysis
+
+
+void Analysis::EnsureAnalyzed(RegExpNode* that) {
+ if (that->info()->been_analyzed || that->info()->being_analyzed)
+ return;
+ that->info()->being_analyzed = true;
+ that->Accept(this);
+ that->info()->being_analyzed = false;
+ that->info()->been_analyzed = true;
+}
+
+
+void Analysis::VisitEnd(EndNode* that) {
+ // nothing to do
+}
+
+
+void Analysis::VisitText(TextNode* that) {
+ EnsureAnalyzed(that->on_success());
+ EnsureAnalyzed(that->on_failure());
+}
+
+
+void Analysis::VisitAction(ActionNode* that) {
+ RegExpNode* next = that->on_success();
+ EnsureAnalyzed(next);
+ that->info()->determine_newline = next->info()->prev_determine_newline();
+ that->info()->determine_word = next->info()->prev_determine_word();
+ that->info()->determine_start = next->info()->prev_determine_start();
+}
+
+
+void Analysis::VisitChoice(ChoiceNode* that) {
+ NodeInfo* info = that->info();
+ for (int i = 0; i < that->alternatives()->length(); i++) {
+ RegExpNode* node = that->alternatives()->at(i).node();
+ EnsureAnalyzed(node);
+ info->determine_newline |= node->info()->prev_determine_newline();
+ info->determine_word |= node->info()->prev_determine_word();
+ info->determine_start |= node->info()->prev_determine_start();
+ }
+ if (!that->table_calculated()) {
+ DispatchTableConstructor cons(that->table());
+ cons.BuildTable(that);
+ }
+ EnsureAnalyzed(that->on_failure());
+}
+
+
+void Analysis::VisitBackReference(BackReferenceNode* that) {
+ EnsureAnalyzed(that->on_success());
+ EnsureAnalyzed(that->on_failure());
+}
+
+
+// -------------------------------------------------------------------
+// Dispatch table construction
+
+
+void DispatchTableConstructor::VisitEnd(EndNode* that) {
+ AddRange(CharacterRange::Everything());
+}
+
+
+void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
+ ASSERT(!node->table_calculated());
+ node->set_being_calculated(true);
+ ZoneList<GuardedAlternative>* alternatives = node->alternatives();
+ for (int i = 0; i < alternatives->length(); i++) {
+ set_choice_index(i);
+ alternatives->at(i).node()->Accept(this);
+ }
+ node->set_being_calculated(false);
+ node->set_table_calculated(true);
+}
+
+
+class AddDispatchRange {
+ public:
+ explicit AddDispatchRange(DispatchTableConstructor* constructor)
+ : constructor_(constructor) { }
+ void Call(uc32 from, DispatchTable::Entry entry);
+ private:
+ DispatchTableConstructor* constructor_;
+};
+
+
+void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
+ CharacterRange range(from, entry.to());
+ constructor_->AddRange(range);
+}
+
+
+void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
+ if (node->being_calculated())
+ return;
+ if (!node->table_calculated()) {
+ DispatchTableConstructor constructor(node->table());
+ constructor.BuildTable(node);
+ }
+ ASSERT(node->table_calculated());
+ AddDispatchRange adder(this);
+ node->table()->ForEach(&adder);
+}
+
+
+void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
+ // TODO(160): Find the node that we refer back to and propagate its start
+ // set back to here. For now we just accept anything.
+ AddRange(CharacterRange::Everything());
+}
+
+
+
+static int CompareRangeByFrom(const CharacterRange* a,
+ const CharacterRange* b) {
+ return Spaceship<uc16>(a->from(), b->from());
+}
+
+
+void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
+ ranges->Sort(CompareRangeByFrom);
+ uc16 last = 0;
+ for (int i = 0; i < ranges->length(); i++) {
+ CharacterRange range = ranges->at(i);
+ if (last < range.from())
+ AddRange(CharacterRange(last, range.from() - 1));
+ if (range.to() >= last) {
+ if (range.to() == 0xFFFF) {
+ return;
+ } else {
+ last = range.to() + 1;
+ }
+ }
+ }
+ AddRange(CharacterRange(last, 0xFFFF));
+}
+
+
+void DispatchTableConstructor::VisitText(TextNode* that) {
+ TextElement elm = that->elements()->at(0);
+ switch (elm.type) {
+ case TextElement::ATOM: {
+ uc16 c = elm.data.u_atom->data()[0];
+ AddRange(CharacterRange(c, c));
+ break;
+ }
+ case TextElement::CHAR_CLASS: {
+ RegExpCharacterClass* tree = elm.data.u_char_class;
+ ZoneList<CharacterRange>* ranges = tree->ranges();
+ if (tree->is_negated()) {
+ AddInverse(ranges);
+ } else {
+ for (int i = 0; i < ranges->length(); i++)
+ AddRange(ranges->at(i));
+ }
+ break;
+ }
+ default: {
+ UNIMPLEMENTED();
+ }
+ }
+}
+
+
+void DispatchTableConstructor::VisitAction(ActionNode* that) {
+ that->on_success()->Accept(this);
+}
+
+
+Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
+ RegExpNode** node_return,
+ bool ignore_case) {
+ RegExpCompiler compiler(input->capture_count, ignore_case);
+ // Wrap the body of the regexp in capture #0.
+ RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
+ 0,
+ &compiler,
+ compiler.accept(),
+ compiler.backtrack());
+ // Add a .*? at the beginning, outside the body capture.
+ // Note: We could choose to not add this if the regexp is anchored at
+ // the start of the input but I'm not sure how best to do that and
+ // since we don't even handle ^ yet I'm saving that optimization for
+ // later.
+ RegExpNode* node = RegExpQuantifier::ToNode(0,
+ RegExpQuantifier::kInfinity,
+ false,
+ new RegExpCharacterClass('*'),
+ &compiler,
+ captured_body,
+ compiler.backtrack());
+ if (node_return != NULL) *node_return = node;
+ Analysis analysis;
+ analysis.EnsureAnalyzed(node);
+
+ if (!FLAG_irregexp) {
+ return Handle<FixedArray>::null();
+ }
+
+#if !(defined ARM || defined __arm__ || defined __thumb__)
+ if (FLAG_irregexp_native) { // Flag only checked in IA32 mode.
+ // TODO(lrn) Move compilation to a later point in the life-cycle
+ // of the RegExp. We don't know the type of input string yet.
+ // For now, always assume two-byte strings.
+ RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16,
+ (input->capture_count + 1) * 2,
+ ignore_case);
+ return compiler.Assemble(&macro_assembler,
+ node,
+ input->capture_count);
+ }
+#endif
+ byte codes[1024];
+ IrregexpAssembler assembler(Vector<byte>(codes, 1024));
+ RegExpMacroAssemblerIrregexp macro_assembler(&assembler);
+ return compiler.Assemble(&macro_assembler,
+ node,
+ input->capture_count);
+}
+
+
}} // namespace v8::internal

Powered by Google App Engine
This is Rietveld 408576698