Index: src/jsregexp.cc |
=================================================================== |
--- src/jsregexp.cc (revision 830) |
+++ src/jsregexp.cc (working copy) |
@@ -25,15 +25,30 @@ |
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
+#define _HAS_EXCEPTIONS 0 |
Erik Corry
2008/11/25 11:03:52
??
|
+#include <set> |
Erik Corry
2008/11/25 11:03:52
Where is this coming from?
Mads Ager (chromium)
2008/11/25 21:09:41
These are still there. Please remove!
Christian Plesner Hansen
2008/11/26 06:49:56
It's in use.
|
+ |
#include "v8.h" |
+#include "ast.h" |
#include "execution.h" |
#include "factory.h" |
-#include "jsregexp.h" |
+#include "jsregexp-inl.h" |
#include "platform.h" |
#include "runtime.h" |
#include "top.h" |
#include "compilation-cache.h" |
+#include "string-stream.h" |
+#include "parser.h" |
+#include "assembler-irregexp.h" |
+#include "regexp-macro-assembler.h" |
+#include "regexp-macro-assembler-irregexp.h" |
+#if defined __arm__ || defined __thumb__ || defined ARM |
+// include regexp-macro-assembler-arm.h when created. |
+#else // ia32 |
+#include "regexp-macro-assembler-ia32.h" |
+#endif |
+#include "interpreter-irregexp.h" |
// Including pcre.h undefines DEBUG to avoid getting debug output from |
// the JSCRE implementation. Make sure to redefine it in debug mode |
@@ -45,12 +60,10 @@ |
#include "third_party/jscre/pcre.h" |
#endif |
+ |
namespace v8 { namespace internal { |
-#define CAPTURE_INDEX 0 |
-#define INTERNAL_INDEX 1 |
- |
static Failure* malloc_failure; |
static void* JSREMalloc(size_t size) { |
@@ -176,7 +189,16 @@ |
} |
-unibrow::Predicate<unibrow::RegExpSpecialChar, 128> is_reg_exp_special_char; |
+static inline void ThrowRegExpException(Handle<JSRegExp> re, |
+ Handle<String> pattern, |
+ Handle<String> error_text, |
+ const char* message) { |
+ Handle<JSArray> array = Factory::NewJSArray(2); |
+ SetElement(array, 0, pattern); |
+ SetElement(array, 1, error_text); |
+ Handle<Object> regexp_err = Factory::NewSyntaxError(message, array); |
+ Top::Throw(*regexp_err); |
+} |
Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, |
@@ -186,20 +208,42 @@ |
Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags); |
bool in_cache = !cached.is_null(); |
Handle<Object> result; |
- StringShape shape(*pattern); |
if (in_cache) { |
re->set_data(*cached); |
result = re; |
} else { |
- bool is_atom = !flags.is_ignore_case(); |
- for (int i = 0; is_atom && i < pattern->length(shape); i++) { |
- if (is_reg_exp_special_char.get(pattern->Get(shape, i))) |
- is_atom = false; |
+ FlattenString(pattern); |
+ RegExpParseResult parse_result; |
+ FlatStringReader reader(pattern); |
+ if (!ParseRegExp(&reader, &parse_result)) { |
+ // Throw an exception if we fail to parse the pattern. |
+ ThrowRegExpException(re, |
+ pattern, |
+ parse_result.error, |
+ "malformed_regexp"); |
+ return Handle<Object>(); |
} |
- if (is_atom) { |
- result = AtomCompile(re, pattern, flags); |
+ RegExpAtom* atom = parse_result.tree->AsAtom(); |
+ if (atom != NULL && !flags.is_ignore_case()) { |
+ if (parse_result.has_character_escapes) { |
+ Vector<const uc16> atom_pattern = atom->data(); |
+ Handle<String> atom_string = |
+ Factory::NewStringFromTwoByte(atom_pattern); |
+ result = AtomCompile(re, pattern, flags, atom_string); |
+ } else { |
+ result = AtomCompile(re, pattern, flags, pattern); |
+ } |
} else { |
- result = JsreCompile(re, pattern, flags); |
+ RegExpNode* node = NULL; |
+ Handle<FixedArray> irregexp_data = |
+ RegExpEngine::Compile(&parse_result, |
+ &node, |
+ flags.is_ignore_case()); |
+ if (irregexp_data.is_null()) { |
+ result = JscrePrepare(re, pattern, flags); |
+ } else { |
+ result = IrregexpPrepare(re, pattern, flags, irregexp_data); |
+ } |
} |
Object* data = re->data(); |
if (data->IsFixedArray()) { |
@@ -220,9 +264,11 @@ |
Handle<Object> index) { |
switch (regexp->TypeTag()) { |
case JSRegExp::JSCRE: |
- return JsreExec(regexp, subject, index); |
+ return JscreExec(regexp, subject, index); |
case JSRegExp::ATOM: |
return AtomExec(regexp, subject, index); |
+ case JSRegExp::IRREGEXP: |
+ return IrregexpExec(regexp, subject, index); |
default: |
UNREACHABLE(); |
return Handle<Object>(); |
@@ -234,9 +280,11 @@ |
Handle<String> subject) { |
switch (regexp->TypeTag()) { |
case JSRegExp::JSCRE: |
- return JsreExecGlobal(regexp, subject); |
+ return JscreExecGlobal(regexp, subject); |
case JSRegExp::ATOM: |
return AtomExecGlobal(regexp, subject); |
+ case JSRegExp::IRREGEXP: |
+ return IrregexpExecGlobal(regexp, subject); |
default: |
UNREACHABLE(); |
return Handle<Object>(); |
@@ -246,8 +294,9 @@ |
Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re, |
Handle<String> pattern, |
- JSRegExp::Flags flags) { |
- Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, pattern); |
+ JSRegExp::Flags flags, |
+ Handle<String> match_pattern) { |
+ Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern); |
return re; |
} |
@@ -267,12 +316,8 @@ |
if (value == -1) return Factory::null_value(); |
Handle<FixedArray> array = Factory::NewFixedArray(2); |
- array->set(0, |
- Smi::FromInt(value), |
- SKIP_WRITE_BARRIER); |
- array->set(1, |
- Smi::FromInt(value + needle->length()), |
- SKIP_WRITE_BARRIER); |
+ array->set(0, Smi::FromInt(value)); |
+ array->set(1, Smi::FromInt(value + needle->length())); |
return Factory::NewJSArrayWithElements(array); |
} |
@@ -296,12 +341,8 @@ |
int end = value + needle_length; |
Handle<FixedArray> array = Factory::NewFixedArray(2); |
- array->set(0, |
- Smi::FromInt(value), |
- SKIP_WRITE_BARRIER); |
- array->set(1, |
- Smi::FromInt(end), |
- SKIP_WRITE_BARRIER); |
+ array->set(0, Smi::FromInt(value)); |
+ array->set(1, Smi::FromInt(end)); |
Handle<JSArray> pair = Factory::NewJSArrayWithElements(array); |
SetElement(result, match_count, pair); |
match_count++; |
@@ -312,6 +353,24 @@ |
} |
+Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re, |
+ Handle<String> pattern, |
+ JSRegExp::Flags flags) { |
+ Handle<Object> value(Heap::undefined_value()); |
+ Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value); |
+ return re; |
+} |
+ |
+ |
+Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re, |
+ Handle<String> pattern, |
+ JSRegExp::Flags flags, |
+ Handle<FixedArray> irregexp_data) { |
+ Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data); |
+ return re; |
+} |
+ |
+ |
static inline Object* DoCompile(String* pattern, |
JSRegExp::Flags flags, |
unsigned* number_of_captures, |
@@ -358,9 +417,13 @@ |
} |
-Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re, |
- Handle<String> pattern, |
- JSRegExp::Flags flags) { |
+Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) { |
+ ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE); |
+ ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()); |
+ |
+ Handle<String> pattern(re->Pattern()); |
+ JSRegExp::Flags flags = re->GetFlags(); |
+ |
Handle<String> two_byte_pattern = StringToTwoByte(pattern); |
unsigned number_of_captures; |
@@ -391,26 +454,110 @@ |
Handle<ByteArray> internal( |
ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code))); |
- Handle<FixedArray> value = Factory::NewFixedArray(2); |
- value->set(CAPTURE_INDEX, Smi::FromInt(number_of_captures)); |
- value->set(INTERNAL_INDEX, *internal); |
+ Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength); |
+ value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures)); |
+ value->set(kJscreInternalIndex, *internal); |
Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value); |
return re; |
} |
-Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp, |
- int num_captures, |
- Handle<String> subject, |
- int previous_index, |
- const uc16* two_byte_subject, |
- int* offsets_vector, |
- int offsets_vector_length) { |
+Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp, |
+ int num_captures, |
+ Handle<String> two_byte_subject, |
+ int previous_index, |
+ int* offsets_vector, |
+ int offsets_vector_length) { |
+#ifdef DEBUG |
+ if (FLAG_trace_regexp_bytecodes) { |
+ String* pattern = regexp->Pattern(); |
+ PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString())); |
+ PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString())); |
+ } |
+#endif |
+ ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation()); |
+ ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject))); |
+ bool rc; |
+ { |
Mads Ager (chromium)
2008/11/25 21:09:41
Why is the extra scope needed here?
Erik Corry
2008/11/26 12:18:36
It isn't. Removed.
|
+ for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) { |
+ offsets_vector[i] = -1; |
+ } |
+ |
+ LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject)); |
+ |
+ FixedArray* irregexp = |
+ FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex)); |
+ int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value(); |
+ |
+ switch (tag) { |
Mads Ager (chromium)
2008/11/25 21:09:41
The indentation of this switch is off. Indent the
Erik Corry
2008/11/26 12:18:36
Done.
|
+ case RegExpMacroAssembler::kIA32Implementation: { |
+ Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex)); |
+ SmartPointer<int> captures(NewArray<int>((num_captures + 1) * 2)); |
+ Address start_addr = |
+ Handle<SeqTwoByteString>::cast(two_byte_subject)->GetCharsAddress(); |
+ int start_offset = |
+ start_addr - reinterpret_cast<Address>(*two_byte_subject); |
+ int end_offset = |
+ start_offset + (two_byte_subject->length() - previous_index) * 2; |
+ typedef bool testfunc(String**, int, int, int*); |
+ testfunc* test = FUNCTION_CAST<testfunc*>(code->entry()); |
+ rc = test(two_byte_subject.location(), |
+ start_offset, |
+ end_offset, |
+ *captures); |
+ if (rc) { |
+ // Capture values are relative to start_offset only. |
+ for (int i = 0; i < offsets_vector_length; i++) { |
+ if (offsets_vector[i] >= 0) { |
+ offsets_vector[i] += previous_index; |
+ } |
+ } |
+ } |
+ break; |
+ } |
+ default: |
Mads Ager (chromium)
2008/11/25 21:09:41
Why is this default case in the middle of the rest
Erik Corry
2008/11/26 12:18:36
Because it falls through? Lasse, please take a lo
|
+ case RegExpMacroAssembler::kARMImplementation: |
+ UNREACHABLE(); |
+ rc = false; |
+ break; |
+ case RegExpMacroAssembler::kBytecodeImplementation: { |
+ Handle<ByteArray> byte_codes = IrregexpCode(regexp); |
+ |
+ rc = IrregexpInterpreter::Match(byte_codes, |
+ two_byte_subject, |
+ offsets_vector, |
+ previous_index); |
+ break; |
+ } |
+ } |
+ } |
+ |
+ if (!rc) { |
+ return Factory::null_value(); |
+ } |
+ |
+ Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1)); |
+ // The captures come in (start, end+1) pairs. |
+ for (int i = 0; i < 2 * (num_captures+1); i += 2) { |
+ array->set(i, Smi::FromInt(offsets_vector[i])); |
+ array->set(i+1, Smi::FromInt(offsets_vector[i+1])); |
+ } |
+ return Factory::NewJSArrayWithElements(array); |
+} |
+ |
+ |
+Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp, |
+ int num_captures, |
+ Handle<String> subject, |
+ int previous_index, |
+ const uc16* two_byte_subject, |
+ int* offsets_vector, |
+ int offsets_vector_length) { |
int rc; |
{ |
AssertNoAllocation a; |
- ByteArray* internal = JsreInternal(regexp); |
+ ByteArray* internal = JscreInternal(regexp); |
const JscreRegExp* js_regexp = |
reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress()); |
@@ -444,12 +591,8 @@ |
Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1)); |
// The captures come in (start, end+1) pairs. |
for (int i = 0; i < 2 * (num_captures+1); i += 2) { |
- array->set(i, |
- Smi::FromInt(offsets_vector[i]), |
- SKIP_WRITE_BARRIER); |
- array->set(i+1, |
- Smi::FromInt(offsets_vector[i+1]), |
- SKIP_WRITE_BARRIER); |
+ array->set(i, Smi::FromInt(offsets_vector[i])); |
+ array->set(i+1, Smi::FromInt(offsets_vector[i+1])); |
} |
return Factory::NewJSArrayWithElements(array); |
} |
@@ -457,8 +600,8 @@ |
class OffsetsVector { |
public: |
- inline OffsetsVector(int num_captures) { |
- offsets_vector_length_ = (num_captures + 1) * 3; |
+ inline OffsetsVector(int num_registers) : |
Mads Ager (chromium)
2008/11/25 21:09:41
Move : to the next line and do a four space indent
|
+ offsets_vector_length_(num_registers) { |
if (offsets_vector_length_ > kStaticOffsetsVectorSize) { |
vector_ = NewArray<int>(offsets_vector_length_); |
} else { |
@@ -487,7 +630,7 @@ |
private: |
int* vector_; |
int offsets_vector_length_; |
- static const int kStaticOffsetsVectorSize = 30; |
+ static const int kStaticOffsetsVectorSize = 50; |
static int static_offsets_vector_[kStaticOffsetsVectorSize]; |
}; |
@@ -496,34 +639,127 @@ |
OffsetsVector::kStaticOffsetsVectorSize]; |
-Handle<Object> RegExpImpl::JsreExec(Handle<JSRegExp> regexp, |
+Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp, |
Handle<String> subject, |
Mads Ager (chromium)
2008/11/25 21:09:41
Indentation is off.
Erik Corry
2008/11/26 12:18:36
Fixed.
|
Handle<Object> index) { |
+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); |
+ ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined()); |
+ |
// Prepare space for the return values. |
- int num_captures = JsreCapture(regexp); |
+ int number_of_registers = IrregexpNumberOfRegisters(regexp); |
+ OffsetsVector offsets(number_of_registers); |
- OffsetsVector offsets(num_captures); |
+ int num_captures = IrregexpNumberOfCaptures(regexp); |
int previous_index = static_cast<int>(DoubleToInteger(index->Number())); |
Handle<String> subject16 = CachedStringToTwoByte(subject); |
- Handle<Object> result(JsreExecOnce(regexp, num_captures, subject, |
- previous_index, |
- subject16->GetTwoByteData(), |
- offsets.vector(), offsets.length())); |
+ Handle<Object> result( |
+ IrregexpExecOnce(regexp, |
+ num_captures, |
+ subject16, |
+ previous_index, |
+ offsets.vector(), |
+ offsets.length())); |
+ return result; |
+} |
+ |
+Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp, |
+ Handle<String> subject, |
+ Handle<Object> index) { |
+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE); |
+ if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) { |
+ Handle<Object> compile_result = JscreCompile(regexp); |
+ if (compile_result.is_null()) return compile_result; |
+ } |
+ ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray()); |
+ |
+ int num_captures = JscreNumberOfCaptures(regexp); |
+ |
+ OffsetsVector offsets((num_captures + 1) * 3); |
+ |
+ int previous_index = static_cast<int>(DoubleToInteger(index->Number())); |
+ |
+ Handle<String> subject16 = CachedStringToTwoByte(subject); |
+ |
+ Handle<Object> result(JscreExecOnce(regexp, |
+ num_captures, |
+ subject, |
+ previous_index, |
+ subject16->GetTwoByteData(), |
+ offsets.vector(), |
+ offsets.length())); |
+ |
return result; |
} |
-Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp, |
- Handle<String> subject) { |
+Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp, |
+ Handle<String> subject) { |
+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP); |
+ ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined()); |
+ |
// Prepare space for the return values. |
- int num_captures = JsreCapture(regexp); |
+ int number_of_registers = IrregexpNumberOfRegisters(regexp); |
+ OffsetsVector offsets(number_of_registers); |
- OffsetsVector offsets(num_captures); |
+ int previous_index = 0; |
+ Handle<JSArray> result = Factory::NewJSArray(0); |
+ int i = 0; |
+ Handle<Object> matches; |
+ |
+ Handle<String> subject16 = CachedStringToTwoByte(subject); |
+ |
+ do { |
+ if (previous_index > subject->length() || previous_index < 0) { |
+ // Per ECMA-262 15.10.6.2, if the previous index is greater than the |
+ // string length, there is no match. |
+ matches = Factory::null_value(); |
+ } else { |
+ matches = IrregexpExecOnce(regexp, |
+ IrregexpNumberOfCaptures(regexp), |
+ subject16, |
+ previous_index, |
+ offsets.vector(), |
+ offsets.length()); |
+ |
+ if (matches->IsJSArray()) { |
+ SetElement(result, i, matches); |
+ i++; |
+ previous_index = offsets.vector()[1]; |
+ if (offsets.vector()[0] == offsets.vector()[1]) { |
+ previous_index++; |
+ } |
+ } |
+ } |
+ } while (matches->IsJSArray()); |
+ |
+ // If we exited the loop with an exception, throw it. |
+ if (matches->IsNull()) { // Exited loop normally. |
Mads Ager (chromium)
2008/11/25 21:09:41
Move these comments to new lines before the return
Erik Corry
2008/11/26 12:18:36
I'm not sure why that's any better, but fixed. Wh
|
+ return result; |
+ } else { // Exited loop with the exception in matches. |
+ return matches; |
+ } |
+} |
+ |
+ |
+Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp, |
+ Handle<String> subject) { |
+ ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE); |
+ if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) { |
+ Handle<Object> compile_result = JscreCompile(regexp); |
+ if (compile_result.is_null()) return compile_result; |
+ } |
+ ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray()); |
+ |
+ // Prepare space for the return values. |
+ int num_captures = JscreNumberOfCaptures(regexp); |
+ |
+ OffsetsVector offsets((num_captures + 1) * 3); |
+ |
int previous_index = 0; |
Handle<JSArray> result = Factory::NewJSArray(0); |
@@ -538,9 +774,13 @@ |
// string length, there is no match. |
matches = Factory::null_value(); |
} else { |
- matches = JsreExecOnce(regexp, num_captures, subject, previous_index, |
- subject16->GetTwoByteData(), |
- offsets.vector(), offsets.length()); |
+ matches = JscreExecOnce(regexp, |
+ num_captures, |
+ subject, |
+ previous_index, |
+ subject16->GetTwoByteData(), |
+ offsets.vector(), |
+ offsets.length()); |
if (matches->IsJSArray()) { |
SetElement(result, i, matches); |
@@ -562,15 +802,1798 @@ |
} |
-int RegExpImpl::JsreCapture(Handle<JSRegExp> re) { |
+int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) { |
FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); |
- return Smi::cast(value->get(CAPTURE_INDEX))->value(); |
+ return Smi::cast(value->get(kJscreNumberOfCapturesIndex))-> |
+ value(); |
Mads Ager (chromium)
2008/11/25 21:09:41
This should fit on the previous line.
Erik Corry
2008/11/26 12:18:36
Done.
|
} |
-ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) { |
+ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) { |
FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); |
- return ByteArray::cast(value->get(INTERNAL_INDEX)); |
+ return ByteArray::cast(value->get(kJscreInternalIndex)); |
} |
+ |
+int RegExpImpl::IrregexpNumberOfCaptures(Handle<JSRegExp> re) { |
+ FixedArray* value = |
+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); |
+ return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value(); |
+} |
+ |
+ |
+int RegExpImpl::IrregexpNumberOfRegisters(Handle<JSRegExp> re) { |
+ FixedArray* value = |
+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); |
+ return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value(); |
+} |
+ |
+ |
+Handle<ByteArray> RegExpImpl::IrregexpCode(Handle<JSRegExp> re) { |
+ FixedArray* value = |
+ FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex)); |
+ return Handle<ByteArray>(ByteArray::cast(value->get(kIrregexpCodeIndex))); |
+} |
+ |
+ |
+// ------------------------------------------------------------------- |
+// New regular expression engine |
Erik Corry
2008/11/25 11:03:52
Probably should mention "Irregexp"
Erik Corry
2008/11/26 12:18:36
Fixed.
|
+ |
+ |
+void RegExpTree::AppendToText(RegExpText* text) { |
+ UNREACHABLE(); |
+} |
+ |
+ |
+void RegExpAtom::AppendToText(RegExpText* text) { |
+ text->AddElement(TextElement::Atom(this)); |
+} |
+ |
+ |
+void RegExpCharacterClass::AppendToText(RegExpText* text) { |
+ text->AddElement(TextElement::CharClass(this)); |
+} |
+ |
+ |
+void RegExpText::AppendToText(RegExpText* text) { |
+ for (int i = 0; i < elements()->length(); i++) |
+ text->AddElement(elements()->at(i)); |
+} |
+ |
+ |
+TextElement TextElement::Atom(RegExpAtom* atom) { |
+ TextElement result = TextElement(ATOM); |
+ result.data.u_atom = atom; |
+ return result; |
+} |
+ |
+ |
+TextElement TextElement::CharClass( |
+ RegExpCharacterClass* char_class) { |
+ TextElement result = TextElement(CHAR_CLASS); |
+ result.data.u_char_class = char_class; |
+ return result; |
+} |
+ |
+ |
+class RegExpCompiler { |
+ public: |
+ RegExpCompiler(int capture_count, bool ignore_case); |
+ |
+ int AllocateRegister() { return next_register_++; } |
+ |
+ Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler, |
+ RegExpNode* start, |
+ int capture_count); |
+ |
+ inline void AddWork(RegExpNode* node) { work_list_->Add(node); } |
+ |
+ static const int kImplementationOffset = 0; |
+ static const int kNumberOfRegistersOffset = 0; |
+ static const int kCodeOffset = 1; |
+ |
+ RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } |
+ EndNode* accept() { return accept_; } |
+ EndNode* backtrack() { return backtrack_; } |
+ |
+ static const int kMaxRecursion = 100; |
+ inline int recursion_depth() { return recursion_depth_; } |
+ inline void IncrementRecursionDepth() { recursion_depth_++; } |
+ inline void DecrementRecursionDepth() { recursion_depth_--; } |
+ |
+ inline bool is_case_independent() { return is_case_independent_; } |
+ |
+ private: |
+ EndNode* accept_; |
+ EndNode* backtrack_; |
+ int next_register_; |
+ List<RegExpNode*>* work_list_; |
+ int recursion_depth_; |
+ RegExpMacroAssembler* macro_assembler_; |
+ bool is_case_independent_; |
+}; |
+ |
+ |
+// Attempts to compile the regexp using an Irregexp code generator. Returns |
+// a fixed array or a null handle depending on whether it succeeded. |
+RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case) |
+ : next_register_(2 * (capture_count + 1)), |
Mads Ager (chromium)
2008/11/25 21:09:41
Four space indent.
Christian Plesner Hansen
2008/11/26 06:49:56
Fixed.
|
+ work_list_(NULL), |
+ recursion_depth_(0), |
+ is_case_independent_(ignore_case) { |
+ accept_ = new EndNode(EndNode::ACCEPT); |
+ backtrack_ = new EndNode(EndNode::BACKTRACK); |
+} |
+ |
+ |
+Handle<FixedArray> RegExpCompiler::Assemble( |
+ RegExpMacroAssembler* macro_assembler, |
+ RegExpNode* start, |
+ int capture_count) { |
+ if (!FLAG_attempt_case_independent && is_case_independent_) { |
+ return Handle<FixedArray>::null(); |
+ } |
+ macro_assembler_ = macro_assembler; |
+ List <RegExpNode*> work_list(0); |
+ work_list_ = &work_list; |
+ Label fail; |
+ macro_assembler->PushBacktrack(&fail); |
+ if (!start->GoTo(this)) { |
+ fail.Unuse(); |
+ return Handle<FixedArray>::null(); |
+ } |
+ while (!work_list.is_empty()) { |
+ if (!work_list.RemoveLast()->GoTo(this)) { |
+ fail.Unuse(); |
+ return Handle<FixedArray>::null(); |
+ } |
+ } |
+ macro_assembler->Bind(&fail); |
+ macro_assembler->Fail(); |
+ Handle<FixedArray> array = |
+ Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength); |
+ array->set(RegExpImpl::kIrregexpImplementationIndex, |
+ Smi::FromInt(macro_assembler->Implementation())); |
+ array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex, |
+ Smi::FromInt(next_register_)); |
+ array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex, |
+ Smi::FromInt(capture_count)); |
+ Handle<Object> code = macro_assembler->GetCode(); |
+ array->set(RegExpImpl::kIrregexpCodeIndex, *code); |
+ work_list_ = NULL; |
+ return array; |
+} |
+ |
+ |
+bool RegExpNode::GoTo(RegExpCompiler* compiler) { |
+ // TODO(erikcorry): Implement support. |
+ if (info_.follows_word_interest || |
+ info_.follows_newline_interest || |
+ info_.follows_start_interest) { |
+ return false; |
+ } |
+ if (label_.is_bound()) { |
+ compiler->macro_assembler()->GoTo(&label_); |
+ return true; |
+ } else { |
Mads Ager (chromium)
2008/11/25 21:09:41
Most of this nesting could go away since all of th
Christian Plesner Hansen
2008/11/26 06:49:56
Nesting aids readability: you can trivially see wh
|
+ if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) { |
+ compiler->macro_assembler()->GoTo(&label_); |
+ compiler->AddWork(this); |
+ return true; |
+ } else { |
+ compiler->IncrementRecursionDepth(); |
+ bool how_it_went = Emit(compiler); |
+ compiler->DecrementRecursionDepth(); |
+ return how_it_went; |
+ } |
+ } |
+} |
+ |
+ |
+bool EndNode::GoTo(RegExpCompiler* compiler) { |
+ if (info()->follows_word_interest || |
+ info()->follows_newline_interest || |
+ info()->follows_start_interest) { |
+ return false; |
+ } |
+ if (!label()->is_bound()) { |
+ Bind(compiler->macro_assembler()); |
+ } |
+ switch (action_) { |
+ case ACCEPT: |
+ compiler->macro_assembler()->Succeed(); |
+ break; |
Mads Ager (chromium)
2008/11/25 21:09:41
Indent the break.
Erik Corry
2008/11/26 12:18:36
Fixed.
|
+ case BACKTRACK: |
+ compiler->macro_assembler()->Backtrack(); |
+ break; |
Mads Ager (chromium)
2008/11/25 21:09:41
Indent the break.
Erik Corry
2008/11/26 12:18:36
Fixed.
|
+ } |
+ return true; |
+} |
+ |
+ |
+Label* RegExpNode::label() { |
+ return &label_; |
+} |
+ |
+ |
+bool EndNode::Emit(RegExpCompiler* compiler) { |
+ RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ switch (action_) { |
+ case ACCEPT: |
+ Bind(macro); |
+ macro->Succeed(); |
+ return true; |
+ case BACKTRACK: |
+ Bind(macro); |
+ macro->Backtrack(); |
+ return true; |
+ } |
+ return false; |
+} |
+ |
+ |
+void GuardedAlternative::AddGuard(Guard* guard) { |
+ if (guards_ == NULL) |
+ guards_ = new ZoneList<Guard*>(1); |
+ guards_->Add(guard); |
+} |
+ |
+ |
+ActionNode* ActionNode::StoreRegister(int reg, |
+ int val, |
+ RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(STORE_REGISTER, on_success); |
+ result->data_.u_store_register.reg = reg; |
+ result->data_.u_store_register.value = val; |
+ return result; |
+} |
+ |
+ |
+ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success); |
+ result->data_.u_increment_register.reg = reg; |
+ return result; |
+} |
+ |
+ |
+ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(STORE_POSITION, on_success); |
+ result->data_.u_position_register.reg = reg; |
+ return result; |
+} |
+ |
+ |
+ActionNode* ActionNode::SavePosition(int reg, RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(SAVE_POSITION, on_success); |
+ result->data_.u_position_register.reg = reg; |
+ return result; |
+} |
+ |
+ |
+ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(RESTORE_POSITION, on_success); |
+ result->data_.u_position_register.reg = reg; |
+ return result; |
+} |
+ |
+ |
+ActionNode* ActionNode::BeginSubmatch(int reg, RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success); |
+ result->data_.u_submatch_stack_pointer_register.reg = reg; |
+ return result; |
+} |
+ |
+ |
+ActionNode* ActionNode::EscapeSubmatch(int reg, RegExpNode* on_success) { |
+ ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success); |
+ result->data_.u_submatch_stack_pointer_register.reg = reg; |
+ return result; |
+} |
+ |
+ |
+#define DEFINE_ACCEPT(Type) \ |
+ void Type##Node::Accept(NodeVisitor* visitor) { \ |
+ visitor->Visit##Type(this); \ |
+ } |
+FOR_EACH_NODE_TYPE(DEFINE_ACCEPT) |
+#undef DEFINE_ACCEPT |
+ |
+ |
+// ------------------------------------------------------------------- |
+// Emit code. |
+ |
+ |
+void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler, |
+ Guard* guard, |
+ Label* on_failure) { |
+ switch (guard->op()) { |
+ case Guard::LT: |
+ macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure); |
+ break; |
+ case Guard::GEQ: |
+ macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure); |
+ break; |
+ } |
+} |
+ |
+ |
+static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; |
+static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
+ |
+ |
+static inline void EmitAtomNonLetters( |
+ RegExpMacroAssembler* macro_assembler, |
+ TextElement elm, |
+ Vector<const uc16> quarks, |
+ Label* on_failure, |
+ int cp_offset) { |
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
+ for (int i = quarks.length() - 1; i >= 0; i--) { |
+ uc16 c = quarks[i]; |
+ int length = uncanonicalize.get(c, '\0', chars); |
+ if (length <= 1) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
+ macro_assembler->CheckNotCharacter(c, on_failure); |
+ } |
+ } |
+} |
+ |
+ |
+static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, |
+ uc16 c1, |
+ uc16 c2, |
+ Label* on_failure) { |
+ uc16 exor = c1 ^ c2; |
+ // Check whether exor has only one bit set. |
+ if (((exor - 1) & exor) == 0) { |
Mads Ager (chromium)
2008/11/25 21:09:41
The if-else could be replaced by two ifs that each
Erik Corry
2008/11/26 12:18:36
Fixed.
|
+ // If c1 and c2 differ only by one bit. |
+ // Ecma262UnCanonicalize always gives the highest number last. |
+ ASSERT(c2 > c1); |
+ macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure); |
+ return true; |
+ } else { |
+ ASSERT(c2 > c1); |
+ uc16 diff = c2 - c1; |
+ if (((diff - 1) & diff) == 0 && c1 >= diff) { |
+ // If the characters differ by 2^n but don't differ by one bit then |
+ // subtract the difference from the found character, then do the or |
+ // trick. We avoid the theoretical case where negative numbers are |
+ // involved in order to simplify code generation. |
+ macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff, |
+ diff, |
+ on_failure); |
+ return true; |
+ } |
+ } |
+ return false; |
+} |
+ |
+ |
+static inline void EmitAtomLetters( |
+ RegExpMacroAssembler* macro_assembler, |
+ TextElement elm, |
+ Vector<const uc16> quarks, |
+ Label* on_failure, |
+ int cp_offset) { |
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
+ for (int i = quarks.length() - 1; i >= 0; i--) { |
+ uc16 c = quarks[i]; |
+ int length = uncanonicalize.get(c, '\0', chars); |
+ if (length <= 1) continue; |
+ macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure); |
+ Label ok; |
+ ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
+ switch (length) { |
+ case 2: { |
+ if (ShortCutEmitCharacterPair(macro_assembler, |
+ chars[0], |
+ chars[1], |
+ on_failure)) { |
+ ok.Unuse(); |
+ } else { |
+ macro_assembler->CheckCharacter(chars[0], &ok); |
+ macro_assembler->CheckNotCharacter(chars[1], on_failure); |
+ macro_assembler->Bind(&ok); |
+ } |
+ break; |
+ } |
+ case 4: |
+ macro_assembler->CheckCharacter(chars[3], &ok); |
+ // Fall through! |
+ case 3: |
+ macro_assembler->CheckCharacter(chars[0], &ok); |
+ macro_assembler->CheckCharacter(chars[1], &ok); |
+ macro_assembler->CheckNotCharacter(chars[2], on_failure); |
+ macro_assembler->Bind(&ok); |
+ break; |
+ default: |
+ UNREACHABLE(); |
+ break; |
+ } |
+ } |
+} |
+ |
+ |
+static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
+ RegExpCharacterClass* cc, |
+ int cp_offset, |
+ Label* on_failure) { |
+ macro_assembler->LoadCurrentCharacter(cp_offset, on_failure); |
+ cp_offset++; |
+ |
+ ZoneList<CharacterRange>* ranges = cc->ranges(); |
+ |
+ Label success; |
+ |
+ Label *char_is_in_class = |
Erik Corry
2008/11/25 11:03:52
Misplaced asterisk.
Erik Corry
2008/11/26 12:18:36
Fixed.
|
+ cc->is_negated() ? on_failure : &success; |
+ |
+ int range_count = ranges->length(); |
+ |
+ if (range_count == 0) { |
+ if (!cc->is_negated()) { |
+ macro_assembler->GoTo(on_failure); |
+ } |
+ return; |
+ } |
+ |
+ for (int i = 0; i < range_count - 1; i++) { |
+ CharacterRange& range = ranges->at(i); |
+ Label next_range; |
+ uc16 from = range.from(); |
+ uc16 to = range.to(); |
+ if (to == from) { |
+ macro_assembler->CheckCharacter(to, char_is_in_class); |
+ } else { |
+ if (from != 0) { |
+ macro_assembler->CheckCharacterLT(from, &next_range); |
+ } |
+ if (to != 0xffff) { |
+ macro_assembler->CheckCharacterLT(to + 1, char_is_in_class); |
+ } else { |
+ macro_assembler->GoTo(char_is_in_class); |
+ } |
+ } |
+ macro_assembler->Bind(&next_range); |
+ } |
+ |
+ CharacterRange& range = ranges->at(range_count - 1); |
+ uc16 from = range.from(); |
+ uc16 to = range.to(); |
+ |
+ if (to == from) { |
+ if (cc->is_negated()) { |
+ macro_assembler->CheckCharacter(to, on_failure); |
+ } else { |
+ macro_assembler->CheckNotCharacter(to, on_failure); |
+ } |
+ } else { |
+ if (from != 0) { |
+ if (!cc->is_negated()) { |
+ macro_assembler->CheckCharacterLT(from, on_failure); |
+ } else { |
+ macro_assembler->CheckCharacterLT(from, &success); |
+ } |
+ } |
+ if (to != 0xffff) { |
+ if (!cc->is_negated()) { |
+ macro_assembler->CheckCharacterGT(to, on_failure); |
+ } else { |
+ macro_assembler->CheckCharacterLT(to + 1, on_failure); |
+ } |
+ } else { |
+ if (cc->is_negated()) { |
+ macro_assembler->GoTo(on_failure); |
+ } |
+ } |
+ } |
+ macro_assembler->Bind(&success); |
+} |
+ |
+ |
+ |
+bool TextNode::Emit(RegExpCompiler* compiler) { |
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
+ Bind(macro_assembler); |
+ int element_count = elms_->length(); |
+ int cp_offset = 0; |
+ // First, handle straight character matches. |
+ for (int i = 0; i < element_count; i++) { |
+ TextElement elm = elms_->at(i); |
+ if (elm.type == TextElement::ATOM) { |
+ Vector<const uc16> quarks = elm.data.u_atom->data(); |
+ if (!compiler->is_case_independent()) { |
+ macro_assembler->CheckCharacters(quarks, |
+ cp_offset, |
+ on_failure_->label()); |
+ } else { |
+ EmitAtomNonLetters(macro_assembler, |
+ elm, |
+ quarks, |
+ on_failure_->label(), |
+ cp_offset); |
+ } |
+ cp_offset += quarks.length(); |
+ } else { |
+ ASSERT_EQ(elm.type, TextElement::CHAR_CLASS); |
+ cp_offset++; |
+ } |
+ } |
+ // Second, handle case independent letter matches if any. |
+ if (compiler->is_case_independent()) { |
+ cp_offset = 0; |
+ for (int i = 0; i < element_count; i++) { |
+ TextElement elm = elms_->at(i); |
+ if (elm.type == TextElement::ATOM) { |
+ Vector<const uc16> quarks = elm.data.u_atom->data(); |
+ EmitAtomLetters(macro_assembler, |
+ elm, |
+ quarks, |
+ on_failure_->label(), |
+ cp_offset); |
+ cp_offset += quarks.length(); |
+ } else { |
+ cp_offset++; |
+ } |
+ } |
+ } |
+ // If the fast character matches passed then do the character classes. |
+ cp_offset = 0; |
+ for (int i = 0; i < element_count; i++) { |
+ TextElement elm = elms_->at(i); |
+ if (elm.type == TextElement::CHAR_CLASS) { |
+ RegExpCharacterClass* cc = elm.data.u_char_class; |
+ EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label()); |
+ cp_offset ++; |
+ } else { |
+ cp_offset += elm.data.u_atom->data().length(); |
+ } |
+ } |
+ |
+ compiler->AddWork(on_failure_); |
+ macro_assembler->AdvanceCurrentPosition(cp_offset); |
+ return on_success()->GoTo(compiler); |
+} |
+ |
+ |
+bool ChoiceNode::Emit(RegExpCompiler* compiler) { |
+ int choice_count = alternatives_->length(); |
+ RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
+ Bind(macro_assembler); |
+ // For now we just call all choices one after the other. The idea ultimately |
+ // is to use the Dispatch table to try only the relevant ones. |
+ int i; |
Mads Ager (chromium)
2008/11/25 21:09:41
I don't see how the i is needed after the for loop
Erik Corry
2008/11/26 12:18:36
Fixed.
|
+ for (i = 0; i < choice_count - 1; i++) { |
+ GuardedAlternative alternative = alternatives_->at(i); |
+ Label after; |
+ Label after_no_pop_cp; |
+ ZoneList<Guard*>* guards = alternative.guards(); |
+ if (guards != NULL) { |
+ int guard_count = guards->length(); |
+ for (int j = 0; j < guard_count; j++) { |
+ GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp); |
+ } |
+ } |
+ macro_assembler->PushCurrentPosition(); |
+ macro_assembler->PushBacktrack(&after); |
+ if (!alternative.node()->GoTo(compiler)) { |
+ after.Unuse(); |
+ after_no_pop_cp.Unuse(); |
+ return false; |
+ } |
+ macro_assembler->Bind(&after); |
+ macro_assembler->PopCurrentPosition(); |
+ macro_assembler->Bind(&after_no_pop_cp); |
+ } |
+ GuardedAlternative alternative = alternatives_->at(i); |
+ ZoneList<Guard*>* guards = alternative.guards(); |
+ if (guards != NULL) { |
+ int guard_count = guards->length(); |
+ for (int j = 0; j < guard_count; j++) { |
+ GenerateGuard(macro_assembler, guards->at(j), on_failure_->label()); |
+ } |
+ } |
+ if (!on_failure_->IsBacktrack()) { |
+ ASSERT_NOT_NULL(on_failure_ -> label()); |
+ macro_assembler->PushBacktrack(on_failure_->label()); |
+ compiler->AddWork(on_failure_); |
+ } |
+ if (!alternative.node()->GoTo(compiler)) { |
+ return false; |
+ } |
+ return true; |
+} |
+ |
+ |
+bool ActionNode::Emit(RegExpCompiler* compiler) { |
+ RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ Bind(macro); |
+ switch (type_) { |
+ case STORE_REGISTER: |
+ macro->SetRegister(data_.u_store_register.reg, |
+ data_.u_store_register.value); |
+ break; |
+ case INCREMENT_REGISTER: { |
+ Label undo; |
+ macro->PushBacktrack(&undo); |
+ macro->AdvanceRegister(data_.u_increment_register.reg, 1); |
+ bool ok = on_success()->GoTo(compiler); |
+ if (!ok) { |
+ undo.Unuse(); |
+ return false; |
+ } |
+ macro->Bind(&undo); |
+ macro->AdvanceRegister(data_.u_increment_register.reg, -1); |
+ macro->Backtrack(); |
+ break; |
+ } |
+ case STORE_POSITION: { |
+ Label undo; |
+ macro->PushRegister(data_.u_position_register.reg); |
+ macro->PushBacktrack(&undo); |
+ macro->WriteCurrentPositionToRegister(data_.u_position_register.reg); |
+ bool ok = on_success()->GoTo(compiler); |
+ if (!ok) { |
+ undo.Unuse(); |
+ return false; |
+ } |
+ macro->Bind(&undo); |
+ macro->PopRegister(data_.u_position_register.reg); |
+ macro->Backtrack(); |
+ break; |
+ } |
+ case SAVE_POSITION: |
+ macro->WriteCurrentPositionToRegister( |
+ data_.u_position_register.reg); |
+ break; |
+ case RESTORE_POSITION: |
+ macro->ReadCurrentPositionFromRegister( |
+ data_.u_position_register.reg); |
+ break; |
+ case BEGIN_SUBMATCH: |
+ macro->WriteStackPointerToRegister( |
+ data_.u_submatch_stack_pointer_register.reg); |
+ break; |
+ case ESCAPE_SUBMATCH: |
+ macro->ReadStackPointerFromRegister( |
+ data_.u_submatch_stack_pointer_register.reg); |
+ break; |
+ default: |
+ UNREACHABLE(); |
+ return false; |
+ } |
+ return on_success()->GoTo(compiler); |
+} |
+ |
+ |
+bool BackReferenceNode::Emit(RegExpCompiler* compiler) { |
+ RegExpMacroAssembler* macro = compiler->macro_assembler(); |
+ Bind(macro); |
+ // Check whether the registers are uninitialized and always |
+ // succeed if they are. |
+ macro->IfRegisterLT(start_reg_, 0, on_success()->label()); |
+ macro->IfRegisterLT(end_reg_, 0, on_success()->label()); |
+ ASSERT_EQ(start_reg_ + 1, end_reg_); |
+ macro->CheckNotBackReference(start_reg_, on_failure_->label()); |
+ return on_success()->GoTo(compiler); |
+} |
+ |
+ |
+// ------------------------------------------------------------------- |
+// Dot/dotty output |
+ |
+ |
+#ifdef DEBUG |
+ |
+ |
+class DotPrinter: public NodeVisitor { |
+ public: |
+ DotPrinter() : stream_(&alloc_) { } |
+ void PrintNode(const char* label, RegExpNode* node); |
+ void Visit(RegExpNode* node); |
+ void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure); |
+ StringStream* stream() { return &stream_; } |
+#define DECLARE_VISIT(Type) \ |
+ virtual void Visit##Type(Type##Node* that); |
+FOR_EACH_NODE_TYPE(DECLARE_VISIT) |
+#undef DECLARE_VISIT |
+ private: |
+ HeapStringAllocator alloc_; |
+ StringStream stream_; |
+ std::set<RegExpNode*> seen_; |
+}; |
+ |
+ |
+void DotPrinter::PrintNode(const char* label, RegExpNode* node) { |
+ stream()->Add("digraph G {\n graph [label=\""); |
+ for (int i = 0; label[i]; i++) { |
+ switch (label[i]) { |
+ case '\\': |
+ stream()->Add("\\\\"); |
+ break; |
+ case '"': |
+ stream()->Add("\""); |
+ break; |
+ default: |
+ stream()->Put(label[i]); |
+ break; |
+ } |
+ } |
+ stream()->Add("\"];\n"); |
+ Visit(node); |
+ stream()->Add("}\n"); |
+ printf("%s", *(stream()->ToCString())); |
+} |
+ |
+ |
+void DotPrinter::Visit(RegExpNode* node) { |
+ if (seen_.find(node) != seen_.end()) |
+ return; |
+ seen_.insert(node); |
+ node->Accept(this); |
+} |
+ |
+ |
+void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) { |
+ if (on_failure->IsBacktrack()) return; |
+ stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure); |
+ Visit(on_failure); |
+} |
+ |
+ |
+class TableEntryBodyPrinter { |
+ public: |
+ TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice) |
+ : stream_(stream), choice_(choice) { } |
Mads Ager (chromium)
2008/11/25 21:09:41
Four space indent.
Christian Plesner Hansen
2008/11/26 06:49:56
Fixed.
|
+ void Call(uc16 from, DispatchTable::Entry entry) { |
+ OutSet* out_set = entry.out_set(); |
+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { |
+ if (out_set->Get(i)) { |
+ stream()->Add(" n%p:s%io%i -> n%p;\n", |
+ choice(), |
+ from, |
+ i, |
+ choice()->alternatives()->at(i).node()); |
+ } |
+ } |
+ } |
+ private: |
+ StringStream* stream() { return stream_; } |
+ ChoiceNode* choice() { return choice_; } |
+ StringStream* stream_; |
+ ChoiceNode* choice_; |
+}; |
+ |
+ |
+class TableEntryHeaderPrinter { |
+ public: |
+ explicit TableEntryHeaderPrinter(StringStream* stream) |
+ : first_(true), stream_(stream) { } |
Mads Ager (chromium)
2008/11/25 21:09:41
Four space indent.
Christian Plesner Hansen
2008/11/26 06:49:56
Fixed.
|
+ void Call(uc16 from, DispatchTable::Entry entry) { |
+ if (first_) { |
+ first_ = false; |
+ } else { |
+ stream()->Add("|"); |
+ } |
+ stream()->Add("{\\%k-\\%k|{", from, entry.to()); |
+ OutSet* out_set = entry.out_set(); |
+ int priority = 0; |
+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { |
+ if (out_set->Get(i)) { |
+ if (priority > 0) stream()->Add("|"); |
+ stream()->Add("<s%io%i> %i", from, i, priority); |
+ priority++; |
+ } |
+ } |
+ stream()->Add("}}"); |
+ } |
+ private: |
+ bool first_; |
+ StringStream* stream() { return stream_; } |
+ StringStream* stream_; |
+}; |
+ |
+ |
+void DotPrinter::VisitChoice(ChoiceNode* that) { |
+ stream()->Add(" n%p [shape=Mrecord, label=\"", that); |
+ TableEntryHeaderPrinter header_printer(stream()); |
+ that->table()->ForEach(&header_printer); |
+ stream()->Add("\"]\n", that); |
+ TableEntryBodyPrinter body_printer(stream(), that); |
+ that->table()->ForEach(&body_printer); |
+ PrintOnFailure(that, that->on_failure()); |
+ for (int i = 0; i < that->alternatives()->length(); i++) { |
+ GuardedAlternative alt = that->alternatives()->at(i); |
+ alt.node()->Accept(this); |
+ } |
+} |
+ |
+ |
+void DotPrinter::VisitText(TextNode* that) { |
+ stream()->Add(" n%p [label=\"", that); |
+ for (int i = 0; i < that->elements()->length(); i++) { |
+ if (i > 0) stream()->Add(" "); |
+ TextElement elm = that->elements()->at(i); |
+ switch (elm.type) { |
+ case TextElement::ATOM: { |
+ stream()->Add("'%w'", elm.data.u_atom->data()); |
+ break; |
+ } |
+ case TextElement::CHAR_CLASS: { |
+ RegExpCharacterClass* node = elm.data.u_char_class; |
+ stream()->Add("["); |
+ if (node->is_negated()) |
+ stream()->Add("^"); |
+ for (int j = 0; j < node->ranges()->length(); j++) { |
+ CharacterRange range = node->ranges()->at(j); |
+ stream()->Add("%k-%k", range.from(), range.to()); |
+ } |
+ stream()->Add("]"); |
+ break; |
+ } |
+ default: |
+ UNREACHABLE(); |
+ } |
+ } |
+ stream()->Add("\", shape=box, peripheries=2];\n"); |
+ stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
+ Visit(that->on_success()); |
+ PrintOnFailure(that, that->on_failure()); |
+} |
+ |
+ |
+void DotPrinter::VisitBackReference(BackReferenceNode* that) { |
+ stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n", |
+ that, |
+ that->start_register(), |
+ that->end_register()); |
+ stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
+ Visit(that->on_success()); |
+ PrintOnFailure(that, that->on_failure()); |
+} |
+ |
+ |
+void DotPrinter::VisitEnd(EndNode* that) { |
+ stream()->Add(" n%p [style=bold, shape=point];\n", that); |
+} |
+ |
+ |
+void DotPrinter::VisitAction(ActionNode* that) { |
+ stream()->Add(" n%p [", that); |
+ switch (that->type_) { |
+ case ActionNode::STORE_REGISTER: |
+ stream()->Add("label=\"$%i:=%i\", shape=octagon", |
+ that->data_.u_store_register.reg, |
+ that->data_.u_store_register.value); |
+ break; |
+ case ActionNode::INCREMENT_REGISTER: |
+ stream()->Add("label=\"$%i++\", shape=octagon", |
+ that->data_.u_increment_register.reg); |
+ break; |
+ case ActionNode::STORE_POSITION: |
+ stream()->Add("label=\"$%i:=$pos\", shape=octagon", |
+ that->data_.u_position_register.reg); |
+ break; |
+ case ActionNode::SAVE_POSITION: |
+ stream()->Add("label=\"$%i:=$pos\", shape=octagon", |
+ that->data_.u_position_register.reg); |
+ break; |
+ case ActionNode::RESTORE_POSITION: |
+ stream()->Add("label=\"$pos:=$%i\", shape=octagon", |
+ that->data_.u_position_register.reg); |
+ break; |
+ case ActionNode::BEGIN_SUBMATCH: |
+ stream()->Add("label=\"begin\", shape=septagon"); |
+ break; |
+ case ActionNode::ESCAPE_SUBMATCH: |
+ stream()->Add("label=\"escape\", shape=septagon"); |
+ break; |
+ } |
+ stream()->Add("];\n"); |
+ stream()->Add(" n%p -> n%p;\n", that, that->on_success()); |
+ Visit(that->on_success()); |
+} |
+ |
+ |
+class DispatchTableDumper { |
+ public: |
+ explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { } |
+ void Call(uc16 key, DispatchTable::Entry entry); |
+ StringStream* stream() { return stream_; } |
+ private: |
+ StringStream* stream_; |
+}; |
+ |
+ |
+void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) { |
+ stream()->Add("[%k-%k]: {", key, entry.to()); |
+ OutSet* set = entry.out_set(); |
+ bool first = true; |
+ for (unsigned i = 0; i < OutSet::kFirstLimit; i++) { |
+ if (set->Get(i)) { |
+ if (first) { |
+ first = false; |
+ } else { |
+ stream()->Add(", "); |
+ } |
+ stream()->Add("%i", i); |
+ } |
+ } |
+ stream()->Add("}\n"); |
+} |
+ |
+ |
+void DispatchTable::Dump() { |
+ HeapStringAllocator alloc; |
+ StringStream stream(&alloc); |
+ DispatchTableDumper dumper(&stream); |
+ tree()->ForEach(&dumper); |
+ OS::PrintError("%s", *stream.ToCString()); |
+} |
+ |
+ |
+void RegExpEngine::DotPrint(const char* label, RegExpNode* node) { |
+ DotPrinter printer; |
+ printer.PrintNode(label, node); |
+} |
+ |
+ |
+#endif // DEBUG |
+ |
+ |
+// ------------------------------------------------------------------- |
+// Tree to graph conversion |
+ |
+ |
+RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
+ elms->Add(TextElement::Atom(this)); |
+ return new TextNode(elms, on_success, on_failure); |
+} |
+ |
+ |
+RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ return new TextNode(elements(), on_success, on_failure); |
+} |
+ |
+ |
+RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); |
+ elms->Add(TextElement::CharClass(this)); |
+ return new TextNode(elms, on_success, on_failure); |
+} |
+ |
+ |
+RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
+ int length = alternatives->length(); |
+ ChoiceNode* result = new ChoiceNode(length, on_failure); |
+ for (int i = 0; i < length; i++) { |
+ GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, |
+ on_success, |
+ on_failure)); |
+ result->AddAlternative(alternative); |
+ } |
+ return result; |
+} |
+ |
+ |
+RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ return ToNode(min(), |
+ max(), |
+ is_greedy(), |
+ body(), |
+ compiler, |
+ on_success, |
+ on_failure); |
+} |
+ |
+ |
+RegExpNode* RegExpQuantifier::ToNode(int min, |
+ int max, |
+ bool is_greedy, |
+ RegExpTree* body, |
+ RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ // x{f, t} becomes this: |
+ // |
+ // (r++)<-. |
+ // | ` |
+ // | (x) |
+ // v ^ |
+ // (r=0)-->(?)---/ [if r < t] |
+ // | |
+ // [if r >= f] \----> ... |
+ // |
+ // |
+ // TODO(someone): clear captures on repetition and handle empty |
+ // matches. |
+ bool has_min = min > 0; |
+ bool has_max = max < RegExpQuantifier::kInfinity; |
+ bool needs_counter = has_min || has_max; |
+ int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1; |
+ ChoiceNode* center = new ChoiceNode(2, on_failure); |
+ RegExpNode* loop_return = needs_counter |
+ ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center)) |
+ : static_cast<RegExpNode*>(center); |
+ RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure); |
+ GuardedAlternative body_alt(body_node); |
+ if (has_max) { |
+ Guard* body_guard = new Guard(reg_ctr, Guard::LT, max); |
+ body_alt.AddGuard(body_guard); |
+ } |
+ GuardedAlternative rest_alt(on_success); |
+ if (has_min) { |
+ Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min); |
+ rest_alt.AddGuard(rest_guard); |
+ } |
+ if (is_greedy) { |
+ center->AddAlternative(body_alt); |
+ center->AddAlternative(rest_alt); |
+ } else { |
+ center->AddAlternative(rest_alt); |
+ center->AddAlternative(body_alt); |
+ } |
+ if (needs_counter) { |
+ return ActionNode::StoreRegister(reg_ctr, 0, center); |
+ } else { |
+ return center; |
+ } |
+} |
+ |
+ |
+RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ NodeInfo info; |
+ switch (type()) { |
+ case START_OF_LINE: |
+ info.follows_newline_interest = true; |
+ break; |
+ case START_OF_INPUT: |
+ info.follows_start_interest = true; |
+ break; |
+ case BOUNDARY: case NON_BOUNDARY: |
+ info.follows_word_interest = true; |
+ break; |
+ case END_OF_LINE: case END_OF_INPUT: |
+ // This is wrong but has the effect of making the compiler abort. |
+ info.follows_start_interest = true; |
+ } |
+ return on_success->PropagateInterest(&info); |
+} |
+ |
+ |
+RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ return new BackReferenceNode(RegExpCapture::StartRegister(index()), |
+ RegExpCapture::EndRegister(index()), |
+ on_success, |
+ on_failure); |
+} |
+ |
+ |
+RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ return on_success; |
+} |
+ |
+ |
+RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ int stack_pointer_register = compiler->AllocateRegister(); |
+ int position_register = compiler->AllocateRegister(); |
+ if (is_positive()) { |
+ // begin submatch scope |
+ // $reg = $pos |
+ // if [body] |
+ // then |
+ // $pos = $reg |
+ // escape submatch scope (drop all backtracks created in scope) |
+ // succeed |
+ // else |
+ // end submatch scope (nothing to clean up, just exit the scope) |
+ // fail |
+ return ActionNode::BeginSubmatch( |
+ stack_pointer_register, |
+ ActionNode::SavePosition( |
+ position_register, |
+ body()->ToNode( |
+ compiler, |
+ ActionNode::RestorePosition( |
+ position_register, |
+ ActionNode::EscapeSubmatch(stack_pointer_register, |
+ on_success)), |
+ on_failure))); |
+ } else { |
+ // begin submatch scope |
+ // try |
+ // first if (body) |
+ // then |
+ // escape submatch scope |
+ // fail |
+ // else |
+ // backtrack |
+ // second |
+ // end submatch scope |
+ // restore current position |
+ // succeed |
+ ChoiceNode* try_node = |
+ new ChoiceNode(1, ActionNode::RestorePosition(position_register, |
+ on_success)); |
+ RegExpNode* body_node = body()->ToNode( |
+ compiler, |
+ ActionNode::EscapeSubmatch(stack_pointer_register, on_failure), |
+ compiler->backtrack()); |
+ GuardedAlternative body_alt(body_node); |
+ try_node->AddAlternative(body_alt); |
+ return ActionNode::BeginSubmatch(stack_pointer_register, |
+ ActionNode::SavePosition( |
+ position_register, |
+ try_node)); |
+ } |
+} |
+ |
+ |
+RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ return ToNode(body(), index(), compiler, on_success, on_failure); |
+} |
+ |
+ |
+RegExpNode* RegExpCapture::ToNode(RegExpTree* body, |
+ int index, |
+ RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ int start_reg = RegExpCapture::StartRegister(index); |
+ int end_reg = RegExpCapture::EndRegister(index); |
+ RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success); |
+ RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure); |
+ return ActionNode::StorePosition(start_reg, body_node); |
+} |
+ |
+ |
+RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpNode* on_failure) { |
+ ZoneList<RegExpTree*>* children = nodes(); |
+ RegExpNode* current = on_success; |
+ for (int i = children->length() - 1; i >= 0; i--) { |
+ current = children->at(i)->ToNode(compiler, current, on_failure); |
+ } |
+ return current; |
+} |
+ |
+ |
+static const int kSpaceRangeCount = 20; |
+static const uc16 kSpaceRanges[kSpaceRangeCount] = { |
+ 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680, |
+ 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029, |
+ 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 |
+}; |
+ |
+ |
+static const int kWordRangeCount = 8; |
+static const uc16 kWordRanges[kWordRangeCount] = { |
+ '0', '9', 'A', 'Z', '_', '_', 'a', 'z' |
+}; |
+ |
+ |
+static const int kDigitRangeCount = 2; |
+static const uc16 kDigitRanges[kDigitRangeCount] = { |
+ '0', '9' |
+}; |
+ |
+ |
+static const int kLineTerminatorRangeCount = 6; |
+static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { |
+ 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 |
+}; |
+ |
+ |
+static void AddClass(const uc16* elmv, |
+ int elmc, |
+ ZoneList<CharacterRange>* ranges) { |
+ for (int i = 0; i < elmc; i += 2) { |
+ ASSERT(elmv[i] <= elmv[i + 1]); |
+ ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); |
+ } |
+} |
+ |
+ |
+static void AddClassNegated(const uc16 *elmv, |
+ int elmc, |
+ ZoneList<CharacterRange>* ranges) { |
+ ASSERT(elmv[0] != 0x0000); |
+ ASSERT(elmv[elmc-1] != 0xFFFF); |
+ uc16 last = 0x0000; |
+ for (int i = 0; i < elmc; i += 2) { |
+ ASSERT(last <= elmv[i] - 1); |
+ ASSERT(elmv[i] <= elmv[i + 1]); |
+ ranges->Add(CharacterRange(last, elmv[i] - 1)); |
+ last = elmv[i + 1] + 1; |
+ } |
+ ranges->Add(CharacterRange(last, 0xFFFF)); |
+} |
+ |
+ |
+void CharacterRange::AddClassEscape(uc16 type, |
+ ZoneList<CharacterRange>* ranges) { |
+ switch (type) { |
+ case 's': |
+ AddClass(kSpaceRanges, kSpaceRangeCount, ranges); |
+ break; |
+ case 'S': |
+ AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges); |
+ break; |
+ case 'w': |
+ AddClass(kWordRanges, kWordRangeCount, ranges); |
+ break; |
+ case 'W': |
+ AddClassNegated(kWordRanges, kWordRangeCount, ranges); |
+ break; |
+ case 'd': |
+ AddClass(kDigitRanges, kDigitRangeCount, ranges); |
+ break; |
+ case 'D': |
+ AddClassNegated(kDigitRanges, kDigitRangeCount, ranges); |
+ break; |
+ case '.': |
+ AddClassNegated(kLineTerminatorRanges, |
+ kLineTerminatorRangeCount, |
+ ranges); |
+ break; |
+ // This is not a character range as defined by the spec but a |
+ // convenient shorthand for a character class that matches any |
+ // character. |
+ case '*': |
+ ranges->Add(CharacterRange::Everything()); |
+ break; |
+ default: |
+ UNREACHABLE(); |
+ } |
+} |
+ |
+ |
+void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) { |
+ unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
+ if (IsSingleton()) { |
+ // If this is a singleton we just expand the one character. |
+ int length = uncanonicalize.get(from(), '\0', chars); |
+ for (int i = 0; i < length; i++) { |
+ uc32 chr = chars[i]; |
+ if (chr != from()) { |
+ ranges->Add(CharacterRange::Singleton(chars[i])); |
+ } |
+ } |
+ } else if (from() <= kRangeCanonicalizeMax |
+ && to() <= kRangeCanonicalizeMax) { |
Mads Ager (chromium)
2008/11/25 21:09:41
Indentation is off.
Christian Plesner Hansen
2008/11/26 06:49:56
Fixed.
|
+ // If this is a range we expand the characters block by block, |
+ // expanding contiguous subranges (blocks) one at a time. |
+ // The approach is as follows. For a given start character we |
+ // look up the block that contains it, for instance 'a' if the |
+ // start character is 'c'. A block is characterized by the property |
+ // that all characters uncanonicalize in the same way as the first |
+ // element, except that each entry in the result is incremented |
+ // by the distance from the first element. So a-z is a block |
+ // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter |
+ // uncanonicalizes to ['a' + k, 'A' + k]. |
+ // Once we've found the start point we look up its uncanonicalization |
+ // and produce a range for each element. For instance for [c-f] |
+ // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only |
+ // add a range if it is not already contained in the input, so [c-f] |
+ // will be skipped but [C-F] will be added. If this range is not |
+ // completely contained in a block we do this for all the blocks |
+ // covered by the range. |
+ unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
+ // First, look up the block that contains the 'from' character. |
+ int length = canonrange.get(from(), '\0', range); |
+ if (length == 0) { |
+ range[0] = from(); |
+ } else { |
+ ASSERT_EQ(1, length); |
+ } |
+ int pos = from(); |
+ // The start of the current block. Note that except for the first |
+ // iteration 'start' is always equal to 'pos'. |
+ int start; |
+ // If it is not the start point of a block the entry contains the |
+ // offset of the character from the start point. |
+ if ((range[0] & kStartMarker) == 0) { |
+ start = pos - range[0]; |
+ } else { |
+ start = pos; |
+ } |
+ // Then we add the ranges on at a time, incrementing the current |
+ // position to be after the last block each time. The position |
+ // always points to the start of a block. |
+ while (pos < to()) { |
+ length = canonrange.get(start, '\0', range); |
+ if (length == 0) { |
+ range[0] = start; |
+ } else { |
+ ASSERT_EQ(1, length); |
+ } |
+ ASSERT((range[0] & kStartMarker) != 0); |
+ // The start point of a block contains the distance to the end |
+ // of the range. |
+ int block_end = start + (range[0] & kPayloadMask) - 1; |
+ int end = (block_end > to()) ? to() : block_end; |
+ length = uncanonicalize.get(start, '\0', range); |
+ for (int i = 0; i < length; i++) { |
+ uc32 c = range[i]; |
+ uc16 range_from = c + (pos - start); |
+ uc16 range_to = c + (end - start); |
+ if (!(from() <= range_from && range_to <= to())) |
+ ranges->Add(CharacterRange(range_from, range_to)); |
+ } |
+ start = pos = block_end + 1; |
+ } |
+ } else { |
+ // TODO(plesner) when we've fixed the 2^11 bug in unibrow. |
+ } |
+} |
+ |
+ |
+// ------------------------------------------------------------------- |
+// Interest propagation |
+ |
+ |
+RegExpNode* RegExpNode::GetSibling(NodeInfo* info) { |
+ for (int i = 0; i < siblings_.length(); i++) { |
+ RegExpNode* sibling = siblings_.Get(i); |
+ if (sibling->info()->SameInterests(info)) |
+ return sibling; |
+ } |
+ return NULL; |
+} |
+ |
+ |
+template <class C> |
+static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) { |
+ RegExpNode* sibling = node->GetSibling(info); |
+ if (sibling != NULL) return sibling; |
+ node->EnsureSiblings(); |
+ sibling = new C(*node); |
+ sibling->info()->AdoptInterests(info); |
+ node->AddSibling(sibling); |
+ return sibling; |
+} |
+ |
+ |
+RegExpNode* ActionNode::PropagateInterest(NodeInfo* info) { |
+ RegExpNode* sibling = GetSibling(info); |
+ if (sibling != NULL) return sibling; |
+ EnsureSiblings(); |
+ ActionNode* action = new ActionNode(*this); |
+ action->info()->AdoptInterests(info); |
+ AddSibling(action); |
+ action->set_on_success(action->on_success()->PropagateInterest(info)); |
+ return action; |
+} |
+ |
+ |
+RegExpNode* ChoiceNode::PropagateInterest(NodeInfo* info) { |
+ RegExpNode* sibling = GetSibling(info); |
+ if (sibling != NULL) return sibling; |
+ EnsureSiblings(); |
+ ChoiceNode* choice = new ChoiceNode(*this); |
+ choice->info()->AdoptInterests(info); |
+ AddSibling(choice); |
+ ZoneList<GuardedAlternative>* old_alternatives = alternatives(); |
+ int count = old_alternatives->length(); |
+ choice->alternatives_ = new ZoneList<GuardedAlternative>(count); |
+ for (int i = 0; i < count; i++) { |
+ GuardedAlternative alternative = old_alternatives->at(i); |
+ alternative.set_node(alternative.node()->PropagateInterest(info)); |
+ choice->alternatives()->Add(alternative); |
+ } |
+ return choice; |
+} |
+ |
+ |
+RegExpNode* EndNode::PropagateInterest(NodeInfo* info) { |
+ return PropagateToEndpoint(this, info); |
+} |
+ |
+ |
+RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) { |
+ return PropagateToEndpoint(this, info); |
+} |
+ |
+ |
+RegExpNode* TextNode::PropagateInterest(NodeInfo* info) { |
+ return PropagateToEndpoint(this, info); |
+} |
+ |
+ |
+// ------------------------------------------------------------------- |
+// Splay tree |
+ |
+ |
+OutSet* OutSet::Extend(unsigned value) { |
+ if (Get(value)) |
+ return this; |
+ if (successors() != NULL) { |
+ for (int i = 0; i < successors()->length(); i++) { |
+ OutSet* successor = successors()->at(i); |
+ if (successor->Get(value)) |
+ return successor; |
+ } |
+ } else { |
+ successors_ = new ZoneList<OutSet*>(2); |
+ } |
+ OutSet* result = new OutSet(first_, remaining_); |
+ result->Set(value); |
+ successors()->Add(result); |
+ return result; |
+} |
+ |
+ |
+void OutSet::Set(unsigned value) { |
+ if (value < kFirstLimit) { |
+ first_ |= (1 << value); |
+ } else { |
+ if (remaining_ == NULL) |
+ remaining_ = new ZoneList<unsigned>(1); |
+ if (remaining_->is_empty() || !remaining_->Contains(value)) |
+ remaining_->Add(value); |
+ } |
+} |
+ |
+ |
+bool OutSet::Get(unsigned value) { |
+ if (value < kFirstLimit) { |
+ return (first_ & (1 << value)) != 0; |
+ } else if (remaining_ == NULL) { |
+ return false; |
+ } else { |
+ return remaining_->Contains(value); |
+ } |
+} |
+ |
+ |
+const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar; |
+const DispatchTable::Entry DispatchTable::Config::kNoValue; |
+ |
+ |
+void DispatchTable::AddRange(CharacterRange full_range, int value) { |
+ CharacterRange current = full_range; |
+ if (tree()->is_empty()) { |
+ // If this is the first range we just insert into the table. |
+ ZoneSplayTree<Config>::Locator loc; |
+ ASSERT_RESULT(tree()->Insert(current.from(), &loc)); |
+ loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value))); |
+ return; |
+ } |
+ // First see if there is a range to the left of this one that |
+ // overlaps. |
+ ZoneSplayTree<Config>::Locator loc; |
+ if (tree()->FindGreatestLessThan(current.from(), &loc)) { |
+ Entry* entry = &loc.value(); |
+ // If we've found a range that overlaps with this one, and it |
+ // starts strictly to the left of this one, we have to fix it |
+ // because the following code only handles ranges that start on |
+ // or after the start point of the range we're adding. |
+ if (entry->from() < current.from() && entry->to() >= current.from()) { |
+ // Snap the overlapping range in half around the start point of |
+ // the range we're adding. |
+ CharacterRange left(entry->from(), current.from() - 1); |
+ CharacterRange right(current.from(), entry->to()); |
+ // The left part of the overlapping range doesn't overlap. |
+ // Truncate the whole entry to be just the left part. |
+ entry->set_to(left.to()); |
+ // The right part is the one that overlaps. We add this part |
+ // to the map and let the next step deal with merging it with |
+ // the range we're adding. |
+ ZoneSplayTree<Config>::Locator loc; |
+ ASSERT_RESULT(tree()->Insert(right.from(), &loc)); |
+ loc.set_value(Entry(right.from(), |
+ right.to(), |
+ entry->out_set())); |
+ } |
+ } |
+ while (current.is_valid()) { |
+ if (tree()->FindLeastGreaterThan(current.from(), &loc) && |
+ (loc.value().from() <= current.to()) && |
+ (loc.value().to() >= current.from())) { |
+ Entry* entry = &loc.value(); |
+ // We have overlap. If there is space between the start point of |
+ // the range we're adding and where the overlapping range starts |
+ // then we have to add a range covering just that space. |
+ if (current.from() < entry->from()) { |
+ ZoneSplayTree<Config>::Locator ins; |
+ ASSERT_RESULT(tree()->Insert(current.from(), &ins)); |
+ ins.set_value(Entry(current.from(), |
+ entry->from() - 1, |
+ empty()->Extend(value))); |
+ current.set_from(entry->from()); |
+ } |
+ ASSERT_EQ(current.from(), entry->from()); |
+ // If the overlapping range extends beyond the one we want to add |
+ // we have to snap the right part off and add it separately. |
+ if (entry->to() > current.to()) { |
+ ZoneSplayTree<Config>::Locator ins; |
+ ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins)); |
+ ins.set_value(Entry(current.to() + 1, |
+ entry->to(), |
+ entry->out_set())); |
+ entry->set_to(current.to()); |
+ } |
+ ASSERT(entry->to() <= current.to()); |
+ // The overlapping range is now completely contained by the range |
+ // we're adding so we can just update it and move the start point |
+ // of the range we're adding just past it. |
+ entry->AddValue(value); |
+ // Bail out if the last interval ended at 0xFFFF since otherwise |
+ // adding 1 will wrap around to 0. |
+ if (entry->to() == 0xFFFF) |
+ break; |
+ ASSERT(entry->to() + 1 > current.from()); |
+ current.set_from(entry->to() + 1); |
+ } else { |
+ // There is no overlap so we can just add the range |
+ ZoneSplayTree<Config>::Locator ins; |
+ ASSERT_RESULT(tree()->Insert(current.from(), &ins)); |
+ ins.set_value(Entry(current.from(), |
+ current.to(), |
+ empty()->Extend(value))); |
+ break; |
+ } |
+ } |
+} |
+ |
+ |
+OutSet* DispatchTable::Get(uc16 value) { |
+ ZoneSplayTree<Config>::Locator loc; |
+ if (!tree()->FindGreatestLessThan(value, &loc)) |
+ return empty(); |
+ Entry* entry = &loc.value(); |
+ if (value <= entry->to()) |
+ return entry->out_set(); |
+ else |
+ return empty(); |
+} |
+ |
+ |
+// ------------------------------------------------------------------- |
+// Analysis |
+ |
+ |
+void Analysis::EnsureAnalyzed(RegExpNode* that) { |
+ if (that->info()->been_analyzed || that->info()->being_analyzed) |
+ return; |
+ that->info()->being_analyzed = true; |
+ that->Accept(this); |
+ that->info()->being_analyzed = false; |
+ that->info()->been_analyzed = true; |
+} |
+ |
+ |
+void Analysis::VisitEnd(EndNode* that) { |
+ // nothing to do |
+} |
+ |
+ |
+void Analysis::VisitText(TextNode* that) { |
+ EnsureAnalyzed(that->on_success()); |
+ EnsureAnalyzed(that->on_failure()); |
+} |
+ |
+ |
+void Analysis::VisitAction(ActionNode* that) { |
+ RegExpNode* next = that->on_success(); |
+ EnsureAnalyzed(next); |
+ that->info()->determine_newline = next->info()->prev_determine_newline(); |
+ that->info()->determine_word = next->info()->prev_determine_word(); |
+ that->info()->determine_start = next->info()->prev_determine_start(); |
+} |
+ |
+ |
+void Analysis::VisitChoice(ChoiceNode* that) { |
+ NodeInfo* info = that->info(); |
+ for (int i = 0; i < that->alternatives()->length(); i++) { |
+ RegExpNode* node = that->alternatives()->at(i).node(); |
+ EnsureAnalyzed(node); |
+ info->determine_newline |= node->info()->prev_determine_newline(); |
+ info->determine_word |= node->info()->prev_determine_word(); |
+ info->determine_start |= node->info()->prev_determine_start(); |
+ } |
+ if (!that->table_calculated()) { |
+ DispatchTableConstructor cons(that->table()); |
+ cons.BuildTable(that); |
+ } |
+ EnsureAnalyzed(that->on_failure()); |
+} |
+ |
+ |
+void Analysis::VisitBackReference(BackReferenceNode* that) { |
+ EnsureAnalyzed(that->on_success()); |
+ EnsureAnalyzed(that->on_failure()); |
+} |
+ |
+ |
+// ------------------------------------------------------------------- |
+// Dispatch table construction |
+ |
+ |
+void DispatchTableConstructor::VisitEnd(EndNode* that) { |
+ AddRange(CharacterRange::Everything()); |
+} |
+ |
+ |
+void DispatchTableConstructor::BuildTable(ChoiceNode* node) { |
+ ASSERT(!node->table_calculated()); |
+ node->set_being_calculated(true); |
+ ZoneList<GuardedAlternative>* alternatives = node->alternatives(); |
+ for (int i = 0; i < alternatives->length(); i++) { |
+ set_choice_index(i); |
+ alternatives->at(i).node()->Accept(this); |
+ } |
+ node->set_being_calculated(false); |
+ node->set_table_calculated(true); |
+} |
+ |
+ |
+class AddDispatchRange { |
+ public: |
+ explicit AddDispatchRange(DispatchTableConstructor* constructor) |
+ : constructor_(constructor) { } |
+ void Call(uc32 from, DispatchTable::Entry entry); |
+ private: |
+ DispatchTableConstructor* constructor_; |
+}; |
+ |
+ |
+void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) { |
+ CharacterRange range(from, entry.to()); |
+ constructor_->AddRange(range); |
+} |
+ |
+ |
+void DispatchTableConstructor::VisitChoice(ChoiceNode* node) { |
+ if (node->being_calculated()) |
+ return; |
+ if (!node->table_calculated()) { |
+ DispatchTableConstructor constructor(node->table()); |
+ constructor.BuildTable(node); |
+ } |
+ ASSERT(node->table_calculated()); |
+ AddDispatchRange adder(this); |
+ node->table()->ForEach(&adder); |
+} |
+ |
+ |
+void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) { |
+ // TODO(160): Find the node that we refer back to and propagate its start |
+ // set back to here. For now we just accept anything. |
+ AddRange(CharacterRange::Everything()); |
+} |
+ |
+ |
+ |
+static int CompareRangeByFrom(const CharacterRange* a, |
+ const CharacterRange* b) { |
+ return Spaceship<uc16>(a->from(), b->from()); |
+} |
+ |
+ |
+void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) { |
+ ranges->Sort(CompareRangeByFrom); |
+ uc16 last = 0; |
+ for (int i = 0; i < ranges->length(); i++) { |
+ CharacterRange range = ranges->at(i); |
+ if (last < range.from()) |
+ AddRange(CharacterRange(last, range.from() - 1)); |
+ if (range.to() >= last) { |
+ if (range.to() == 0xFFFF) { |
+ return; |
+ } else { |
+ last = range.to() + 1; |
+ } |
+ } |
+ } |
+ AddRange(CharacterRange(last, 0xFFFF)); |
+} |
+ |
+ |
+void DispatchTableConstructor::VisitText(TextNode* that) { |
+ TextElement elm = that->elements()->at(0); |
+ switch (elm.type) { |
+ case TextElement::ATOM: { |
+ uc16 c = elm.data.u_atom->data()[0]; |
+ AddRange(CharacterRange(c, c)); |
+ break; |
+ } |
+ case TextElement::CHAR_CLASS: { |
+ RegExpCharacterClass* tree = elm.data.u_char_class; |
+ ZoneList<CharacterRange>* ranges = tree->ranges(); |
+ if (tree->is_negated()) { |
+ AddInverse(ranges); |
+ } else { |
+ for (int i = 0; i < ranges->length(); i++) |
+ AddRange(ranges->at(i)); |
+ } |
+ break; |
+ } |
+ default: { |
+ UNIMPLEMENTED(); |
+ } |
+ } |
+} |
+ |
+ |
+void DispatchTableConstructor::VisitAction(ActionNode* that) { |
+ that->on_success()->Accept(this); |
+} |
+ |
+ |
+Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input, |
+ RegExpNode** node_return, |
+ bool ignore_case) { |
+ RegExpCompiler compiler(input->capture_count, ignore_case); |
+ // Wrap the body of the regexp in capture #0. |
+ RegExpNode* captured_body = RegExpCapture::ToNode(input->tree, |
+ 0, |
+ &compiler, |
+ compiler.accept(), |
+ compiler.backtrack()); |
+ // Add a .*? at the beginning, outside the body capture. |
+ // Note: We could choose to not add this if the regexp is anchored at |
+ // the start of the input but I'm not sure how best to do that and |
+ // since we don't even handle ^ yet I'm saving that optimization for |
+ // later. |
+ RegExpNode* node = RegExpQuantifier::ToNode(0, |
+ RegExpQuantifier::kInfinity, |
+ false, |
+ new RegExpCharacterClass('*'), |
+ &compiler, |
+ captured_body, |
+ compiler.backtrack()); |
+ if (node_return != NULL) *node_return = node; |
+ Analysis analysis; |
+ analysis.EnsureAnalyzed(node); |
+ |
+ if (!FLAG_irregexp) { |
+ return Handle<FixedArray>::null(); |
+ } |
+ |
+#if !(defined ARM || defined __arm__ || defined __thumb__) |
+ if (FLAG_irregexp_native) { // Flag only checked in IA32 mode. |
+ // TODO(lrn) Move compilation to a later point in the life-cycle |
+ // of the RegExp. We don't know the type of input string yet. |
+ // For now, always assume two-byte strings. |
+ RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16, |
+ (input->capture_count + 1) * 2, |
+ ignore_case); |
+ return compiler.Assemble(¯o_assembler, |
+ node, |
+ input->capture_count); |
+ } |
+#endif |
+ byte codes[1024]; |
+ IrregexpAssembler assembler(Vector<byte>(codes, 1024)); |
+ RegExpMacroAssemblerIrregexp macro_assembler(&assembler); |
+ return compiler.Assemble(¯o_assembler, |
+ node, |
+ input->capture_count); |
+} |
+ |
+ |
}} // namespace v8::internal |