Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(359)

Side by Side Diff: src/jsregexp.cc

Issue 12427: Merge regexp2000 back into bleeding_edge (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 12 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution. 11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its 12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived 13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission. 14 // from this software without specific prior written permission.
15 // 15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 27
28 #define _HAS_EXCEPTIONS 0
Erik Corry 2008/11/25 11:03:52 ??
29 #include <set>
Erik Corry 2008/11/25 11:03:52 Where is this coming from?
Mads Ager (chromium) 2008/11/25 21:09:41 These are still there. Please remove!
Christian Plesner Hansen 2008/11/26 06:49:56 It's in use.
30
28 #include "v8.h" 31 #include "v8.h"
29 32
33 #include "ast.h"
30 #include "execution.h" 34 #include "execution.h"
31 #include "factory.h" 35 #include "factory.h"
32 #include "jsregexp.h" 36 #include "jsregexp-inl.h"
33 #include "platform.h" 37 #include "platform.h"
34 #include "runtime.h" 38 #include "runtime.h"
35 #include "top.h" 39 #include "top.h"
36 #include "compilation-cache.h" 40 #include "compilation-cache.h"
41 #include "string-stream.h"
42 #include "parser.h"
43 #include "assembler-irregexp.h"
44 #include "regexp-macro-assembler.h"
45 #include "regexp-macro-assembler-irregexp.h"
46 #if defined __arm__ || defined __thumb__ || defined ARM
47 // include regexp-macro-assembler-arm.h when created.
48 #else // ia32
49 #include "regexp-macro-assembler-ia32.h"
50 #endif
51 #include "interpreter-irregexp.h"
37 52
38 // Including pcre.h undefines DEBUG to avoid getting debug output from 53 // Including pcre.h undefines DEBUG to avoid getting debug output from
39 // the JSCRE implementation. Make sure to redefine it in debug mode 54 // the JSCRE implementation. Make sure to redefine it in debug mode
40 // after having included the header file. 55 // after having included the header file.
41 #ifdef DEBUG 56 #ifdef DEBUG
42 #include "third_party/jscre/pcre.h" 57 #include "third_party/jscre/pcre.h"
43 #define DEBUG 58 #define DEBUG
44 #else 59 #else
45 #include "third_party/jscre/pcre.h" 60 #include "third_party/jscre/pcre.h"
46 #endif 61 #endif
47 62
63
48 namespace v8 { namespace internal { 64 namespace v8 { namespace internal {
49 65
50 66
51 #define CAPTURE_INDEX 0
52 #define INTERNAL_INDEX 1
53
54 static Failure* malloc_failure; 67 static Failure* malloc_failure;
55 68
56 static void* JSREMalloc(size_t size) { 69 static void* JSREMalloc(size_t size) {
57 Object* obj = Heap::AllocateByteArray(size); 70 Object* obj = Heap::AllocateByteArray(size);
58 71
59 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile 72 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
60 // will return NULL to the caller, performs GC there. 73 // will return NULL to the caller, performs GC there.
61 // Also pass failure information to the caller. 74 // Also pass failure information to the caller.
62 if (obj->IsFailure()) { 75 if (obj->IsFailure()) {
63 malloc_failure = Failure::cast(obj); 76 malloc_failure = Failure::cast(obj);
(...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after
169 break; 182 break;
170 case 'm': 183 case 'm':
171 flags |= JSRegExp::MULTILINE; 184 flags |= JSRegExp::MULTILINE;
172 break; 185 break;
173 } 186 }
174 } 187 }
175 return JSRegExp::Flags(flags); 188 return JSRegExp::Flags(flags);
176 } 189 }
177 190
178 191
179 unibrow::Predicate<unibrow::RegExpSpecialChar, 128> is_reg_exp_special_char; 192 static inline void ThrowRegExpException(Handle<JSRegExp> re,
193 Handle<String> pattern,
194 Handle<String> error_text,
195 const char* message) {
196 Handle<JSArray> array = Factory::NewJSArray(2);
197 SetElement(array, 0, pattern);
198 SetElement(array, 1, error_text);
199 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
200 Top::Throw(*regexp_err);
201 }
180 202
181 203
182 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 204 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
183 Handle<String> pattern, 205 Handle<String> pattern,
184 Handle<String> flag_str) { 206 Handle<String> flag_str) {
185 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 207 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
186 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags); 208 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
187 bool in_cache = !cached.is_null(); 209 bool in_cache = !cached.is_null();
188 Handle<Object> result; 210 Handle<Object> result;
189 StringShape shape(*pattern);
190 if (in_cache) { 211 if (in_cache) {
191 re->set_data(*cached); 212 re->set_data(*cached);
192 result = re; 213 result = re;
193 } else { 214 } else {
194 bool is_atom = !flags.is_ignore_case(); 215 FlattenString(pattern);
195 for (int i = 0; is_atom && i < pattern->length(shape); i++) { 216 RegExpParseResult parse_result;
196 if (is_reg_exp_special_char.get(pattern->Get(shape, i))) 217 FlatStringReader reader(pattern);
197 is_atom = false; 218 if (!ParseRegExp(&reader, &parse_result)) {
219 // Throw an exception if we fail to parse the pattern.
220 ThrowRegExpException(re,
221 pattern,
222 parse_result.error,
223 "malformed_regexp");
224 return Handle<Object>();
198 } 225 }
199 if (is_atom) { 226 RegExpAtom* atom = parse_result.tree->AsAtom();
200 result = AtomCompile(re, pattern, flags); 227 if (atom != NULL && !flags.is_ignore_case()) {
228 if (parse_result.has_character_escapes) {
229 Vector<const uc16> atom_pattern = atom->data();
230 Handle<String> atom_string =
231 Factory::NewStringFromTwoByte(atom_pattern);
232 result = AtomCompile(re, pattern, flags, atom_string);
233 } else {
234 result = AtomCompile(re, pattern, flags, pattern);
235 }
201 } else { 236 } else {
202 result = JsreCompile(re, pattern, flags); 237 RegExpNode* node = NULL;
238 Handle<FixedArray> irregexp_data =
239 RegExpEngine::Compile(&parse_result,
240 &node,
241 flags.is_ignore_case());
242 if (irregexp_data.is_null()) {
243 result = JscrePrepare(re, pattern, flags);
244 } else {
245 result = IrregexpPrepare(re, pattern, flags, irregexp_data);
246 }
203 } 247 }
204 Object* data = re->data(); 248 Object* data = re->data();
205 if (data->IsFixedArray()) { 249 if (data->IsFixedArray()) {
206 // If compilation succeeded then the data is set on the regexp 250 // If compilation succeeded then the data is set on the regexp
207 // and we can store it in the cache. 251 // and we can store it in the cache.
208 Handle<FixedArray> data(FixedArray::cast(re->data())); 252 Handle<FixedArray> data(FixedArray::cast(re->data()));
209 CompilationCache::PutRegExp(pattern, flags, data); 253 CompilationCache::PutRegExp(pattern, flags, data);
210 } 254 }
211 } 255 }
212 256
213 LOG(RegExpCompileEvent(re, in_cache)); 257 LOG(RegExpCompileEvent(re, in_cache));
214 return result; 258 return result;
215 } 259 }
216 260
217 261
218 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 262 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
219 Handle<String> subject, 263 Handle<String> subject,
220 Handle<Object> index) { 264 Handle<Object> index) {
221 switch (regexp->TypeTag()) { 265 switch (regexp->TypeTag()) {
222 case JSRegExp::JSCRE: 266 case JSRegExp::JSCRE:
223 return JsreExec(regexp, subject, index); 267 return JscreExec(regexp, subject, index);
224 case JSRegExp::ATOM: 268 case JSRegExp::ATOM:
225 return AtomExec(regexp, subject, index); 269 return AtomExec(regexp, subject, index);
270 case JSRegExp::IRREGEXP:
271 return IrregexpExec(regexp, subject, index);
226 default: 272 default:
227 UNREACHABLE(); 273 UNREACHABLE();
228 return Handle<Object>(); 274 return Handle<Object>();
229 } 275 }
230 } 276 }
231 277
232 278
233 Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp, 279 Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
234 Handle<String> subject) { 280 Handle<String> subject) {
235 switch (regexp->TypeTag()) { 281 switch (regexp->TypeTag()) {
236 case JSRegExp::JSCRE: 282 case JSRegExp::JSCRE:
237 return JsreExecGlobal(regexp, subject); 283 return JscreExecGlobal(regexp, subject);
238 case JSRegExp::ATOM: 284 case JSRegExp::ATOM:
239 return AtomExecGlobal(regexp, subject); 285 return AtomExecGlobal(regexp, subject);
286 case JSRegExp::IRREGEXP:
287 return IrregexpExecGlobal(regexp, subject);
240 default: 288 default:
241 UNREACHABLE(); 289 UNREACHABLE();
242 return Handle<Object>(); 290 return Handle<Object>();
243 } 291 }
244 } 292 }
245 293
246 294
247 Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re, 295 Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
248 Handle<String> pattern, 296 Handle<String> pattern,
249 JSRegExp::Flags flags) { 297 JSRegExp::Flags flags,
250 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, pattern); 298 Handle<String> match_pattern) {
299 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
251 return re; 300 return re;
252 } 301 }
253 302
254 303
255 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 304 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
256 Handle<String> subject, 305 Handle<String> subject,
257 Handle<Object> index) { 306 Handle<Object> index) {
258 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex))); 307 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
259 308
260 uint32_t start_index; 309 uint32_t start_index;
261 if (!Array::IndexFromObject(*index, &start_index)) { 310 if (!Array::IndexFromObject(*index, &start_index)) {
262 return Handle<Smi>(Smi::FromInt(-1)); 311 return Handle<Smi>(Smi::FromInt(-1));
263 } 312 }
264 313
265 LOG(RegExpExecEvent(re, start_index, subject)); 314 LOG(RegExpExecEvent(re, start_index, subject));
266 int value = Runtime::StringMatch(subject, needle, start_index); 315 int value = Runtime::StringMatch(subject, needle, start_index);
267 if (value == -1) return Factory::null_value(); 316 if (value == -1) return Factory::null_value();
268 317
269 Handle<FixedArray> array = Factory::NewFixedArray(2); 318 Handle<FixedArray> array = Factory::NewFixedArray(2);
270 array->set(0, 319 array->set(0, Smi::FromInt(value));
271 Smi::FromInt(value), 320 array->set(1, Smi::FromInt(value + needle->length()));
272 SKIP_WRITE_BARRIER);
273 array->set(1,
274 Smi::FromInt(value + needle->length()),
275 SKIP_WRITE_BARRIER);
276 return Factory::NewJSArrayWithElements(array); 321 return Factory::NewJSArrayWithElements(array);
277 } 322 }
278 323
279 324
280 Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re, 325 Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
281 Handle<String> subject) { 326 Handle<String> subject) {
282 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex))); 327 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
283 Handle<JSArray> result = Factory::NewJSArray(1); 328 Handle<JSArray> result = Factory::NewJSArray(1);
284 int index = 0; 329 int index = 0;
285 int match_count = 0; 330 int match_count = 0;
286 int subject_length = subject->length(); 331 int subject_length = subject->length();
287 int needle_length = needle->length(); 332 int needle_length = needle->length();
288 while (true) { 333 while (true) {
289 LOG(RegExpExecEvent(re, index, subject)); 334 LOG(RegExpExecEvent(re, index, subject));
290 int value = -1; 335 int value = -1;
291 if (index + needle_length <= subject_length) { 336 if (index + needle_length <= subject_length) {
292 value = Runtime::StringMatch(subject, needle, index); 337 value = Runtime::StringMatch(subject, needle, index);
293 } 338 }
294 if (value == -1) break; 339 if (value == -1) break;
295 HandleScope scope; 340 HandleScope scope;
296 int end = value + needle_length; 341 int end = value + needle_length;
297 342
298 Handle<FixedArray> array = Factory::NewFixedArray(2); 343 Handle<FixedArray> array = Factory::NewFixedArray(2);
299 array->set(0, 344 array->set(0, Smi::FromInt(value));
300 Smi::FromInt(value), 345 array->set(1, Smi::FromInt(end));
301 SKIP_WRITE_BARRIER);
302 array->set(1,
303 Smi::FromInt(end),
304 SKIP_WRITE_BARRIER);
305 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array); 346 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
306 SetElement(result, match_count, pair); 347 SetElement(result, match_count, pair);
307 match_count++; 348 match_count++;
308 index = end; 349 index = end;
309 if (needle_length == 0) index++; 350 if (needle_length == 0) index++;
310 } 351 }
311 return result; 352 return result;
312 } 353 }
313 354
314 355
356 Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
357 Handle<String> pattern,
358 JSRegExp::Flags flags) {
359 Handle<Object> value(Heap::undefined_value());
360 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
361 return re;
362 }
363
364
365 Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
366 Handle<String> pattern,
367 JSRegExp::Flags flags,
368 Handle<FixedArray> irregexp_data) {
369 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data);
370 return re;
371 }
372
373
315 static inline Object* DoCompile(String* pattern, 374 static inline Object* DoCompile(String* pattern,
316 JSRegExp::Flags flags, 375 JSRegExp::Flags flags,
317 unsigned* number_of_captures, 376 unsigned* number_of_captures,
318 const char** error_message, 377 const char** error_message,
319 JscreRegExp** code) { 378 JscreRegExp** code) {
320 JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case() 379 JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
321 ? JSRegExpIgnoreCase 380 ? JSRegExpIgnoreCase
322 : JSRegExpDoNotIgnoreCase; 381 : JSRegExpDoNotIgnoreCase;
323 JSRegExpMultilineOption multiline_option = flags.is_multiline() 382 JSRegExpMultilineOption multiline_option = flags.is_multiline()
324 ? JSRegExpMultiline 383 ? JSRegExpMultiline
(...skipping 26 matching lines...) Expand all
351 const char** error_message, 410 const char** error_message,
352 JscreRegExp** code) { 411 JscreRegExp** code) {
353 CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern, 412 CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern,
354 flags, 413 flags,
355 number_of_captures, 414 number_of_captures,
356 error_message, 415 error_message,
357 code)); 416 code));
358 } 417 }
359 418
360 419
361 Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re, 420 Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
362 Handle<String> pattern, 421 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
363 JSRegExp::Flags flags) { 422 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
423
424 Handle<String> pattern(re->Pattern());
425 JSRegExp::Flags flags = re->GetFlags();
426
364 Handle<String> two_byte_pattern = StringToTwoByte(pattern); 427 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
365 428
366 unsigned number_of_captures; 429 unsigned number_of_captures;
367 const char* error_message = NULL; 430 const char* error_message = NULL;
368 431
369 JscreRegExp* code = NULL; 432 JscreRegExp* code = NULL;
370 FlattenString(pattern); 433 FlattenString(pattern);
371 434
372 CompileWithRetryAfterGC(two_byte_pattern, 435 CompileWithRetryAfterGC(two_byte_pattern,
373 flags, 436 flags,
(...skipping 10 matching lines...) Expand all
384 Handle<Object> regexp_err = 447 Handle<Object> regexp_err =
385 Factory::NewSyntaxError("malformed_regexp", array); 448 Factory::NewSyntaxError("malformed_regexp", array);
386 Top::Throw(*regexp_err); 449 Top::Throw(*regexp_err);
387 return Handle<Object>(); 450 return Handle<Object>();
388 } 451 }
389 452
390 // Convert the return address to a ByteArray pointer. 453 // Convert the return address to a ByteArray pointer.
391 Handle<ByteArray> internal( 454 Handle<ByteArray> internal(
392 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code))); 455 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
393 456
394 Handle<FixedArray> value = Factory::NewFixedArray(2); 457 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
395 value->set(CAPTURE_INDEX, Smi::FromInt(number_of_captures)); 458 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
396 value->set(INTERNAL_INDEX, *internal); 459 value->set(kJscreInternalIndex, *internal);
397 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value); 460 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
398 461
399 return re; 462 return re;
400 } 463 }
401 464
402 465
403 Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp, 466 Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp,
404 int num_captures, 467 int num_captures,
405 Handle<String> subject, 468 Handle<String> two_byte_subject,
406 int previous_index, 469 int previous_index,
407 const uc16* two_byte_subject, 470 int* offsets_vector,
408 int* offsets_vector, 471 int offsets_vector_length) {
409 int offsets_vector_length) { 472 #ifdef DEBUG
473 if (FLAG_trace_regexp_bytecodes) {
474 String* pattern = regexp->Pattern();
475 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
476 PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString()));
477 }
478 #endif
479 ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation());
480 ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject)));
481 bool rc;
482 {
Mads Ager (chromium) 2008/11/25 21:09:41 Why is the extra scope needed here?
Erik Corry 2008/11/26 12:18:36 It isn't. Removed.
483 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
484 offsets_vector[i] = -1;
485 }
486
487 LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject));
488
489 FixedArray* irregexp =
490 FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex));
491 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
492
493 switch (tag) {
Mads Ager (chromium) 2008/11/25 21:09:41 The indentation of this switch is off. Indent the
Erik Corry 2008/11/26 12:18:36 Done.
494 case RegExpMacroAssembler::kIA32Implementation: {
495 Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex));
496 SmartPointer<int> captures(NewArray<int>((num_captures + 1) * 2));
497 Address start_addr =
498 Handle<SeqTwoByteString>::cast(two_byte_subject)->GetCharsAddress();
499 int start_offset =
500 start_addr - reinterpret_cast<Address>(*two_byte_subject);
501 int end_offset =
502 start_offset + (two_byte_subject->length() - previous_index) * 2;
503 typedef bool testfunc(String**, int, int, int*);
504 testfunc* test = FUNCTION_CAST<testfunc*>(code->entry());
505 rc = test(two_byte_subject.location(),
506 start_offset,
507 end_offset,
508 *captures);
509 if (rc) {
510 // Capture values are relative to start_offset only.
511 for (int i = 0; i < offsets_vector_length; i++) {
512 if (offsets_vector[i] >= 0) {
513 offsets_vector[i] += previous_index;
514 }
515 }
516 }
517 break;
518 }
519 default:
Mads Ager (chromium) 2008/11/25 21:09:41 Why is this default case in the middle of the rest
Erik Corry 2008/11/26 12:18:36 Because it falls through? Lasse, please take a lo
520 case RegExpMacroAssembler::kARMImplementation:
521 UNREACHABLE();
522 rc = false;
523 break;
524 case RegExpMacroAssembler::kBytecodeImplementation: {
525 Handle<ByteArray> byte_codes = IrregexpCode(regexp);
526
527 rc = IrregexpInterpreter::Match(byte_codes,
528 two_byte_subject,
529 offsets_vector,
530 previous_index);
531 break;
532 }
533 }
534 }
535
536 if (!rc) {
537 return Factory::null_value();
538 }
539
540 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
541 // The captures come in (start, end+1) pairs.
542 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
543 array->set(i, Smi::FromInt(offsets_vector[i]));
544 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
545 }
546 return Factory::NewJSArrayWithElements(array);
547 }
548
549
550 Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
551 int num_captures,
552 Handle<String> subject,
553 int previous_index,
554 const uc16* two_byte_subject,
555 int* offsets_vector,
556 int offsets_vector_length) {
410 int rc; 557 int rc;
411 { 558 {
412 AssertNoAllocation a; 559 AssertNoAllocation a;
413 ByteArray* internal = JsreInternal(regexp); 560 ByteArray* internal = JscreInternal(regexp);
414 const JscreRegExp* js_regexp = 561 const JscreRegExp* js_regexp =
415 reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress()); 562 reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress());
416 563
417 LOG(RegExpExecEvent(regexp, previous_index, subject)); 564 LOG(RegExpExecEvent(regexp, previous_index, subject));
418 565
419 rc = jsRegExpExecute(js_regexp, 566 rc = jsRegExpExecute(js_regexp,
420 two_byte_subject, 567 two_byte_subject,
421 subject->length(), 568 subject->length(),
422 previous_index, 569 previous_index,
423 offsets_vector, 570 offsets_vector,
(...skipping 13 matching lines...) Expand all
437 Handle<Object> code(Smi::FromInt(rc)); 584 Handle<Object> code(Smi::FromInt(rc));
438 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code }; 585 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
439 Handle<Object> regexp_err( 586 Handle<Object> regexp_err(
440 Factory::NewTypeError("jsre_error", HandleVector(args, 2))); 587 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
441 return Handle<Object>(Top::Throw(*regexp_err)); 588 return Handle<Object>(Top::Throw(*regexp_err));
442 } 589 }
443 590
444 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1)); 591 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
445 // The captures come in (start, end+1) pairs. 592 // The captures come in (start, end+1) pairs.
446 for (int i = 0; i < 2 * (num_captures+1); i += 2) { 593 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
447 array->set(i, 594 array->set(i, Smi::FromInt(offsets_vector[i]));
448 Smi::FromInt(offsets_vector[i]), 595 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
449 SKIP_WRITE_BARRIER);
450 array->set(i+1,
451 Smi::FromInt(offsets_vector[i+1]),
452 SKIP_WRITE_BARRIER);
453 } 596 }
454 return Factory::NewJSArrayWithElements(array); 597 return Factory::NewJSArrayWithElements(array);
455 } 598 }
456 599
457 600
458 class OffsetsVector { 601 class OffsetsVector {
459 public: 602 public:
460 inline OffsetsVector(int num_captures) { 603 inline OffsetsVector(int num_registers) :
Mads Ager (chromium) 2008/11/25 21:09:41 Move : to the next line and do a four space indent
461 offsets_vector_length_ = (num_captures + 1) * 3; 604 offsets_vector_length_(num_registers) {
462 if (offsets_vector_length_ > kStaticOffsetsVectorSize) { 605 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
463 vector_ = NewArray<int>(offsets_vector_length_); 606 vector_ = NewArray<int>(offsets_vector_length_);
464 } else { 607 } else {
465 vector_ = static_offsets_vector_; 608 vector_ = static_offsets_vector_;
466 } 609 }
467 } 610 }
468 611
469 612
470 inline ~OffsetsVector() { 613 inline ~OffsetsVector() {
471 if (offsets_vector_length_ > kStaticOffsetsVectorSize) { 614 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
472 DeleteArray(vector_); 615 DeleteArray(vector_);
473 vector_ = NULL; 616 vector_ = NULL;
474 } 617 }
475 } 618 }
476 619
477 620
478 inline int* vector() { 621 inline int* vector() {
479 return vector_; 622 return vector_;
480 } 623 }
481 624
482 625
483 inline int length() { 626 inline int length() {
484 return offsets_vector_length_; 627 return offsets_vector_length_;
485 } 628 }
486 629
487 private: 630 private:
488 int* vector_; 631 int* vector_;
489 int offsets_vector_length_; 632 int offsets_vector_length_;
490 static const int kStaticOffsetsVectorSize = 30; 633 static const int kStaticOffsetsVectorSize = 50;
491 static int static_offsets_vector_[kStaticOffsetsVectorSize]; 634 static int static_offsets_vector_[kStaticOffsetsVectorSize];
492 }; 635 };
493 636
494 637
495 int OffsetsVector::static_offsets_vector_[ 638 int OffsetsVector::static_offsets_vector_[
496 OffsetsVector::kStaticOffsetsVectorSize]; 639 OffsetsVector::kStaticOffsetsVectorSize];
497 640
498 641
499 Handle<Object> RegExpImpl::JsreExec(Handle<JSRegExp> regexp, 642 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
500 Handle<String> subject, 643 Handle<String> subject,
Mads Ager (chromium) 2008/11/25 21:09:41 Indentation is off.
Erik Corry 2008/11/26 12:18:36 Fixed.
501 Handle<Object> index) { 644 Handle<Object> index) {
645 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
646 ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());
647
502 // Prepare space for the return values. 648 // Prepare space for the return values.
503 int num_captures = JsreCapture(regexp); 649 int number_of_registers = IrregexpNumberOfRegisters(regexp);
650 OffsetsVector offsets(number_of_registers);
504 651
505 OffsetsVector offsets(num_captures); 652 int num_captures = IrregexpNumberOfCaptures(regexp);
506 653
507 int previous_index = static_cast<int>(DoubleToInteger(index->Number())); 654 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
508 655
509 Handle<String> subject16 = CachedStringToTwoByte(subject); 656 Handle<String> subject16 = CachedStringToTwoByte(subject);
510 657
511 Handle<Object> result(JsreExecOnce(regexp, num_captures, subject, 658 Handle<Object> result(
512 previous_index, 659 IrregexpExecOnce(regexp,
513 subject16->GetTwoByteData(), 660 num_captures,
514 offsets.vector(), offsets.length())); 661 subject16,
662 previous_index,
663 offsets.vector(),
664 offsets.length()));
665 return result;
666 }
667
668
669 Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
670 Handle<String> subject,
671 Handle<Object> index) {
672 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
673 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
674 Handle<Object> compile_result = JscreCompile(regexp);
675 if (compile_result.is_null()) return compile_result;
676 }
677 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
678
679 int num_captures = JscreNumberOfCaptures(regexp);
680
681 OffsetsVector offsets((num_captures + 1) * 3);
682
683 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
684
685 Handle<String> subject16 = CachedStringToTwoByte(subject);
686
687 Handle<Object> result(JscreExecOnce(regexp,
688 num_captures,
689 subject,
690 previous_index,
691 subject16->GetTwoByteData(),
692 offsets.vector(),
693 offsets.length()));
515 694
516 return result; 695 return result;
517 } 696 }
518 697
519 698
520 Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp, 699 Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
521 Handle<String> subject) { 700 Handle<String> subject) {
701 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
702 ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());
703
522 // Prepare space for the return values. 704 // Prepare space for the return values.
523 int num_captures = JsreCapture(regexp); 705 int number_of_registers = IrregexpNumberOfRegisters(regexp);
524 706 OffsetsVector offsets(number_of_registers);
525 OffsetsVector offsets(num_captures);
526 707
527 int previous_index = 0; 708 int previous_index = 0;
528 709
529 Handle<JSArray> result = Factory::NewJSArray(0); 710 Handle<JSArray> result = Factory::NewJSArray(0);
711 int i = 0;
712 Handle<Object> matches;
713
714 Handle<String> subject16 = CachedStringToTwoByte(subject);
715
716 do {
717 if (previous_index > subject->length() || previous_index < 0) {
718 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
719 // string length, there is no match.
720 matches = Factory::null_value();
721 } else {
722 matches = IrregexpExecOnce(regexp,
723 IrregexpNumberOfCaptures(regexp),
724 subject16,
725 previous_index,
726 offsets.vector(),
727 offsets.length());
728
729 if (matches->IsJSArray()) {
730 SetElement(result, i, matches);
731 i++;
732 previous_index = offsets.vector()[1];
733 if (offsets.vector()[0] == offsets.vector()[1]) {
734 previous_index++;
735 }
736 }
737 }
738 } while (matches->IsJSArray());
739
740 // If we exited the loop with an exception, throw it.
741 if (matches->IsNull()) { // Exited loop normally.
Mads Ager (chromium) 2008/11/25 21:09:41 Move these comments to new lines before the return
Erik Corry 2008/11/26 12:18:36 I'm not sure why that's any better, but fixed. Wh
742 return result;
743 } else { // Exited loop with the exception in matches.
744 return matches;
745 }
746 }
747
748
749 Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
750 Handle<String> subject) {
751 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
752 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
753 Handle<Object> compile_result = JscreCompile(regexp);
754 if (compile_result.is_null()) return compile_result;
755 }
756 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
757
758 // Prepare space for the return values.
759 int num_captures = JscreNumberOfCaptures(regexp);
760
761 OffsetsVector offsets((num_captures + 1) * 3);
762
763 int previous_index = 0;
764
765 Handle<JSArray> result = Factory::NewJSArray(0);
530 int i = 0; 766 int i = 0;
531 Handle<Object> matches; 767 Handle<Object> matches;
532 768
533 Handle<String> subject16 = CachedStringToTwoByte(subject); 769 Handle<String> subject16 = CachedStringToTwoByte(subject);
534 770
535 do { 771 do {
536 if (previous_index > subject->length() || previous_index < 0) { 772 if (previous_index > subject->length() || previous_index < 0) {
537 // Per ECMA-262 15.10.6.2, if the previous index is greater than the 773 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
538 // string length, there is no match. 774 // string length, there is no match.
539 matches = Factory::null_value(); 775 matches = Factory::null_value();
540 } else { 776 } else {
541 matches = JsreExecOnce(regexp, num_captures, subject, previous_index, 777 matches = JscreExecOnce(regexp,
542 subject16->GetTwoByteData(), 778 num_captures,
543 offsets.vector(), offsets.length()); 779 subject,
780 previous_index,
781 subject16->GetTwoByteData(),
782 offsets.vector(),
783 offsets.length());
544 784
545 if (matches->IsJSArray()) { 785 if (matches->IsJSArray()) {
546 SetElement(result, i, matches); 786 SetElement(result, i, matches);
547 i++; 787 i++;
548 previous_index = offsets.vector()[1]; 788 previous_index = offsets.vector()[1];
549 if (offsets.vector()[0] == offsets.vector()[1]) { 789 if (offsets.vector()[0] == offsets.vector()[1]) {
550 previous_index++; 790 previous_index++;
551 } 791 }
552 } 792 }
553 } 793 }
554 } while (matches->IsJSArray()); 794 } while (matches->IsJSArray());
555 795
556 // If we exited the loop with an exception, throw it. 796 // If we exited the loop with an exception, throw it.
557 if (matches->IsNull()) { // Exited loop normally. 797 if (matches->IsNull()) { // Exited loop normally.
558 return result; 798 return result;
559 } else { // Exited loop with the exception in matches. 799 } else { // Exited loop with the exception in matches.
560 return matches; 800 return matches;
561 } 801 }
562 } 802 }
563 803
564 804
565 int RegExpImpl::JsreCapture(Handle<JSRegExp> re) { 805 int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
566 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); 806 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
567 return Smi::cast(value->get(CAPTURE_INDEX))->value(); 807 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->
568 } 808 value();
Mads Ager (chromium) 2008/11/25 21:09:41 This should fit on the previous line.
Erik Corry 2008/11/26 12:18:36 Done.
569 809 }
570 810
571 ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) { 811
812 ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
572 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); 813 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
573 return ByteArray::cast(value->get(INTERNAL_INDEX)); 814 return ByteArray::cast(value->get(kJscreInternalIndex));
574 } 815 }
816
817
818 int RegExpImpl::IrregexpNumberOfCaptures(Handle<JSRegExp> re) {
819 FixedArray* value =
820 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
821 return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value();
822 }
823
824
825 int RegExpImpl::IrregexpNumberOfRegisters(Handle<JSRegExp> re) {
826 FixedArray* value =
827 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
828 return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value();
829 }
830
831
832 Handle<ByteArray> RegExpImpl::IrregexpCode(Handle<JSRegExp> re) {
833 FixedArray* value =
834 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
835 return Handle<ByteArray>(ByteArray::cast(value->get(kIrregexpCodeIndex)));
836 }
837
838
839 // -------------------------------------------------------------------
840 // New regular expression engine
Erik Corry 2008/11/25 11:03:52 Probably should mention "Irregexp"
Erik Corry 2008/11/26 12:18:36 Fixed.
841
842
843 void RegExpTree::AppendToText(RegExpText* text) {
844 UNREACHABLE();
845 }
846
847
848 void RegExpAtom::AppendToText(RegExpText* text) {
849 text->AddElement(TextElement::Atom(this));
850 }
851
852
853 void RegExpCharacterClass::AppendToText(RegExpText* text) {
854 text->AddElement(TextElement::CharClass(this));
855 }
856
857
858 void RegExpText::AppendToText(RegExpText* text) {
859 for (int i = 0; i < elements()->length(); i++)
860 text->AddElement(elements()->at(i));
861 }
862
863
864 TextElement TextElement::Atom(RegExpAtom* atom) {
865 TextElement result = TextElement(ATOM);
866 result.data.u_atom = atom;
867 return result;
868 }
869
870
871 TextElement TextElement::CharClass(
872 RegExpCharacterClass* char_class) {
873 TextElement result = TextElement(CHAR_CLASS);
874 result.data.u_char_class = char_class;
875 return result;
876 }
877
878
879 class RegExpCompiler {
880 public:
881 RegExpCompiler(int capture_count, bool ignore_case);
882
883 int AllocateRegister() { return next_register_++; }
884
885 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
886 RegExpNode* start,
887 int capture_count);
888
889 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
890
891 static const int kImplementationOffset = 0;
892 static const int kNumberOfRegistersOffset = 0;
893 static const int kCodeOffset = 1;
894
895 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
896 EndNode* accept() { return accept_; }
897 EndNode* backtrack() { return backtrack_; }
898
899 static const int kMaxRecursion = 100;
900 inline int recursion_depth() { return recursion_depth_; }
901 inline void IncrementRecursionDepth() { recursion_depth_++; }
902 inline void DecrementRecursionDepth() { recursion_depth_--; }
903
904 inline bool is_case_independent() { return is_case_independent_; }
905
906 private:
907 EndNode* accept_;
908 EndNode* backtrack_;
909 int next_register_;
910 List<RegExpNode*>* work_list_;
911 int recursion_depth_;
912 RegExpMacroAssembler* macro_assembler_;
913 bool is_case_independent_;
914 };
915
916
917 // Attempts to compile the regexp using an Irregexp code generator. Returns
918 // a fixed array or a null handle depending on whether it succeeded.
919 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)
920 : next_register_(2 * (capture_count + 1)),
Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.
Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.
921 work_list_(NULL),
922 recursion_depth_(0),
923 is_case_independent_(ignore_case) {
924 accept_ = new EndNode(EndNode::ACCEPT);
925 backtrack_ = new EndNode(EndNode::BACKTRACK);
926 }
927
928
929 Handle<FixedArray> RegExpCompiler::Assemble(
930 RegExpMacroAssembler* macro_assembler,
931 RegExpNode* start,
932 int capture_count) {
933 if (!FLAG_attempt_case_independent && is_case_independent_) {
934 return Handle<FixedArray>::null();
935 }
936 macro_assembler_ = macro_assembler;
937 List <RegExpNode*> work_list(0);
938 work_list_ = &work_list;
939 Label fail;
940 macro_assembler->PushBacktrack(&fail);
941 if (!start->GoTo(this)) {
942 fail.Unuse();
943 return Handle<FixedArray>::null();
944 }
945 while (!work_list.is_empty()) {
946 if (!work_list.RemoveLast()->GoTo(this)) {
947 fail.Unuse();
948 return Handle<FixedArray>::null();
949 }
950 }
951 macro_assembler->Bind(&fail);
952 macro_assembler->Fail();
953 Handle<FixedArray> array =
954 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
955 array->set(RegExpImpl::kIrregexpImplementationIndex,
956 Smi::FromInt(macro_assembler->Implementation()));
957 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
958 Smi::FromInt(next_register_));
959 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
960 Smi::FromInt(capture_count));
961 Handle<Object> code = macro_assembler->GetCode();
962 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
963 work_list_ = NULL;
964 return array;
965 }
966
967
968 bool RegExpNode::GoTo(RegExpCompiler* compiler) {
969 // TODO(erikcorry): Implement support.
970 if (info_.follows_word_interest ||
971 info_.follows_newline_interest ||
972 info_.follows_start_interest) {
973 return false;
974 }
975 if (label_.is_bound()) {
976 compiler->macro_assembler()->GoTo(&label_);
977 return true;
978 } else {
Mads Ager (chromium) 2008/11/25 21:09:41 Most of this nesting could go away since all of th
Christian Plesner Hansen 2008/11/26 06:49:56 Nesting aids readability: you can trivially see wh
979 if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) {
980 compiler->macro_assembler()->GoTo(&label_);
981 compiler->AddWork(this);
982 return true;
983 } else {
984 compiler->IncrementRecursionDepth();
985 bool how_it_went = Emit(compiler);
986 compiler->DecrementRecursionDepth();
987 return how_it_went;
988 }
989 }
990 }
991
992
993 bool EndNode::GoTo(RegExpCompiler* compiler) {
994 if (info()->follows_word_interest ||
995 info()->follows_newline_interest ||
996 info()->follows_start_interest) {
997 return false;
998 }
999 if (!label()->is_bound()) {
1000 Bind(compiler->macro_assembler());
1001 }
1002 switch (action_) {
1003 case ACCEPT:
1004 compiler->macro_assembler()->Succeed();
1005 break;
Mads Ager (chromium) 2008/11/25 21:09:41 Indent the break.
Erik Corry 2008/11/26 12:18:36 Fixed.
1006 case BACKTRACK:
1007 compiler->macro_assembler()->Backtrack();
1008 break;
Mads Ager (chromium) 2008/11/25 21:09:41 Indent the break.
Erik Corry 2008/11/26 12:18:36 Fixed.
1009 }
1010 return true;
1011 }
1012
1013
1014 Label* RegExpNode::label() {
1015 return &label_;
1016 }
1017
1018
1019 bool EndNode::Emit(RegExpCompiler* compiler) {
1020 RegExpMacroAssembler* macro = compiler->macro_assembler();
1021 switch (action_) {
1022 case ACCEPT:
1023 Bind(macro);
1024 macro->Succeed();
1025 return true;
1026 case BACKTRACK:
1027 Bind(macro);
1028 macro->Backtrack();
1029 return true;
1030 }
1031 return false;
1032 }
1033
1034
1035 void GuardedAlternative::AddGuard(Guard* guard) {
1036 if (guards_ == NULL)
1037 guards_ = new ZoneList<Guard*>(1);
1038 guards_->Add(guard);
1039 }
1040
1041
1042 ActionNode* ActionNode::StoreRegister(int reg,
1043 int val,
1044 RegExpNode* on_success) {
1045 ActionNode* result = new ActionNode(STORE_REGISTER, on_success);
1046 result->data_.u_store_register.reg = reg;
1047 result->data_.u_store_register.value = val;
1048 return result;
1049 }
1050
1051
1052 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1053 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1054 result->data_.u_increment_register.reg = reg;
1055 return result;
1056 }
1057
1058
1059 ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {
1060 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1061 result->data_.u_position_register.reg = reg;
1062 return result;
1063 }
1064
1065
1066 ActionNode* ActionNode::SavePosition(int reg, RegExpNode* on_success) {
1067 ActionNode* result = new ActionNode(SAVE_POSITION, on_success);
1068 result->data_.u_position_register.reg = reg;
1069 return result;
1070 }
1071
1072
1073 ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) {
1074 ActionNode* result = new ActionNode(RESTORE_POSITION, on_success);
1075 result->data_.u_position_register.reg = reg;
1076 return result;
1077 }
1078
1079
1080 ActionNode* ActionNode::BeginSubmatch(int reg, RegExpNode* on_success) {
1081 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1082 result->data_.u_submatch_stack_pointer_register.reg = reg;
1083 return result;
1084 }
1085
1086
1087 ActionNode* ActionNode::EscapeSubmatch(int reg, RegExpNode* on_success) {
1088 ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success);
1089 result->data_.u_submatch_stack_pointer_register.reg = reg;
1090 return result;
1091 }
1092
1093
1094 #define DEFINE_ACCEPT(Type) \
1095 void Type##Node::Accept(NodeVisitor* visitor) { \
1096 visitor->Visit##Type(this); \
1097 }
1098 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1099 #undef DEFINE_ACCEPT
1100
1101
1102 // -------------------------------------------------------------------
1103 // Emit code.
1104
1105
1106 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1107 Guard* guard,
1108 Label* on_failure) {
1109 switch (guard->op()) {
1110 case Guard::LT:
1111 macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure);
1112 break;
1113 case Guard::GEQ:
1114 macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure);
1115 break;
1116 }
1117 }
1118
1119
1120 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1121 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1122
1123
1124 static inline void EmitAtomNonLetters(
1125 RegExpMacroAssembler* macro_assembler,
1126 TextElement elm,
1127 Vector<const uc16> quarks,
1128 Label* on_failure,
1129 int cp_offset) {
1130 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1131 for (int i = quarks.length() - 1; i >= 0; i--) {
1132 uc16 c = quarks[i];
1133 int length = uncanonicalize.get(c, '\0', chars);
1134 if (length <= 1) {
1135 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
1136 macro_assembler->CheckNotCharacter(c, on_failure);
1137 }
1138 }
1139 }
1140
1141
1142 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1143 uc16 c1,
1144 uc16 c2,
1145 Label* on_failure) {
1146 uc16 exor = c1 ^ c2;
1147 // Check whether exor has only one bit set.
1148 if (((exor - 1) & exor) == 0) {
Mads Ager (chromium) 2008/11/25 21:09:41 The if-else could be replaced by two ifs that each
Erik Corry 2008/11/26 12:18:36 Fixed.
1149 // If c1 and c2 differ only by one bit.
1150 // Ecma262UnCanonicalize always gives the highest number last.
1151 ASSERT(c2 > c1);
1152 macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);
1153 return true;
1154 } else {
1155 ASSERT(c2 > c1);
1156 uc16 diff = c2 - c1;
1157 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1158 // If the characters differ by 2^n but don't differ by one bit then
1159 // subtract the difference from the found character, then do the or
1160 // trick. We avoid the theoretical case where negative numbers are
1161 // involved in order to simplify code generation.
1162 macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,
1163 diff,
1164 on_failure);
1165 return true;
1166 }
1167 }
1168 return false;
1169 }
1170
1171
1172 static inline void EmitAtomLetters(
1173 RegExpMacroAssembler* macro_assembler,
1174 TextElement elm,
1175 Vector<const uc16> quarks,
1176 Label* on_failure,
1177 int cp_offset) {
1178 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1179 for (int i = quarks.length() - 1; i >= 0; i--) {
1180 uc16 c = quarks[i];
1181 int length = uncanonicalize.get(c, '\0', chars);
1182 if (length <= 1) continue;
1183 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
1184 Label ok;
1185 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1186 switch (length) {
1187 case 2: {
1188 if (ShortCutEmitCharacterPair(macro_assembler,
1189 chars[0],
1190 chars[1],
1191 on_failure)) {
1192 ok.Unuse();
1193 } else {
1194 macro_assembler->CheckCharacter(chars[0], &ok);
1195 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1196 macro_assembler->Bind(&ok);
1197 }
1198 break;
1199 }
1200 case 4:
1201 macro_assembler->CheckCharacter(chars[3], &ok);
1202 // Fall through!
1203 case 3:
1204 macro_assembler->CheckCharacter(chars[0], &ok);
1205 macro_assembler->CheckCharacter(chars[1], &ok);
1206 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1207 macro_assembler->Bind(&ok);
1208 break;
1209 default:
1210 UNREACHABLE();
1211 break;
1212 }
1213 }
1214 }
1215
1216
1217 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1218 RegExpCharacterClass* cc,
1219 int cp_offset,
1220 Label* on_failure) {
1221 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
1222 cp_offset++;
1223
1224 ZoneList<CharacterRange>* ranges = cc->ranges();
1225
1226 Label success;
1227
1228 Label *char_is_in_class =
Erik Corry 2008/11/25 11:03:52 Misplaced asterisk.
Erik Corry 2008/11/26 12:18:36 Fixed.
1229 cc->is_negated() ? on_failure : &success;
1230
1231 int range_count = ranges->length();
1232
1233 if (range_count == 0) {
1234 if (!cc->is_negated()) {
1235 macro_assembler->GoTo(on_failure);
1236 }
1237 return;
1238 }
1239
1240 for (int i = 0; i < range_count - 1; i++) {
1241 CharacterRange& range = ranges->at(i);
1242 Label next_range;
1243 uc16 from = range.from();
1244 uc16 to = range.to();
1245 if (to == from) {
1246 macro_assembler->CheckCharacter(to, char_is_in_class);
1247 } else {
1248 if (from != 0) {
1249 macro_assembler->CheckCharacterLT(from, &next_range);
1250 }
1251 if (to != 0xffff) {
1252 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1253 } else {
1254 macro_assembler->GoTo(char_is_in_class);
1255 }
1256 }
1257 macro_assembler->Bind(&next_range);
1258 }
1259
1260 CharacterRange& range = ranges->at(range_count - 1);
1261 uc16 from = range.from();
1262 uc16 to = range.to();
1263
1264 if (to == from) {
1265 if (cc->is_negated()) {
1266 macro_assembler->CheckCharacter(to, on_failure);
1267 } else {
1268 macro_assembler->CheckNotCharacter(to, on_failure);
1269 }
1270 } else {
1271 if (from != 0) {
1272 if (!cc->is_negated()) {
1273 macro_assembler->CheckCharacterLT(from, on_failure);
1274 } else {
1275 macro_assembler->CheckCharacterLT(from, &success);
1276 }
1277 }
1278 if (to != 0xffff) {
1279 if (!cc->is_negated()) {
1280 macro_assembler->CheckCharacterGT(to, on_failure);
1281 } else {
1282 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1283 }
1284 } else {
1285 if (cc->is_negated()) {
1286 macro_assembler->GoTo(on_failure);
1287 }
1288 }
1289 }
1290 macro_assembler->Bind(&success);
1291 }
1292
1293
1294
1295 bool TextNode::Emit(RegExpCompiler* compiler) {
1296 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1297 Bind(macro_assembler);
1298 int element_count = elms_->length();
1299 int cp_offset = 0;
1300 // First, handle straight character matches.
1301 for (int i = 0; i < element_count; i++) {
1302 TextElement elm = elms_->at(i);
1303 if (elm.type == TextElement::ATOM) {
1304 Vector<const uc16> quarks = elm.data.u_atom->data();
1305 if (!compiler->is_case_independent()) {
1306 macro_assembler->CheckCharacters(quarks,
1307 cp_offset,
1308 on_failure_->label());
1309 } else {
1310 EmitAtomNonLetters(macro_assembler,
1311 elm,
1312 quarks,
1313 on_failure_->label(),
1314 cp_offset);
1315 }
1316 cp_offset += quarks.length();
1317 } else {
1318 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
1319 cp_offset++;
1320 }
1321 }
1322 // Second, handle case independent letter matches if any.
1323 if (compiler->is_case_independent()) {
1324 cp_offset = 0;
1325 for (int i = 0; i < element_count; i++) {
1326 TextElement elm = elms_->at(i);
1327 if (elm.type == TextElement::ATOM) {
1328 Vector<const uc16> quarks = elm.data.u_atom->data();
1329 EmitAtomLetters(macro_assembler,
1330 elm,
1331 quarks,
1332 on_failure_->label(),
1333 cp_offset);
1334 cp_offset += quarks.length();
1335 } else {
1336 cp_offset++;
1337 }
1338 }
1339 }
1340 // If the fast character matches passed then do the character classes.
1341 cp_offset = 0;
1342 for (int i = 0; i < element_count; i++) {
1343 TextElement elm = elms_->at(i);
1344 if (elm.type == TextElement::CHAR_CLASS) {
1345 RegExpCharacterClass* cc = elm.data.u_char_class;
1346 EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label());
1347 cp_offset ++;
1348 } else {
1349 cp_offset += elm.data.u_atom->data().length();
1350 }
1351 }
1352
1353 compiler->AddWork(on_failure_);
1354 macro_assembler->AdvanceCurrentPosition(cp_offset);
1355 return on_success()->GoTo(compiler);
1356 }
1357
1358
1359 bool ChoiceNode::Emit(RegExpCompiler* compiler) {
1360 int choice_count = alternatives_->length();
1361 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1362 Bind(macro_assembler);
1363 // For now we just call all choices one after the other. The idea ultimately
1364 // is to use the Dispatch table to try only the relevant ones.
1365 int i;
Mads Ager (chromium) 2008/11/25 21:09:41 I don't see how the i is needed after the for loop
Erik Corry 2008/11/26 12:18:36 Fixed.
1366 for (i = 0; i < choice_count - 1; i++) {
1367 GuardedAlternative alternative = alternatives_->at(i);
1368 Label after;
1369 Label after_no_pop_cp;
1370 ZoneList<Guard*>* guards = alternative.guards();
1371 if (guards != NULL) {
1372 int guard_count = guards->length();
1373 for (int j = 0; j < guard_count; j++) {
1374 GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp);
1375 }
1376 }
1377 macro_assembler->PushCurrentPosition();
1378 macro_assembler->PushBacktrack(&after);
1379 if (!alternative.node()->GoTo(compiler)) {
1380 after.Unuse();
1381 after_no_pop_cp.Unuse();
1382 return false;
1383 }
1384 macro_assembler->Bind(&after);
1385 macro_assembler->PopCurrentPosition();
1386 macro_assembler->Bind(&after_no_pop_cp);
1387 }
1388 GuardedAlternative alternative = alternatives_->at(i);
1389 ZoneList<Guard*>* guards = alternative.guards();
1390 if (guards != NULL) {
1391 int guard_count = guards->length();
1392 for (int j = 0; j < guard_count; j++) {
1393 GenerateGuard(macro_assembler, guards->at(j), on_failure_->label());
1394 }
1395 }
1396 if (!on_failure_->IsBacktrack()) {
1397 ASSERT_NOT_NULL(on_failure_ -> label());
1398 macro_assembler->PushBacktrack(on_failure_->label());
1399 compiler->AddWork(on_failure_);
1400 }
1401 if (!alternative.node()->GoTo(compiler)) {
1402 return false;
1403 }
1404 return true;
1405 }
1406
1407
1408 bool ActionNode::Emit(RegExpCompiler* compiler) {
1409 RegExpMacroAssembler* macro = compiler->macro_assembler();
1410 Bind(macro);
1411 switch (type_) {
1412 case STORE_REGISTER:
1413 macro->SetRegister(data_.u_store_register.reg,
1414 data_.u_store_register.value);
1415 break;
1416 case INCREMENT_REGISTER: {
1417 Label undo;
1418 macro->PushBacktrack(&undo);
1419 macro->AdvanceRegister(data_.u_increment_register.reg, 1);
1420 bool ok = on_success()->GoTo(compiler);
1421 if (!ok) {
1422 undo.Unuse();
1423 return false;
1424 }
1425 macro->Bind(&undo);
1426 macro->AdvanceRegister(data_.u_increment_register.reg, -1);
1427 macro->Backtrack();
1428 break;
1429 }
1430 case STORE_POSITION: {
1431 Label undo;
1432 macro->PushRegister(data_.u_position_register.reg);
1433 macro->PushBacktrack(&undo);
1434 macro->WriteCurrentPositionToRegister(data_.u_position_register.reg);
1435 bool ok = on_success()->GoTo(compiler);
1436 if (!ok) {
1437 undo.Unuse();
1438 return false;
1439 }
1440 macro->Bind(&undo);
1441 macro->PopRegister(data_.u_position_register.reg);
1442 macro->Backtrack();
1443 break;
1444 }
1445 case SAVE_POSITION:
1446 macro->WriteCurrentPositionToRegister(
1447 data_.u_position_register.reg);
1448 break;
1449 case RESTORE_POSITION:
1450 macro->ReadCurrentPositionFromRegister(
1451 data_.u_position_register.reg);
1452 break;
1453 case BEGIN_SUBMATCH:
1454 macro->WriteStackPointerToRegister(
1455 data_.u_submatch_stack_pointer_register.reg);
1456 break;
1457 case ESCAPE_SUBMATCH:
1458 macro->ReadStackPointerFromRegister(
1459 data_.u_submatch_stack_pointer_register.reg);
1460 break;
1461 default:
1462 UNREACHABLE();
1463 return false;
1464 }
1465 return on_success()->GoTo(compiler);
1466 }
1467
1468
1469 bool BackReferenceNode::Emit(RegExpCompiler* compiler) {
1470 RegExpMacroAssembler* macro = compiler->macro_assembler();
1471 Bind(macro);
1472 // Check whether the registers are uninitialized and always
1473 // succeed if they are.
1474 macro->IfRegisterLT(start_reg_, 0, on_success()->label());
1475 macro->IfRegisterLT(end_reg_, 0, on_success()->label());
1476 ASSERT_EQ(start_reg_ + 1, end_reg_);
1477 macro->CheckNotBackReference(start_reg_, on_failure_->label());
1478 return on_success()->GoTo(compiler);
1479 }
1480
1481
1482 // -------------------------------------------------------------------
1483 // Dot/dotty output
1484
1485
1486 #ifdef DEBUG
1487
1488
1489 class DotPrinter: public NodeVisitor {
1490 public:
1491 DotPrinter() : stream_(&alloc_) { }
1492 void PrintNode(const char* label, RegExpNode* node);
1493 void Visit(RegExpNode* node);
1494 void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure);
1495 StringStream* stream() { return &stream_; }
1496 #define DECLARE_VISIT(Type) \
1497 virtual void Visit##Type(Type##Node* that);
1498 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1499 #undef DECLARE_VISIT
1500 private:
1501 HeapStringAllocator alloc_;
1502 StringStream stream_;
1503 std::set<RegExpNode*> seen_;
1504 };
1505
1506
1507 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
1508 stream()->Add("digraph G {\n graph [label=\"");
1509 for (int i = 0; label[i]; i++) {
1510 switch (label[i]) {
1511 case '\\':
1512 stream()->Add("\\\\");
1513 break;
1514 case '"':
1515 stream()->Add("\"");
1516 break;
1517 default:
1518 stream()->Put(label[i]);
1519 break;
1520 }
1521 }
1522 stream()->Add("\"];\n");
1523 Visit(node);
1524 stream()->Add("}\n");
1525 printf("%s", *(stream()->ToCString()));
1526 }
1527
1528
1529 void DotPrinter::Visit(RegExpNode* node) {
1530 if (seen_.find(node) != seen_.end())
1531 return;
1532 seen_.insert(node);
1533 node->Accept(this);
1534 }
1535
1536
1537 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
1538 if (on_failure->IsBacktrack()) return;
1539 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
1540 Visit(on_failure);
1541 }
1542
1543
1544 class TableEntryBodyPrinter {
1545 public:
1546 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
1547 : stream_(stream), choice_(choice) { }
Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.
Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.
1548 void Call(uc16 from, DispatchTable::Entry entry) {
1549 OutSet* out_set = entry.out_set();
1550 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1551 if (out_set->Get(i)) {
1552 stream()->Add(" n%p:s%io%i -> n%p;\n",
1553 choice(),
1554 from,
1555 i,
1556 choice()->alternatives()->at(i).node());
1557 }
1558 }
1559 }
1560 private:
1561 StringStream* stream() { return stream_; }
1562 ChoiceNode* choice() { return choice_; }
1563 StringStream* stream_;
1564 ChoiceNode* choice_;
1565 };
1566
1567
1568 class TableEntryHeaderPrinter {
1569 public:
1570 explicit TableEntryHeaderPrinter(StringStream* stream)
1571 : first_(true), stream_(stream) { }
Mads Ager (chromium) 2008/11/25 21:09:41 Four space indent.
Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.
1572 void Call(uc16 from, DispatchTable::Entry entry) {
1573 if (first_) {
1574 first_ = false;
1575 } else {
1576 stream()->Add("|");
1577 }
1578 stream()->Add("{\\%k-\\%k|{", from, entry.to());
1579 OutSet* out_set = entry.out_set();
1580 int priority = 0;
1581 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1582 if (out_set->Get(i)) {
1583 if (priority > 0) stream()->Add("|");
1584 stream()->Add("<s%io%i> %i", from, i, priority);
1585 priority++;
1586 }
1587 }
1588 stream()->Add("}}");
1589 }
1590 private:
1591 bool first_;
1592 StringStream* stream() { return stream_; }
1593 StringStream* stream_;
1594 };
1595
1596
1597 void DotPrinter::VisitChoice(ChoiceNode* that) {
1598 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
1599 TableEntryHeaderPrinter header_printer(stream());
1600 that->table()->ForEach(&header_printer);
1601 stream()->Add("\"]\n", that);
1602 TableEntryBodyPrinter body_printer(stream(), that);
1603 that->table()->ForEach(&body_printer);
1604 PrintOnFailure(that, that->on_failure());
1605 for (int i = 0; i < that->alternatives()->length(); i++) {
1606 GuardedAlternative alt = that->alternatives()->at(i);
1607 alt.node()->Accept(this);
1608 }
1609 }
1610
1611
1612 void DotPrinter::VisitText(TextNode* that) {
1613 stream()->Add(" n%p [label=\"", that);
1614 for (int i = 0; i < that->elements()->length(); i++) {
1615 if (i > 0) stream()->Add(" ");
1616 TextElement elm = that->elements()->at(i);
1617 switch (elm.type) {
1618 case TextElement::ATOM: {
1619 stream()->Add("'%w'", elm.data.u_atom->data());
1620 break;
1621 }
1622 case TextElement::CHAR_CLASS: {
1623 RegExpCharacterClass* node = elm.data.u_char_class;
1624 stream()->Add("[");
1625 if (node->is_negated())
1626 stream()->Add("^");
1627 for (int j = 0; j < node->ranges()->length(); j++) {
1628 CharacterRange range = node->ranges()->at(j);
1629 stream()->Add("%k-%k", range.from(), range.to());
1630 }
1631 stream()->Add("]");
1632 break;
1633 }
1634 default:
1635 UNREACHABLE();
1636 }
1637 }
1638 stream()->Add("\", shape=box, peripheries=2];\n");
1639 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1640 Visit(that->on_success());
1641 PrintOnFailure(that, that->on_failure());
1642 }
1643
1644
1645 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
1646 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
1647 that,
1648 that->start_register(),
1649 that->end_register());
1650 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1651 Visit(that->on_success());
1652 PrintOnFailure(that, that->on_failure());
1653 }
1654
1655
1656 void DotPrinter::VisitEnd(EndNode* that) {
1657 stream()->Add(" n%p [style=bold, shape=point];\n", that);
1658 }
1659
1660
1661 void DotPrinter::VisitAction(ActionNode* that) {
1662 stream()->Add(" n%p [", that);
1663 switch (that->type_) {
1664 case ActionNode::STORE_REGISTER:
1665 stream()->Add("label=\"$%i:=%i\", shape=octagon",
1666 that->data_.u_store_register.reg,
1667 that->data_.u_store_register.value);
1668 break;
1669 case ActionNode::INCREMENT_REGISTER:
1670 stream()->Add("label=\"$%i++\", shape=octagon",
1671 that->data_.u_increment_register.reg);
1672 break;
1673 case ActionNode::STORE_POSITION:
1674 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
1675 that->data_.u_position_register.reg);
1676 break;
1677 case ActionNode::SAVE_POSITION:
1678 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
1679 that->data_.u_position_register.reg);
1680 break;
1681 case ActionNode::RESTORE_POSITION:
1682 stream()->Add("label=\"$pos:=$%i\", shape=octagon",
1683 that->data_.u_position_register.reg);
1684 break;
1685 case ActionNode::BEGIN_SUBMATCH:
1686 stream()->Add("label=\"begin\", shape=septagon");
1687 break;
1688 case ActionNode::ESCAPE_SUBMATCH:
1689 stream()->Add("label=\"escape\", shape=septagon");
1690 break;
1691 }
1692 stream()->Add("];\n");
1693 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1694 Visit(that->on_success());
1695 }
1696
1697
1698 class DispatchTableDumper {
1699 public:
1700 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
1701 void Call(uc16 key, DispatchTable::Entry entry);
1702 StringStream* stream() { return stream_; }
1703 private:
1704 StringStream* stream_;
1705 };
1706
1707
1708 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
1709 stream()->Add("[%k-%k]: {", key, entry.to());
1710 OutSet* set = entry.out_set();
1711 bool first = true;
1712 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1713 if (set->Get(i)) {
1714 if (first) {
1715 first = false;
1716 } else {
1717 stream()->Add(", ");
1718 }
1719 stream()->Add("%i", i);
1720 }
1721 }
1722 stream()->Add("}\n");
1723 }
1724
1725
1726 void DispatchTable::Dump() {
1727 HeapStringAllocator alloc;
1728 StringStream stream(&alloc);
1729 DispatchTableDumper dumper(&stream);
1730 tree()->ForEach(&dumper);
1731 OS::PrintError("%s", *stream.ToCString());
1732 }
1733
1734
1735 void RegExpEngine::DotPrint(const char* label, RegExpNode* node) {
1736 DotPrinter printer;
1737 printer.PrintNode(label, node);
1738 }
1739
1740
1741 #endif // DEBUG
1742
1743
1744 // -------------------------------------------------------------------
1745 // Tree to graph conversion
1746
1747
1748 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
1749 RegExpNode* on_success,
1750 RegExpNode* on_failure) {
1751 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
1752 elms->Add(TextElement::Atom(this));
1753 return new TextNode(elms, on_success, on_failure);
1754 }
1755
1756
1757 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
1758 RegExpNode* on_success,
1759 RegExpNode* on_failure) {
1760 return new TextNode(elements(), on_success, on_failure);
1761 }
1762
1763
1764 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
1765 RegExpNode* on_success,
1766 RegExpNode* on_failure) {
1767 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
1768 elms->Add(TextElement::CharClass(this));
1769 return new TextNode(elms, on_success, on_failure);
1770 }
1771
1772
1773 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
1774 RegExpNode* on_success,
1775 RegExpNode* on_failure) {
1776 ZoneList<RegExpTree*>* alternatives = this->alternatives();
1777 int length = alternatives->length();
1778 ChoiceNode* result = new ChoiceNode(length, on_failure);
1779 for (int i = 0; i < length; i++) {
1780 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
1781 on_success,
1782 on_failure));
1783 result->AddAlternative(alternative);
1784 }
1785 return result;
1786 }
1787
1788
1789 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
1790 RegExpNode* on_success,
1791 RegExpNode* on_failure) {
1792 return ToNode(min(),
1793 max(),
1794 is_greedy(),
1795 body(),
1796 compiler,
1797 on_success,
1798 on_failure);
1799 }
1800
1801
1802 RegExpNode* RegExpQuantifier::ToNode(int min,
1803 int max,
1804 bool is_greedy,
1805 RegExpTree* body,
1806 RegExpCompiler* compiler,
1807 RegExpNode* on_success,
1808 RegExpNode* on_failure) {
1809 // x{f, t} becomes this:
1810 //
1811 // (r++)<-.
1812 // | `
1813 // | (x)
1814 // v ^
1815 // (r=0)-->(?)---/ [if r < t]
1816 // |
1817 // [if r >= f] \----> ...
1818 //
1819 //
1820 // TODO(someone): clear captures on repetition and handle empty
1821 // matches.
1822 bool has_min = min > 0;
1823 bool has_max = max < RegExpQuantifier::kInfinity;
1824 bool needs_counter = has_min || has_max;
1825 int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;
1826 ChoiceNode* center = new ChoiceNode(2, on_failure);
1827 RegExpNode* loop_return = needs_counter
1828 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
1829 : static_cast<RegExpNode*>(center);
1830 RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure);
1831 GuardedAlternative body_alt(body_node);
1832 if (has_max) {
1833 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
1834 body_alt.AddGuard(body_guard);
1835 }
1836 GuardedAlternative rest_alt(on_success);
1837 if (has_min) {
1838 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
1839 rest_alt.AddGuard(rest_guard);
1840 }
1841 if (is_greedy) {
1842 center->AddAlternative(body_alt);
1843 center->AddAlternative(rest_alt);
1844 } else {
1845 center->AddAlternative(rest_alt);
1846 center->AddAlternative(body_alt);
1847 }
1848 if (needs_counter) {
1849 return ActionNode::StoreRegister(reg_ctr, 0, center);
1850 } else {
1851 return center;
1852 }
1853 }
1854
1855
1856 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
1857 RegExpNode* on_success,
1858 RegExpNode* on_failure) {
1859 NodeInfo info;
1860 switch (type()) {
1861 case START_OF_LINE:
1862 info.follows_newline_interest = true;
1863 break;
1864 case START_OF_INPUT:
1865 info.follows_start_interest = true;
1866 break;
1867 case BOUNDARY: case NON_BOUNDARY:
1868 info.follows_word_interest = true;
1869 break;
1870 case END_OF_LINE: case END_OF_INPUT:
1871 // This is wrong but has the effect of making the compiler abort.
1872 info.follows_start_interest = true;
1873 }
1874 return on_success->PropagateInterest(&info);
1875 }
1876
1877
1878 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
1879 RegExpNode* on_success,
1880 RegExpNode* on_failure) {
1881 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
1882 RegExpCapture::EndRegister(index()),
1883 on_success,
1884 on_failure);
1885 }
1886
1887
1888 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
1889 RegExpNode* on_success,
1890 RegExpNode* on_failure) {
1891 return on_success;
1892 }
1893
1894
1895 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
1896 RegExpNode* on_success,
1897 RegExpNode* on_failure) {
1898 int stack_pointer_register = compiler->AllocateRegister();
1899 int position_register = compiler->AllocateRegister();
1900 if (is_positive()) {
1901 // begin submatch scope
1902 // $reg = $pos
1903 // if [body]
1904 // then
1905 // $pos = $reg
1906 // escape submatch scope (drop all backtracks created in scope)
1907 // succeed
1908 // else
1909 // end submatch scope (nothing to clean up, just exit the scope)
1910 // fail
1911 return ActionNode::BeginSubmatch(
1912 stack_pointer_register,
1913 ActionNode::SavePosition(
1914 position_register,
1915 body()->ToNode(
1916 compiler,
1917 ActionNode::RestorePosition(
1918 position_register,
1919 ActionNode::EscapeSubmatch(stack_pointer_register,
1920 on_success)),
1921 on_failure)));
1922 } else {
1923 // begin submatch scope
1924 // try
1925 // first if (body)
1926 // then
1927 // escape submatch scope
1928 // fail
1929 // else
1930 // backtrack
1931 // second
1932 // end submatch scope
1933 // restore current position
1934 // succeed
1935 ChoiceNode* try_node =
1936 new ChoiceNode(1, ActionNode::RestorePosition(position_register,
1937 on_success));
1938 RegExpNode* body_node = body()->ToNode(
1939 compiler,
1940 ActionNode::EscapeSubmatch(stack_pointer_register, on_failure),
1941 compiler->backtrack());
1942 GuardedAlternative body_alt(body_node);
1943 try_node->AddAlternative(body_alt);
1944 return ActionNode::BeginSubmatch(stack_pointer_register,
1945 ActionNode::SavePosition(
1946 position_register,
1947 try_node));
1948 }
1949 }
1950
1951
1952 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
1953 RegExpNode* on_success,
1954 RegExpNode* on_failure) {
1955 return ToNode(body(), index(), compiler, on_success, on_failure);
1956 }
1957
1958
1959 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
1960 int index,
1961 RegExpCompiler* compiler,
1962 RegExpNode* on_success,
1963 RegExpNode* on_failure) {
1964 int start_reg = RegExpCapture::StartRegister(index);
1965 int end_reg = RegExpCapture::EndRegister(index);
1966 RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);
1967 RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure);
1968 return ActionNode::StorePosition(start_reg, body_node);
1969 }
1970
1971
1972 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
1973 RegExpNode* on_success,
1974 RegExpNode* on_failure) {
1975 ZoneList<RegExpTree*>* children = nodes();
1976 RegExpNode* current = on_success;
1977 for (int i = children->length() - 1; i >= 0; i--) {
1978 current = children->at(i)->ToNode(compiler, current, on_failure);
1979 }
1980 return current;
1981 }
1982
1983
1984 static const int kSpaceRangeCount = 20;
1985 static const uc16 kSpaceRanges[kSpaceRangeCount] = {
1986 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680,
1987 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
1988 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000
1989 };
1990
1991
1992 static const int kWordRangeCount = 8;
1993 static const uc16 kWordRanges[kWordRangeCount] = {
1994 '0', '9', 'A', 'Z', '_', '_', 'a', 'z'
1995 };
1996
1997
1998 static const int kDigitRangeCount = 2;
1999 static const uc16 kDigitRanges[kDigitRangeCount] = {
2000 '0', '9'
2001 };
2002
2003
2004 static const int kLineTerminatorRangeCount = 6;
2005 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {
2006 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029
2007 };
2008
2009
2010 static void AddClass(const uc16* elmv,
2011 int elmc,
2012 ZoneList<CharacterRange>* ranges) {
2013 for (int i = 0; i < elmc; i += 2) {
2014 ASSERT(elmv[i] <= elmv[i + 1]);
2015 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
2016 }
2017 }
2018
2019
2020 static void AddClassNegated(const uc16 *elmv,
2021 int elmc,
2022 ZoneList<CharacterRange>* ranges) {
2023 ASSERT(elmv[0] != 0x0000);
2024 ASSERT(elmv[elmc-1] != 0xFFFF);
2025 uc16 last = 0x0000;
2026 for (int i = 0; i < elmc; i += 2) {
2027 ASSERT(last <= elmv[i] - 1);
2028 ASSERT(elmv[i] <= elmv[i + 1]);
2029 ranges->Add(CharacterRange(last, elmv[i] - 1));
2030 last = elmv[i + 1] + 1;
2031 }
2032 ranges->Add(CharacterRange(last, 0xFFFF));
2033 }
2034
2035
2036 void CharacterRange::AddClassEscape(uc16 type,
2037 ZoneList<CharacterRange>* ranges) {
2038 switch (type) {
2039 case 's':
2040 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
2041 break;
2042 case 'S':
2043 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
2044 break;
2045 case 'w':
2046 AddClass(kWordRanges, kWordRangeCount, ranges);
2047 break;
2048 case 'W':
2049 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
2050 break;
2051 case 'd':
2052 AddClass(kDigitRanges, kDigitRangeCount, ranges);
2053 break;
2054 case 'D':
2055 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
2056 break;
2057 case '.':
2058 AddClassNegated(kLineTerminatorRanges,
2059 kLineTerminatorRangeCount,
2060 ranges);
2061 break;
2062 // This is not a character range as defined by the spec but a
2063 // convenient shorthand for a character class that matches any
2064 // character.
2065 case '*':
2066 ranges->Add(CharacterRange::Everything());
2067 break;
2068 default:
2069 UNREACHABLE();
2070 }
2071 }
2072
2073
2074 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
2075 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2076 if (IsSingleton()) {
2077 // If this is a singleton we just expand the one character.
2078 int length = uncanonicalize.get(from(), '\0', chars);
2079 for (int i = 0; i < length; i++) {
2080 uc32 chr = chars[i];
2081 if (chr != from()) {
2082 ranges->Add(CharacterRange::Singleton(chars[i]));
2083 }
2084 }
2085 } else if (from() <= kRangeCanonicalizeMax
2086 && to() <= kRangeCanonicalizeMax) {
Mads Ager (chromium) 2008/11/25 21:09:41 Indentation is off.
Christian Plesner Hansen 2008/11/26 06:49:56 Fixed.
2087 // If this is a range we expand the characters block by block,
2088 // expanding contiguous subranges (blocks) one at a time.
2089 // The approach is as follows. For a given start character we
2090 // look up the block that contains it, for instance 'a' if the
2091 // start character is 'c'. A block is characterized by the property
2092 // that all characters uncanonicalize in the same way as the first
2093 // element, except that each entry in the result is incremented
2094 // by the distance from the first element. So a-z is a block
2095 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
2096 // uncanonicalizes to ['a' + k, 'A' + k].
2097 // Once we've found the start point we look up its uncanonicalization
2098 // and produce a range for each element. For instance for [c-f]
2099 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
2100 // add a range if it is not already contained in the input, so [c-f]
2101 // will be skipped but [C-F] will be added. If this range is not
2102 // completely contained in a block we do this for all the blocks
2103 // covered by the range.
2104 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2105 // First, look up the block that contains the 'from' character.
2106 int length = canonrange.get(from(), '\0', range);
2107 if (length == 0) {
2108 range[0] = from();
2109 } else {
2110 ASSERT_EQ(1, length);
2111 }
2112 int pos = from();
2113 // The start of the current block. Note that except for the first
2114 // iteration 'start' is always equal to 'pos'.
2115 int start;
2116 // If it is not the start point of a block the entry contains the
2117 // offset of the character from the start point.
2118 if ((range[0] & kStartMarker) == 0) {
2119 start = pos - range[0];
2120 } else {
2121 start = pos;
2122 }
2123 // Then we add the ranges on at a time, incrementing the current
2124 // position to be after the last block each time. The position
2125 // always points to the start of a block.
2126 while (pos < to()) {
2127 length = canonrange.get(start, '\0', range);
2128 if (length == 0) {
2129 range[0] = start;
2130 } else {
2131 ASSERT_EQ(1, length);
2132 }
2133 ASSERT((range[0] & kStartMarker) != 0);
2134 // The start point of a block contains the distance to the end
2135 // of the range.
2136 int block_end = start + (range[0] & kPayloadMask) - 1;
2137 int end = (block_end > to()) ? to() : block_end;
2138 length = uncanonicalize.get(start, '\0', range);
2139 for (int i = 0; i < length; i++) {
2140 uc32 c = range[i];
2141 uc16 range_from = c + (pos - start);
2142 uc16 range_to = c + (end - start);
2143 if (!(from() <= range_from && range_to <= to()))
2144 ranges->Add(CharacterRange(range_from, range_to));
2145 }
2146 start = pos = block_end + 1;
2147 }
2148 } else {
2149 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
2150 }
2151 }
2152
2153
2154 // -------------------------------------------------------------------
2155 // Interest propagation
2156
2157
2158 RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {
2159 for (int i = 0; i < siblings_.length(); i++) {
2160 RegExpNode* sibling = siblings_.Get(i);
2161 if (sibling->info()->SameInterests(info))
2162 return sibling;
2163 }
2164 return NULL;
2165 }
2166
2167
2168 template <class C>
2169 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
2170 RegExpNode* sibling = node->GetSibling(info);
2171 if (sibling != NULL) return sibling;
2172 node->EnsureSiblings();
2173 sibling = new C(*node);
2174 sibling->info()->AdoptInterests(info);
2175 node->AddSibling(sibling);
2176 return sibling;
2177 }
2178
2179
2180 RegExpNode* ActionNode::PropagateInterest(NodeInfo* info) {
2181 RegExpNode* sibling = GetSibling(info);
2182 if (sibling != NULL) return sibling;
2183 EnsureSiblings();
2184 ActionNode* action = new ActionNode(*this);
2185 action->info()->AdoptInterests(info);
2186 AddSibling(action);
2187 action->set_on_success(action->on_success()->PropagateInterest(info));
2188 return action;
2189 }
2190
2191
2192 RegExpNode* ChoiceNode::PropagateInterest(NodeInfo* info) {
2193 RegExpNode* sibling = GetSibling(info);
2194 if (sibling != NULL) return sibling;
2195 EnsureSiblings();
2196 ChoiceNode* choice = new ChoiceNode(*this);
2197 choice->info()->AdoptInterests(info);
2198 AddSibling(choice);
2199 ZoneList<GuardedAlternative>* old_alternatives = alternatives();
2200 int count = old_alternatives->length();
2201 choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
2202 for (int i = 0; i < count; i++) {
2203 GuardedAlternative alternative = old_alternatives->at(i);
2204 alternative.set_node(alternative.node()->PropagateInterest(info));
2205 choice->alternatives()->Add(alternative);
2206 }
2207 return choice;
2208 }
2209
2210
2211 RegExpNode* EndNode::PropagateInterest(NodeInfo* info) {
2212 return PropagateToEndpoint(this, info);
2213 }
2214
2215
2216 RegExpNode* BackReferenceNode::PropagateInterest(NodeInfo* info) {
2217 return PropagateToEndpoint(this, info);
2218 }
2219
2220
2221 RegExpNode* TextNode::PropagateInterest(NodeInfo* info) {
2222 return PropagateToEndpoint(this, info);
2223 }
2224
2225
2226 // -------------------------------------------------------------------
2227 // Splay tree
2228
2229
2230 OutSet* OutSet::Extend(unsigned value) {
2231 if (Get(value))
2232 return this;
2233 if (successors() != NULL) {
2234 for (int i = 0; i < successors()->length(); i++) {
2235 OutSet* successor = successors()->at(i);
2236 if (successor->Get(value))
2237 return successor;
2238 }
2239 } else {
2240 successors_ = new ZoneList<OutSet*>(2);
2241 }
2242 OutSet* result = new OutSet(first_, remaining_);
2243 result->Set(value);
2244 successors()->Add(result);
2245 return result;
2246 }
2247
2248
2249 void OutSet::Set(unsigned value) {
2250 if (value < kFirstLimit) {
2251 first_ |= (1 << value);
2252 } else {
2253 if (remaining_ == NULL)
2254 remaining_ = new ZoneList<unsigned>(1);
2255 if (remaining_->is_empty() || !remaining_->Contains(value))
2256 remaining_->Add(value);
2257 }
2258 }
2259
2260
2261 bool OutSet::Get(unsigned value) {
2262 if (value < kFirstLimit) {
2263 return (first_ & (1 << value)) != 0;
2264 } else if (remaining_ == NULL) {
2265 return false;
2266 } else {
2267 return remaining_->Contains(value);
2268 }
2269 }
2270
2271
2272 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
2273 const DispatchTable::Entry DispatchTable::Config::kNoValue;
2274
2275
2276 void DispatchTable::AddRange(CharacterRange full_range, int value) {
2277 CharacterRange current = full_range;
2278 if (tree()->is_empty()) {
2279 // If this is the first range we just insert into the table.
2280 ZoneSplayTree<Config>::Locator loc;
2281 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
2282 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
2283 return;
2284 }
2285 // First see if there is a range to the left of this one that
2286 // overlaps.
2287 ZoneSplayTree<Config>::Locator loc;
2288 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
2289 Entry* entry = &loc.value();
2290 // If we've found a range that overlaps with this one, and it
2291 // starts strictly to the left of this one, we have to fix it
2292 // because the following code only handles ranges that start on
2293 // or after the start point of the range we're adding.
2294 if (entry->from() < current.from() && entry->to() >= current.from()) {
2295 // Snap the overlapping range in half around the start point of
2296 // the range we're adding.
2297 CharacterRange left(entry->from(), current.from() - 1);
2298 CharacterRange right(current.from(), entry->to());
2299 // The left part of the overlapping range doesn't overlap.
2300 // Truncate the whole entry to be just the left part.
2301 entry->set_to(left.to());
2302 // The right part is the one that overlaps. We add this part
2303 // to the map and let the next step deal with merging it with
2304 // the range we're adding.
2305 ZoneSplayTree<Config>::Locator loc;
2306 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
2307 loc.set_value(Entry(right.from(),
2308 right.to(),
2309 entry->out_set()));
2310 }
2311 }
2312 while (current.is_valid()) {
2313 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
2314 (loc.value().from() <= current.to()) &&
2315 (loc.value().to() >= current.from())) {
2316 Entry* entry = &loc.value();
2317 // We have overlap. If there is space between the start point of
2318 // the range we're adding and where the overlapping range starts
2319 // then we have to add a range covering just that space.
2320 if (current.from() < entry->from()) {
2321 ZoneSplayTree<Config>::Locator ins;
2322 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
2323 ins.set_value(Entry(current.from(),
2324 entry->from() - 1,
2325 empty()->Extend(value)));
2326 current.set_from(entry->from());
2327 }
2328 ASSERT_EQ(current.from(), entry->from());
2329 // If the overlapping range extends beyond the one we want to add
2330 // we have to snap the right part off and add it separately.
2331 if (entry->to() > current.to()) {
2332 ZoneSplayTree<Config>::Locator ins;
2333 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
2334 ins.set_value(Entry(current.to() + 1,
2335 entry->to(),
2336 entry->out_set()));
2337 entry->set_to(current.to());
2338 }
2339 ASSERT(entry->to() <= current.to());
2340 // The overlapping range is now completely contained by the range
2341 // we're adding so we can just update it and move the start point
2342 // of the range we're adding just past it.
2343 entry->AddValue(value);
2344 // Bail out if the last interval ended at 0xFFFF since otherwise
2345 // adding 1 will wrap around to 0.
2346 if (entry->to() == 0xFFFF)
2347 break;
2348 ASSERT(entry->to() + 1 > current.from());
2349 current.set_from(entry->to() + 1);
2350 } else {
2351 // There is no overlap so we can just add the range
2352 ZoneSplayTree<Config>::Locator ins;
2353 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
2354 ins.set_value(Entry(current.from(),
2355 current.to(),
2356 empty()->Extend(value)));
2357 break;
2358 }
2359 }
2360 }
2361
2362
2363 OutSet* DispatchTable::Get(uc16 value) {
2364 ZoneSplayTree<Config>::Locator loc;
2365 if (!tree()->FindGreatestLessThan(value, &loc))
2366 return empty();
2367 Entry* entry = &loc.value();
2368 if (value <= entry->to())
2369 return entry->out_set();
2370 else
2371 return empty();
2372 }
2373
2374
2375 // -------------------------------------------------------------------
2376 // Analysis
2377
2378
2379 void Analysis::EnsureAnalyzed(RegExpNode* that) {
2380 if (that->info()->been_analyzed || that->info()->being_analyzed)
2381 return;
2382 that->info()->being_analyzed = true;
2383 that->Accept(this);
2384 that->info()->being_analyzed = false;
2385 that->info()->been_analyzed = true;
2386 }
2387
2388
2389 void Analysis::VisitEnd(EndNode* that) {
2390 // nothing to do
2391 }
2392
2393
2394 void Analysis::VisitText(TextNode* that) {
2395 EnsureAnalyzed(that->on_success());
2396 EnsureAnalyzed(that->on_failure());
2397 }
2398
2399
2400 void Analysis::VisitAction(ActionNode* that) {
2401 RegExpNode* next = that->on_success();
2402 EnsureAnalyzed(next);
2403 that->info()->determine_newline = next->info()->prev_determine_newline();
2404 that->info()->determine_word = next->info()->prev_determine_word();
2405 that->info()->determine_start = next->info()->prev_determine_start();
2406 }
2407
2408
2409 void Analysis::VisitChoice(ChoiceNode* that) {
2410 NodeInfo* info = that->info();
2411 for (int i = 0; i < that->alternatives()->length(); i++) {
2412 RegExpNode* node = that->alternatives()->at(i).node();
2413 EnsureAnalyzed(node);
2414 info->determine_newline |= node->info()->prev_determine_newline();
2415 info->determine_word |= node->info()->prev_determine_word();
2416 info->determine_start |= node->info()->prev_determine_start();
2417 }
2418 if (!that->table_calculated()) {
2419 DispatchTableConstructor cons(that->table());
2420 cons.BuildTable(that);
2421 }
2422 EnsureAnalyzed(that->on_failure());
2423 }
2424
2425
2426 void Analysis::VisitBackReference(BackReferenceNode* that) {
2427 EnsureAnalyzed(that->on_success());
2428 EnsureAnalyzed(that->on_failure());
2429 }
2430
2431
2432 // -------------------------------------------------------------------
2433 // Dispatch table construction
2434
2435
2436 void DispatchTableConstructor::VisitEnd(EndNode* that) {
2437 AddRange(CharacterRange::Everything());
2438 }
2439
2440
2441 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
2442 ASSERT(!node->table_calculated());
2443 node->set_being_calculated(true);
2444 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
2445 for (int i = 0; i < alternatives->length(); i++) {
2446 set_choice_index(i);
2447 alternatives->at(i).node()->Accept(this);
2448 }
2449 node->set_being_calculated(false);
2450 node->set_table_calculated(true);
2451 }
2452
2453
2454 class AddDispatchRange {
2455 public:
2456 explicit AddDispatchRange(DispatchTableConstructor* constructor)
2457 : constructor_(constructor) { }
2458 void Call(uc32 from, DispatchTable::Entry entry);
2459 private:
2460 DispatchTableConstructor* constructor_;
2461 };
2462
2463
2464 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
2465 CharacterRange range(from, entry.to());
2466 constructor_->AddRange(range);
2467 }
2468
2469
2470 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
2471 if (node->being_calculated())
2472 return;
2473 if (!node->table_calculated()) {
2474 DispatchTableConstructor constructor(node->table());
2475 constructor.BuildTable(node);
2476 }
2477 ASSERT(node->table_calculated());
2478 AddDispatchRange adder(this);
2479 node->table()->ForEach(&adder);
2480 }
2481
2482
2483 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
2484 // TODO(160): Find the node that we refer back to and propagate its start
2485 // set back to here. For now we just accept anything.
2486 AddRange(CharacterRange::Everything());
2487 }
2488
2489
2490
2491 static int CompareRangeByFrom(const CharacterRange* a,
2492 const CharacterRange* b) {
2493 return Spaceship<uc16>(a->from(), b->from());
2494 }
2495
2496
2497 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
2498 ranges->Sort(CompareRangeByFrom);
2499 uc16 last = 0;
2500 for (int i = 0; i < ranges->length(); i++) {
2501 CharacterRange range = ranges->at(i);
2502 if (last < range.from())
2503 AddRange(CharacterRange(last, range.from() - 1));
2504 if (range.to() >= last) {
2505 if (range.to() == 0xFFFF) {
2506 return;
2507 } else {
2508 last = range.to() + 1;
2509 }
2510 }
2511 }
2512 AddRange(CharacterRange(last, 0xFFFF));
2513 }
2514
2515
2516 void DispatchTableConstructor::VisitText(TextNode* that) {
2517 TextElement elm = that->elements()->at(0);
2518 switch (elm.type) {
2519 case TextElement::ATOM: {
2520 uc16 c = elm.data.u_atom->data()[0];
2521 AddRange(CharacterRange(c, c));
2522 break;
2523 }
2524 case TextElement::CHAR_CLASS: {
2525 RegExpCharacterClass* tree = elm.data.u_char_class;
2526 ZoneList<CharacterRange>* ranges = tree->ranges();
2527 if (tree->is_negated()) {
2528 AddInverse(ranges);
2529 } else {
2530 for (int i = 0; i < ranges->length(); i++)
2531 AddRange(ranges->at(i));
2532 }
2533 break;
2534 }
2535 default: {
2536 UNIMPLEMENTED();
2537 }
2538 }
2539 }
2540
2541
2542 void DispatchTableConstructor::VisitAction(ActionNode* that) {
2543 that->on_success()->Accept(this);
2544 }
2545
2546
2547 Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
2548 RegExpNode** node_return,
2549 bool ignore_case) {
2550 RegExpCompiler compiler(input->capture_count, ignore_case);
2551 // Wrap the body of the regexp in capture #0.
2552 RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
2553 0,
2554 &compiler,
2555 compiler.accept(),
2556 compiler.backtrack());
2557 // Add a .*? at the beginning, outside the body capture.
2558 // Note: We could choose to not add this if the regexp is anchored at
2559 // the start of the input but I'm not sure how best to do that and
2560 // since we don't even handle ^ yet I'm saving that optimization for
2561 // later.
2562 RegExpNode* node = RegExpQuantifier::ToNode(0,
2563 RegExpQuantifier::kInfinity,
2564 false,
2565 new RegExpCharacterClass('*'),
2566 &compiler,
2567 captured_body,
2568 compiler.backtrack());
2569 if (node_return != NULL) *node_return = node;
2570 Analysis analysis;
2571 analysis.EnsureAnalyzed(node);
2572
2573 if (!FLAG_irregexp) {
2574 return Handle<FixedArray>::null();
2575 }
2576
2577 #if !(defined ARM || defined __arm__ || defined __thumb__)
2578 if (FLAG_irregexp_native) { // Flag only checked in IA32 mode.
2579 // TODO(lrn) Move compilation to a later point in the life-cycle
2580 // of the RegExp. We don't know the type of input string yet.
2581 // For now, always assume two-byte strings.
2582 RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16,
2583 (input->capture_count + 1) * 2,
2584 ignore_case);
2585 return compiler.Assemble(&macro_assembler,
2586 node,
2587 input->capture_count);
2588 }
2589 #endif
2590 byte codes[1024];
2591 IrregexpAssembler assembler(Vector<byte>(codes, 1024));
2592 RegExpMacroAssemblerIrregexp macro_assembler(&assembler);
2593 return compiler.Assemble(&macro_assembler,
2594 node,
2595 input->capture_count);
2596 }
2597
575 2598
576 }} // namespace v8::internal 2599 }} // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698