OLD | NEW |
---|---|
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/v8.h" | 5 #include "src/v8.h" |
6 | 6 |
7 #include "src/ast.h" | 7 #include "src/ast.h" |
8 #include "src/base/platform/platform.h" | 8 #include "src/base/platform/platform.h" |
9 #include "src/compilation-cache.h" | 9 #include "src/compilation-cache.h" |
10 #include "src/compiler.h" | 10 #include "src/compiler.h" |
(...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
283 if (index + needle_len > subject->length()) { | 283 if (index + needle_len > subject->length()) { |
284 return RegExpImpl::RE_FAILURE; | 284 return RegExpImpl::RE_FAILURE; |
285 } | 285 } |
286 | 286 |
287 for (int i = 0; i < output_size; i += 2) { | 287 for (int i = 0; i < output_size; i += 2) { |
288 String::FlatContent needle_content = needle->GetFlatContent(); | 288 String::FlatContent needle_content = needle->GetFlatContent(); |
289 String::FlatContent subject_content = subject->GetFlatContent(); | 289 String::FlatContent subject_content = subject->GetFlatContent(); |
290 DCHECK(needle_content.IsFlat()); | 290 DCHECK(needle_content.IsFlat()); |
291 DCHECK(subject_content.IsFlat()); | 291 DCHECK(subject_content.IsFlat()); |
292 // dispatch on type of strings | 292 // dispatch on type of strings |
293 index = (needle_content.IsAscii() | 293 index = |
294 ? (subject_content.IsAscii() | 294 (needle_content.IsOneByte() |
295 ? SearchString(isolate, | 295 ? (subject_content.IsOneByte() |
296 subject_content.ToOneByteVector(), | 296 ? SearchString(isolate, subject_content.ToOneByteVector(), |
297 needle_content.ToOneByteVector(), | 297 needle_content.ToOneByteVector(), index) |
298 index) | 298 : SearchString(isolate, subject_content.ToUC16Vector(), |
299 : SearchString(isolate, | 299 needle_content.ToOneByteVector(), index)) |
300 subject_content.ToUC16Vector(), | 300 : (subject_content.IsOneByte() |
301 needle_content.ToOneByteVector(), | 301 ? SearchString(isolate, subject_content.ToOneByteVector(), |
302 index)) | 302 needle_content.ToUC16Vector(), index) |
303 : (subject_content.IsAscii() | 303 : SearchString(isolate, subject_content.ToUC16Vector(), |
304 ? SearchString(isolate, | 304 needle_content.ToUC16Vector(), index))); |
305 subject_content.ToOneByteVector(), | |
306 needle_content.ToUC16Vector(), | |
307 index) | |
308 : SearchString(isolate, | |
309 subject_content.ToUC16Vector(), | |
310 needle_content.ToUC16Vector(), | |
311 index))); | |
312 if (index == -1) { | 305 if (index == -1) { |
313 return i / 2; // Return number of matches. | 306 return i / 2; // Return number of matches. |
314 } else { | 307 } else { |
315 output[i] = index; | 308 output[i] = index; |
316 output[i+1] = index + needle_len; | 309 output[i+1] = index + needle_len; |
317 index += needle_len; | 310 index += needle_len; |
318 } | 311 } |
319 } | 312 } |
320 return output_size / 2; | 313 return output_size / 2; |
321 } | 314 } |
(...skipping 17 matching lines...) Expand all Loading... | |
339 SealHandleScope shs(isolate); | 332 SealHandleScope shs(isolate); |
340 FixedArray* array = FixedArray::cast(last_match_info->elements()); | 333 FixedArray* array = FixedArray::cast(last_match_info->elements()); |
341 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); | 334 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); |
342 return last_match_info; | 335 return last_match_info; |
343 } | 336 } |
344 | 337 |
345 | 338 |
346 // Irregexp implementation. | 339 // Irregexp implementation. |
347 | 340 |
348 // Ensures that the regexp object contains a compiled version of the | 341 // Ensures that the regexp object contains a compiled version of the |
349 // source for either ASCII or non-ASCII strings. | 342 // source for either one-byte or two-byte subject strings. |
350 // If the compiled version doesn't already exist, it is compiled | 343 // If the compiled version doesn't already exist, it is compiled |
351 // from the source pattern. | 344 // from the source pattern. |
352 // If compilation fails, an exception is thrown and this function | 345 // If compilation fails, an exception is thrown and this function |
353 // returns false. | 346 // returns false. |
354 bool RegExpImpl::EnsureCompiledIrregexp( | 347 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, |
355 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { | 348 Handle<String> sample_subject, |
356 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); | 349 bool is_one_byte) { |
350 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); | |
357 #ifdef V8_INTERPRETED_REGEXP | 351 #ifdef V8_INTERPRETED_REGEXP |
358 if (compiled_code->IsByteArray()) return true; | 352 if (compiled_code->IsByteArray()) return true; |
359 #else // V8_INTERPRETED_REGEXP (RegExp native code) | 353 #else // V8_INTERPRETED_REGEXP (RegExp native code) |
360 if (compiled_code->IsCode()) return true; | 354 if (compiled_code->IsCode()) return true; |
361 #endif | 355 #endif |
362 // We could potentially have marked this as flushable, but have kept | 356 // We could potentially have marked this as flushable, but have kept |
363 // a saved version if we did not flush it yet. | 357 // a saved version if we did not flush it yet. |
364 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); | 358 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); |
365 if (saved_code->IsCode()) { | 359 if (saved_code->IsCode()) { |
366 // Reinstate the code in the original place. | 360 // Reinstate the code in the original place. |
367 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); | 361 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code); |
368 DCHECK(compiled_code->IsSmi()); | 362 DCHECK(compiled_code->IsSmi()); |
369 return true; | 363 return true; |
370 } | 364 } |
371 return CompileIrregexp(re, sample_subject, is_ascii); | 365 return CompileIrregexp(re, sample_subject, is_one_byte); |
372 } | 366 } |
373 | 367 |
374 | 368 |
375 static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, bool is_ascii, | 369 static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, |
376 Handle<String> error_message, | 370 Handle<String> error_message, |
377 Isolate* isolate) { | 371 Isolate* isolate) { |
378 Factory* factory = isolate->factory(); | 372 Factory* factory = isolate->factory(); |
379 Handle<FixedArray> elements = factory->NewFixedArray(2); | 373 Handle<FixedArray> elements = factory->NewFixedArray(2); |
380 elements->set(0, re->Pattern()); | 374 elements->set(0, re->Pattern()); |
381 elements->set(1, *error_message); | 375 elements->set(1, *error_message); |
382 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 376 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
383 Handle<Object> error; | 377 Handle<Object> error; |
384 MaybeHandle<Object> maybe_error = | 378 MaybeHandle<Object> maybe_error = |
385 factory->NewSyntaxError("malformed_regexp", array); | 379 factory->NewSyntaxError("malformed_regexp", array); |
386 if (maybe_error.ToHandle(&error)) isolate->Throw(*error); | 380 if (maybe_error.ToHandle(&error)) isolate->Throw(*error); |
387 } | 381 } |
388 | 382 |
389 | 383 |
390 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, | 384 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
391 Handle<String> sample_subject, | 385 Handle<String> sample_subject, |
392 bool is_ascii) { | 386 bool is_one_byte) { |
393 // Compile the RegExp. | 387 // Compile the RegExp. |
394 Isolate* isolate = re->GetIsolate(); | 388 Isolate* isolate = re->GetIsolate(); |
395 Zone zone(isolate); | 389 Zone zone(isolate); |
396 PostponeInterruptsScope postpone(isolate); | 390 PostponeInterruptsScope postpone(isolate); |
397 // If we had a compilation error the last time this is saved at the | 391 // If we had a compilation error the last time this is saved at the |
398 // saved code index. | 392 // saved code index. |
399 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); | 393 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); |
400 // When arriving here entry can only be a smi, either representing an | 394 // When arriving here entry can only be a smi, either representing an |
401 // uncompiled regexp, a previous compilation error, or code that has | 395 // uncompiled regexp, a previous compilation error, or code that has |
402 // been flushed. | 396 // been flushed. |
403 DCHECK(entry->IsSmi()); | 397 DCHECK(entry->IsSmi()); |
404 int entry_value = Smi::cast(entry)->value(); | 398 int entry_value = Smi::cast(entry)->value(); |
405 DCHECK(entry_value == JSRegExp::kUninitializedValue || | 399 DCHECK(entry_value == JSRegExp::kUninitializedValue || |
406 entry_value == JSRegExp::kCompilationErrorValue || | 400 entry_value == JSRegExp::kCompilationErrorValue || |
407 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); | 401 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); |
408 | 402 |
409 if (entry_value == JSRegExp::kCompilationErrorValue) { | 403 if (entry_value == JSRegExp::kCompilationErrorValue) { |
410 // A previous compilation failed and threw an error which we store in | 404 // A previous compilation failed and threw an error which we store in |
411 // the saved code index (we store the error message, not the actual | 405 // the saved code index (we store the error message, not the actual |
412 // error). Recreate the error object and throw it. | 406 // error). Recreate the error object and throw it. |
413 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii)); | 407 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); |
414 DCHECK(error_string->IsString()); | 408 DCHECK(error_string->IsString()); |
415 Handle<String> error_message(String::cast(error_string)); | 409 Handle<String> error_message(String::cast(error_string)); |
416 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); | 410 CreateRegExpErrorObjectAndThrow(re, error_message, isolate); |
417 return false; | 411 return false; |
418 } | 412 } |
419 | 413 |
420 JSRegExp::Flags flags = re->GetFlags(); | 414 JSRegExp::Flags flags = re->GetFlags(); |
421 | 415 |
422 Handle<String> pattern(re->Pattern()); | 416 Handle<String> pattern(re->Pattern()); |
423 pattern = String::Flatten(pattern); | 417 pattern = String::Flatten(pattern); |
424 RegExpCompileData compile_data; | 418 RegExpCompileData compile_data; |
425 FlatStringReader reader(isolate, pattern); | 419 FlatStringReader reader(isolate, pattern); |
426 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), | 420 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), |
427 &compile_data, | 421 &compile_data, |
428 &zone)) { | 422 &zone)) { |
429 // Throw an exception if we fail to parse the pattern. | 423 // Throw an exception if we fail to parse the pattern. |
430 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. | 424 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. |
431 USE(ThrowRegExpException(re, | 425 USE(ThrowRegExpException(re, |
432 pattern, | 426 pattern, |
433 compile_data.error, | 427 compile_data.error, |
434 "malformed_regexp")); | 428 "malformed_regexp")); |
435 return false; | 429 return false; |
436 } | 430 } |
437 RegExpEngine::CompilationResult result = | 431 RegExpEngine::CompilationResult result = RegExpEngine::Compile( |
438 RegExpEngine::Compile(&compile_data, | 432 &compile_data, flags.is_ignore_case(), flags.is_global(), |
439 flags.is_ignore_case(), | 433 flags.is_multiline(), pattern, sample_subject, is_one_byte, &zone); |
440 flags.is_global(), | |
441 flags.is_multiline(), | |
442 pattern, | |
443 sample_subject, | |
444 is_ascii, | |
445 &zone); | |
446 if (result.error_message != NULL) { | 434 if (result.error_message != NULL) { |
447 // Unable to compile regexp. | 435 // Unable to compile regexp. |
448 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( | 436 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( |
449 CStrVector(result.error_message)).ToHandleChecked(); | 437 CStrVector(result.error_message)).ToHandleChecked(); |
450 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); | 438 CreateRegExpErrorObjectAndThrow(re, error_message, isolate); |
451 return false; | 439 return false; |
452 } | 440 } |
453 | 441 |
454 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); | 442 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); |
455 data->set(JSRegExp::code_index(is_ascii), result.code); | 443 data->set(JSRegExp::code_index(is_one_byte), result.code); |
456 int register_max = IrregexpMaxRegisterCount(*data); | 444 int register_max = IrregexpMaxRegisterCount(*data); |
457 if (result.num_registers > register_max) { | 445 if (result.num_registers > register_max) { |
458 SetIrregexpMaxRegisterCount(*data, result.num_registers); | 446 SetIrregexpMaxRegisterCount(*data, result.num_registers); |
459 } | 447 } |
460 | 448 |
461 return true; | 449 return true; |
462 } | 450 } |
463 | 451 |
464 | 452 |
465 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { | 453 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { |
(...skipping 10 matching lines...) Expand all Loading... | |
476 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { | 464 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { |
477 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); | 465 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); |
478 } | 466 } |
479 | 467 |
480 | 468 |
481 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { | 469 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { |
482 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); | 470 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); |
483 } | 471 } |
484 | 472 |
485 | 473 |
486 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { | 474 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { |
487 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); | 475 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); |
488 } | 476 } |
489 | 477 |
490 | 478 |
491 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { | 479 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { |
492 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); | 480 return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); |
493 } | 481 } |
494 | 482 |
495 | 483 |
496 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, | 484 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, |
497 Handle<String> pattern, | 485 Handle<String> pattern, |
498 JSRegExp::Flags flags, | 486 JSRegExp::Flags flags, |
499 int capture_count) { | 487 int capture_count) { |
500 // Initialize compiled code entries to null. | 488 // Initialize compiled code entries to null. |
501 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, | 489 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, |
502 JSRegExp::IRREGEXP, | 490 JSRegExp::IRREGEXP, |
503 pattern, | 491 pattern, |
504 flags, | 492 flags, |
505 capture_count); | 493 capture_count); |
506 } | 494 } |
507 | 495 |
508 | 496 |
509 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, | 497 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, |
510 Handle<String> subject) { | 498 Handle<String> subject) { |
511 subject = String::Flatten(subject); | 499 subject = String::Flatten(subject); |
512 | 500 |
513 // Check the asciiness of the underlying storage. | 501 // Check representation of the underlying storage. |
514 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); | 502 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
515 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; | 503 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1; |
516 | 504 |
517 #ifdef V8_INTERPRETED_REGEXP | 505 #ifdef V8_INTERPRETED_REGEXP |
518 // Byte-code regexp needs space allocated for all its registers. | 506 // Byte-code regexp needs space allocated for all its registers. |
519 // The result captures are copied to the start of the registers array | 507 // The result captures are copied to the start of the registers array |
520 // if the match succeeds. This way those registers are not clobbered | 508 // if the match succeeds. This way those registers are not clobbered |
521 // when we set the last match info from last successful match. | 509 // when we set the last match info from last successful match. |
522 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + | 510 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + |
523 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; | 511 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; |
524 #else // V8_INTERPRETED_REGEXP | 512 #else // V8_INTERPRETED_REGEXP |
525 // Native regexp only needs room to output captures. Registers are handled | 513 // Native regexp only needs room to output captures. Registers are handled |
526 // internally. | 514 // internally. |
527 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; | 515 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; |
528 #endif // V8_INTERPRETED_REGEXP | 516 #endif // V8_INTERPRETED_REGEXP |
529 } | 517 } |
530 | 518 |
531 | 519 |
532 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, | 520 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, |
533 Handle<String> subject, | 521 Handle<String> subject, |
534 int index, | 522 int index, |
535 int32_t* output, | 523 int32_t* output, |
536 int output_size) { | 524 int output_size) { |
537 Isolate* isolate = regexp->GetIsolate(); | 525 Isolate* isolate = regexp->GetIsolate(); |
538 | 526 |
539 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); | 527 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); |
540 | 528 |
541 DCHECK(index >= 0); | 529 DCHECK(index >= 0); |
542 DCHECK(index <= subject->length()); | 530 DCHECK(index <= subject->length()); |
543 DCHECK(subject->IsFlat()); | 531 DCHECK(subject->IsFlat()); |
544 | 532 |
545 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); | 533 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
546 | 534 |
547 #ifndef V8_INTERPRETED_REGEXP | 535 #ifndef V8_INTERPRETED_REGEXP |
548 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); | 536 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
549 do { | 537 do { |
550 EnsureCompiledIrregexp(regexp, subject, is_ascii); | 538 EnsureCompiledIrregexp(regexp, subject, is_one_byte); |
551 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); | 539 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); |
552 // The stack is used to allocate registers for the compiled regexp code. | 540 // The stack is used to allocate registers for the compiled regexp code. |
553 // This means that in case of failure, the output registers array is left | 541 // This means that in case of failure, the output registers array is left |
554 // untouched and contains the capture results from the previous successful | 542 // untouched and contains the capture results from the previous successful |
555 // match. We can use that to set the last match info lazily. | 543 // match. We can use that to set the last match info lazily. |
556 NativeRegExpMacroAssembler::Result res = | 544 NativeRegExpMacroAssembler::Result res = |
557 NativeRegExpMacroAssembler::Match(code, | 545 NativeRegExpMacroAssembler::Match(code, |
558 subject, | 546 subject, |
559 output, | 547 output, |
560 output_size, | 548 output_size, |
561 index, | 549 index, |
562 isolate); | 550 isolate); |
563 if (res != NativeRegExpMacroAssembler::RETRY) { | 551 if (res != NativeRegExpMacroAssembler::RETRY) { |
564 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || | 552 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || |
565 isolate->has_pending_exception()); | 553 isolate->has_pending_exception()); |
566 STATIC_ASSERT( | 554 STATIC_ASSERT( |
567 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); | 555 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); |
568 STATIC_ASSERT( | 556 STATIC_ASSERT( |
569 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); | 557 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); |
570 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) | 558 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) |
571 == RE_EXCEPTION); | 559 == RE_EXCEPTION); |
572 return static_cast<IrregexpResult>(res); | 560 return static_cast<IrregexpResult>(res); |
573 } | 561 } |
574 // If result is RETRY, the string has changed representation, and we | 562 // If result is RETRY, the string has changed representation, and we |
575 // must restart from scratch. | 563 // must restart from scratch. |
576 // In this case, it means we must make sure we are prepared to handle | 564 // In this case, it means we must make sure we are prepared to handle |
577 // the, potentially, different subject (the string can switch between | 565 // the, potentially, different subject (the string can switch between |
578 // being internal and external, and even between being ASCII and UC16, | 566 // being internal and external, and even between being Latin1 and UC16, |
579 // but the characters are always the same). | 567 // but the characters are always the same). |
580 IrregexpPrepare(regexp, subject); | 568 IrregexpPrepare(regexp, subject); |
581 is_ascii = subject->IsOneByteRepresentationUnderneath(); | 569 is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
582 } while (true); | 570 } while (true); |
583 UNREACHABLE(); | 571 UNREACHABLE(); |
584 return RE_EXCEPTION; | 572 return RE_EXCEPTION; |
585 #else // V8_INTERPRETED_REGEXP | 573 #else // V8_INTERPRETED_REGEXP |
586 | 574 |
587 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); | 575 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); |
588 // We must have done EnsureCompiledIrregexp, so we can get the number of | 576 // We must have done EnsureCompiledIrregexp, so we can get the number of |
589 // registers. | 577 // registers. |
590 int number_of_capture_registers = | 578 int number_of_capture_registers = |
591 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; | 579 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; |
592 int32_t* raw_output = &output[number_of_capture_registers]; | 580 int32_t* raw_output = &output[number_of_capture_registers]; |
593 // We do not touch the actual capture result registers until we know there | 581 // We do not touch the actual capture result registers until we know there |
594 // has been a match so that we can use those capture results to set the | 582 // has been a match so that we can use those capture results to set the |
595 // last match info. | 583 // last match info. |
596 for (int i = number_of_capture_registers - 1; i >= 0; i--) { | 584 for (int i = number_of_capture_registers - 1; i >= 0; i--) { |
597 raw_output[i] = -1; | 585 raw_output[i] = -1; |
598 } | 586 } |
599 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); | 587 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), |
588 isolate); | |
600 | 589 |
601 IrregexpResult result = IrregexpInterpreter::Match(isolate, | 590 IrregexpResult result = IrregexpInterpreter::Match(isolate, |
602 byte_codes, | 591 byte_codes, |
603 subject, | 592 subject, |
604 raw_output, | 593 raw_output, |
605 index); | 594 index); |
606 if (result == RE_SUCCESS) { | 595 if (result == RE_SUCCESS) { |
607 // Copy capture results to the start of the registers array. | 596 // Copy capture results to the start of the registers array. |
608 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); | 597 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); |
609 } | 598 } |
(...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
990 | 979 |
991 | 980 |
992 private: | 981 private: |
993 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; | 982 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; |
994 int total_samples_; | 983 int total_samples_; |
995 }; | 984 }; |
996 | 985 |
997 | 986 |
998 class RegExpCompiler { | 987 class RegExpCompiler { |
999 public: | 988 public: |
1000 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii, | 989 RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte, |
1001 Zone* zone); | 990 Zone* zone); |
1002 | 991 |
1003 int AllocateRegister() { | 992 int AllocateRegister() { |
1004 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { | 993 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { |
1005 reg_exp_too_big_ = true; | 994 reg_exp_too_big_ = true; |
1006 return next_register_; | 995 return next_register_; |
1007 } | 996 } |
1008 return next_register_++; | 997 return next_register_++; |
1009 } | 998 } |
1010 | 999 |
(...skipping 12 matching lines...) Expand all Loading... | |
1023 EndNode* accept() { return accept_; } | 1012 EndNode* accept() { return accept_; } |
1024 | 1013 |
1025 static const int kMaxRecursion = 100; | 1014 static const int kMaxRecursion = 100; |
1026 inline int recursion_depth() { return recursion_depth_; } | 1015 inline int recursion_depth() { return recursion_depth_; } |
1027 inline void IncrementRecursionDepth() { recursion_depth_++; } | 1016 inline void IncrementRecursionDepth() { recursion_depth_++; } |
1028 inline void DecrementRecursionDepth() { recursion_depth_--; } | 1017 inline void DecrementRecursionDepth() { recursion_depth_--; } |
1029 | 1018 |
1030 void SetRegExpTooBig() { reg_exp_too_big_ = true; } | 1019 void SetRegExpTooBig() { reg_exp_too_big_ = true; } |
1031 | 1020 |
1032 inline bool ignore_case() { return ignore_case_; } | 1021 inline bool ignore_case() { return ignore_case_; } |
1033 inline bool ascii() { return ascii_; } | 1022 inline bool one_byte() { return one_byte_; } |
1034 FrequencyCollator* frequency_collator() { return &frequency_collator_; } | 1023 FrequencyCollator* frequency_collator() { return &frequency_collator_; } |
1035 | 1024 |
1036 int current_expansion_factor() { return current_expansion_factor_; } | 1025 int current_expansion_factor() { return current_expansion_factor_; } |
1037 void set_current_expansion_factor(int value) { | 1026 void set_current_expansion_factor(int value) { |
1038 current_expansion_factor_ = value; | 1027 current_expansion_factor_ = value; |
1039 } | 1028 } |
1040 | 1029 |
1041 Zone* zone() const { return zone_; } | 1030 Zone* zone() const { return zone_; } |
1042 | 1031 |
1043 static const int kNoRegister = -1; | 1032 static const int kNoRegister = -1; |
1044 | 1033 |
1045 private: | 1034 private: |
1046 EndNode* accept_; | 1035 EndNode* accept_; |
1047 int next_register_; | 1036 int next_register_; |
1048 List<RegExpNode*>* work_list_; | 1037 List<RegExpNode*>* work_list_; |
1049 int recursion_depth_; | 1038 int recursion_depth_; |
1050 RegExpMacroAssembler* macro_assembler_; | 1039 RegExpMacroAssembler* macro_assembler_; |
1051 bool ignore_case_; | 1040 bool ignore_case_; |
1052 bool ascii_; | 1041 bool one_byte_; |
1053 bool reg_exp_too_big_; | 1042 bool reg_exp_too_big_; |
1054 int current_expansion_factor_; | 1043 int current_expansion_factor_; |
1055 FrequencyCollator frequency_collator_; | 1044 FrequencyCollator frequency_collator_; |
1056 Zone* zone_; | 1045 Zone* zone_; |
1057 }; | 1046 }; |
1058 | 1047 |
1059 | 1048 |
1060 class RecursionCheck { | 1049 class RecursionCheck { |
1061 public: | 1050 public: |
1062 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { | 1051 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { |
1063 compiler->IncrementRecursionDepth(); | 1052 compiler->IncrementRecursionDepth(); |
1064 } | 1053 } |
1065 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } | 1054 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } |
1066 private: | 1055 private: |
1067 RegExpCompiler* compiler_; | 1056 RegExpCompiler* compiler_; |
1068 }; | 1057 }; |
1069 | 1058 |
1070 | 1059 |
1071 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { | 1060 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { |
1072 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); | 1061 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); |
1073 } | 1062 } |
1074 | 1063 |
1075 | 1064 |
1076 // Attempts to compile the regexp using an Irregexp code generator. Returns | 1065 // Attempts to compile the regexp using an Irregexp code generator. Returns |
1077 // a fixed array or a null handle depending on whether it succeeded. | 1066 // a fixed array or a null handle depending on whether it succeeded. |
1078 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii, | 1067 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, |
1079 Zone* zone) | 1068 bool one_byte, Zone* zone) |
1080 : next_register_(2 * (capture_count + 1)), | 1069 : next_register_(2 * (capture_count + 1)), |
1081 work_list_(NULL), | 1070 work_list_(NULL), |
1082 recursion_depth_(0), | 1071 recursion_depth_(0), |
1083 ignore_case_(ignore_case), | 1072 ignore_case_(ignore_case), |
1084 ascii_(ascii), | 1073 one_byte_(one_byte), |
1085 reg_exp_too_big_(false), | 1074 reg_exp_too_big_(false), |
1086 current_expansion_factor_(1), | 1075 current_expansion_factor_(1), |
1087 frequency_collator_(), | 1076 frequency_collator_(), |
1088 zone_(zone) { | 1077 zone_(zone) { |
1089 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); | 1078 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); |
1090 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); | 1079 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); |
1091 } | 1080 } |
1092 | 1081 |
1093 | 1082 |
1094 RegExpEngine::CompilationResult RegExpCompiler::Assemble( | 1083 RegExpEngine::CompilationResult RegExpCompiler::Assemble( |
(...skipping 490 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1585 macro_assembler->IfRegisterLT(guard->reg(), | 1574 macro_assembler->IfRegisterLT(guard->reg(), |
1586 guard->value(), | 1575 guard->value(), |
1587 trace->backtrack()); | 1576 trace->backtrack()); |
1588 break; | 1577 break; |
1589 } | 1578 } |
1590 } | 1579 } |
1591 | 1580 |
1592 | 1581 |
1593 // Returns the number of characters in the equivalence class, omitting those | 1582 // Returns the number of characters in the equivalence class, omitting those |
1594 // that cannot occur in the source string because it is ASCII. | 1583 // that cannot occur in the source string because it is ASCII. |
1595 static int GetCaseIndependentLetters(Isolate* isolate, | 1584 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, |
1596 uc16 character, | 1585 bool one_byte_subject, |
1597 bool ascii_subject, | |
1598 unibrow::uchar* letters) { | 1586 unibrow::uchar* letters) { |
1599 int length = | 1587 int length = |
1600 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); | 1588 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); |
1601 // Unibrow returns 0 or 1 for characters where case independence is | 1589 // Unibrow returns 0 or 1 for characters where case independence is |
1602 // trivial. | 1590 // trivial. |
1603 if (length == 0) { | 1591 if (length == 0) { |
1604 letters[0] = character; | 1592 letters[0] = character; |
1605 length = 1; | 1593 length = 1; |
1606 } | 1594 } |
1607 if (!ascii_subject || character <= String::kMaxOneByteCharCode) { | 1595 if (!one_byte_subject || character <= String::kMaxOneByteCharCode) { |
1608 return length; | 1596 return length; |
1609 } | 1597 } |
1598 | |
1610 // The standard requires that non-ASCII characters cannot have ASCII | 1599 // The standard requires that non-ASCII characters cannot have ASCII |
1611 // character codes in their equivalence class. | 1600 // character codes in their equivalence class. |
1601 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore, | |
1602 // is it? For example, \u00C5 is equivalent to \u212B. | |
Yang
2014/09/10 08:26:36
This is one of the TODOs I mentioned.
dcarney
2014/09/10 09:35:12
I checked other browsers I think originally, and w
| |
1612 return 0; | 1603 return 0; |
1613 } | 1604 } |
1614 | 1605 |
1615 | 1606 |
1616 static inline bool EmitSimpleCharacter(Isolate* isolate, | 1607 static inline bool EmitSimpleCharacter(Isolate* isolate, |
1617 RegExpCompiler* compiler, | 1608 RegExpCompiler* compiler, |
1618 uc16 c, | 1609 uc16 c, |
1619 Label* on_failure, | 1610 Label* on_failure, |
1620 int cp_offset, | 1611 int cp_offset, |
1621 bool check, | 1612 bool check, |
(...skipping 15 matching lines...) Expand all Loading... | |
1637 // Only emits non-letters (things that don't have case). Only used for case | 1628 // Only emits non-letters (things that don't have case). Only used for case |
1638 // independent matches. | 1629 // independent matches. |
1639 static inline bool EmitAtomNonLetter(Isolate* isolate, | 1630 static inline bool EmitAtomNonLetter(Isolate* isolate, |
1640 RegExpCompiler* compiler, | 1631 RegExpCompiler* compiler, |
1641 uc16 c, | 1632 uc16 c, |
1642 Label* on_failure, | 1633 Label* on_failure, |
1643 int cp_offset, | 1634 int cp_offset, |
1644 bool check, | 1635 bool check, |
1645 bool preloaded) { | 1636 bool preloaded) { |
1646 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | 1637 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
1647 bool ascii = compiler->ascii(); | 1638 bool one_byte = compiler->one_byte(); |
1648 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1639 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
1649 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); | 1640 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); |
1650 if (length < 1) { | 1641 if (length < 1) { |
1651 // This can't match. Must be an ASCII subject and a non-ASCII character. | 1642 // This can't match. Must be an one-byte subject and a non-one-byte |
1652 // We do not need to do anything since the ASCII pass already handled this. | 1643 // character. We do not need to do anything since the one-byte pass |
1644 // already handled this. | |
1653 return false; // Bounds not checked. | 1645 return false; // Bounds not checked. |
1654 } | 1646 } |
1655 bool checked = false; | 1647 bool checked = false; |
1656 // We handle the length > 1 case in a later pass. | 1648 // We handle the length > 1 case in a later pass. |
1657 if (length == 1) { | 1649 if (length == 1) { |
1658 if (ascii && c > String::kMaxOneByteCharCodeU) { | 1650 if (one_byte && c > String::kMaxOneByteCharCodeU) { |
1659 // Can't match - see above. | 1651 // Can't match - see above. |
1660 return false; // Bounds not checked. | 1652 return false; // Bounds not checked. |
1661 } | 1653 } |
1662 if (!preloaded) { | 1654 if (!preloaded) { |
1663 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1655 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
1664 checked = check; | 1656 checked = check; |
1665 } | 1657 } |
1666 macro_assembler->CheckNotCharacter(c, on_failure); | 1658 macro_assembler->CheckNotCharacter(c, on_failure); |
1667 } | 1659 } |
1668 return checked; | 1660 return checked; |
1669 } | 1661 } |
1670 | 1662 |
1671 | 1663 |
1672 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, | 1664 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, |
1673 bool ascii, | 1665 bool one_byte, uc16 c1, uc16 c2, |
1674 uc16 c1, | |
1675 uc16 c2, | |
1676 Label* on_failure) { | 1666 Label* on_failure) { |
1677 uc16 char_mask; | 1667 uc16 char_mask; |
1678 if (ascii) { | 1668 if (one_byte) { |
1679 char_mask = String::kMaxOneByteCharCode; | 1669 char_mask = String::kMaxOneByteCharCode; |
1680 } else { | 1670 } else { |
1681 char_mask = String::kMaxUtf16CodeUnit; | 1671 char_mask = String::kMaxUtf16CodeUnit; |
1682 } | 1672 } |
1683 uc16 exor = c1 ^ c2; | 1673 uc16 exor = c1 ^ c2; |
1684 // Check whether exor has only one bit set. | 1674 // Check whether exor has only one bit set. |
1685 if (((exor - 1) & exor) == 0) { | 1675 if (((exor - 1) & exor) == 0) { |
1686 // If c1 and c2 differ only by one bit. | 1676 // If c1 and c2 differ only by one bit. |
1687 // Ecma262UnCanonicalize always gives the highest number last. | 1677 // Ecma262UnCanonicalize always gives the highest number last. |
1688 DCHECK(c2 > c1); | 1678 DCHECK(c2 > c1); |
(...skipping 30 matching lines...) Expand all Loading... | |
1719 // Only emits letters (things that have case). Only used for case independent | 1709 // Only emits letters (things that have case). Only used for case independent |
1720 // matches. | 1710 // matches. |
1721 static inline bool EmitAtomLetter(Isolate* isolate, | 1711 static inline bool EmitAtomLetter(Isolate* isolate, |
1722 RegExpCompiler* compiler, | 1712 RegExpCompiler* compiler, |
1723 uc16 c, | 1713 uc16 c, |
1724 Label* on_failure, | 1714 Label* on_failure, |
1725 int cp_offset, | 1715 int cp_offset, |
1726 bool check, | 1716 bool check, |
1727 bool preloaded) { | 1717 bool preloaded) { |
1728 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | 1718 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
1729 bool ascii = compiler->ascii(); | 1719 bool one_byte = compiler->one_byte(); |
1730 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1720 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
1731 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); | 1721 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); |
1732 if (length <= 1) return false; | 1722 if (length <= 1) return false; |
1733 // We may not need to check against the end of the input string | 1723 // We may not need to check against the end of the input string |
1734 // if this character lies before a character that matched. | 1724 // if this character lies before a character that matched. |
1735 if (!preloaded) { | 1725 if (!preloaded) { |
1736 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1726 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
1737 } | 1727 } |
1738 Label ok; | 1728 Label ok; |
1739 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); | 1729 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
1740 switch (length) { | 1730 switch (length) { |
1741 case 2: { | 1731 case 2: { |
1742 if (ShortCutEmitCharacterPair(macro_assembler, | 1732 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], |
1743 ascii, | 1733 chars[1], on_failure)) { |
1744 chars[0], | |
1745 chars[1], | |
1746 on_failure)) { | |
1747 } else { | 1734 } else { |
1748 macro_assembler->CheckCharacter(chars[0], &ok); | 1735 macro_assembler->CheckCharacter(chars[0], &ok); |
1749 macro_assembler->CheckNotCharacter(chars[1], on_failure); | 1736 macro_assembler->CheckNotCharacter(chars[1], on_failure); |
1750 macro_assembler->Bind(&ok); | 1737 macro_assembler->Bind(&ok); |
1751 } | 1738 } |
1752 break; | 1739 break; |
1753 } | 1740 } |
1754 case 4: | 1741 case 4: |
1755 macro_assembler->CheckCharacter(chars[3], &ok); | 1742 macro_assembler->CheckCharacter(chars[3], &ok); |
1756 // Fall through! | 1743 // Fall through! |
(...skipping 154 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1911 | 1898 |
1912 *new_start_index = start_index; | 1899 *new_start_index = start_index; |
1913 *border = (ranges->at(start_index) & ~kMask) + kSize; | 1900 *border = (ranges->at(start_index) & ~kMask) + kSize; |
1914 while (*new_start_index < end_index) { | 1901 while (*new_start_index < end_index) { |
1915 if (ranges->at(*new_start_index) > *border) break; | 1902 if (ranges->at(*new_start_index) > *border) break; |
1916 (*new_start_index)++; | 1903 (*new_start_index)++; |
1917 } | 1904 } |
1918 // new_start_index is the index of the first edge that is beyond the | 1905 // new_start_index is the index of the first edge that is beyond the |
1919 // current kSize space. | 1906 // current kSize space. |
1920 | 1907 |
1921 // For very large search spaces we do a binary chop search of the non-ASCII | 1908 // For very large search spaces we do a binary chop search of the non-Latin1 |
1922 // space instead of just going to the end of the current kSize space. The | 1909 // space instead of just going to the end of the current kSize space. The |
1923 // heuristics are complicated a little by the fact that any 128-character | 1910 // heuristics are complicated a little by the fact that any 128-character |
1924 // encoding space can be quickly tested with a table lookup, so we don't | 1911 // encoding space can be quickly tested with a table lookup, so we don't |
1925 // wish to do binary chop search at a smaller granularity than that. A | 1912 // wish to do binary chop search at a smaller granularity than that. A |
1926 // 128-character space can take up a lot of space in the ranges array if, | 1913 // 128-character space can take up a lot of space in the ranges array if, |
1927 // for example, we only want to match every second character (eg. the lower | 1914 // for example, we only want to match every second character (eg. the lower |
1928 // case characters on some Unicode pages). | 1915 // case characters on some Unicode pages). |
1929 int binary_chop_index = (end_index + start_index) / 2; | 1916 int binary_chop_index = (end_index + start_index) / 2; |
1930 // The first test ensures that we get to the code that handles the ASCII | 1917 // The first test ensures that we get to the code that handles the Latin1 |
1931 // range with a single not-taken branch, speeding up this important | 1918 // range with a single not-taken branch, speeding up this important |
1932 // character range (even non-ASCII charset-based text has spaces and | 1919 // character range (even non-Latin1 charset-based text has spaces and |
1933 // punctuation). | 1920 // punctuation). |
1934 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case. | 1921 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. |
1935 end_index - start_index > (*new_start_index - start_index) * 2 && | 1922 end_index - start_index > (*new_start_index - start_index) * 2 && |
1936 last - first > kSize * 2 && | 1923 last - first > kSize * 2 && binary_chop_index > *new_start_index && |
1937 binary_chop_index > *new_start_index && | |
1938 ranges->at(binary_chop_index) >= first + 2 * kSize) { | 1924 ranges->at(binary_chop_index) >= first + 2 * kSize) { |
1939 int scan_forward_for_section_border = binary_chop_index;; | 1925 int scan_forward_for_section_border = binary_chop_index;; |
1940 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; | 1926 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; |
1941 | 1927 |
1942 while (scan_forward_for_section_border < end_index) { | 1928 while (scan_forward_for_section_border < end_index) { |
1943 if (ranges->at(scan_forward_for_section_border) > new_border) { | 1929 if (ranges->at(scan_forward_for_section_border) > new_border) { |
1944 *new_start_index = scan_forward_for_section_border; | 1930 *new_start_index = scan_forward_for_section_border; |
1945 *border = new_border; | 1931 *border = new_border; |
1946 break; | 1932 break; |
1947 } | 1933 } |
(...skipping 166 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2114 border, | 2100 border, |
2115 max_char, | 2101 max_char, |
2116 &dummy, | 2102 &dummy, |
2117 flip ? odd_label : even_label, | 2103 flip ? odd_label : even_label, |
2118 flip ? even_label : odd_label); | 2104 flip ? even_label : odd_label); |
2119 } | 2105 } |
2120 } | 2106 } |
2121 | 2107 |
2122 | 2108 |
2123 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, | 2109 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
2124 RegExpCharacterClass* cc, | 2110 RegExpCharacterClass* cc, bool one_byte, |
2125 bool ascii, | 2111 Label* on_failure, int cp_offset, bool check_offset, |
2126 Label* on_failure, | 2112 bool preloaded, Zone* zone) { |
2127 int cp_offset, | |
2128 bool check_offset, | |
2129 bool preloaded, | |
2130 Zone* zone) { | |
2131 ZoneList<CharacterRange>* ranges = cc->ranges(zone); | 2113 ZoneList<CharacterRange>* ranges = cc->ranges(zone); |
2132 if (!CharacterRange::IsCanonical(ranges)) { | 2114 if (!CharacterRange::IsCanonical(ranges)) { |
2133 CharacterRange::Canonicalize(ranges); | 2115 CharacterRange::Canonicalize(ranges); |
2134 } | 2116 } |
2135 | 2117 |
2136 int max_char; | 2118 int max_char; |
2137 if (ascii) { | 2119 if (one_byte) { |
2138 max_char = String::kMaxOneByteCharCode; | 2120 max_char = String::kMaxOneByteCharCode; |
2139 } else { | 2121 } else { |
2140 max_char = String::kMaxUtf16CodeUnit; | 2122 max_char = String::kMaxUtf16CodeUnit; |
2141 } | 2123 } |
2142 | 2124 |
2143 int range_count = ranges->length(); | 2125 int range_count = ranges->length(); |
2144 | 2126 |
2145 int last_valid_range = range_count - 1; | 2127 int last_valid_range = range_count - 1; |
2146 while (last_valid_range >= 0) { | 2128 while (last_valid_range >= 0) { |
2147 CharacterRange& range = ranges->at(last_valid_range); | 2129 CharacterRange& range = ranges->at(last_valid_range); |
(...skipping 309 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2457 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, | 2439 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, |
2458 Trace* trace, | 2440 Trace* trace, |
2459 bool preload_has_checked_bounds, | 2441 bool preload_has_checked_bounds, |
2460 Label* on_possible_success, | 2442 Label* on_possible_success, |
2461 QuickCheckDetails* details, | 2443 QuickCheckDetails* details, |
2462 bool fall_through_on_failure) { | 2444 bool fall_through_on_failure) { |
2463 if (details->characters() == 0) return false; | 2445 if (details->characters() == 0) return false; |
2464 GetQuickCheckDetails( | 2446 GetQuickCheckDetails( |
2465 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); | 2447 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); |
2466 if (details->cannot_match()) return false; | 2448 if (details->cannot_match()) return false; |
2467 if (!details->Rationalize(compiler->ascii())) return false; | 2449 if (!details->Rationalize(compiler->one_byte())) return false; |
2468 DCHECK(details->characters() == 1 || | 2450 DCHECK(details->characters() == 1 || |
2469 compiler->macro_assembler()->CanReadUnaligned()); | 2451 compiler->macro_assembler()->CanReadUnaligned()); |
2470 uint32_t mask = details->mask(); | 2452 uint32_t mask = details->mask(); |
2471 uint32_t value = details->value(); | 2453 uint32_t value = details->value(); |
2472 | 2454 |
2473 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2455 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
2474 | 2456 |
2475 if (trace->characters_preloaded() != details->characters()) { | 2457 if (trace->characters_preloaded() != details->characters()) { |
2476 assembler->LoadCurrentCharacter(trace->cp_offset(), | 2458 assembler->LoadCurrentCharacter(trace->cp_offset(), |
2477 trace->backtrack(), | 2459 trace->backtrack(), |
2478 !preload_has_checked_bounds, | 2460 !preload_has_checked_bounds, |
2479 details->characters()); | 2461 details->characters()); |
2480 } | 2462 } |
2481 | 2463 |
2482 | 2464 |
2483 bool need_mask = true; | 2465 bool need_mask = true; |
2484 | 2466 |
2485 if (details->characters() == 1) { | 2467 if (details->characters() == 1) { |
2486 // If number of characters preloaded is 1 then we used a byte or 16 bit | 2468 // If number of characters preloaded is 1 then we used a byte or 16 bit |
2487 // load so the value is already masked down. | 2469 // load so the value is already masked down. |
2488 uint32_t char_mask; | 2470 uint32_t char_mask; |
2489 if (compiler->ascii()) { | 2471 if (compiler->one_byte()) { |
2490 char_mask = String::kMaxOneByteCharCode; | 2472 char_mask = String::kMaxOneByteCharCode; |
2491 } else { | 2473 } else { |
2492 char_mask = String::kMaxUtf16CodeUnit; | 2474 char_mask = String::kMaxUtf16CodeUnit; |
2493 } | 2475 } |
2494 if ((mask & char_mask) == char_mask) need_mask = false; | 2476 if ((mask & char_mask) == char_mask) need_mask = false; |
2495 mask &= char_mask; | 2477 mask &= char_mask; |
2496 } else { | 2478 } else { |
2497 // For 2-character preloads in ASCII mode or 1-character preloads in | 2479 // For 2-character preloads in one-byte mode or 1-character preloads in |
2498 // TWO_BYTE mode we also use a 16 bit load with zero extend. | 2480 // two-byte mode we also use a 16 bit load with zero extend. |
2499 if (details->characters() == 2 && compiler->ascii()) { | 2481 if (details->characters() == 2 && compiler->one_byte()) { |
2500 if ((mask & 0xffff) == 0xffff) need_mask = false; | 2482 if ((mask & 0xffff) == 0xffff) need_mask = false; |
2501 } else if (details->characters() == 1 && !compiler->ascii()) { | 2483 } else if (details->characters() == 1 && !compiler->one_byte()) { |
2502 if ((mask & 0xffff) == 0xffff) need_mask = false; | 2484 if ((mask & 0xffff) == 0xffff) need_mask = false; |
2503 } else { | 2485 } else { |
2504 if (mask == 0xffffffff) need_mask = false; | 2486 if (mask == 0xffffffff) need_mask = false; |
2505 } | 2487 } |
2506 } | 2488 } |
2507 | 2489 |
2508 if (fall_through_on_failure) { | 2490 if (fall_through_on_failure) { |
2509 if (need_mask) { | 2491 if (need_mask) { |
2510 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); | 2492 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); |
2511 } else { | 2493 } else { |
(...skipping 19 matching lines...) Expand all Loading... | |
2531 // machine word for the current character width in order to be used in | 2513 // machine word for the current character width in order to be used in |
2532 // generating a quick check. | 2514 // generating a quick check. |
2533 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2515 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, |
2534 RegExpCompiler* compiler, | 2516 RegExpCompiler* compiler, |
2535 int characters_filled_in, | 2517 int characters_filled_in, |
2536 bool not_at_start) { | 2518 bool not_at_start) { |
2537 Isolate* isolate = compiler->macro_assembler()->zone()->isolate(); | 2519 Isolate* isolate = compiler->macro_assembler()->zone()->isolate(); |
2538 DCHECK(characters_filled_in < details->characters()); | 2520 DCHECK(characters_filled_in < details->characters()); |
2539 int characters = details->characters(); | 2521 int characters = details->characters(); |
2540 int char_mask; | 2522 int char_mask; |
2541 if (compiler->ascii()) { | 2523 if (compiler->one_byte()) { |
2542 char_mask = String::kMaxOneByteCharCode; | 2524 char_mask = String::kMaxOneByteCharCode; |
2543 } else { | 2525 } else { |
2544 char_mask = String::kMaxUtf16CodeUnit; | 2526 char_mask = String::kMaxUtf16CodeUnit; |
2545 } | 2527 } |
2546 for (int k = 0; k < elms_->length(); k++) { | 2528 for (int k = 0; k < elms_->length(); k++) { |
2547 TextElement elm = elms_->at(k); | 2529 TextElement elm = elms_->at(k); |
2548 if (elm.text_type() == TextElement::ATOM) { | 2530 if (elm.text_type() == TextElement::ATOM) { |
2549 Vector<const uc16> quarks = elm.atom()->data(); | 2531 Vector<const uc16> quarks = elm.atom()->data(); |
2550 for (int i = 0; i < characters && i < quarks.length(); i++) { | 2532 for (int i = 0; i < characters && i < quarks.length(); i++) { |
2551 QuickCheckDetails::Position* pos = | 2533 QuickCheckDetails::Position* pos = |
2552 details->positions(characters_filled_in); | 2534 details->positions(characters_filled_in); |
2553 uc16 c = quarks[i]; | 2535 uc16 c = quarks[i]; |
2554 if (c > char_mask) { | 2536 if (c > char_mask) { |
2555 // If we expect a non-ASCII character from an ASCII string, | 2537 // If we expect a non-Latin1 character from an one-byte string, |
2556 // there is no way we can match. Not even case independent | 2538 // there is no way we can match. Not even case-independent |
2557 // matching can turn an ASCII character into non-ASCII or | 2539 // matching can turn an Latin1 character into non-Latin1 or |
2558 // vice versa. | 2540 // vice versa. |
2541 // TODO(dcarney): issue 3550. Verify that this works as expected. | |
2542 // For example, \u0178 is uppercase of \u00ff (y-umlaut). | |
Yang
2014/09/10 08:26:36
This is the other.
| |
2559 details->set_cannot_match(); | 2543 details->set_cannot_match(); |
2560 pos->determines_perfectly = false; | 2544 pos->determines_perfectly = false; |
2561 return; | 2545 return; |
2562 } | 2546 } |
2563 if (compiler->ignore_case()) { | 2547 if (compiler->ignore_case()) { |
2564 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 2548 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
2565 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), | 2549 int length = GetCaseIndependentLetters(isolate, c, |
2566 chars); | 2550 compiler->one_byte(), chars); |
2567 DCHECK(length != 0); // Can only happen if c > char_mask (see above). | 2551 DCHECK(length != 0); // Can only happen if c > char_mask (see above). |
2568 if (length == 1) { | 2552 if (length == 1) { |
2569 // This letter has no case equivalents, so it's nice and simple | 2553 // This letter has no case equivalents, so it's nice and simple |
2570 // and the mask-compare will determine definitely whether we have | 2554 // and the mask-compare will determine definitely whether we have |
2571 // a match at this character position. | 2555 // a match at this character position. |
2572 pos->mask = char_mask; | 2556 pos->mask = char_mask; |
2573 pos->value = c; | 2557 pos->value = c; |
2574 pos->determines_perfectly = true; | 2558 pos->determines_perfectly = true; |
2575 } else { | 2559 } else { |
2576 uint32_t common_bits = char_mask; | 2560 uint32_t common_bits = char_mask; |
(...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2685 void QuickCheckDetails::Clear() { | 2669 void QuickCheckDetails::Clear() { |
2686 for (int i = 0; i < characters_; i++) { | 2670 for (int i = 0; i < characters_; i++) { |
2687 positions_[i].mask = 0; | 2671 positions_[i].mask = 0; |
2688 positions_[i].value = 0; | 2672 positions_[i].value = 0; |
2689 positions_[i].determines_perfectly = false; | 2673 positions_[i].determines_perfectly = false; |
2690 } | 2674 } |
2691 characters_ = 0; | 2675 characters_ = 0; |
2692 } | 2676 } |
2693 | 2677 |
2694 | 2678 |
2695 void QuickCheckDetails::Advance(int by, bool ascii) { | 2679 void QuickCheckDetails::Advance(int by, bool one_byte) { |
2696 DCHECK(by >= 0); | 2680 DCHECK(by >= 0); |
2697 if (by >= characters_) { | 2681 if (by >= characters_) { |
2698 Clear(); | 2682 Clear(); |
2699 return; | 2683 return; |
2700 } | 2684 } |
2701 for (int i = 0; i < characters_ - by; i++) { | 2685 for (int i = 0; i < characters_ - by; i++) { |
2702 positions_[i] = positions_[by + i]; | 2686 positions_[i] = positions_[by + i]; |
2703 } | 2687 } |
2704 for (int i = characters_ - by; i < characters_; i++) { | 2688 for (int i = characters_ - by; i < characters_; i++) { |
2705 positions_[i].mask = 0; | 2689 positions_[i].mask = 0; |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2749 info->visited = true; | 2733 info->visited = true; |
2750 } | 2734 } |
2751 ~VisitMarker() { | 2735 ~VisitMarker() { |
2752 info_->visited = false; | 2736 info_->visited = false; |
2753 } | 2737 } |
2754 private: | 2738 private: |
2755 NodeInfo* info_; | 2739 NodeInfo* info_; |
2756 }; | 2740 }; |
2757 | 2741 |
2758 | 2742 |
2759 RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) { | 2743 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { |
2760 if (info()->replacement_calculated) return replacement(); | 2744 if (info()->replacement_calculated) return replacement(); |
2761 if (depth < 0) return this; | 2745 if (depth < 0) return this; |
2762 DCHECK(!info()->visited); | 2746 DCHECK(!info()->visited); |
2763 VisitMarker marker(info()); | 2747 VisitMarker marker(info()); |
2764 return FilterSuccessor(depth - 1, ignore_case); | 2748 return FilterSuccessor(depth - 1, ignore_case); |
2765 } | 2749 } |
2766 | 2750 |
2767 | 2751 |
2768 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { | 2752 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { |
2769 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); | 2753 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); |
2770 if (next == NULL) return set_replacement(NULL); | 2754 if (next == NULL) return set_replacement(NULL); |
2771 on_success_ = next; | 2755 on_success_ = next; |
2772 return set_replacement(this); | 2756 return set_replacement(this); |
2773 } | 2757 } |
2774 | 2758 |
2775 | 2759 |
2776 // We need to check for the following characters: 0x39c 0x3bc 0x178. | 2760 // We need to check for the following characters: 0x39c 0x3bc 0x178. |
2777 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { | 2761 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { |
2778 // TODO(dcarney): this could be a lot more efficient. | 2762 // TODO(dcarney): this could be a lot more efficient. |
2779 return range.Contains(0x39c) || | 2763 return range.Contains(0x39c) || |
2780 range.Contains(0x3bc) || range.Contains(0x178); | 2764 range.Contains(0x3bc) || range.Contains(0x178); |
2781 } | 2765 } |
2782 | 2766 |
2783 | 2767 |
2784 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { | 2768 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { |
2785 for (int i = 0; i < ranges->length(); i++) { | 2769 for (int i = 0; i < ranges->length(); i++) { |
2786 // TODO(dcarney): this could be a lot more efficient. | 2770 // TODO(dcarney): this could be a lot more efficient. |
2787 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; | 2771 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; |
2788 } | 2772 } |
2789 return false; | 2773 return false; |
2790 } | 2774 } |
2791 | 2775 |
2792 | 2776 |
2793 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { | 2777 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { |
2794 if (info()->replacement_calculated) return replacement(); | 2778 if (info()->replacement_calculated) return replacement(); |
2795 if (depth < 0) return this; | 2779 if (depth < 0) return this; |
2796 DCHECK(!info()->visited); | 2780 DCHECK(!info()->visited); |
2797 VisitMarker marker(info()); | 2781 VisitMarker marker(info()); |
2798 int element_count = elms_->length(); | 2782 int element_count = elms_->length(); |
2799 for (int i = 0; i < element_count; i++) { | 2783 for (int i = 0; i < element_count; i++) { |
2800 TextElement elm = elms_->at(i); | 2784 TextElement elm = elms_->at(i); |
2801 if (elm.text_type() == TextElement::ATOM) { | 2785 if (elm.text_type() == TextElement::ATOM) { |
2802 Vector<const uc16> quarks = elm.atom()->data(); | 2786 Vector<const uc16> quarks = elm.atom()->data(); |
2803 for (int j = 0; j < quarks.length(); j++) { | 2787 for (int j = 0; j < quarks.length(); j++) { |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2837 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; | 2821 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; |
2838 return set_replacement(NULL); | 2822 return set_replacement(NULL); |
2839 } | 2823 } |
2840 } | 2824 } |
2841 } | 2825 } |
2842 } | 2826 } |
2843 return FilterSuccessor(depth - 1, ignore_case); | 2827 return FilterSuccessor(depth - 1, ignore_case); |
2844 } | 2828 } |
2845 | 2829 |
2846 | 2830 |
2847 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { | 2831 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { |
2848 if (info()->replacement_calculated) return replacement(); | 2832 if (info()->replacement_calculated) return replacement(); |
2849 if (depth < 0) return this; | 2833 if (depth < 0) return this; |
2850 if (info()->visited) return this; | 2834 if (info()->visited) return this; |
2851 { | 2835 { |
2852 VisitMarker marker(info()); | 2836 VisitMarker marker(info()); |
2853 | 2837 |
2854 RegExpNode* continue_replacement = | 2838 RegExpNode* continue_replacement = |
2855 continue_node_->FilterASCII(depth - 1, ignore_case); | 2839 continue_node_->FilterOneByte(depth - 1, ignore_case); |
2856 // If we can't continue after the loop then there is no sense in doing the | 2840 // If we can't continue after the loop then there is no sense in doing the |
2857 // loop. | 2841 // loop. |
2858 if (continue_replacement == NULL) return set_replacement(NULL); | 2842 if (continue_replacement == NULL) return set_replacement(NULL); |
2859 } | 2843 } |
2860 | 2844 |
2861 return ChoiceNode::FilterASCII(depth - 1, ignore_case); | 2845 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); |
2862 } | 2846 } |
2863 | 2847 |
2864 | 2848 |
2865 RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { | 2849 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { |
2866 if (info()->replacement_calculated) return replacement(); | 2850 if (info()->replacement_calculated) return replacement(); |
2867 if (depth < 0) return this; | 2851 if (depth < 0) return this; |
2868 if (info()->visited) return this; | 2852 if (info()->visited) return this; |
2869 VisitMarker marker(info()); | 2853 VisitMarker marker(info()); |
2870 int choice_count = alternatives_->length(); | 2854 int choice_count = alternatives_->length(); |
2871 | 2855 |
2872 for (int i = 0; i < choice_count; i++) { | 2856 for (int i = 0; i < choice_count; i++) { |
2873 GuardedAlternative alternative = alternatives_->at(i); | 2857 GuardedAlternative alternative = alternatives_->at(i); |
2874 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { | 2858 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { |
2875 set_replacement(this); | 2859 set_replacement(this); |
2876 return this; | 2860 return this; |
2877 } | 2861 } |
2878 } | 2862 } |
2879 | 2863 |
2880 int surviving = 0; | 2864 int surviving = 0; |
2881 RegExpNode* survivor = NULL; | 2865 RegExpNode* survivor = NULL; |
2882 for (int i = 0; i < choice_count; i++) { | 2866 for (int i = 0; i < choice_count; i++) { |
2883 GuardedAlternative alternative = alternatives_->at(i); | 2867 GuardedAlternative alternative = alternatives_->at(i); |
2884 RegExpNode* replacement = | 2868 RegExpNode* replacement = |
2885 alternative.node()->FilterASCII(depth - 1, ignore_case); | 2869 alternative.node()->FilterOneByte(depth - 1, ignore_case); |
2886 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. | 2870 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. |
2887 if (replacement != NULL) { | 2871 if (replacement != NULL) { |
2888 alternatives_->at(i).set_node(replacement); | 2872 alternatives_->at(i).set_node(replacement); |
2889 surviving++; | 2873 surviving++; |
2890 survivor = replacement; | 2874 survivor = replacement; |
2891 } | 2875 } |
2892 } | 2876 } |
2893 if (surviving < 2) return set_replacement(survivor); | 2877 if (surviving < 2) return set_replacement(survivor); |
2894 | 2878 |
2895 set_replacement(this); | 2879 set_replacement(this); |
2896 if (surviving == choice_count) { | 2880 if (surviving == choice_count) { |
2897 return this; | 2881 return this; |
2898 } | 2882 } |
2899 // Only some of the nodes survived the filtering. We need to rebuild the | 2883 // Only some of the nodes survived the filtering. We need to rebuild the |
2900 // alternatives list. | 2884 // alternatives list. |
2901 ZoneList<GuardedAlternative>* new_alternatives = | 2885 ZoneList<GuardedAlternative>* new_alternatives = |
2902 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); | 2886 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); |
2903 for (int i = 0; i < choice_count; i++) { | 2887 for (int i = 0; i < choice_count; i++) { |
2904 RegExpNode* replacement = | 2888 RegExpNode* replacement = |
2905 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case); | 2889 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); |
2906 if (replacement != NULL) { | 2890 if (replacement != NULL) { |
2907 alternatives_->at(i).set_node(replacement); | 2891 alternatives_->at(i).set_node(replacement); |
2908 new_alternatives->Add(alternatives_->at(i), zone()); | 2892 new_alternatives->Add(alternatives_->at(i), zone()); |
2909 } | 2893 } |
2910 } | 2894 } |
2911 alternatives_ = new_alternatives; | 2895 alternatives_ = new_alternatives; |
2912 return this; | 2896 return this; |
2913 } | 2897 } |
2914 | 2898 |
2915 | 2899 |
2916 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth, | 2900 RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth, |
2917 bool ignore_case) { | 2901 bool ignore_case) { |
2918 if (info()->replacement_calculated) return replacement(); | 2902 if (info()->replacement_calculated) return replacement(); |
2919 if (depth < 0) return this; | 2903 if (depth < 0) return this; |
2920 if (info()->visited) return this; | 2904 if (info()->visited) return this; |
2921 VisitMarker marker(info()); | 2905 VisitMarker marker(info()); |
2922 // Alternative 0 is the negative lookahead, alternative 1 is what comes | 2906 // Alternative 0 is the negative lookahead, alternative 1 is what comes |
2923 // afterwards. | 2907 // afterwards. |
2924 RegExpNode* node = alternatives_->at(1).node(); | 2908 RegExpNode* node = alternatives_->at(1).node(); |
2925 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case); | 2909 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); |
2926 if (replacement == NULL) return set_replacement(NULL); | 2910 if (replacement == NULL) return set_replacement(NULL); |
2927 alternatives_->at(1).set_node(replacement); | 2911 alternatives_->at(1).set_node(replacement); |
2928 | 2912 |
2929 RegExpNode* neg_node = alternatives_->at(0).node(); | 2913 RegExpNode* neg_node = alternatives_->at(0).node(); |
2930 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case); | 2914 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); |
2931 // If the negative lookahead is always going to fail then | 2915 // If the negative lookahead is always going to fail then |
2932 // we don't need to check it. | 2916 // we don't need to check it. |
2933 if (neg_replacement == NULL) return set_replacement(replacement); | 2917 if (neg_replacement == NULL) return set_replacement(replacement); |
2934 alternatives_->at(0).set_node(neg_replacement); | 2918 alternatives_->at(0).set_node(neg_replacement); |
2935 return set_replacement(this); | 2919 return set_replacement(this); |
2936 } | 2920 } |
2937 | 2921 |
2938 | 2922 |
2939 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2923 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
2940 RegExpCompiler* compiler, | 2924 RegExpCompiler* compiler, |
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3029 assembler->CheckAtStart(&ok); | 3013 assembler->CheckAtStart(&ok); |
3030 } | 3014 } |
3031 // We already checked that we are not at the start of input so it must be | 3015 // We already checked that we are not at the start of input so it must be |
3032 // OK to load the previous character. | 3016 // OK to load the previous character. |
3033 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, | 3017 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, |
3034 new_trace.backtrack(), | 3018 new_trace.backtrack(), |
3035 false); | 3019 false); |
3036 if (!assembler->CheckSpecialCharacterClass('n', | 3020 if (!assembler->CheckSpecialCharacterClass('n', |
3037 new_trace.backtrack())) { | 3021 new_trace.backtrack())) { |
3038 // Newline means \n, \r, 0x2028 or 0x2029. | 3022 // Newline means \n, \r, 0x2028 or 0x2029. |
3039 if (!compiler->ascii()) { | 3023 if (!compiler->one_byte()) { |
3040 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); | 3024 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
3041 } | 3025 } |
3042 assembler->CheckCharacter('\n', &ok); | 3026 assembler->CheckCharacter('\n', &ok); |
3043 assembler->CheckNotCharacter('\r', new_trace.backtrack()); | 3027 assembler->CheckNotCharacter('\r', new_trace.backtrack()); |
3044 } | 3028 } |
3045 assembler->Bind(&ok); | 3029 assembler->Bind(&ok); |
3046 on_success->Emit(compiler, &new_trace); | 3030 on_success->Emit(compiler, &new_trace); |
3047 } | 3031 } |
3048 | 3032 |
3049 | 3033 |
(...skipping 177 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3227 // check can have involved a mask and compare operation which may simplify | 3211 // check can have involved a mask and compare operation which may simplify |
3228 // or obviate the need for further checks at some character positions. | 3212 // or obviate the need for further checks at some character positions. |
3229 void TextNode::TextEmitPass(RegExpCompiler* compiler, | 3213 void TextNode::TextEmitPass(RegExpCompiler* compiler, |
3230 TextEmitPassType pass, | 3214 TextEmitPassType pass, |
3231 bool preloaded, | 3215 bool preloaded, |
3232 Trace* trace, | 3216 Trace* trace, |
3233 bool first_element_checked, | 3217 bool first_element_checked, |
3234 int* checked_up_to) { | 3218 int* checked_up_to) { |
3235 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 3219 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
3236 Isolate* isolate = assembler->zone()->isolate(); | 3220 Isolate* isolate = assembler->zone()->isolate(); |
3237 bool ascii = compiler->ascii(); | 3221 bool one_byte = compiler->one_byte(); |
3238 Label* backtrack = trace->backtrack(); | 3222 Label* backtrack = trace->backtrack(); |
3239 QuickCheckDetails* quick_check = trace->quick_check_performed(); | 3223 QuickCheckDetails* quick_check = trace->quick_check_performed(); |
3240 int element_count = elms_->length(); | 3224 int element_count = elms_->length(); |
3241 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { | 3225 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { |
3242 TextElement elm = elms_->at(i); | 3226 TextElement elm = elms_->at(i); |
3243 int cp_offset = trace->cp_offset() + elm.cp_offset(); | 3227 int cp_offset = trace->cp_offset() + elm.cp_offset(); |
3244 if (elm.text_type() == TextElement::ATOM) { | 3228 if (elm.text_type() == TextElement::ATOM) { |
3245 Vector<const uc16> quarks = elm.atom()->data(); | 3229 Vector<const uc16> quarks = elm.atom()->data(); |
3246 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { | 3230 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { |
3247 if (first_element_checked && i == 0 && j == 0) continue; | 3231 if (first_element_checked && i == 0 && j == 0) continue; |
3248 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; | 3232 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; |
3249 EmitCharacterFunction* emit_function = NULL; | 3233 EmitCharacterFunction* emit_function = NULL; |
3250 switch (pass) { | 3234 switch (pass) { |
3251 case NON_ASCII_MATCH: | 3235 case NON_LATIN1_MATCH: |
3252 DCHECK(ascii); | 3236 DCHECK(one_byte); |
3253 if (quarks[j] > String::kMaxOneByteCharCode) { | 3237 if (quarks[j] > String::kMaxOneByteCharCode) { |
3254 assembler->GoTo(backtrack); | 3238 assembler->GoTo(backtrack); |
3255 return; | 3239 return; |
3256 } | 3240 } |
3257 break; | 3241 break; |
3258 case NON_LETTER_CHARACTER_MATCH: | 3242 case NON_LETTER_CHARACTER_MATCH: |
3259 emit_function = &EmitAtomNonLetter; | 3243 emit_function = &EmitAtomNonLetter; |
3260 break; | 3244 break; |
3261 case SIMPLE_CHARACTER_MATCH: | 3245 case SIMPLE_CHARACTER_MATCH: |
3262 emit_function = &EmitSimpleCharacter; | 3246 emit_function = &EmitSimpleCharacter; |
(...skipping 14 matching lines...) Expand all Loading... | |
3277 preloaded); | 3261 preloaded); |
3278 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); | 3262 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); |
3279 } | 3263 } |
3280 } | 3264 } |
3281 } else { | 3265 } else { |
3282 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); | 3266 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); |
3283 if (pass == CHARACTER_CLASS_MATCH) { | 3267 if (pass == CHARACTER_CLASS_MATCH) { |
3284 if (first_element_checked && i == 0) continue; | 3268 if (first_element_checked && i == 0) continue; |
3285 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; | 3269 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; |
3286 RegExpCharacterClass* cc = elm.char_class(); | 3270 RegExpCharacterClass* cc = elm.char_class(); |
3287 EmitCharClass(assembler, | 3271 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, |
3288 cc, | 3272 *checked_up_to < cp_offset, preloaded, zone()); |
3289 ascii, | |
3290 backtrack, | |
3291 cp_offset, | |
3292 *checked_up_to < cp_offset, | |
3293 preloaded, | |
3294 zone()); | |
3295 UpdateBoundsCheck(cp_offset, checked_up_to); | 3273 UpdateBoundsCheck(cp_offset, checked_up_to); |
3296 } | 3274 } |
3297 } | 3275 } |
3298 } | 3276 } |
3299 } | 3277 } |
3300 | 3278 |
3301 | 3279 |
3302 int TextNode::Length() { | 3280 int TextNode::Length() { |
3303 TextElement elm = elms_->last(); | 3281 TextElement elm = elms_->last(); |
3304 DCHECK(elm.cp_offset() >= 0); | 3282 DCHECK(elm.cp_offset() >= 0); |
(...skipping 20 matching lines...) Expand all Loading... | |
3325 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { | 3303 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
3326 LimitResult limit_result = LimitVersions(compiler, trace); | 3304 LimitResult limit_result = LimitVersions(compiler, trace); |
3327 if (limit_result == DONE) return; | 3305 if (limit_result == DONE) return; |
3328 DCHECK(limit_result == CONTINUE); | 3306 DCHECK(limit_result == CONTINUE); |
3329 | 3307 |
3330 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { | 3308 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { |
3331 compiler->SetRegExpTooBig(); | 3309 compiler->SetRegExpTooBig(); |
3332 return; | 3310 return; |
3333 } | 3311 } |
3334 | 3312 |
3335 if (compiler->ascii()) { | 3313 if (compiler->one_byte()) { |
3336 int dummy = 0; | 3314 int dummy = 0; |
3337 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); | 3315 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); |
3338 } | 3316 } |
3339 | 3317 |
3340 bool first_elt_done = false; | 3318 bool first_elt_done = false; |
3341 int bound_checked_to = trace->cp_offset() - 1; | 3319 int bound_checked_to = trace->cp_offset() - 1; |
3342 bound_checked_to += trace->bound_checked_up_to(); | 3320 bound_checked_to += trace->bound_checked_up_to(); |
3343 | 3321 |
3344 // If a character is preloaded into the current character register then | 3322 // If a character is preloaded into the current character register then |
3345 // check that now. | 3323 // check that now. |
3346 if (trace->characters_preloaded() == 1) { | 3324 if (trace->characters_preloaded() == 1) { |
3347 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { | 3325 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3383 | 3361 |
3384 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { | 3362 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { |
3385 DCHECK(by > 0); | 3363 DCHECK(by > 0); |
3386 // We don't have an instruction for shifting the current character register | 3364 // We don't have an instruction for shifting the current character register |
3387 // down or for using a shifted value for anything so lets just forget that | 3365 // down or for using a shifted value for anything so lets just forget that |
3388 // we preloaded any characters into it. | 3366 // we preloaded any characters into it. |
3389 characters_preloaded_ = 0; | 3367 characters_preloaded_ = 0; |
3390 // Adjust the offsets of the quick check performed information. This | 3368 // Adjust the offsets of the quick check performed information. This |
3391 // information is used to find out what we already determined about the | 3369 // information is used to find out what we already determined about the |
3392 // characters by means of mask and compare. | 3370 // characters by means of mask and compare. |
3393 quick_check_performed_.Advance(by, compiler->ascii()); | 3371 quick_check_performed_.Advance(by, compiler->one_byte()); |
3394 cp_offset_ += by; | 3372 cp_offset_ += by; |
3395 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { | 3373 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { |
3396 compiler->SetRegExpTooBig(); | 3374 compiler->SetRegExpTooBig(); |
3397 cp_offset_ = 0; | 3375 cp_offset_ = 0; |
3398 } | 3376 } |
3399 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); | 3377 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); |
3400 } | 3378 } |
3401 | 3379 |
3402 | 3380 |
3403 void TextNode::MakeCaseIndependent(bool is_ascii) { | 3381 void TextNode::MakeCaseIndependent(bool is_one_byte) { |
3404 int element_count = elms_->length(); | 3382 int element_count = elms_->length(); |
3405 for (int i = 0; i < element_count; i++) { | 3383 for (int i = 0; i < element_count; i++) { |
3406 TextElement elm = elms_->at(i); | 3384 TextElement elm = elms_->at(i); |
3407 if (elm.text_type() == TextElement::CHAR_CLASS) { | 3385 if (elm.text_type() == TextElement::CHAR_CLASS) { |
3408 RegExpCharacterClass* cc = elm.char_class(); | 3386 RegExpCharacterClass* cc = elm.char_class(); |
3409 // None of the standard character classes is different in the case | 3387 // None of the standard character classes is different in the case |
3410 // independent case and it slows us down if we don't know that. | 3388 // independent case and it slows us down if we don't know that. |
3411 if (cc->is_standard(zone())) continue; | 3389 if (cc->is_standard(zone())) continue; |
3412 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); | 3390 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
3413 int range_count = ranges->length(); | 3391 int range_count = ranges->length(); |
3414 for (int j = 0; j < range_count; j++) { | 3392 for (int j = 0; j < range_count; j++) { |
3415 ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone()); | 3393 ranges->at(j).AddCaseEquivalents(ranges, is_one_byte, zone()); |
3416 } | 3394 } |
3417 } | 3395 } |
3418 } | 3396 } |
3419 } | 3397 } |
3420 | 3398 |
3421 | 3399 |
3422 int TextNode::GreedyLoopTextLength() { | 3400 int TextNode::GreedyLoopTextLength() { |
3423 TextElement elm = elms_->at(elms_->length() - 1); | 3401 TextElement elm = elms_->at(elms_->length() - 1); |
3424 return elm.cp_offset() + elm.length(); | 3402 return elm.cp_offset() + elm.length(); |
3425 } | 3403 } |
3426 | 3404 |
3427 | 3405 |
3428 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( | 3406 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( |
3429 RegExpCompiler* compiler) { | 3407 RegExpCompiler* compiler) { |
3430 if (elms_->length() != 1) return NULL; | 3408 if (elms_->length() != 1) return NULL; |
3431 TextElement elm = elms_->at(0); | 3409 TextElement elm = elms_->at(0); |
3432 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; | 3410 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; |
3433 RegExpCharacterClass* node = elm.char_class(); | 3411 RegExpCharacterClass* node = elm.char_class(); |
3434 ZoneList<CharacterRange>* ranges = node->ranges(zone()); | 3412 ZoneList<CharacterRange>* ranges = node->ranges(zone()); |
3435 if (!CharacterRange::IsCanonical(ranges)) { | 3413 if (!CharacterRange::IsCanonical(ranges)) { |
3436 CharacterRange::Canonicalize(ranges); | 3414 CharacterRange::Canonicalize(ranges); |
3437 } | 3415 } |
3438 if (node->is_negated()) { | 3416 if (node->is_negated()) { |
3439 return ranges->length() == 0 ? on_success() : NULL; | 3417 return ranges->length() == 0 ? on_success() : NULL; |
3440 } | 3418 } |
3441 if (ranges->length() != 1) return NULL; | 3419 if (ranges->length() != 1) return NULL; |
3442 uint32_t max_char; | 3420 uint32_t max_char; |
3443 if (compiler->ascii()) { | 3421 if (compiler->one_byte()) { |
3444 max_char = String::kMaxOneByteCharCode; | 3422 max_char = String::kMaxOneByteCharCode; |
3445 } else { | 3423 } else { |
3446 max_char = String::kMaxUtf16CodeUnit; | 3424 max_char = String::kMaxUtf16CodeUnit; |
3447 } | 3425 } |
3448 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; | 3426 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; |
3449 } | 3427 } |
3450 | 3428 |
3451 | 3429 |
3452 // Finds the fixed match length of a sequence of nodes that goes from | 3430 // Finds the fixed match length of a sequence of nodes that goes from |
3453 // this alternative and back to this choice node. If there are variable | 3431 // this alternative and back to this choice node. If there are variable |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3510 return; | 3488 return; |
3511 } | 3489 } |
3512 ChoiceNode::Emit(compiler, trace); | 3490 ChoiceNode::Emit(compiler, trace); |
3513 } | 3491 } |
3514 | 3492 |
3515 | 3493 |
3516 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, | 3494 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
3517 int eats_at_least) { | 3495 int eats_at_least) { |
3518 int preload_characters = Min(4, eats_at_least); | 3496 int preload_characters = Min(4, eats_at_least); |
3519 if (compiler->macro_assembler()->CanReadUnaligned()) { | 3497 if (compiler->macro_assembler()->CanReadUnaligned()) { |
3520 bool ascii = compiler->ascii(); | 3498 bool one_byte = compiler->one_byte(); |
3521 if (ascii) { | 3499 if (one_byte) { |
3522 if (preload_characters > 4) preload_characters = 4; | 3500 if (preload_characters > 4) preload_characters = 4; |
3523 // We can't preload 3 characters because there is no machine instruction | 3501 // We can't preload 3 characters because there is no machine instruction |
3524 // to do that. We can't just load 4 because we could be reading | 3502 // to do that. We can't just load 4 because we could be reading |
3525 // beyond the end of the string, which could cause a memory fault. | 3503 // beyond the end of the string, which could cause a memory fault. |
3526 if (preload_characters == 3) preload_characters = 2; | 3504 if (preload_characters == 3) preload_characters = 2; |
3527 } else { | 3505 } else { |
3528 if (preload_characters > 2) preload_characters = 2; | 3506 if (preload_characters > 2) preload_characters = 2; |
3529 } | 3507 } |
3530 } else { | 3508 } else { |
3531 if (preload_characters > 1) preload_characters = 1; | 3509 if (preload_characters > 1) preload_characters = 1; |
(...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3637 map_count_ = kMapSize; | 3615 map_count_ = kMapSize; |
3638 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; | 3616 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; |
3639 } | 3617 } |
3640 } | 3618 } |
3641 | 3619 |
3642 | 3620 |
3643 BoyerMooreLookahead::BoyerMooreLookahead( | 3621 BoyerMooreLookahead::BoyerMooreLookahead( |
3644 int length, RegExpCompiler* compiler, Zone* zone) | 3622 int length, RegExpCompiler* compiler, Zone* zone) |
3645 : length_(length), | 3623 : length_(length), |
3646 compiler_(compiler) { | 3624 compiler_(compiler) { |
3647 if (compiler->ascii()) { | 3625 if (compiler->one_byte()) { |
3648 max_char_ = String::kMaxOneByteCharCode; | 3626 max_char_ = String::kMaxOneByteCharCode; |
3649 } else { | 3627 } else { |
3650 max_char_ = String::kMaxUtf16CodeUnit; | 3628 max_char_ = String::kMaxUtf16CodeUnit; |
3651 } | 3629 } |
3652 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); | 3630 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); |
3653 for (int i = 0; i < length; i++) { | 3631 for (int i = 0; i < length; i++) { |
3654 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); | 3632 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); |
3655 } | 3633 } |
3656 } | 3634 } |
3657 | 3635 |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3705 // can theoretically be up to 2*kSize though we treat it mostly as | 3683 // can theoretically be up to 2*kSize though we treat it mostly as |
3706 // a fraction of kSize. | 3684 // a fraction of kSize. |
3707 frequency += compiler_->frequency_collator()->Frequency(j) + 1; | 3685 frequency += compiler_->frequency_collator()->Frequency(j) + 1; |
3708 } | 3686 } |
3709 } | 3687 } |
3710 // We use the probability of skipping times the distance we are skipping to | 3688 // We use the probability of skipping times the distance we are skipping to |
3711 // judge the effectiveness of this. Actually we have a cut-off: By | 3689 // judge the effectiveness of this. Actually we have a cut-off: By |
3712 // dividing by 2 we switch off the skipping if the probability of skipping | 3690 // dividing by 2 we switch off the skipping if the probability of skipping |
3713 // is less than 50%. This is because the multibyte mask-and-compare | 3691 // is less than 50%. This is because the multibyte mask-and-compare |
3714 // skipping in quickcheck is more likely to do well on this case. | 3692 // skipping in quickcheck is more likely to do well on this case. |
3715 bool in_quickcheck_range = ((i - remembered_from < 4) || | 3693 bool in_quickcheck_range = |
3716 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); | 3694 ((i - remembered_from < 4) || |
3695 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); | |
3717 // Called 'probability' but it is only a rough estimate and can actually | 3696 // Called 'probability' but it is only a rough estimate and can actually |
3718 // be outside the 0-kSize range. | 3697 // be outside the 0-kSize range. |
3719 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; | 3698 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; |
3720 int points = (i - remembered_from) * probability; | 3699 int points = (i - remembered_from) * probability; |
3721 if (points > biggest_points) { | 3700 if (points > biggest_points) { |
3722 *from = remembered_from; | 3701 *from = remembered_from; |
3723 *to = i - 1; | 3702 *to = i - 1; |
3724 biggest_points = points; | 3703 biggest_points = points; |
3725 } | 3704 } |
3726 } | 3705 } |
(...skipping 197 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3924 #endif | 3903 #endif |
3925 } | 3904 } |
3926 | 3905 |
3927 | 3906 |
3928 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, | 3907 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, |
3929 Trace* current_trace, | 3908 Trace* current_trace, |
3930 PreloadState* state) { | 3909 PreloadState* state) { |
3931 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { | 3910 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { |
3932 // Save some time by looking at most one machine word ahead. | 3911 // Save some time by looking at most one machine word ahead. |
3933 state->eats_at_least_ = | 3912 state->eats_at_least_ = |
3934 EatsAtLeast(compiler->ascii() ? 4 : 2, | 3913 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, |
3935 kRecursionBudget, | |
3936 current_trace->at_start() == Trace::FALSE_VALUE); | 3914 current_trace->at_start() == Trace::FALSE_VALUE); |
3937 } | 3915 } |
3938 state->preload_characters_ = | 3916 state->preload_characters_ = |
3939 CalculatePreloadCharacters(compiler, state->eats_at_least_); | 3917 CalculatePreloadCharacters(compiler, state->eats_at_least_); |
3940 | 3918 |
3941 state->preload_is_current_ = | 3919 state->preload_is_current_ = |
3942 (current_trace->characters_preloaded() == state->preload_characters_); | 3920 (current_trace->characters_preloaded() == state->preload_characters_); |
3943 state->preload_has_checked_bounds_ = state->preload_is_current_; | 3921 state->preload_has_checked_bounds_ = state->preload_is_current_; |
3944 } | 3922 } |
3945 | 3923 |
(...skipping 1394 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5340 for (int i = 0; i < overlay.length(); i += 2) { | 5318 for (int i = 0; i < overlay.length(); i += 2) { |
5341 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), | 5319 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), |
5342 CharacterRangeSplitter::kInOverlay, zone); | 5320 CharacterRangeSplitter::kInOverlay, zone); |
5343 } | 5321 } |
5344 CharacterRangeSplitter callback(included, excluded, zone); | 5322 CharacterRangeSplitter callback(included, excluded, zone); |
5345 table.ForEach(&callback); | 5323 table.ForEach(&callback); |
5346 } | 5324 } |
5347 | 5325 |
5348 | 5326 |
5349 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, | 5327 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
5350 bool is_ascii, | 5328 bool is_one_byte, Zone* zone) { |
5351 Zone* zone) { | |
5352 Isolate* isolate = zone->isolate(); | 5329 Isolate* isolate = zone->isolate(); |
5353 uc16 bottom = from(); | 5330 uc16 bottom = from(); |
5354 uc16 top = to(); | 5331 uc16 top = to(); |
5355 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { | 5332 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { |
5356 if (bottom > String::kMaxOneByteCharCode) return; | 5333 if (bottom > String::kMaxOneByteCharCode) return; |
5357 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; | 5334 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; |
5358 } | 5335 } |
5359 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 5336 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
5360 if (top == bottom) { | 5337 if (top == bottom) { |
5361 // If this is a singleton we just expand the one character. | 5338 // If this is a singleton we just expand the one character. |
5362 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); | 5339 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); |
5363 for (int i = 0; i < length; i++) { | 5340 for (int i = 0; i < length; i++) { |
5364 uc32 chr = chars[i]; | 5341 uc32 chr = chars[i]; |
5365 if (chr != bottom) { | 5342 if (chr != bottom) { |
(...skipping 398 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5764 for (int i = 0; i < element_count; i++) { | 5741 for (int i = 0; i < element_count; i++) { |
5765 TextElement& elm = elements()->at(i); | 5742 TextElement& elm = elements()->at(i); |
5766 elm.set_cp_offset(cp_offset); | 5743 elm.set_cp_offset(cp_offset); |
5767 cp_offset += elm.length(); | 5744 cp_offset += elm.length(); |
5768 } | 5745 } |
5769 } | 5746 } |
5770 | 5747 |
5771 | 5748 |
5772 void Analysis::VisitText(TextNode* that) { | 5749 void Analysis::VisitText(TextNode* that) { |
5773 if (ignore_case_) { | 5750 if (ignore_case_) { |
5774 that->MakeCaseIndependent(is_ascii_); | 5751 that->MakeCaseIndependent(is_one_byte_); |
5775 } | 5752 } |
5776 EnsureAnalyzed(that->on_success()); | 5753 EnsureAnalyzed(that->on_success()); |
5777 if (!has_failed()) { | 5754 if (!has_failed()) { |
5778 that->CalculateOffsets(); | 5755 that->CalculateOffsets(); |
5779 } | 5756 } |
5780 } | 5757 } |
5781 | 5758 |
5782 | 5759 |
5783 void Analysis::VisitAction(ActionNode* that) { | 5760 void Analysis::VisitAction(ActionNode* that) { |
5784 RegExpNode* target = that->on_success(); | 5761 RegExpNode* target = that->on_success(); |
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
6040 } | 6017 } |
6041 | 6018 |
6042 | 6019 |
6043 void DispatchTableConstructor::VisitAction(ActionNode* that) { | 6020 void DispatchTableConstructor::VisitAction(ActionNode* that) { |
6044 RegExpNode* target = that->on_success(); | 6021 RegExpNode* target = that->on_success(); |
6045 target->Accept(this); | 6022 target->Accept(this); |
6046 } | 6023 } |
6047 | 6024 |
6048 | 6025 |
6049 RegExpEngine::CompilationResult RegExpEngine::Compile( | 6026 RegExpEngine::CompilationResult RegExpEngine::Compile( |
6050 RegExpCompileData* data, | 6027 RegExpCompileData* data, bool ignore_case, bool is_global, |
6051 bool ignore_case, | 6028 bool is_multiline, Handle<String> pattern, Handle<String> sample_subject, |
6052 bool is_global, | 6029 bool is_one_byte, Zone* zone) { |
6053 bool is_multiline, | |
6054 Handle<String> pattern, | |
6055 Handle<String> sample_subject, | |
6056 bool is_ascii, | |
6057 Zone* zone) { | |
6058 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { | 6030 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
6059 return IrregexpRegExpTooBig(zone->isolate()); | 6031 return IrregexpRegExpTooBig(zone->isolate()); |
6060 } | 6032 } |
6061 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone); | 6033 RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte, zone); |
6062 | 6034 |
6063 // Sample some characters from the middle of the string. | 6035 // Sample some characters from the middle of the string. |
6064 static const int kSampleSize = 128; | 6036 static const int kSampleSize = 128; |
6065 | 6037 |
6066 sample_subject = String::Flatten(sample_subject); | 6038 sample_subject = String::Flatten(sample_subject); |
6067 int chars_sampled = 0; | 6039 int chars_sampled = 0; |
6068 int half_way = (sample_subject->length() - kSampleSize) / 2; | 6040 int half_way = (sample_subject->length() - kSampleSize) / 2; |
6069 for (int i = Max(0, half_way); | 6041 for (int i = Max(0, half_way); |
6070 i < sample_subject->length() && chars_sampled < kSampleSize; | 6042 i < sample_subject->length() && chars_sampled < kSampleSize; |
6071 i++, chars_sampled++) { | 6043 i++, chars_sampled++) { |
(...skipping 26 matching lines...) Expand all Loading... | |
6098 // at the start of input. | 6070 // at the start of input. |
6099 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); | 6071 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); |
6100 first_step_node->AddAlternative(GuardedAlternative(captured_body)); | 6072 first_step_node->AddAlternative(GuardedAlternative(captured_body)); |
6101 first_step_node->AddAlternative(GuardedAlternative( | 6073 first_step_node->AddAlternative(GuardedAlternative( |
6102 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); | 6074 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); |
6103 node = first_step_node; | 6075 node = first_step_node; |
6104 } else { | 6076 } else { |
6105 node = loop_node; | 6077 node = loop_node; |
6106 } | 6078 } |
6107 } | 6079 } |
6108 if (is_ascii) { | 6080 if (is_one_byte) { |
6109 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); | 6081 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); |
6110 // Do it again to propagate the new nodes to places where they were not | 6082 // Do it again to propagate the new nodes to places where they were not |
6111 // put because they had not been calculated yet. | 6083 // put because they had not been calculated yet. |
6112 if (node != NULL) { | 6084 if (node != NULL) { |
6113 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); | 6085 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); |
6114 } | 6086 } |
6115 } | 6087 } |
6116 | 6088 |
6117 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); | 6089 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); |
6118 data->node = node; | 6090 data->node = node; |
6119 Analysis analysis(ignore_case, is_ascii); | 6091 Analysis analysis(ignore_case, is_one_byte); |
6120 analysis.EnsureAnalyzed(node); | 6092 analysis.EnsureAnalyzed(node); |
6121 if (analysis.has_failed()) { | 6093 if (analysis.has_failed()) { |
6122 const char* error_message = analysis.error_message(); | 6094 const char* error_message = analysis.error_message(); |
6123 return CompilationResult(zone->isolate(), error_message); | 6095 return CompilationResult(zone->isolate(), error_message); |
6124 } | 6096 } |
6125 | 6097 |
6126 // Create the correct assembler for the architecture. | 6098 // Create the correct assembler for the architecture. |
6127 #ifndef V8_INTERPRETED_REGEXP | 6099 #ifndef V8_INTERPRETED_REGEXP |
6128 // Native regexp implementation. | 6100 // Native regexp implementation. |
6129 | 6101 |
6130 NativeRegExpMacroAssembler::Mode mode = | 6102 NativeRegExpMacroAssembler::Mode mode = |
6131 is_ascii ? NativeRegExpMacroAssembler::ASCII | 6103 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 |
6132 : NativeRegExpMacroAssembler::UC16; | 6104 : NativeRegExpMacroAssembler::UC16; |
6133 | 6105 |
6134 #if V8_TARGET_ARCH_IA32 | 6106 #if V8_TARGET_ARCH_IA32 |
6135 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, | 6107 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, |
6136 zone); | 6108 zone); |
6137 #elif V8_TARGET_ARCH_X64 | 6109 #elif V8_TARGET_ARCH_X64 |
6138 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2, | 6110 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2, |
6139 zone); | 6111 zone); |
6140 #elif V8_TARGET_ARCH_ARM | 6112 #elif V8_TARGET_ARCH_ARM |
6141 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2, | 6113 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2, |
6142 zone); | 6114 zone); |
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
6179 } | 6151 } |
6180 | 6152 |
6181 return compiler.Assemble(¯o_assembler, | 6153 return compiler.Assemble(¯o_assembler, |
6182 node, | 6154 node, |
6183 data->capture_count, | 6155 data->capture_count, |
6184 pattern); | 6156 pattern); |
6185 } | 6157 } |
6186 | 6158 |
6187 | 6159 |
6188 }} // namespace v8::internal | 6160 }} // namespace v8::internal |
OLD | NEW |