Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/v8.h" | 5 #include "src/v8.h" |
| 6 | 6 |
| 7 #include "src/ast.h" | 7 #include "src/ast.h" |
| 8 #include "src/base/platform/platform.h" | 8 #include "src/base/platform/platform.h" |
| 9 #include "src/compilation-cache.h" | 9 #include "src/compilation-cache.h" |
| 10 #include "src/compiler.h" | 10 #include "src/compiler.h" |
| (...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 283 if (index + needle_len > subject->length()) { | 283 if (index + needle_len > subject->length()) { |
| 284 return RegExpImpl::RE_FAILURE; | 284 return RegExpImpl::RE_FAILURE; |
| 285 } | 285 } |
| 286 | 286 |
| 287 for (int i = 0; i < output_size; i += 2) { | 287 for (int i = 0; i < output_size; i += 2) { |
| 288 String::FlatContent needle_content = needle->GetFlatContent(); | 288 String::FlatContent needle_content = needle->GetFlatContent(); |
| 289 String::FlatContent subject_content = subject->GetFlatContent(); | 289 String::FlatContent subject_content = subject->GetFlatContent(); |
| 290 DCHECK(needle_content.IsFlat()); | 290 DCHECK(needle_content.IsFlat()); |
| 291 DCHECK(subject_content.IsFlat()); | 291 DCHECK(subject_content.IsFlat()); |
| 292 // dispatch on type of strings | 292 // dispatch on type of strings |
| 293 index = (needle_content.IsAscii() | 293 index = |
| 294 ? (subject_content.IsAscii() | 294 (needle_content.IsOneByte() |
| 295 ? SearchString(isolate, | 295 ? (subject_content.IsOneByte() |
| 296 subject_content.ToOneByteVector(), | 296 ? SearchString(isolate, subject_content.ToOneByteVector(), |
| 297 needle_content.ToOneByteVector(), | 297 needle_content.ToOneByteVector(), index) |
| 298 index) | 298 : SearchString(isolate, subject_content.ToUC16Vector(), |
| 299 : SearchString(isolate, | 299 needle_content.ToOneByteVector(), index)) |
| 300 subject_content.ToUC16Vector(), | 300 : (subject_content.IsOneByte() |
| 301 needle_content.ToOneByteVector(), | 301 ? SearchString(isolate, subject_content.ToOneByteVector(), |
| 302 index)) | 302 needle_content.ToUC16Vector(), index) |
| 303 : (subject_content.IsAscii() | 303 : SearchString(isolate, subject_content.ToUC16Vector(), |
| 304 ? SearchString(isolate, | 304 needle_content.ToUC16Vector(), index))); |
| 305 subject_content.ToOneByteVector(), | |
| 306 needle_content.ToUC16Vector(), | |
| 307 index) | |
| 308 : SearchString(isolate, | |
| 309 subject_content.ToUC16Vector(), | |
| 310 needle_content.ToUC16Vector(), | |
| 311 index))); | |
| 312 if (index == -1) { | 305 if (index == -1) { |
| 313 return i / 2; // Return number of matches. | 306 return i / 2; // Return number of matches. |
| 314 } else { | 307 } else { |
| 315 output[i] = index; | 308 output[i] = index; |
| 316 output[i+1] = index + needle_len; | 309 output[i+1] = index + needle_len; |
| 317 index += needle_len; | 310 index += needle_len; |
| 318 } | 311 } |
| 319 } | 312 } |
| 320 return output_size / 2; | 313 return output_size / 2; |
| 321 } | 314 } |
| (...skipping 17 matching lines...) Expand all Loading... | |
| 339 SealHandleScope shs(isolate); | 332 SealHandleScope shs(isolate); |
| 340 FixedArray* array = FixedArray::cast(last_match_info->elements()); | 333 FixedArray* array = FixedArray::cast(last_match_info->elements()); |
| 341 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); | 334 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]); |
| 342 return last_match_info; | 335 return last_match_info; |
| 343 } | 336 } |
| 344 | 337 |
| 345 | 338 |
| 346 // Irregexp implementation. | 339 // Irregexp implementation. |
| 347 | 340 |
| 348 // Ensures that the regexp object contains a compiled version of the | 341 // Ensures that the regexp object contains a compiled version of the |
| 349 // source for either ASCII or non-ASCII strings. | 342 // source for either one-byte or two-byte subject strings. |
| 350 // If the compiled version doesn't already exist, it is compiled | 343 // If the compiled version doesn't already exist, it is compiled |
| 351 // from the source pattern. | 344 // from the source pattern. |
| 352 // If compilation fails, an exception is thrown and this function | 345 // If compilation fails, an exception is thrown and this function |
| 353 // returns false. | 346 // returns false. |
| 354 bool RegExpImpl::EnsureCompiledIrregexp( | 347 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, |
| 355 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { | 348 Handle<String> sample_subject, |
| 356 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); | 349 bool is_one_byte) { |
| 350 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte)); | |
| 357 #ifdef V8_INTERPRETED_REGEXP | 351 #ifdef V8_INTERPRETED_REGEXP |
| 358 if (compiled_code->IsByteArray()) return true; | 352 if (compiled_code->IsByteArray()) return true; |
| 359 #else // V8_INTERPRETED_REGEXP (RegExp native code) | 353 #else // V8_INTERPRETED_REGEXP (RegExp native code) |
| 360 if (compiled_code->IsCode()) return true; | 354 if (compiled_code->IsCode()) return true; |
| 361 #endif | 355 #endif |
| 362 // We could potentially have marked this as flushable, but have kept | 356 // We could potentially have marked this as flushable, but have kept |
| 363 // a saved version if we did not flush it yet. | 357 // a saved version if we did not flush it yet. |
| 364 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); | 358 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); |
| 365 if (saved_code->IsCode()) { | 359 if (saved_code->IsCode()) { |
| 366 // Reinstate the code in the original place. | 360 // Reinstate the code in the original place. |
| 367 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); | 361 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code); |
| 368 DCHECK(compiled_code->IsSmi()); | 362 DCHECK(compiled_code->IsSmi()); |
| 369 return true; | 363 return true; |
| 370 } | 364 } |
| 371 return CompileIrregexp(re, sample_subject, is_ascii); | 365 return CompileIrregexp(re, sample_subject, is_one_byte); |
| 372 } | 366 } |
| 373 | 367 |
| 374 | 368 |
| 375 static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, bool is_ascii, | 369 static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, |
| 376 Handle<String> error_message, | 370 Handle<String> error_message, |
| 377 Isolate* isolate) { | 371 Isolate* isolate) { |
| 378 Factory* factory = isolate->factory(); | 372 Factory* factory = isolate->factory(); |
| 379 Handle<FixedArray> elements = factory->NewFixedArray(2); | 373 Handle<FixedArray> elements = factory->NewFixedArray(2); |
| 380 elements->set(0, re->Pattern()); | 374 elements->set(0, re->Pattern()); |
| 381 elements->set(1, *error_message); | 375 elements->set(1, *error_message); |
| 382 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 376 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
| 383 Handle<Object> error; | 377 Handle<Object> error; |
| 384 MaybeHandle<Object> maybe_error = | 378 MaybeHandle<Object> maybe_error = |
| 385 factory->NewSyntaxError("malformed_regexp", array); | 379 factory->NewSyntaxError("malformed_regexp", array); |
| 386 if (maybe_error.ToHandle(&error)) isolate->Throw(*error); | 380 if (maybe_error.ToHandle(&error)) isolate->Throw(*error); |
| 387 } | 381 } |
| 388 | 382 |
| 389 | 383 |
| 390 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, | 384 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
| 391 Handle<String> sample_subject, | 385 Handle<String> sample_subject, |
| 392 bool is_ascii) { | 386 bool is_one_byte) { |
| 393 // Compile the RegExp. | 387 // Compile the RegExp. |
| 394 Isolate* isolate = re->GetIsolate(); | 388 Isolate* isolate = re->GetIsolate(); |
| 395 Zone zone(isolate); | 389 Zone zone(isolate); |
| 396 PostponeInterruptsScope postpone(isolate); | 390 PostponeInterruptsScope postpone(isolate); |
| 397 // If we had a compilation error the last time this is saved at the | 391 // If we had a compilation error the last time this is saved at the |
| 398 // saved code index. | 392 // saved code index. |
| 399 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); | 393 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte)); |
| 400 // When arriving here entry can only be a smi, either representing an | 394 // When arriving here entry can only be a smi, either representing an |
| 401 // uncompiled regexp, a previous compilation error, or code that has | 395 // uncompiled regexp, a previous compilation error, or code that has |
| 402 // been flushed. | 396 // been flushed. |
| 403 DCHECK(entry->IsSmi()); | 397 DCHECK(entry->IsSmi()); |
| 404 int entry_value = Smi::cast(entry)->value(); | 398 int entry_value = Smi::cast(entry)->value(); |
| 405 DCHECK(entry_value == JSRegExp::kUninitializedValue || | 399 DCHECK(entry_value == JSRegExp::kUninitializedValue || |
| 406 entry_value == JSRegExp::kCompilationErrorValue || | 400 entry_value == JSRegExp::kCompilationErrorValue || |
| 407 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); | 401 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0)); |
| 408 | 402 |
| 409 if (entry_value == JSRegExp::kCompilationErrorValue) { | 403 if (entry_value == JSRegExp::kCompilationErrorValue) { |
| 410 // A previous compilation failed and threw an error which we store in | 404 // A previous compilation failed and threw an error which we store in |
| 411 // the saved code index (we store the error message, not the actual | 405 // the saved code index (we store the error message, not the actual |
| 412 // error). Recreate the error object and throw it. | 406 // error). Recreate the error object and throw it. |
| 413 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii)); | 407 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte)); |
| 414 DCHECK(error_string->IsString()); | 408 DCHECK(error_string->IsString()); |
| 415 Handle<String> error_message(String::cast(error_string)); | 409 Handle<String> error_message(String::cast(error_string)); |
| 416 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); | 410 CreateRegExpErrorObjectAndThrow(re, error_message, isolate); |
| 417 return false; | 411 return false; |
| 418 } | 412 } |
| 419 | 413 |
| 420 JSRegExp::Flags flags = re->GetFlags(); | 414 JSRegExp::Flags flags = re->GetFlags(); |
| 421 | 415 |
| 422 Handle<String> pattern(re->Pattern()); | 416 Handle<String> pattern(re->Pattern()); |
| 423 pattern = String::Flatten(pattern); | 417 pattern = String::Flatten(pattern); |
| 424 RegExpCompileData compile_data; | 418 RegExpCompileData compile_data; |
| 425 FlatStringReader reader(isolate, pattern); | 419 FlatStringReader reader(isolate, pattern); |
| 426 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), | 420 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(), |
| 427 &compile_data, | 421 &compile_data, |
| 428 &zone)) { | 422 &zone)) { |
| 429 // Throw an exception if we fail to parse the pattern. | 423 // Throw an exception if we fail to parse the pattern. |
| 430 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. | 424 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once. |
| 431 USE(ThrowRegExpException(re, | 425 USE(ThrowRegExpException(re, |
| 432 pattern, | 426 pattern, |
| 433 compile_data.error, | 427 compile_data.error, |
| 434 "malformed_regexp")); | 428 "malformed_regexp")); |
| 435 return false; | 429 return false; |
| 436 } | 430 } |
| 437 RegExpEngine::CompilationResult result = | 431 RegExpEngine::CompilationResult result = RegExpEngine::Compile( |
| 438 RegExpEngine::Compile(&compile_data, | 432 &compile_data, flags.is_ignore_case(), flags.is_global(), |
| 439 flags.is_ignore_case(), | 433 flags.is_multiline(), pattern, sample_subject, is_one_byte, &zone); |
| 440 flags.is_global(), | |
| 441 flags.is_multiline(), | |
| 442 pattern, | |
| 443 sample_subject, | |
| 444 is_ascii, | |
| 445 &zone); | |
| 446 if (result.error_message != NULL) { | 434 if (result.error_message != NULL) { |
| 447 // Unable to compile regexp. | 435 // Unable to compile regexp. |
| 448 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( | 436 Handle<String> error_message = isolate->factory()->NewStringFromUtf8( |
| 449 CStrVector(result.error_message)).ToHandleChecked(); | 437 CStrVector(result.error_message)).ToHandleChecked(); |
| 450 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); | 438 CreateRegExpErrorObjectAndThrow(re, error_message, isolate); |
| 451 return false; | 439 return false; |
| 452 } | 440 } |
| 453 | 441 |
| 454 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); | 442 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); |
| 455 data->set(JSRegExp::code_index(is_ascii), result.code); | 443 data->set(JSRegExp::code_index(is_one_byte), result.code); |
| 456 int register_max = IrregexpMaxRegisterCount(*data); | 444 int register_max = IrregexpMaxRegisterCount(*data); |
| 457 if (result.num_registers > register_max) { | 445 if (result.num_registers > register_max) { |
| 458 SetIrregexpMaxRegisterCount(*data, result.num_registers); | 446 SetIrregexpMaxRegisterCount(*data, result.num_registers); |
| 459 } | 447 } |
| 460 | 448 |
| 461 return true; | 449 return true; |
| 462 } | 450 } |
| 463 | 451 |
| 464 | 452 |
| 465 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { | 453 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) { |
| (...skipping 10 matching lines...) Expand all Loading... | |
| 476 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { | 464 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) { |
| 477 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); | 465 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value(); |
| 478 } | 466 } |
| 479 | 467 |
| 480 | 468 |
| 481 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { | 469 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) { |
| 482 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); | 470 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value(); |
| 483 } | 471 } |
| 484 | 472 |
| 485 | 473 |
| 486 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) { | 474 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) { |
| 487 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii))); | 475 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte))); |
| 488 } | 476 } |
| 489 | 477 |
| 490 | 478 |
| 491 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) { | 479 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) { |
| 492 return Code::cast(re->get(JSRegExp::code_index(is_ascii))); | 480 return Code::cast(re->get(JSRegExp::code_index(is_one_byte))); |
| 493 } | 481 } |
| 494 | 482 |
| 495 | 483 |
| 496 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, | 484 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re, |
| 497 Handle<String> pattern, | 485 Handle<String> pattern, |
| 498 JSRegExp::Flags flags, | 486 JSRegExp::Flags flags, |
| 499 int capture_count) { | 487 int capture_count) { |
| 500 // Initialize compiled code entries to null. | 488 // Initialize compiled code entries to null. |
| 501 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, | 489 re->GetIsolate()->factory()->SetRegExpIrregexpData(re, |
| 502 JSRegExp::IRREGEXP, | 490 JSRegExp::IRREGEXP, |
| 503 pattern, | 491 pattern, |
| 504 flags, | 492 flags, |
| 505 capture_count); | 493 capture_count); |
| 506 } | 494 } |
| 507 | 495 |
| 508 | 496 |
| 509 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, | 497 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, |
| 510 Handle<String> subject) { | 498 Handle<String> subject) { |
| 511 subject = String::Flatten(subject); | 499 subject = String::Flatten(subject); |
| 512 | 500 |
| 513 // Check the asciiness of the underlying storage. | 501 // Check representation of the underlying storage. |
| 514 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); | 502 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
| 515 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; | 503 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1; |
| 516 | 504 |
| 517 #ifdef V8_INTERPRETED_REGEXP | 505 #ifdef V8_INTERPRETED_REGEXP |
| 518 // Byte-code regexp needs space allocated for all its registers. | 506 // Byte-code regexp needs space allocated for all its registers. |
| 519 // The result captures are copied to the start of the registers array | 507 // The result captures are copied to the start of the registers array |
| 520 // if the match succeeds. This way those registers are not clobbered | 508 // if the match succeeds. This way those registers are not clobbered |
| 521 // when we set the last match info from last successful match. | 509 // when we set the last match info from last successful match. |
| 522 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + | 510 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) + |
| 523 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; | 511 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; |
| 524 #else // V8_INTERPRETED_REGEXP | 512 #else // V8_INTERPRETED_REGEXP |
| 525 // Native regexp only needs room to output captures. Registers are handled | 513 // Native regexp only needs room to output captures. Registers are handled |
| 526 // internally. | 514 // internally. |
| 527 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; | 515 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; |
| 528 #endif // V8_INTERPRETED_REGEXP | 516 #endif // V8_INTERPRETED_REGEXP |
| 529 } | 517 } |
| 530 | 518 |
| 531 | 519 |
| 532 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, | 520 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp, |
| 533 Handle<String> subject, | 521 Handle<String> subject, |
| 534 int index, | 522 int index, |
| 535 int32_t* output, | 523 int32_t* output, |
| 536 int output_size) { | 524 int output_size) { |
| 537 Isolate* isolate = regexp->GetIsolate(); | 525 Isolate* isolate = regexp->GetIsolate(); |
| 538 | 526 |
| 539 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); | 527 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate); |
| 540 | 528 |
| 541 DCHECK(index >= 0); | 529 DCHECK(index >= 0); |
| 542 DCHECK(index <= subject->length()); | 530 DCHECK(index <= subject->length()); |
| 543 DCHECK(subject->IsFlat()); | 531 DCHECK(subject->IsFlat()); |
| 544 | 532 |
| 545 bool is_ascii = subject->IsOneByteRepresentationUnderneath(); | 533 bool is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
| 546 | 534 |
| 547 #ifndef V8_INTERPRETED_REGEXP | 535 #ifndef V8_INTERPRETED_REGEXP |
| 548 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); | 536 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
| 549 do { | 537 do { |
| 550 EnsureCompiledIrregexp(regexp, subject, is_ascii); | 538 EnsureCompiledIrregexp(regexp, subject, is_one_byte); |
| 551 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); | 539 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate); |
| 552 // The stack is used to allocate registers for the compiled regexp code. | 540 // The stack is used to allocate registers for the compiled regexp code. |
| 553 // This means that in case of failure, the output registers array is left | 541 // This means that in case of failure, the output registers array is left |
| 554 // untouched and contains the capture results from the previous successful | 542 // untouched and contains the capture results from the previous successful |
| 555 // match. We can use that to set the last match info lazily. | 543 // match. We can use that to set the last match info lazily. |
| 556 NativeRegExpMacroAssembler::Result res = | 544 NativeRegExpMacroAssembler::Result res = |
| 557 NativeRegExpMacroAssembler::Match(code, | 545 NativeRegExpMacroAssembler::Match(code, |
| 558 subject, | 546 subject, |
| 559 output, | 547 output, |
| 560 output_size, | 548 output_size, |
| 561 index, | 549 index, |
| 562 isolate); | 550 isolate); |
| 563 if (res != NativeRegExpMacroAssembler::RETRY) { | 551 if (res != NativeRegExpMacroAssembler::RETRY) { |
| 564 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || | 552 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION || |
| 565 isolate->has_pending_exception()); | 553 isolate->has_pending_exception()); |
| 566 STATIC_ASSERT( | 554 STATIC_ASSERT( |
| 567 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); | 555 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS); |
| 568 STATIC_ASSERT( | 556 STATIC_ASSERT( |
| 569 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); | 557 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE); |
| 570 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) | 558 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) |
| 571 == RE_EXCEPTION); | 559 == RE_EXCEPTION); |
| 572 return static_cast<IrregexpResult>(res); | 560 return static_cast<IrregexpResult>(res); |
| 573 } | 561 } |
| 574 // If result is RETRY, the string has changed representation, and we | 562 // If result is RETRY, the string has changed representation, and we |
| 575 // must restart from scratch. | 563 // must restart from scratch. |
| 576 // In this case, it means we must make sure we are prepared to handle | 564 // In this case, it means we must make sure we are prepared to handle |
| 577 // the, potentially, different subject (the string can switch between | 565 // the, potentially, different subject (the string can switch between |
| 578 // being internal and external, and even between being ASCII and UC16, | 566 // being internal and external, and even between being Latin1 and UC16, |
| 579 // but the characters are always the same). | 567 // but the characters are always the same). |
| 580 IrregexpPrepare(regexp, subject); | 568 IrregexpPrepare(regexp, subject); |
| 581 is_ascii = subject->IsOneByteRepresentationUnderneath(); | 569 is_one_byte = subject->IsOneByteRepresentationUnderneath(); |
| 582 } while (true); | 570 } while (true); |
| 583 UNREACHABLE(); | 571 UNREACHABLE(); |
| 584 return RE_EXCEPTION; | 572 return RE_EXCEPTION; |
| 585 #else // V8_INTERPRETED_REGEXP | 573 #else // V8_INTERPRETED_REGEXP |
| 586 | 574 |
| 587 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); | 575 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp)); |
| 588 // We must have done EnsureCompiledIrregexp, so we can get the number of | 576 // We must have done EnsureCompiledIrregexp, so we can get the number of |
| 589 // registers. | 577 // registers. |
| 590 int number_of_capture_registers = | 578 int number_of_capture_registers = |
| 591 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; | 579 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2; |
| 592 int32_t* raw_output = &output[number_of_capture_registers]; | 580 int32_t* raw_output = &output[number_of_capture_registers]; |
| 593 // We do not touch the actual capture result registers until we know there | 581 // We do not touch the actual capture result registers until we know there |
| 594 // has been a match so that we can use those capture results to set the | 582 // has been a match so that we can use those capture results to set the |
| 595 // last match info. | 583 // last match info. |
| 596 for (int i = number_of_capture_registers - 1; i >= 0; i--) { | 584 for (int i = number_of_capture_registers - 1; i >= 0; i--) { |
| 597 raw_output[i] = -1; | 585 raw_output[i] = -1; |
| 598 } | 586 } |
| 599 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate); | 587 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte), |
| 588 isolate); | |
| 600 | 589 |
| 601 IrregexpResult result = IrregexpInterpreter::Match(isolate, | 590 IrregexpResult result = IrregexpInterpreter::Match(isolate, |
| 602 byte_codes, | 591 byte_codes, |
| 603 subject, | 592 subject, |
| 604 raw_output, | 593 raw_output, |
| 605 index); | 594 index); |
| 606 if (result == RE_SUCCESS) { | 595 if (result == RE_SUCCESS) { |
| 607 // Copy capture results to the start of the registers array. | 596 // Copy capture results to the start of the registers array. |
| 608 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); | 597 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t)); |
| 609 } | 598 } |
| (...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 990 | 979 |
| 991 | 980 |
| 992 private: | 981 private: |
| 993 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; | 982 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; |
| 994 int total_samples_; | 983 int total_samples_; |
| 995 }; | 984 }; |
| 996 | 985 |
| 997 | 986 |
| 998 class RegExpCompiler { | 987 class RegExpCompiler { |
| 999 public: | 988 public: |
| 1000 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii, | 989 RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte, |
| 1001 Zone* zone); | 990 Zone* zone); |
| 1002 | 991 |
| 1003 int AllocateRegister() { | 992 int AllocateRegister() { |
| 1004 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { | 993 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { |
| 1005 reg_exp_too_big_ = true; | 994 reg_exp_too_big_ = true; |
| 1006 return next_register_; | 995 return next_register_; |
| 1007 } | 996 } |
| 1008 return next_register_++; | 997 return next_register_++; |
| 1009 } | 998 } |
| 1010 | 999 |
| (...skipping 12 matching lines...) Expand all Loading... | |
| 1023 EndNode* accept() { return accept_; } | 1012 EndNode* accept() { return accept_; } |
| 1024 | 1013 |
| 1025 static const int kMaxRecursion = 100; | 1014 static const int kMaxRecursion = 100; |
| 1026 inline int recursion_depth() { return recursion_depth_; } | 1015 inline int recursion_depth() { return recursion_depth_; } |
| 1027 inline void IncrementRecursionDepth() { recursion_depth_++; } | 1016 inline void IncrementRecursionDepth() { recursion_depth_++; } |
| 1028 inline void DecrementRecursionDepth() { recursion_depth_--; } | 1017 inline void DecrementRecursionDepth() { recursion_depth_--; } |
| 1029 | 1018 |
| 1030 void SetRegExpTooBig() { reg_exp_too_big_ = true; } | 1019 void SetRegExpTooBig() { reg_exp_too_big_ = true; } |
| 1031 | 1020 |
| 1032 inline bool ignore_case() { return ignore_case_; } | 1021 inline bool ignore_case() { return ignore_case_; } |
| 1033 inline bool ascii() { return ascii_; } | 1022 inline bool one_byte() { return one_byte_; } |
| 1034 FrequencyCollator* frequency_collator() { return &frequency_collator_; } | 1023 FrequencyCollator* frequency_collator() { return &frequency_collator_; } |
| 1035 | 1024 |
| 1036 int current_expansion_factor() { return current_expansion_factor_; } | 1025 int current_expansion_factor() { return current_expansion_factor_; } |
| 1037 void set_current_expansion_factor(int value) { | 1026 void set_current_expansion_factor(int value) { |
| 1038 current_expansion_factor_ = value; | 1027 current_expansion_factor_ = value; |
| 1039 } | 1028 } |
| 1040 | 1029 |
| 1041 Zone* zone() const { return zone_; } | 1030 Zone* zone() const { return zone_; } |
| 1042 | 1031 |
| 1043 static const int kNoRegister = -1; | 1032 static const int kNoRegister = -1; |
| 1044 | 1033 |
| 1045 private: | 1034 private: |
| 1046 EndNode* accept_; | 1035 EndNode* accept_; |
| 1047 int next_register_; | 1036 int next_register_; |
| 1048 List<RegExpNode*>* work_list_; | 1037 List<RegExpNode*>* work_list_; |
| 1049 int recursion_depth_; | 1038 int recursion_depth_; |
| 1050 RegExpMacroAssembler* macro_assembler_; | 1039 RegExpMacroAssembler* macro_assembler_; |
| 1051 bool ignore_case_; | 1040 bool ignore_case_; |
| 1052 bool ascii_; | 1041 bool one_byte_; |
| 1053 bool reg_exp_too_big_; | 1042 bool reg_exp_too_big_; |
| 1054 int current_expansion_factor_; | 1043 int current_expansion_factor_; |
| 1055 FrequencyCollator frequency_collator_; | 1044 FrequencyCollator frequency_collator_; |
| 1056 Zone* zone_; | 1045 Zone* zone_; |
| 1057 }; | 1046 }; |
| 1058 | 1047 |
| 1059 | 1048 |
| 1060 class RecursionCheck { | 1049 class RecursionCheck { |
| 1061 public: | 1050 public: |
| 1062 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { | 1051 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { |
| 1063 compiler->IncrementRecursionDepth(); | 1052 compiler->IncrementRecursionDepth(); |
| 1064 } | 1053 } |
| 1065 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } | 1054 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } |
| 1066 private: | 1055 private: |
| 1067 RegExpCompiler* compiler_; | 1056 RegExpCompiler* compiler_; |
| 1068 }; | 1057 }; |
| 1069 | 1058 |
| 1070 | 1059 |
| 1071 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { | 1060 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) { |
| 1072 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); | 1061 return RegExpEngine::CompilationResult(isolate, "RegExp too big"); |
| 1073 } | 1062 } |
| 1074 | 1063 |
| 1075 | 1064 |
| 1076 // Attempts to compile the regexp using an Irregexp code generator. Returns | 1065 // Attempts to compile the regexp using an Irregexp code generator. Returns |
| 1077 // a fixed array or a null handle depending on whether it succeeded. | 1066 // a fixed array or a null handle depending on whether it succeeded. |
| 1078 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii, | 1067 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, |
| 1079 Zone* zone) | 1068 bool one_byte, Zone* zone) |
| 1080 : next_register_(2 * (capture_count + 1)), | 1069 : next_register_(2 * (capture_count + 1)), |
| 1081 work_list_(NULL), | 1070 work_list_(NULL), |
| 1082 recursion_depth_(0), | 1071 recursion_depth_(0), |
| 1083 ignore_case_(ignore_case), | 1072 ignore_case_(ignore_case), |
| 1084 ascii_(ascii), | 1073 one_byte_(one_byte), |
| 1085 reg_exp_too_big_(false), | 1074 reg_exp_too_big_(false), |
| 1086 current_expansion_factor_(1), | 1075 current_expansion_factor_(1), |
| 1087 frequency_collator_(), | 1076 frequency_collator_(), |
| 1088 zone_(zone) { | 1077 zone_(zone) { |
| 1089 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); | 1078 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone); |
| 1090 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); | 1079 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); |
| 1091 } | 1080 } |
| 1092 | 1081 |
| 1093 | 1082 |
| 1094 RegExpEngine::CompilationResult RegExpCompiler::Assemble( | 1083 RegExpEngine::CompilationResult RegExpCompiler::Assemble( |
| (...skipping 490 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1585 macro_assembler->IfRegisterLT(guard->reg(), | 1574 macro_assembler->IfRegisterLT(guard->reg(), |
| 1586 guard->value(), | 1575 guard->value(), |
| 1587 trace->backtrack()); | 1576 trace->backtrack()); |
| 1588 break; | 1577 break; |
| 1589 } | 1578 } |
| 1590 } | 1579 } |
| 1591 | 1580 |
| 1592 | 1581 |
| 1593 // Returns the number of characters in the equivalence class, omitting those | 1582 // Returns the number of characters in the equivalence class, omitting those |
| 1594 // that cannot occur in the source string because it is ASCII. | 1583 // that cannot occur in the source string because it is ASCII. |
| 1595 static int GetCaseIndependentLetters(Isolate* isolate, | 1584 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, |
| 1596 uc16 character, | 1585 bool one_byte_subject, |
| 1597 bool ascii_subject, | |
| 1598 unibrow::uchar* letters) { | 1586 unibrow::uchar* letters) { |
| 1599 int length = | 1587 int length = |
| 1600 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); | 1588 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters); |
| 1601 // Unibrow returns 0 or 1 for characters where case independence is | 1589 // Unibrow returns 0 or 1 for characters where case independence is |
| 1602 // trivial. | 1590 // trivial. |
| 1603 if (length == 0) { | 1591 if (length == 0) { |
| 1604 letters[0] = character; | 1592 letters[0] = character; |
| 1605 length = 1; | 1593 length = 1; |
| 1606 } | 1594 } |
| 1607 if (!ascii_subject || character <= String::kMaxOneByteCharCode) { | 1595 if (!one_byte_subject || character <= String::kMaxOneByteCharCode) { |
| 1608 return length; | 1596 return length; |
| 1609 } | 1597 } |
| 1598 | |
| 1610 // The standard requires that non-ASCII characters cannot have ASCII | 1599 // The standard requires that non-ASCII characters cannot have ASCII |
| 1611 // character codes in their equivalence class. | 1600 // character codes in their equivalence class. |
| 1601 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore, | |
| 1602 // is it? For example, \u00C5 is equivalent to \u212B. | |
|
Yang
2014/09/10 08:26:36
This is one of the TODOs I mentioned.
dcarney
2014/09/10 09:35:12
I checked other browsers I think originally, and w
| |
| 1612 return 0; | 1603 return 0; |
| 1613 } | 1604 } |
| 1614 | 1605 |
| 1615 | 1606 |
| 1616 static inline bool EmitSimpleCharacter(Isolate* isolate, | 1607 static inline bool EmitSimpleCharacter(Isolate* isolate, |
| 1617 RegExpCompiler* compiler, | 1608 RegExpCompiler* compiler, |
| 1618 uc16 c, | 1609 uc16 c, |
| 1619 Label* on_failure, | 1610 Label* on_failure, |
| 1620 int cp_offset, | 1611 int cp_offset, |
| 1621 bool check, | 1612 bool check, |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 1637 // Only emits non-letters (things that don't have case). Only used for case | 1628 // Only emits non-letters (things that don't have case). Only used for case |
| 1638 // independent matches. | 1629 // independent matches. |
| 1639 static inline bool EmitAtomNonLetter(Isolate* isolate, | 1630 static inline bool EmitAtomNonLetter(Isolate* isolate, |
| 1640 RegExpCompiler* compiler, | 1631 RegExpCompiler* compiler, |
| 1641 uc16 c, | 1632 uc16 c, |
| 1642 Label* on_failure, | 1633 Label* on_failure, |
| 1643 int cp_offset, | 1634 int cp_offset, |
| 1644 bool check, | 1635 bool check, |
| 1645 bool preloaded) { | 1636 bool preloaded) { |
| 1646 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | 1637 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| 1647 bool ascii = compiler->ascii(); | 1638 bool one_byte = compiler->one_byte(); |
| 1648 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1639 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1649 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); | 1640 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); |
| 1650 if (length < 1) { | 1641 if (length < 1) { |
| 1651 // This can't match. Must be an ASCII subject and a non-ASCII character. | 1642 // This can't match. Must be an one-byte subject and a non-one-byte |
| 1652 // We do not need to do anything since the ASCII pass already handled this. | 1643 // character. We do not need to do anything since the one-byte pass |
| 1644 // already handled this. | |
| 1653 return false; // Bounds not checked. | 1645 return false; // Bounds not checked. |
| 1654 } | 1646 } |
| 1655 bool checked = false; | 1647 bool checked = false; |
| 1656 // We handle the length > 1 case in a later pass. | 1648 // We handle the length > 1 case in a later pass. |
| 1657 if (length == 1) { | 1649 if (length == 1) { |
| 1658 if (ascii && c > String::kMaxOneByteCharCodeU) { | 1650 if (one_byte && c > String::kMaxOneByteCharCodeU) { |
| 1659 // Can't match - see above. | 1651 // Can't match - see above. |
| 1660 return false; // Bounds not checked. | 1652 return false; // Bounds not checked. |
| 1661 } | 1653 } |
| 1662 if (!preloaded) { | 1654 if (!preloaded) { |
| 1663 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1655 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1664 checked = check; | 1656 checked = check; |
| 1665 } | 1657 } |
| 1666 macro_assembler->CheckNotCharacter(c, on_failure); | 1658 macro_assembler->CheckNotCharacter(c, on_failure); |
| 1667 } | 1659 } |
| 1668 return checked; | 1660 return checked; |
| 1669 } | 1661 } |
| 1670 | 1662 |
| 1671 | 1663 |
| 1672 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, | 1664 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, |
| 1673 bool ascii, | 1665 bool one_byte, uc16 c1, uc16 c2, |
| 1674 uc16 c1, | |
| 1675 uc16 c2, | |
| 1676 Label* on_failure) { | 1666 Label* on_failure) { |
| 1677 uc16 char_mask; | 1667 uc16 char_mask; |
| 1678 if (ascii) { | 1668 if (one_byte) { |
| 1679 char_mask = String::kMaxOneByteCharCode; | 1669 char_mask = String::kMaxOneByteCharCode; |
| 1680 } else { | 1670 } else { |
| 1681 char_mask = String::kMaxUtf16CodeUnit; | 1671 char_mask = String::kMaxUtf16CodeUnit; |
| 1682 } | 1672 } |
| 1683 uc16 exor = c1 ^ c2; | 1673 uc16 exor = c1 ^ c2; |
| 1684 // Check whether exor has only one bit set. | 1674 // Check whether exor has only one bit set. |
| 1685 if (((exor - 1) & exor) == 0) { | 1675 if (((exor - 1) & exor) == 0) { |
| 1686 // If c1 and c2 differ only by one bit. | 1676 // If c1 and c2 differ only by one bit. |
| 1687 // Ecma262UnCanonicalize always gives the highest number last. | 1677 // Ecma262UnCanonicalize always gives the highest number last. |
| 1688 DCHECK(c2 > c1); | 1678 DCHECK(c2 > c1); |
| (...skipping 30 matching lines...) Expand all Loading... | |
| 1719 // Only emits letters (things that have case). Only used for case independent | 1709 // Only emits letters (things that have case). Only used for case independent |
| 1720 // matches. | 1710 // matches. |
| 1721 static inline bool EmitAtomLetter(Isolate* isolate, | 1711 static inline bool EmitAtomLetter(Isolate* isolate, |
| 1722 RegExpCompiler* compiler, | 1712 RegExpCompiler* compiler, |
| 1723 uc16 c, | 1713 uc16 c, |
| 1724 Label* on_failure, | 1714 Label* on_failure, |
| 1725 int cp_offset, | 1715 int cp_offset, |
| 1726 bool check, | 1716 bool check, |
| 1727 bool preloaded) { | 1717 bool preloaded) { |
| 1728 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); | 1718 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); |
| 1729 bool ascii = compiler->ascii(); | 1719 bool one_byte = compiler->one_byte(); |
| 1730 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 1720 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 1731 int length = GetCaseIndependentLetters(isolate, c, ascii, chars); | 1721 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars); |
| 1732 if (length <= 1) return false; | 1722 if (length <= 1) return false; |
| 1733 // We may not need to check against the end of the input string | 1723 // We may not need to check against the end of the input string |
| 1734 // if this character lies before a character that matched. | 1724 // if this character lies before a character that matched. |
| 1735 if (!preloaded) { | 1725 if (!preloaded) { |
| 1736 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); | 1726 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check); |
| 1737 } | 1727 } |
| 1738 Label ok; | 1728 Label ok; |
| 1739 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); | 1729 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4); |
| 1740 switch (length) { | 1730 switch (length) { |
| 1741 case 2: { | 1731 case 2: { |
| 1742 if (ShortCutEmitCharacterPair(macro_assembler, | 1732 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0], |
| 1743 ascii, | 1733 chars[1], on_failure)) { |
| 1744 chars[0], | |
| 1745 chars[1], | |
| 1746 on_failure)) { | |
| 1747 } else { | 1734 } else { |
| 1748 macro_assembler->CheckCharacter(chars[0], &ok); | 1735 macro_assembler->CheckCharacter(chars[0], &ok); |
| 1749 macro_assembler->CheckNotCharacter(chars[1], on_failure); | 1736 macro_assembler->CheckNotCharacter(chars[1], on_failure); |
| 1750 macro_assembler->Bind(&ok); | 1737 macro_assembler->Bind(&ok); |
| 1751 } | 1738 } |
| 1752 break; | 1739 break; |
| 1753 } | 1740 } |
| 1754 case 4: | 1741 case 4: |
| 1755 macro_assembler->CheckCharacter(chars[3], &ok); | 1742 macro_assembler->CheckCharacter(chars[3], &ok); |
| 1756 // Fall through! | 1743 // Fall through! |
| (...skipping 154 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1911 | 1898 |
| 1912 *new_start_index = start_index; | 1899 *new_start_index = start_index; |
| 1913 *border = (ranges->at(start_index) & ~kMask) + kSize; | 1900 *border = (ranges->at(start_index) & ~kMask) + kSize; |
| 1914 while (*new_start_index < end_index) { | 1901 while (*new_start_index < end_index) { |
| 1915 if (ranges->at(*new_start_index) > *border) break; | 1902 if (ranges->at(*new_start_index) > *border) break; |
| 1916 (*new_start_index)++; | 1903 (*new_start_index)++; |
| 1917 } | 1904 } |
| 1918 // new_start_index is the index of the first edge that is beyond the | 1905 // new_start_index is the index of the first edge that is beyond the |
| 1919 // current kSize space. | 1906 // current kSize space. |
| 1920 | 1907 |
| 1921 // For very large search spaces we do a binary chop search of the non-ASCII | 1908 // For very large search spaces we do a binary chop search of the non-Latin1 |
| 1922 // space instead of just going to the end of the current kSize space. The | 1909 // space instead of just going to the end of the current kSize space. The |
| 1923 // heuristics are complicated a little by the fact that any 128-character | 1910 // heuristics are complicated a little by the fact that any 128-character |
| 1924 // encoding space can be quickly tested with a table lookup, so we don't | 1911 // encoding space can be quickly tested with a table lookup, so we don't |
| 1925 // wish to do binary chop search at a smaller granularity than that. A | 1912 // wish to do binary chop search at a smaller granularity than that. A |
| 1926 // 128-character space can take up a lot of space in the ranges array if, | 1913 // 128-character space can take up a lot of space in the ranges array if, |
| 1927 // for example, we only want to match every second character (eg. the lower | 1914 // for example, we only want to match every second character (eg. the lower |
| 1928 // case characters on some Unicode pages). | 1915 // case characters on some Unicode pages). |
| 1929 int binary_chop_index = (end_index + start_index) / 2; | 1916 int binary_chop_index = (end_index + start_index) / 2; |
| 1930 // The first test ensures that we get to the code that handles the ASCII | 1917 // The first test ensures that we get to the code that handles the Latin1 |
| 1931 // range with a single not-taken branch, speeding up this important | 1918 // range with a single not-taken branch, speeding up this important |
| 1932 // character range (even non-ASCII charset-based text has spaces and | 1919 // character range (even non-Latin1 charset-based text has spaces and |
| 1933 // punctuation). | 1920 // punctuation). |
| 1934 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case. | 1921 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case. |
| 1935 end_index - start_index > (*new_start_index - start_index) * 2 && | 1922 end_index - start_index > (*new_start_index - start_index) * 2 && |
| 1936 last - first > kSize * 2 && | 1923 last - first > kSize * 2 && binary_chop_index > *new_start_index && |
| 1937 binary_chop_index > *new_start_index && | |
| 1938 ranges->at(binary_chop_index) >= first + 2 * kSize) { | 1924 ranges->at(binary_chop_index) >= first + 2 * kSize) { |
| 1939 int scan_forward_for_section_border = binary_chop_index;; | 1925 int scan_forward_for_section_border = binary_chop_index;; |
| 1940 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; | 1926 int new_border = (ranges->at(binary_chop_index) | kMask) + 1; |
| 1941 | 1927 |
| 1942 while (scan_forward_for_section_border < end_index) { | 1928 while (scan_forward_for_section_border < end_index) { |
| 1943 if (ranges->at(scan_forward_for_section_border) > new_border) { | 1929 if (ranges->at(scan_forward_for_section_border) > new_border) { |
| 1944 *new_start_index = scan_forward_for_section_border; | 1930 *new_start_index = scan_forward_for_section_border; |
| 1945 *border = new_border; | 1931 *border = new_border; |
| 1946 break; | 1932 break; |
| 1947 } | 1933 } |
| (...skipping 166 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2114 border, | 2100 border, |
| 2115 max_char, | 2101 max_char, |
| 2116 &dummy, | 2102 &dummy, |
| 2117 flip ? odd_label : even_label, | 2103 flip ? odd_label : even_label, |
| 2118 flip ? even_label : odd_label); | 2104 flip ? even_label : odd_label); |
| 2119 } | 2105 } |
| 2120 } | 2106 } |
| 2121 | 2107 |
| 2122 | 2108 |
| 2123 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, | 2109 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, |
| 2124 RegExpCharacterClass* cc, | 2110 RegExpCharacterClass* cc, bool one_byte, |
| 2125 bool ascii, | 2111 Label* on_failure, int cp_offset, bool check_offset, |
| 2126 Label* on_failure, | 2112 bool preloaded, Zone* zone) { |
| 2127 int cp_offset, | |
| 2128 bool check_offset, | |
| 2129 bool preloaded, | |
| 2130 Zone* zone) { | |
| 2131 ZoneList<CharacterRange>* ranges = cc->ranges(zone); | 2113 ZoneList<CharacterRange>* ranges = cc->ranges(zone); |
| 2132 if (!CharacterRange::IsCanonical(ranges)) { | 2114 if (!CharacterRange::IsCanonical(ranges)) { |
| 2133 CharacterRange::Canonicalize(ranges); | 2115 CharacterRange::Canonicalize(ranges); |
| 2134 } | 2116 } |
| 2135 | 2117 |
| 2136 int max_char; | 2118 int max_char; |
| 2137 if (ascii) { | 2119 if (one_byte) { |
| 2138 max_char = String::kMaxOneByteCharCode; | 2120 max_char = String::kMaxOneByteCharCode; |
| 2139 } else { | 2121 } else { |
| 2140 max_char = String::kMaxUtf16CodeUnit; | 2122 max_char = String::kMaxUtf16CodeUnit; |
| 2141 } | 2123 } |
| 2142 | 2124 |
| 2143 int range_count = ranges->length(); | 2125 int range_count = ranges->length(); |
| 2144 | 2126 |
| 2145 int last_valid_range = range_count - 1; | 2127 int last_valid_range = range_count - 1; |
| 2146 while (last_valid_range >= 0) { | 2128 while (last_valid_range >= 0) { |
| 2147 CharacterRange& range = ranges->at(last_valid_range); | 2129 CharacterRange& range = ranges->at(last_valid_range); |
| (...skipping 309 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2457 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, | 2439 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler, |
| 2458 Trace* trace, | 2440 Trace* trace, |
| 2459 bool preload_has_checked_bounds, | 2441 bool preload_has_checked_bounds, |
| 2460 Label* on_possible_success, | 2442 Label* on_possible_success, |
| 2461 QuickCheckDetails* details, | 2443 QuickCheckDetails* details, |
| 2462 bool fall_through_on_failure) { | 2444 bool fall_through_on_failure) { |
| 2463 if (details->characters() == 0) return false; | 2445 if (details->characters() == 0) return false; |
| 2464 GetQuickCheckDetails( | 2446 GetQuickCheckDetails( |
| 2465 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); | 2447 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE); |
| 2466 if (details->cannot_match()) return false; | 2448 if (details->cannot_match()) return false; |
| 2467 if (!details->Rationalize(compiler->ascii())) return false; | 2449 if (!details->Rationalize(compiler->one_byte())) return false; |
| 2468 DCHECK(details->characters() == 1 || | 2450 DCHECK(details->characters() == 1 || |
| 2469 compiler->macro_assembler()->CanReadUnaligned()); | 2451 compiler->macro_assembler()->CanReadUnaligned()); |
| 2470 uint32_t mask = details->mask(); | 2452 uint32_t mask = details->mask(); |
| 2471 uint32_t value = details->value(); | 2453 uint32_t value = details->value(); |
| 2472 | 2454 |
| 2473 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 2455 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 2474 | 2456 |
| 2475 if (trace->characters_preloaded() != details->characters()) { | 2457 if (trace->characters_preloaded() != details->characters()) { |
| 2476 assembler->LoadCurrentCharacter(trace->cp_offset(), | 2458 assembler->LoadCurrentCharacter(trace->cp_offset(), |
| 2477 trace->backtrack(), | 2459 trace->backtrack(), |
| 2478 !preload_has_checked_bounds, | 2460 !preload_has_checked_bounds, |
| 2479 details->characters()); | 2461 details->characters()); |
| 2480 } | 2462 } |
| 2481 | 2463 |
| 2482 | 2464 |
| 2483 bool need_mask = true; | 2465 bool need_mask = true; |
| 2484 | 2466 |
| 2485 if (details->characters() == 1) { | 2467 if (details->characters() == 1) { |
| 2486 // If number of characters preloaded is 1 then we used a byte or 16 bit | 2468 // If number of characters preloaded is 1 then we used a byte or 16 bit |
| 2487 // load so the value is already masked down. | 2469 // load so the value is already masked down. |
| 2488 uint32_t char_mask; | 2470 uint32_t char_mask; |
| 2489 if (compiler->ascii()) { | 2471 if (compiler->one_byte()) { |
| 2490 char_mask = String::kMaxOneByteCharCode; | 2472 char_mask = String::kMaxOneByteCharCode; |
| 2491 } else { | 2473 } else { |
| 2492 char_mask = String::kMaxUtf16CodeUnit; | 2474 char_mask = String::kMaxUtf16CodeUnit; |
| 2493 } | 2475 } |
| 2494 if ((mask & char_mask) == char_mask) need_mask = false; | 2476 if ((mask & char_mask) == char_mask) need_mask = false; |
| 2495 mask &= char_mask; | 2477 mask &= char_mask; |
| 2496 } else { | 2478 } else { |
| 2497 // For 2-character preloads in ASCII mode or 1-character preloads in | 2479 // For 2-character preloads in one-byte mode or 1-character preloads in |
| 2498 // TWO_BYTE mode we also use a 16 bit load with zero extend. | 2480 // two-byte mode we also use a 16 bit load with zero extend. |
| 2499 if (details->characters() == 2 && compiler->ascii()) { | 2481 if (details->characters() == 2 && compiler->one_byte()) { |
| 2500 if ((mask & 0xffff) == 0xffff) need_mask = false; | 2482 if ((mask & 0xffff) == 0xffff) need_mask = false; |
| 2501 } else if (details->characters() == 1 && !compiler->ascii()) { | 2483 } else if (details->characters() == 1 && !compiler->one_byte()) { |
| 2502 if ((mask & 0xffff) == 0xffff) need_mask = false; | 2484 if ((mask & 0xffff) == 0xffff) need_mask = false; |
| 2503 } else { | 2485 } else { |
| 2504 if (mask == 0xffffffff) need_mask = false; | 2486 if (mask == 0xffffffff) need_mask = false; |
| 2505 } | 2487 } |
| 2506 } | 2488 } |
| 2507 | 2489 |
| 2508 if (fall_through_on_failure) { | 2490 if (fall_through_on_failure) { |
| 2509 if (need_mask) { | 2491 if (need_mask) { |
| 2510 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); | 2492 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success); |
| 2511 } else { | 2493 } else { |
| (...skipping 19 matching lines...) Expand all Loading... | |
| 2531 // machine word for the current character width in order to be used in | 2513 // machine word for the current character width in order to be used in |
| 2532 // generating a quick check. | 2514 // generating a quick check. |
| 2533 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2515 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| 2534 RegExpCompiler* compiler, | 2516 RegExpCompiler* compiler, |
| 2535 int characters_filled_in, | 2517 int characters_filled_in, |
| 2536 bool not_at_start) { | 2518 bool not_at_start) { |
| 2537 Isolate* isolate = compiler->macro_assembler()->zone()->isolate(); | 2519 Isolate* isolate = compiler->macro_assembler()->zone()->isolate(); |
| 2538 DCHECK(characters_filled_in < details->characters()); | 2520 DCHECK(characters_filled_in < details->characters()); |
| 2539 int characters = details->characters(); | 2521 int characters = details->characters(); |
| 2540 int char_mask; | 2522 int char_mask; |
| 2541 if (compiler->ascii()) { | 2523 if (compiler->one_byte()) { |
| 2542 char_mask = String::kMaxOneByteCharCode; | 2524 char_mask = String::kMaxOneByteCharCode; |
| 2543 } else { | 2525 } else { |
| 2544 char_mask = String::kMaxUtf16CodeUnit; | 2526 char_mask = String::kMaxUtf16CodeUnit; |
| 2545 } | 2527 } |
| 2546 for (int k = 0; k < elms_->length(); k++) { | 2528 for (int k = 0; k < elms_->length(); k++) { |
| 2547 TextElement elm = elms_->at(k); | 2529 TextElement elm = elms_->at(k); |
| 2548 if (elm.text_type() == TextElement::ATOM) { | 2530 if (elm.text_type() == TextElement::ATOM) { |
| 2549 Vector<const uc16> quarks = elm.atom()->data(); | 2531 Vector<const uc16> quarks = elm.atom()->data(); |
| 2550 for (int i = 0; i < characters && i < quarks.length(); i++) { | 2532 for (int i = 0; i < characters && i < quarks.length(); i++) { |
| 2551 QuickCheckDetails::Position* pos = | 2533 QuickCheckDetails::Position* pos = |
| 2552 details->positions(characters_filled_in); | 2534 details->positions(characters_filled_in); |
| 2553 uc16 c = quarks[i]; | 2535 uc16 c = quarks[i]; |
| 2554 if (c > char_mask) { | 2536 if (c > char_mask) { |
| 2555 // If we expect a non-ASCII character from an ASCII string, | 2537 // If we expect a non-Latin1 character from an one-byte string, |
| 2556 // there is no way we can match. Not even case independent | 2538 // there is no way we can match. Not even case-independent |
| 2557 // matching can turn an ASCII character into non-ASCII or | 2539 // matching can turn an Latin1 character into non-Latin1 or |
| 2558 // vice versa. | 2540 // vice versa. |
| 2541 // TODO(dcarney): issue 3550. Verify that this works as expected. | |
| 2542 // For example, \u0178 is uppercase of \u00ff (y-umlaut). | |
|
Yang
2014/09/10 08:26:36
This is the other.
| |
| 2559 details->set_cannot_match(); | 2543 details->set_cannot_match(); |
| 2560 pos->determines_perfectly = false; | 2544 pos->determines_perfectly = false; |
| 2561 return; | 2545 return; |
| 2562 } | 2546 } |
| 2563 if (compiler->ignore_case()) { | 2547 if (compiler->ignore_case()) { |
| 2564 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 2548 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 2565 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(), | 2549 int length = GetCaseIndependentLetters(isolate, c, |
| 2566 chars); | 2550 compiler->one_byte(), chars); |
| 2567 DCHECK(length != 0); // Can only happen if c > char_mask (see above). | 2551 DCHECK(length != 0); // Can only happen if c > char_mask (see above). |
| 2568 if (length == 1) { | 2552 if (length == 1) { |
| 2569 // This letter has no case equivalents, so it's nice and simple | 2553 // This letter has no case equivalents, so it's nice and simple |
| 2570 // and the mask-compare will determine definitely whether we have | 2554 // and the mask-compare will determine definitely whether we have |
| 2571 // a match at this character position. | 2555 // a match at this character position. |
| 2572 pos->mask = char_mask; | 2556 pos->mask = char_mask; |
| 2573 pos->value = c; | 2557 pos->value = c; |
| 2574 pos->determines_perfectly = true; | 2558 pos->determines_perfectly = true; |
| 2575 } else { | 2559 } else { |
| 2576 uint32_t common_bits = char_mask; | 2560 uint32_t common_bits = char_mask; |
| (...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2685 void QuickCheckDetails::Clear() { | 2669 void QuickCheckDetails::Clear() { |
| 2686 for (int i = 0; i < characters_; i++) { | 2670 for (int i = 0; i < characters_; i++) { |
| 2687 positions_[i].mask = 0; | 2671 positions_[i].mask = 0; |
| 2688 positions_[i].value = 0; | 2672 positions_[i].value = 0; |
| 2689 positions_[i].determines_perfectly = false; | 2673 positions_[i].determines_perfectly = false; |
| 2690 } | 2674 } |
| 2691 characters_ = 0; | 2675 characters_ = 0; |
| 2692 } | 2676 } |
| 2693 | 2677 |
| 2694 | 2678 |
| 2695 void QuickCheckDetails::Advance(int by, bool ascii) { | 2679 void QuickCheckDetails::Advance(int by, bool one_byte) { |
| 2696 DCHECK(by >= 0); | 2680 DCHECK(by >= 0); |
| 2697 if (by >= characters_) { | 2681 if (by >= characters_) { |
| 2698 Clear(); | 2682 Clear(); |
| 2699 return; | 2683 return; |
| 2700 } | 2684 } |
| 2701 for (int i = 0; i < characters_ - by; i++) { | 2685 for (int i = 0; i < characters_ - by; i++) { |
| 2702 positions_[i] = positions_[by + i]; | 2686 positions_[i] = positions_[by + i]; |
| 2703 } | 2687 } |
| 2704 for (int i = characters_ - by; i < characters_; i++) { | 2688 for (int i = characters_ - by; i < characters_; i++) { |
| 2705 positions_[i].mask = 0; | 2689 positions_[i].mask = 0; |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2749 info->visited = true; | 2733 info->visited = true; |
| 2750 } | 2734 } |
| 2751 ~VisitMarker() { | 2735 ~VisitMarker() { |
| 2752 info_->visited = false; | 2736 info_->visited = false; |
| 2753 } | 2737 } |
| 2754 private: | 2738 private: |
| 2755 NodeInfo* info_; | 2739 NodeInfo* info_; |
| 2756 }; | 2740 }; |
| 2757 | 2741 |
| 2758 | 2742 |
| 2759 RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) { | 2743 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) { |
| 2760 if (info()->replacement_calculated) return replacement(); | 2744 if (info()->replacement_calculated) return replacement(); |
| 2761 if (depth < 0) return this; | 2745 if (depth < 0) return this; |
| 2762 DCHECK(!info()->visited); | 2746 DCHECK(!info()->visited); |
| 2763 VisitMarker marker(info()); | 2747 VisitMarker marker(info()); |
| 2764 return FilterSuccessor(depth - 1, ignore_case); | 2748 return FilterSuccessor(depth - 1, ignore_case); |
| 2765 } | 2749 } |
| 2766 | 2750 |
| 2767 | 2751 |
| 2768 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { | 2752 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) { |
| 2769 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case); | 2753 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); |
| 2770 if (next == NULL) return set_replacement(NULL); | 2754 if (next == NULL) return set_replacement(NULL); |
| 2771 on_success_ = next; | 2755 on_success_ = next; |
| 2772 return set_replacement(this); | 2756 return set_replacement(this); |
| 2773 } | 2757 } |
| 2774 | 2758 |
| 2775 | 2759 |
| 2776 // We need to check for the following characters: 0x39c 0x3bc 0x178. | 2760 // We need to check for the following characters: 0x39c 0x3bc 0x178. |
| 2777 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { | 2761 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) { |
| 2778 // TODO(dcarney): this could be a lot more efficient. | 2762 // TODO(dcarney): this could be a lot more efficient. |
| 2779 return range.Contains(0x39c) || | 2763 return range.Contains(0x39c) || |
| 2780 range.Contains(0x3bc) || range.Contains(0x178); | 2764 range.Contains(0x3bc) || range.Contains(0x178); |
| 2781 } | 2765 } |
| 2782 | 2766 |
| 2783 | 2767 |
| 2784 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { | 2768 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) { |
| 2785 for (int i = 0; i < ranges->length(); i++) { | 2769 for (int i = 0; i < ranges->length(); i++) { |
| 2786 // TODO(dcarney): this could be a lot more efficient. | 2770 // TODO(dcarney): this could be a lot more efficient. |
| 2787 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; | 2771 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; |
| 2788 } | 2772 } |
| 2789 return false; | 2773 return false; |
| 2790 } | 2774 } |
| 2791 | 2775 |
| 2792 | 2776 |
| 2793 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) { | 2777 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) { |
| 2794 if (info()->replacement_calculated) return replacement(); | 2778 if (info()->replacement_calculated) return replacement(); |
| 2795 if (depth < 0) return this; | 2779 if (depth < 0) return this; |
| 2796 DCHECK(!info()->visited); | 2780 DCHECK(!info()->visited); |
| 2797 VisitMarker marker(info()); | 2781 VisitMarker marker(info()); |
| 2798 int element_count = elms_->length(); | 2782 int element_count = elms_->length(); |
| 2799 for (int i = 0; i < element_count; i++) { | 2783 for (int i = 0; i < element_count; i++) { |
| 2800 TextElement elm = elms_->at(i); | 2784 TextElement elm = elms_->at(i); |
| 2801 if (elm.text_type() == TextElement::ATOM) { | 2785 if (elm.text_type() == TextElement::ATOM) { |
| 2802 Vector<const uc16> quarks = elm.atom()->data(); | 2786 Vector<const uc16> quarks = elm.atom()->data(); |
| 2803 for (int j = 0; j < quarks.length(); j++) { | 2787 for (int j = 0; j < quarks.length(); j++) { |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2837 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; | 2821 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; |
| 2838 return set_replacement(NULL); | 2822 return set_replacement(NULL); |
| 2839 } | 2823 } |
| 2840 } | 2824 } |
| 2841 } | 2825 } |
| 2842 } | 2826 } |
| 2843 return FilterSuccessor(depth - 1, ignore_case); | 2827 return FilterSuccessor(depth - 1, ignore_case); |
| 2844 } | 2828 } |
| 2845 | 2829 |
| 2846 | 2830 |
| 2847 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) { | 2831 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) { |
| 2848 if (info()->replacement_calculated) return replacement(); | 2832 if (info()->replacement_calculated) return replacement(); |
| 2849 if (depth < 0) return this; | 2833 if (depth < 0) return this; |
| 2850 if (info()->visited) return this; | 2834 if (info()->visited) return this; |
| 2851 { | 2835 { |
| 2852 VisitMarker marker(info()); | 2836 VisitMarker marker(info()); |
| 2853 | 2837 |
| 2854 RegExpNode* continue_replacement = | 2838 RegExpNode* continue_replacement = |
| 2855 continue_node_->FilterASCII(depth - 1, ignore_case); | 2839 continue_node_->FilterOneByte(depth - 1, ignore_case); |
| 2856 // If we can't continue after the loop then there is no sense in doing the | 2840 // If we can't continue after the loop then there is no sense in doing the |
| 2857 // loop. | 2841 // loop. |
| 2858 if (continue_replacement == NULL) return set_replacement(NULL); | 2842 if (continue_replacement == NULL) return set_replacement(NULL); |
| 2859 } | 2843 } |
| 2860 | 2844 |
| 2861 return ChoiceNode::FilterASCII(depth - 1, ignore_case); | 2845 return ChoiceNode::FilterOneByte(depth - 1, ignore_case); |
| 2862 } | 2846 } |
| 2863 | 2847 |
| 2864 | 2848 |
| 2865 RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) { | 2849 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) { |
| 2866 if (info()->replacement_calculated) return replacement(); | 2850 if (info()->replacement_calculated) return replacement(); |
| 2867 if (depth < 0) return this; | 2851 if (depth < 0) return this; |
| 2868 if (info()->visited) return this; | 2852 if (info()->visited) return this; |
| 2869 VisitMarker marker(info()); | 2853 VisitMarker marker(info()); |
| 2870 int choice_count = alternatives_->length(); | 2854 int choice_count = alternatives_->length(); |
| 2871 | 2855 |
| 2872 for (int i = 0; i < choice_count; i++) { | 2856 for (int i = 0; i < choice_count; i++) { |
| 2873 GuardedAlternative alternative = alternatives_->at(i); | 2857 GuardedAlternative alternative = alternatives_->at(i); |
| 2874 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { | 2858 if (alternative.guards() != NULL && alternative.guards()->length() != 0) { |
| 2875 set_replacement(this); | 2859 set_replacement(this); |
| 2876 return this; | 2860 return this; |
| 2877 } | 2861 } |
| 2878 } | 2862 } |
| 2879 | 2863 |
| 2880 int surviving = 0; | 2864 int surviving = 0; |
| 2881 RegExpNode* survivor = NULL; | 2865 RegExpNode* survivor = NULL; |
| 2882 for (int i = 0; i < choice_count; i++) { | 2866 for (int i = 0; i < choice_count; i++) { |
| 2883 GuardedAlternative alternative = alternatives_->at(i); | 2867 GuardedAlternative alternative = alternatives_->at(i); |
| 2884 RegExpNode* replacement = | 2868 RegExpNode* replacement = |
| 2885 alternative.node()->FilterASCII(depth - 1, ignore_case); | 2869 alternative.node()->FilterOneByte(depth - 1, ignore_case); |
| 2886 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. | 2870 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. |
| 2887 if (replacement != NULL) { | 2871 if (replacement != NULL) { |
| 2888 alternatives_->at(i).set_node(replacement); | 2872 alternatives_->at(i).set_node(replacement); |
| 2889 surviving++; | 2873 surviving++; |
| 2890 survivor = replacement; | 2874 survivor = replacement; |
| 2891 } | 2875 } |
| 2892 } | 2876 } |
| 2893 if (surviving < 2) return set_replacement(survivor); | 2877 if (surviving < 2) return set_replacement(survivor); |
| 2894 | 2878 |
| 2895 set_replacement(this); | 2879 set_replacement(this); |
| 2896 if (surviving == choice_count) { | 2880 if (surviving == choice_count) { |
| 2897 return this; | 2881 return this; |
| 2898 } | 2882 } |
| 2899 // Only some of the nodes survived the filtering. We need to rebuild the | 2883 // Only some of the nodes survived the filtering. We need to rebuild the |
| 2900 // alternatives list. | 2884 // alternatives list. |
| 2901 ZoneList<GuardedAlternative>* new_alternatives = | 2885 ZoneList<GuardedAlternative>* new_alternatives = |
| 2902 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); | 2886 new(zone()) ZoneList<GuardedAlternative>(surviving, zone()); |
| 2903 for (int i = 0; i < choice_count; i++) { | 2887 for (int i = 0; i < choice_count; i++) { |
| 2904 RegExpNode* replacement = | 2888 RegExpNode* replacement = |
| 2905 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case); | 2889 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case); |
| 2906 if (replacement != NULL) { | 2890 if (replacement != NULL) { |
| 2907 alternatives_->at(i).set_node(replacement); | 2891 alternatives_->at(i).set_node(replacement); |
| 2908 new_alternatives->Add(alternatives_->at(i), zone()); | 2892 new_alternatives->Add(alternatives_->at(i), zone()); |
| 2909 } | 2893 } |
| 2910 } | 2894 } |
| 2911 alternatives_ = new_alternatives; | 2895 alternatives_ = new_alternatives; |
| 2912 return this; | 2896 return this; |
| 2913 } | 2897 } |
| 2914 | 2898 |
| 2915 | 2899 |
| 2916 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth, | 2900 RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth, |
| 2917 bool ignore_case) { | 2901 bool ignore_case) { |
| 2918 if (info()->replacement_calculated) return replacement(); | 2902 if (info()->replacement_calculated) return replacement(); |
| 2919 if (depth < 0) return this; | 2903 if (depth < 0) return this; |
| 2920 if (info()->visited) return this; | 2904 if (info()->visited) return this; |
| 2921 VisitMarker marker(info()); | 2905 VisitMarker marker(info()); |
| 2922 // Alternative 0 is the negative lookahead, alternative 1 is what comes | 2906 // Alternative 0 is the negative lookahead, alternative 1 is what comes |
| 2923 // afterwards. | 2907 // afterwards. |
| 2924 RegExpNode* node = alternatives_->at(1).node(); | 2908 RegExpNode* node = alternatives_->at(1).node(); |
| 2925 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case); | 2909 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); |
| 2926 if (replacement == NULL) return set_replacement(NULL); | 2910 if (replacement == NULL) return set_replacement(NULL); |
| 2927 alternatives_->at(1).set_node(replacement); | 2911 alternatives_->at(1).set_node(replacement); |
| 2928 | 2912 |
| 2929 RegExpNode* neg_node = alternatives_->at(0).node(); | 2913 RegExpNode* neg_node = alternatives_->at(0).node(); |
| 2930 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case); | 2914 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); |
| 2931 // If the negative lookahead is always going to fail then | 2915 // If the negative lookahead is always going to fail then |
| 2932 // we don't need to check it. | 2916 // we don't need to check it. |
| 2933 if (neg_replacement == NULL) return set_replacement(replacement); | 2917 if (neg_replacement == NULL) return set_replacement(replacement); |
| 2934 alternatives_->at(0).set_node(neg_replacement); | 2918 alternatives_->at(0).set_node(neg_replacement); |
| 2935 return set_replacement(this); | 2919 return set_replacement(this); |
| 2936 } | 2920 } |
| 2937 | 2921 |
| 2938 | 2922 |
| 2939 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2923 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| 2940 RegExpCompiler* compiler, | 2924 RegExpCompiler* compiler, |
| (...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3029 assembler->CheckAtStart(&ok); | 3013 assembler->CheckAtStart(&ok); |
| 3030 } | 3014 } |
| 3031 // We already checked that we are not at the start of input so it must be | 3015 // We already checked that we are not at the start of input so it must be |
| 3032 // OK to load the previous character. | 3016 // OK to load the previous character. |
| 3033 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, | 3017 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1, |
| 3034 new_trace.backtrack(), | 3018 new_trace.backtrack(), |
| 3035 false); | 3019 false); |
| 3036 if (!assembler->CheckSpecialCharacterClass('n', | 3020 if (!assembler->CheckSpecialCharacterClass('n', |
| 3037 new_trace.backtrack())) { | 3021 new_trace.backtrack())) { |
| 3038 // Newline means \n, \r, 0x2028 or 0x2029. | 3022 // Newline means \n, \r, 0x2028 or 0x2029. |
| 3039 if (!compiler->ascii()) { | 3023 if (!compiler->one_byte()) { |
| 3040 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); | 3024 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok); |
| 3041 } | 3025 } |
| 3042 assembler->CheckCharacter('\n', &ok); | 3026 assembler->CheckCharacter('\n', &ok); |
| 3043 assembler->CheckNotCharacter('\r', new_trace.backtrack()); | 3027 assembler->CheckNotCharacter('\r', new_trace.backtrack()); |
| 3044 } | 3028 } |
| 3045 assembler->Bind(&ok); | 3029 assembler->Bind(&ok); |
| 3046 on_success->Emit(compiler, &new_trace); | 3030 on_success->Emit(compiler, &new_trace); |
| 3047 } | 3031 } |
| 3048 | 3032 |
| 3049 | 3033 |
| (...skipping 177 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3227 // check can have involved a mask and compare operation which may simplify | 3211 // check can have involved a mask and compare operation which may simplify |
| 3228 // or obviate the need for further checks at some character positions. | 3212 // or obviate the need for further checks at some character positions. |
| 3229 void TextNode::TextEmitPass(RegExpCompiler* compiler, | 3213 void TextNode::TextEmitPass(RegExpCompiler* compiler, |
| 3230 TextEmitPassType pass, | 3214 TextEmitPassType pass, |
| 3231 bool preloaded, | 3215 bool preloaded, |
| 3232 Trace* trace, | 3216 Trace* trace, |
| 3233 bool first_element_checked, | 3217 bool first_element_checked, |
| 3234 int* checked_up_to) { | 3218 int* checked_up_to) { |
| 3235 RegExpMacroAssembler* assembler = compiler->macro_assembler(); | 3219 RegExpMacroAssembler* assembler = compiler->macro_assembler(); |
| 3236 Isolate* isolate = assembler->zone()->isolate(); | 3220 Isolate* isolate = assembler->zone()->isolate(); |
| 3237 bool ascii = compiler->ascii(); | 3221 bool one_byte = compiler->one_byte(); |
| 3238 Label* backtrack = trace->backtrack(); | 3222 Label* backtrack = trace->backtrack(); |
| 3239 QuickCheckDetails* quick_check = trace->quick_check_performed(); | 3223 QuickCheckDetails* quick_check = trace->quick_check_performed(); |
| 3240 int element_count = elms_->length(); | 3224 int element_count = elms_->length(); |
| 3241 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { | 3225 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) { |
| 3242 TextElement elm = elms_->at(i); | 3226 TextElement elm = elms_->at(i); |
| 3243 int cp_offset = trace->cp_offset() + elm.cp_offset(); | 3227 int cp_offset = trace->cp_offset() + elm.cp_offset(); |
| 3244 if (elm.text_type() == TextElement::ATOM) { | 3228 if (elm.text_type() == TextElement::ATOM) { |
| 3245 Vector<const uc16> quarks = elm.atom()->data(); | 3229 Vector<const uc16> quarks = elm.atom()->data(); |
| 3246 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { | 3230 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { |
| 3247 if (first_element_checked && i == 0 && j == 0) continue; | 3231 if (first_element_checked && i == 0 && j == 0) continue; |
| 3248 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; | 3232 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; |
| 3249 EmitCharacterFunction* emit_function = NULL; | 3233 EmitCharacterFunction* emit_function = NULL; |
| 3250 switch (pass) { | 3234 switch (pass) { |
| 3251 case NON_ASCII_MATCH: | 3235 case NON_LATIN1_MATCH: |
| 3252 DCHECK(ascii); | 3236 DCHECK(one_byte); |
| 3253 if (quarks[j] > String::kMaxOneByteCharCode) { | 3237 if (quarks[j] > String::kMaxOneByteCharCode) { |
| 3254 assembler->GoTo(backtrack); | 3238 assembler->GoTo(backtrack); |
| 3255 return; | 3239 return; |
| 3256 } | 3240 } |
| 3257 break; | 3241 break; |
| 3258 case NON_LETTER_CHARACTER_MATCH: | 3242 case NON_LETTER_CHARACTER_MATCH: |
| 3259 emit_function = &EmitAtomNonLetter; | 3243 emit_function = &EmitAtomNonLetter; |
| 3260 break; | 3244 break; |
| 3261 case SIMPLE_CHARACTER_MATCH: | 3245 case SIMPLE_CHARACTER_MATCH: |
| 3262 emit_function = &EmitSimpleCharacter; | 3246 emit_function = &EmitSimpleCharacter; |
| (...skipping 14 matching lines...) Expand all Loading... | |
| 3277 preloaded); | 3261 preloaded); |
| 3278 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); | 3262 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to); |
| 3279 } | 3263 } |
| 3280 } | 3264 } |
| 3281 } else { | 3265 } else { |
| 3282 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); | 3266 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type()); |
| 3283 if (pass == CHARACTER_CLASS_MATCH) { | 3267 if (pass == CHARACTER_CLASS_MATCH) { |
| 3284 if (first_element_checked && i == 0) continue; | 3268 if (first_element_checked && i == 0) continue; |
| 3285 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; | 3269 if (DeterminedAlready(quick_check, elm.cp_offset())) continue; |
| 3286 RegExpCharacterClass* cc = elm.char_class(); | 3270 RegExpCharacterClass* cc = elm.char_class(); |
| 3287 EmitCharClass(assembler, | 3271 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset, |
| 3288 cc, | 3272 *checked_up_to < cp_offset, preloaded, zone()); |
| 3289 ascii, | |
| 3290 backtrack, | |
| 3291 cp_offset, | |
| 3292 *checked_up_to < cp_offset, | |
| 3293 preloaded, | |
| 3294 zone()); | |
| 3295 UpdateBoundsCheck(cp_offset, checked_up_to); | 3273 UpdateBoundsCheck(cp_offset, checked_up_to); |
| 3296 } | 3274 } |
| 3297 } | 3275 } |
| 3298 } | 3276 } |
| 3299 } | 3277 } |
| 3300 | 3278 |
| 3301 | 3279 |
| 3302 int TextNode::Length() { | 3280 int TextNode::Length() { |
| 3303 TextElement elm = elms_->last(); | 3281 TextElement elm = elms_->last(); |
| 3304 DCHECK(elm.cp_offset() >= 0); | 3282 DCHECK(elm.cp_offset() >= 0); |
| (...skipping 20 matching lines...) Expand all Loading... | |
| 3325 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { | 3303 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
| 3326 LimitResult limit_result = LimitVersions(compiler, trace); | 3304 LimitResult limit_result = LimitVersions(compiler, trace); |
| 3327 if (limit_result == DONE) return; | 3305 if (limit_result == DONE) return; |
| 3328 DCHECK(limit_result == CONTINUE); | 3306 DCHECK(limit_result == CONTINUE); |
| 3329 | 3307 |
| 3330 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { | 3308 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) { |
| 3331 compiler->SetRegExpTooBig(); | 3309 compiler->SetRegExpTooBig(); |
| 3332 return; | 3310 return; |
| 3333 } | 3311 } |
| 3334 | 3312 |
| 3335 if (compiler->ascii()) { | 3313 if (compiler->one_byte()) { |
| 3336 int dummy = 0; | 3314 int dummy = 0; |
| 3337 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy); | 3315 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy); |
| 3338 } | 3316 } |
| 3339 | 3317 |
| 3340 bool first_elt_done = false; | 3318 bool first_elt_done = false; |
| 3341 int bound_checked_to = trace->cp_offset() - 1; | 3319 int bound_checked_to = trace->cp_offset() - 1; |
| 3342 bound_checked_to += trace->bound_checked_up_to(); | 3320 bound_checked_to += trace->bound_checked_up_to(); |
| 3343 | 3321 |
| 3344 // If a character is preloaded into the current character register then | 3322 // If a character is preloaded into the current character register then |
| 3345 // check that now. | 3323 // check that now. |
| 3346 if (trace->characters_preloaded() == 1) { | 3324 if (trace->characters_preloaded() == 1) { |
| 3347 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { | 3325 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) { |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3383 | 3361 |
| 3384 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { | 3362 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { |
| 3385 DCHECK(by > 0); | 3363 DCHECK(by > 0); |
| 3386 // We don't have an instruction for shifting the current character register | 3364 // We don't have an instruction for shifting the current character register |
| 3387 // down or for using a shifted value for anything so lets just forget that | 3365 // down or for using a shifted value for anything so lets just forget that |
| 3388 // we preloaded any characters into it. | 3366 // we preloaded any characters into it. |
| 3389 characters_preloaded_ = 0; | 3367 characters_preloaded_ = 0; |
| 3390 // Adjust the offsets of the quick check performed information. This | 3368 // Adjust the offsets of the quick check performed information. This |
| 3391 // information is used to find out what we already determined about the | 3369 // information is used to find out what we already determined about the |
| 3392 // characters by means of mask and compare. | 3370 // characters by means of mask and compare. |
| 3393 quick_check_performed_.Advance(by, compiler->ascii()); | 3371 quick_check_performed_.Advance(by, compiler->one_byte()); |
| 3394 cp_offset_ += by; | 3372 cp_offset_ += by; |
| 3395 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { | 3373 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { |
| 3396 compiler->SetRegExpTooBig(); | 3374 compiler->SetRegExpTooBig(); |
| 3397 cp_offset_ = 0; | 3375 cp_offset_ = 0; |
| 3398 } | 3376 } |
| 3399 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); | 3377 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); |
| 3400 } | 3378 } |
| 3401 | 3379 |
| 3402 | 3380 |
| 3403 void TextNode::MakeCaseIndependent(bool is_ascii) { | 3381 void TextNode::MakeCaseIndependent(bool is_one_byte) { |
| 3404 int element_count = elms_->length(); | 3382 int element_count = elms_->length(); |
| 3405 for (int i = 0; i < element_count; i++) { | 3383 for (int i = 0; i < element_count; i++) { |
| 3406 TextElement elm = elms_->at(i); | 3384 TextElement elm = elms_->at(i); |
| 3407 if (elm.text_type() == TextElement::CHAR_CLASS) { | 3385 if (elm.text_type() == TextElement::CHAR_CLASS) { |
| 3408 RegExpCharacterClass* cc = elm.char_class(); | 3386 RegExpCharacterClass* cc = elm.char_class(); |
| 3409 // None of the standard character classes is different in the case | 3387 // None of the standard character classes is different in the case |
| 3410 // independent case and it slows us down if we don't know that. | 3388 // independent case and it slows us down if we don't know that. |
| 3411 if (cc->is_standard(zone())) continue; | 3389 if (cc->is_standard(zone())) continue; |
| 3412 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); | 3390 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
| 3413 int range_count = ranges->length(); | 3391 int range_count = ranges->length(); |
| 3414 for (int j = 0; j < range_count; j++) { | 3392 for (int j = 0; j < range_count; j++) { |
| 3415 ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone()); | 3393 ranges->at(j).AddCaseEquivalents(ranges, is_one_byte, zone()); |
| 3416 } | 3394 } |
| 3417 } | 3395 } |
| 3418 } | 3396 } |
| 3419 } | 3397 } |
| 3420 | 3398 |
| 3421 | 3399 |
| 3422 int TextNode::GreedyLoopTextLength() { | 3400 int TextNode::GreedyLoopTextLength() { |
| 3423 TextElement elm = elms_->at(elms_->length() - 1); | 3401 TextElement elm = elms_->at(elms_->length() - 1); |
| 3424 return elm.cp_offset() + elm.length(); | 3402 return elm.cp_offset() + elm.length(); |
| 3425 } | 3403 } |
| 3426 | 3404 |
| 3427 | 3405 |
| 3428 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( | 3406 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( |
| 3429 RegExpCompiler* compiler) { | 3407 RegExpCompiler* compiler) { |
| 3430 if (elms_->length() != 1) return NULL; | 3408 if (elms_->length() != 1) return NULL; |
| 3431 TextElement elm = elms_->at(0); | 3409 TextElement elm = elms_->at(0); |
| 3432 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; | 3410 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL; |
| 3433 RegExpCharacterClass* node = elm.char_class(); | 3411 RegExpCharacterClass* node = elm.char_class(); |
| 3434 ZoneList<CharacterRange>* ranges = node->ranges(zone()); | 3412 ZoneList<CharacterRange>* ranges = node->ranges(zone()); |
| 3435 if (!CharacterRange::IsCanonical(ranges)) { | 3413 if (!CharacterRange::IsCanonical(ranges)) { |
| 3436 CharacterRange::Canonicalize(ranges); | 3414 CharacterRange::Canonicalize(ranges); |
| 3437 } | 3415 } |
| 3438 if (node->is_negated()) { | 3416 if (node->is_negated()) { |
| 3439 return ranges->length() == 0 ? on_success() : NULL; | 3417 return ranges->length() == 0 ? on_success() : NULL; |
| 3440 } | 3418 } |
| 3441 if (ranges->length() != 1) return NULL; | 3419 if (ranges->length() != 1) return NULL; |
| 3442 uint32_t max_char; | 3420 uint32_t max_char; |
| 3443 if (compiler->ascii()) { | 3421 if (compiler->one_byte()) { |
| 3444 max_char = String::kMaxOneByteCharCode; | 3422 max_char = String::kMaxOneByteCharCode; |
| 3445 } else { | 3423 } else { |
| 3446 max_char = String::kMaxUtf16CodeUnit; | 3424 max_char = String::kMaxUtf16CodeUnit; |
| 3447 } | 3425 } |
| 3448 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; | 3426 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; |
| 3449 } | 3427 } |
| 3450 | 3428 |
| 3451 | 3429 |
| 3452 // Finds the fixed match length of a sequence of nodes that goes from | 3430 // Finds the fixed match length of a sequence of nodes that goes from |
| 3453 // this alternative and back to this choice node. If there are variable | 3431 // this alternative and back to this choice node. If there are variable |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3510 return; | 3488 return; |
| 3511 } | 3489 } |
| 3512 ChoiceNode::Emit(compiler, trace); | 3490 ChoiceNode::Emit(compiler, trace); |
| 3513 } | 3491 } |
| 3514 | 3492 |
| 3515 | 3493 |
| 3516 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, | 3494 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
| 3517 int eats_at_least) { | 3495 int eats_at_least) { |
| 3518 int preload_characters = Min(4, eats_at_least); | 3496 int preload_characters = Min(4, eats_at_least); |
| 3519 if (compiler->macro_assembler()->CanReadUnaligned()) { | 3497 if (compiler->macro_assembler()->CanReadUnaligned()) { |
| 3520 bool ascii = compiler->ascii(); | 3498 bool one_byte = compiler->one_byte(); |
| 3521 if (ascii) { | 3499 if (one_byte) { |
| 3522 if (preload_characters > 4) preload_characters = 4; | 3500 if (preload_characters > 4) preload_characters = 4; |
| 3523 // We can't preload 3 characters because there is no machine instruction | 3501 // We can't preload 3 characters because there is no machine instruction |
| 3524 // to do that. We can't just load 4 because we could be reading | 3502 // to do that. We can't just load 4 because we could be reading |
| 3525 // beyond the end of the string, which could cause a memory fault. | 3503 // beyond the end of the string, which could cause a memory fault. |
| 3526 if (preload_characters == 3) preload_characters = 2; | 3504 if (preload_characters == 3) preload_characters = 2; |
| 3527 } else { | 3505 } else { |
| 3528 if (preload_characters > 2) preload_characters = 2; | 3506 if (preload_characters > 2) preload_characters = 2; |
| 3529 } | 3507 } |
| 3530 } else { | 3508 } else { |
| 3531 if (preload_characters > 1) preload_characters = 1; | 3509 if (preload_characters > 1) preload_characters = 1; |
| (...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3637 map_count_ = kMapSize; | 3615 map_count_ = kMapSize; |
| 3638 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; | 3616 for (int i = 0; i < kMapSize; i++) map_->at(i) = true; |
| 3639 } | 3617 } |
| 3640 } | 3618 } |
| 3641 | 3619 |
| 3642 | 3620 |
| 3643 BoyerMooreLookahead::BoyerMooreLookahead( | 3621 BoyerMooreLookahead::BoyerMooreLookahead( |
| 3644 int length, RegExpCompiler* compiler, Zone* zone) | 3622 int length, RegExpCompiler* compiler, Zone* zone) |
| 3645 : length_(length), | 3623 : length_(length), |
| 3646 compiler_(compiler) { | 3624 compiler_(compiler) { |
| 3647 if (compiler->ascii()) { | 3625 if (compiler->one_byte()) { |
| 3648 max_char_ = String::kMaxOneByteCharCode; | 3626 max_char_ = String::kMaxOneByteCharCode; |
| 3649 } else { | 3627 } else { |
| 3650 max_char_ = String::kMaxUtf16CodeUnit; | 3628 max_char_ = String::kMaxUtf16CodeUnit; |
| 3651 } | 3629 } |
| 3652 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); | 3630 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone); |
| 3653 for (int i = 0; i < length; i++) { | 3631 for (int i = 0; i < length; i++) { |
| 3654 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); | 3632 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone); |
| 3655 } | 3633 } |
| 3656 } | 3634 } |
| 3657 | 3635 |
| (...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3705 // can theoretically be up to 2*kSize though we treat it mostly as | 3683 // can theoretically be up to 2*kSize though we treat it mostly as |
| 3706 // a fraction of kSize. | 3684 // a fraction of kSize. |
| 3707 frequency += compiler_->frequency_collator()->Frequency(j) + 1; | 3685 frequency += compiler_->frequency_collator()->Frequency(j) + 1; |
| 3708 } | 3686 } |
| 3709 } | 3687 } |
| 3710 // We use the probability of skipping times the distance we are skipping to | 3688 // We use the probability of skipping times the distance we are skipping to |
| 3711 // judge the effectiveness of this. Actually we have a cut-off: By | 3689 // judge the effectiveness of this. Actually we have a cut-off: By |
| 3712 // dividing by 2 we switch off the skipping if the probability of skipping | 3690 // dividing by 2 we switch off the skipping if the probability of skipping |
| 3713 // is less than 50%. This is because the multibyte mask-and-compare | 3691 // is less than 50%. This is because the multibyte mask-and-compare |
| 3714 // skipping in quickcheck is more likely to do well on this case. | 3692 // skipping in quickcheck is more likely to do well on this case. |
| 3715 bool in_quickcheck_range = ((i - remembered_from < 4) || | 3693 bool in_quickcheck_range = |
| 3716 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); | 3694 ((i - remembered_from < 4) || |
| 3695 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2)); | |
| 3717 // Called 'probability' but it is only a rough estimate and can actually | 3696 // Called 'probability' but it is only a rough estimate and can actually |
| 3718 // be outside the 0-kSize range. | 3697 // be outside the 0-kSize range. |
| 3719 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; | 3698 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; |
| 3720 int points = (i - remembered_from) * probability; | 3699 int points = (i - remembered_from) * probability; |
| 3721 if (points > biggest_points) { | 3700 if (points > biggest_points) { |
| 3722 *from = remembered_from; | 3701 *from = remembered_from; |
| 3723 *to = i - 1; | 3702 *to = i - 1; |
| 3724 biggest_points = points; | 3703 biggest_points = points; |
| 3725 } | 3704 } |
| 3726 } | 3705 } |
| (...skipping 197 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3924 #endif | 3903 #endif |
| 3925 } | 3904 } |
| 3926 | 3905 |
| 3927 | 3906 |
| 3928 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, | 3907 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, |
| 3929 Trace* current_trace, | 3908 Trace* current_trace, |
| 3930 PreloadState* state) { | 3909 PreloadState* state) { |
| 3931 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { | 3910 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) { |
| 3932 // Save some time by looking at most one machine word ahead. | 3911 // Save some time by looking at most one machine word ahead. |
| 3933 state->eats_at_least_ = | 3912 state->eats_at_least_ = |
| 3934 EatsAtLeast(compiler->ascii() ? 4 : 2, | 3913 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget, |
| 3935 kRecursionBudget, | |
| 3936 current_trace->at_start() == Trace::FALSE_VALUE); | 3914 current_trace->at_start() == Trace::FALSE_VALUE); |
| 3937 } | 3915 } |
| 3938 state->preload_characters_ = | 3916 state->preload_characters_ = |
| 3939 CalculatePreloadCharacters(compiler, state->eats_at_least_); | 3917 CalculatePreloadCharacters(compiler, state->eats_at_least_); |
| 3940 | 3918 |
| 3941 state->preload_is_current_ = | 3919 state->preload_is_current_ = |
| 3942 (current_trace->characters_preloaded() == state->preload_characters_); | 3920 (current_trace->characters_preloaded() == state->preload_characters_); |
| 3943 state->preload_has_checked_bounds_ = state->preload_is_current_; | 3921 state->preload_has_checked_bounds_ = state->preload_is_current_; |
| 3944 } | 3922 } |
| 3945 | 3923 |
| (...skipping 1394 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5340 for (int i = 0; i < overlay.length(); i += 2) { | 5318 for (int i = 0; i < overlay.length(); i += 2) { |
| 5341 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), | 5319 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1), |
| 5342 CharacterRangeSplitter::kInOverlay, zone); | 5320 CharacterRangeSplitter::kInOverlay, zone); |
| 5343 } | 5321 } |
| 5344 CharacterRangeSplitter callback(included, excluded, zone); | 5322 CharacterRangeSplitter callback(included, excluded, zone); |
| 5345 table.ForEach(&callback); | 5323 table.ForEach(&callback); |
| 5346 } | 5324 } |
| 5347 | 5325 |
| 5348 | 5326 |
| 5349 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, | 5327 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
| 5350 bool is_ascii, | 5328 bool is_one_byte, Zone* zone) { |
| 5351 Zone* zone) { | |
| 5352 Isolate* isolate = zone->isolate(); | 5329 Isolate* isolate = zone->isolate(); |
| 5353 uc16 bottom = from(); | 5330 uc16 bottom = from(); |
| 5354 uc16 top = to(); | 5331 uc16 top = to(); |
| 5355 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) { | 5332 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { |
| 5356 if (bottom > String::kMaxOneByteCharCode) return; | 5333 if (bottom > String::kMaxOneByteCharCode) return; |
| 5357 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; | 5334 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; |
| 5358 } | 5335 } |
| 5359 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 5336 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 5360 if (top == bottom) { | 5337 if (top == bottom) { |
| 5361 // If this is a singleton we just expand the one character. | 5338 // If this is a singleton we just expand the one character. |
| 5362 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); | 5339 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); |
| 5363 for (int i = 0; i < length; i++) { | 5340 for (int i = 0; i < length; i++) { |
| 5364 uc32 chr = chars[i]; | 5341 uc32 chr = chars[i]; |
| 5365 if (chr != bottom) { | 5342 if (chr != bottom) { |
| (...skipping 398 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5764 for (int i = 0; i < element_count; i++) { | 5741 for (int i = 0; i < element_count; i++) { |
| 5765 TextElement& elm = elements()->at(i); | 5742 TextElement& elm = elements()->at(i); |
| 5766 elm.set_cp_offset(cp_offset); | 5743 elm.set_cp_offset(cp_offset); |
| 5767 cp_offset += elm.length(); | 5744 cp_offset += elm.length(); |
| 5768 } | 5745 } |
| 5769 } | 5746 } |
| 5770 | 5747 |
| 5771 | 5748 |
| 5772 void Analysis::VisitText(TextNode* that) { | 5749 void Analysis::VisitText(TextNode* that) { |
| 5773 if (ignore_case_) { | 5750 if (ignore_case_) { |
| 5774 that->MakeCaseIndependent(is_ascii_); | 5751 that->MakeCaseIndependent(is_one_byte_); |
| 5775 } | 5752 } |
| 5776 EnsureAnalyzed(that->on_success()); | 5753 EnsureAnalyzed(that->on_success()); |
| 5777 if (!has_failed()) { | 5754 if (!has_failed()) { |
| 5778 that->CalculateOffsets(); | 5755 that->CalculateOffsets(); |
| 5779 } | 5756 } |
| 5780 } | 5757 } |
| 5781 | 5758 |
| 5782 | 5759 |
| 5783 void Analysis::VisitAction(ActionNode* that) { | 5760 void Analysis::VisitAction(ActionNode* that) { |
| 5784 RegExpNode* target = that->on_success(); | 5761 RegExpNode* target = that->on_success(); |
| (...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 6040 } | 6017 } |
| 6041 | 6018 |
| 6042 | 6019 |
| 6043 void DispatchTableConstructor::VisitAction(ActionNode* that) { | 6020 void DispatchTableConstructor::VisitAction(ActionNode* that) { |
| 6044 RegExpNode* target = that->on_success(); | 6021 RegExpNode* target = that->on_success(); |
| 6045 target->Accept(this); | 6022 target->Accept(this); |
| 6046 } | 6023 } |
| 6047 | 6024 |
| 6048 | 6025 |
| 6049 RegExpEngine::CompilationResult RegExpEngine::Compile( | 6026 RegExpEngine::CompilationResult RegExpEngine::Compile( |
| 6050 RegExpCompileData* data, | 6027 RegExpCompileData* data, bool ignore_case, bool is_global, |
| 6051 bool ignore_case, | 6028 bool is_multiline, Handle<String> pattern, Handle<String> sample_subject, |
| 6052 bool is_global, | 6029 bool is_one_byte, Zone* zone) { |
| 6053 bool is_multiline, | |
| 6054 Handle<String> pattern, | |
| 6055 Handle<String> sample_subject, | |
| 6056 bool is_ascii, | |
| 6057 Zone* zone) { | |
| 6058 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { | 6030 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
| 6059 return IrregexpRegExpTooBig(zone->isolate()); | 6031 return IrregexpRegExpTooBig(zone->isolate()); |
| 6060 } | 6032 } |
| 6061 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone); | 6033 RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte, zone); |
| 6062 | 6034 |
| 6063 // Sample some characters from the middle of the string. | 6035 // Sample some characters from the middle of the string. |
| 6064 static const int kSampleSize = 128; | 6036 static const int kSampleSize = 128; |
| 6065 | 6037 |
| 6066 sample_subject = String::Flatten(sample_subject); | 6038 sample_subject = String::Flatten(sample_subject); |
| 6067 int chars_sampled = 0; | 6039 int chars_sampled = 0; |
| 6068 int half_way = (sample_subject->length() - kSampleSize) / 2; | 6040 int half_way = (sample_subject->length() - kSampleSize) / 2; |
| 6069 for (int i = Max(0, half_way); | 6041 for (int i = Max(0, half_way); |
| 6070 i < sample_subject->length() && chars_sampled < kSampleSize; | 6042 i < sample_subject->length() && chars_sampled < kSampleSize; |
| 6071 i++, chars_sampled++) { | 6043 i++, chars_sampled++) { |
| (...skipping 26 matching lines...) Expand all Loading... | |
| 6098 // at the start of input. | 6070 // at the start of input. |
| 6099 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); | 6071 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone); |
| 6100 first_step_node->AddAlternative(GuardedAlternative(captured_body)); | 6072 first_step_node->AddAlternative(GuardedAlternative(captured_body)); |
| 6101 first_step_node->AddAlternative(GuardedAlternative( | 6073 first_step_node->AddAlternative(GuardedAlternative( |
| 6102 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); | 6074 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node))); |
| 6103 node = first_step_node; | 6075 node = first_step_node; |
| 6104 } else { | 6076 } else { |
| 6105 node = loop_node; | 6077 node = loop_node; |
| 6106 } | 6078 } |
| 6107 } | 6079 } |
| 6108 if (is_ascii) { | 6080 if (is_one_byte) { |
| 6109 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); | 6081 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); |
| 6110 // Do it again to propagate the new nodes to places where they were not | 6082 // Do it again to propagate the new nodes to places where they were not |
| 6111 // put because they had not been calculated yet. | 6083 // put because they had not been calculated yet. |
| 6112 if (node != NULL) { | 6084 if (node != NULL) { |
| 6113 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case); | 6085 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); |
| 6114 } | 6086 } |
| 6115 } | 6087 } |
| 6116 | 6088 |
| 6117 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); | 6089 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); |
| 6118 data->node = node; | 6090 data->node = node; |
| 6119 Analysis analysis(ignore_case, is_ascii); | 6091 Analysis analysis(ignore_case, is_one_byte); |
| 6120 analysis.EnsureAnalyzed(node); | 6092 analysis.EnsureAnalyzed(node); |
| 6121 if (analysis.has_failed()) { | 6093 if (analysis.has_failed()) { |
| 6122 const char* error_message = analysis.error_message(); | 6094 const char* error_message = analysis.error_message(); |
| 6123 return CompilationResult(zone->isolate(), error_message); | 6095 return CompilationResult(zone->isolate(), error_message); |
| 6124 } | 6096 } |
| 6125 | 6097 |
| 6126 // Create the correct assembler for the architecture. | 6098 // Create the correct assembler for the architecture. |
| 6127 #ifndef V8_INTERPRETED_REGEXP | 6099 #ifndef V8_INTERPRETED_REGEXP |
| 6128 // Native regexp implementation. | 6100 // Native regexp implementation. |
| 6129 | 6101 |
| 6130 NativeRegExpMacroAssembler::Mode mode = | 6102 NativeRegExpMacroAssembler::Mode mode = |
| 6131 is_ascii ? NativeRegExpMacroAssembler::ASCII | 6103 is_one_byte ? NativeRegExpMacroAssembler::LATIN1 |
| 6132 : NativeRegExpMacroAssembler::UC16; | 6104 : NativeRegExpMacroAssembler::UC16; |
| 6133 | 6105 |
| 6134 #if V8_TARGET_ARCH_IA32 | 6106 #if V8_TARGET_ARCH_IA32 |
| 6135 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, | 6107 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2, |
| 6136 zone); | 6108 zone); |
| 6137 #elif V8_TARGET_ARCH_X64 | 6109 #elif V8_TARGET_ARCH_X64 |
| 6138 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2, | 6110 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2, |
| 6139 zone); | 6111 zone); |
| 6140 #elif V8_TARGET_ARCH_ARM | 6112 #elif V8_TARGET_ARCH_ARM |
| 6141 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2, | 6113 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2, |
| 6142 zone); | 6114 zone); |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 6179 } | 6151 } |
| 6180 | 6152 |
| 6181 return compiler.Assemble(¯o_assembler, | 6153 return compiler.Assemble(¯o_assembler, |
| 6182 node, | 6154 node, |
| 6183 data->capture_count, | 6155 data->capture_count, |
| 6184 pattern); | 6156 pattern); |
| 6185 } | 6157 } |
| 6186 | 6158 |
| 6187 | 6159 |
| 6188 }} // namespace v8::internal | 6160 }} // namespace v8::internal |
| OLD | NEW |