src/jsregexp.cc - Issue 1285163003: Move regexp implementation into its own folder.

Side by Side Diff: src/jsregexp.cc

Issue 1285163003: Move regexp implementation into its own folder. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: addressed comment Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "src/v8.h"

6

7 #include "src/ast.h"

8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"

11 #include "src/execution.h"

12 #include "src/factory.h"

13 #include "src/jsregexp-inl.h"

14 #include "src/jsregexp.h"

15 #include "src/messages.h"

16 #include "src/ostreams.h"

17 #include "src/parser.h"

18 #include "src/regexp-macro-assembler.h"

19 #include "src/regexp-macro-assembler-irregexp.h"

20 #include "src/regexp-macro-assembler-tracer.h"

21 #include "src/regexp-stack.h"

22 #include "src/runtime/runtime.h"

23 #include "src/splay-tree-inl.h"

24 #include "src/string-search.h"

25 #include "src/unicode-decoder.h"

26

27 #ifndef V8_INTERPRETED_REGEXP

28 #if V8_TARGET_ARCH_IA32

29 #include "src/ia32/regexp-macro-assembler-ia32.h" // NOLINT

30 #elif V8_TARGET_ARCH_X64

31 #include "src/x64/regexp-macro-assembler-x64.h" // NOLINT

32 #elif V8_TARGET_ARCH_ARM64

33 #include "src/arm64/regexp-macro-assembler-arm64.h" // NOLINT

34 #elif V8_TARGET_ARCH_ARM

35 #include "src/arm/regexp-macro-assembler-arm.h" // NOLINT

36 #elif V8_TARGET_ARCH_PPC

37 #include "src/ppc/regexp-macro-assembler-ppc.h" // NOLINT

38 #elif V8_TARGET_ARCH_MIPS

39 #include "src/mips/regexp-macro-assembler-mips.h" // NOLINT

40 #elif V8_TARGET_ARCH_MIPS64

41 #include "src/mips64/regexp-macro-assembler-mips64.h" // NOLINT

42 #elif V8_TARGET_ARCH_X87

43 #include "src/x87/regexp-macro-assembler-x87.h" // NOLINT

44 #else

45 #error Unsupported target architecture.

46 #endif

47 #endif

48

49 #include "src/interpreter-irregexp.h"

50

51

52 namespace v8 {

53 namespace internal {

54

55 MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral(

56 Handle<JSFunction> constructor,

57 Handle<String> pattern,

58 Handle<String> flags) {

59 // Call the construct code with 2 arguments.

60 Handle<Object> argv[] = { pattern, flags };

61 return Execution::New(constructor, arraysize(argv), argv);

62 }

63

64

65 MUST_USE_RESULT

66 static inline MaybeHandle<Object> ThrowRegExpException(

67 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {

68 Isolate* isolate = re->GetIsolate();

69 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,

70 pattern, error_text),

71 Object);

72 }

73

74

75 inline void ThrowRegExpException(Handle<JSRegExp> re,

76 Handle<String> error_text) {

77 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));

78 }

79

80

81 ContainedInLattice AddRange(ContainedInLattice containment,

82 const int* ranges,

83 int ranges_length,

84 Interval new_range) {

85 DCHECK((ranges_length & 1) == 1);

86 DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);

87 if (containment == kLatticeUnknown) return containment;

88 bool inside = false;

89 int last = 0;

90 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {

91 // Consider the range from last to ranges[i].

92 // We haven't got to the new range yet.

93 if (ranges[i] <= new_range.from()) continue;

94 // New range is wholly inside last-ranges[i]. Note that new_range.to() is

95 // inclusive, but the values in ranges are not.

96 if (last <= new_range.from() && new_range.to() < ranges[i]) {

97 return Combine(containment, inside ? kLatticeIn : kLatticeOut);

98 }

99 return kLatticeUnknown;

100 }

101 return containment;

102 }

103

104

105 // More makes code generation slower, less makes V8 benchmark score lower.

106 const int kMaxLookaheadForBoyerMoore = 8;

107 // In a 3-character pattern you can maximally step forwards 3 characters

108 // at a time, which is not always enough to pay for the extra logic.

109 const int kPatternTooShortForBoyerMoore = 2;

110

111

112 // Identifies the sort of regexps where the regexp engine is faster

113 // than the code used for atom matches.

114 static bool HasFewDifferentCharacters(Handle<String> pattern) {

115 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());

116 if (length <= kPatternTooShortForBoyerMoore) return false;

117 const int kMod = 128;

118 bool character_found[kMod];

119 int different = 0;

120 memset(&character_found[0], 0, sizeof(character_found));

121 for (int i = 0; i < length; i++) {

122 int ch = (pattern->Get(i) & (kMod - 1));

123 if (!character_found[ch]) {

124 character_found[ch] = true;

125 different++;

126 // We declare a regexp low-alphabet if it has at least 3 times as many

127 // characters as it has different characters.

128 if (different * 3 > length) return false;

129 }

130 }

131 return true;

132 }

133

134

135 // Generic RegExp methods. Dispatches to implementation specific methods.

136

137

138 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,

139 Handle<String> pattern,

140 JSRegExp::Flags flags) {

141 Isolate* isolate = re->GetIsolate();

142 Zone zone;

143 CompilationCache* compilation_cache = isolate->compilation_cache();

144 MaybeHandle<FixedArray> maybe_cached =

145 compilation_cache->LookupRegExp(pattern, flags);

146 Handle<FixedArray> cached;

147 bool in_cache = maybe_cached.ToHandle(&cached);

148 LOG(isolate, RegExpCompileEvent(re, in_cache));

149

150 Handle<Object> result;

151 if (in_cache) {

152 re->set_data(*cached);

153 return re;

154 }

155 pattern = String::Flatten(pattern);

156 PostponeInterruptsScope postpone(isolate);

157 RegExpCompileData parse_result;

158 FlatStringReader reader(isolate, pattern);

159 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader,

160 flags.is_multiline(), flags.is_unicode(),

161 &parse_result)) {

162 // Throw an exception if we fail to parse the pattern.

163 return ThrowRegExpException(re, pattern, parse_result.error);

164 }

165

166 bool has_been_compiled = false;

167

168 if (parse_result.simple &&

169 !flags.is_ignore_case() &&

170 !flags.is_sticky() &&

171 !HasFewDifferentCharacters(pattern)) {

172 // Parse-tree is a single atom that is equal to the pattern.

173 AtomCompile(re, pattern, flags, pattern);

174 has_been_compiled = true;

175 } else if (parse_result.tree->IsAtom() &&

176 !flags.is_ignore_case() &&

177 !flags.is_sticky() &&

178 parse_result.capture_count == 0) {

179 RegExpAtom* atom = parse_result.tree->AsAtom();

180 Vector<const uc16> atom_pattern = atom->data();

181 Handle<String> atom_string;

182 ASSIGN_RETURN_ON_EXCEPTION(

183 isolate, atom_string,

184 isolate->factory()->NewStringFromTwoByte(atom_pattern),

185 Object);

186 if (!HasFewDifferentCharacters(atom_string)) {

187 AtomCompile(re, pattern, flags, atom_string);

188 has_been_compiled = true;

189 }

190 }

191 if (!has_been_compiled) {

192 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);

193 }

194 DCHECK(re->data()->IsFixedArray());

195 // Compilation succeeded so the data is set on the regexp

196 // and we can store it in the cache.

197 Handle<FixedArray> data(FixedArray::cast(re->data()));

198 compilation_cache->PutRegExp(pattern, flags, data);

199

200 return re;

201 }

202

203

204 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,

205 Handle<String> subject,

206 int index,

207 Handle<JSArray> last_match_info) {

208 switch (regexp->TypeTag()) {

209 case JSRegExp::ATOM:

210 return AtomExec(regexp, subject, index, last_match_info);

211 case JSRegExp::IRREGEXP: {

212 return IrregexpExec(regexp, subject, index, last_match_info);

213 }

214 default:

215 UNREACHABLE();

216 return MaybeHandle<Object>();

217 }

218 }

219

220

221 // RegExp Atom implementation: Simple string search using indexOf.

222

223

224 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,

225 Handle<String> pattern,

226 JSRegExp::Flags flags,

227 Handle<String> match_pattern) {

228 re->GetIsolate()->factory()->SetRegExpAtomData(re,

229 JSRegExp::ATOM,

230 pattern,

231 flags,

232 match_pattern);

233 }

234

235

236 static void SetAtomLastCapture(FixedArray* array,

237 String* subject,

238 int from,

239 int to) {

240 SealHandleScope shs(array->GetIsolate());

241 RegExpImpl::SetLastCaptureCount(array, 2);

242 RegExpImpl::SetLastSubject(array, subject);

243 RegExpImpl::SetLastInput(array, subject);

244 RegExpImpl::SetCapture(array, 0, from);

245 RegExpImpl::SetCapture(array, 1, to);

246 }

247

248

249 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,

250 Handle<String> subject,

251 int index,

252 int32_t* output,

253 int output_size) {

254 Isolate* isolate = regexp->GetIsolate();

255

256 DCHECK(0 <= index);

257 DCHECK(index <= subject->length());

258

259 subject = String::Flatten(subject);

260 DisallowHeapAllocation no_gc; // ensure vectors stay valid

261

262 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));

263 int needle_len = needle->length();

264 DCHECK(needle->IsFlat());

265 DCHECK_LT(0, needle_len);

266

267 if (index + needle_len > subject->length()) {

268 return RegExpImpl::RE_FAILURE;

269 }

270

271 for (int i = 0; i < output_size; i += 2) {

272 String::FlatContent needle_content = needle->GetFlatContent();

273 String::FlatContent subject_content = subject->GetFlatContent();

274 DCHECK(needle_content.IsFlat());

275 DCHECK(subject_content.IsFlat());

276 // dispatch on type of strings

277 index =

278 (needle_content.IsOneByte()

279 ? (subject_content.IsOneByte()

280 ? SearchString(isolate, subject_content.ToOneByteVector(),

281 needle_content.ToOneByteVector(), index)

282 : SearchString(isolate, subject_content.ToUC16Vector(),

283 needle_content.ToOneByteVector(), index))

284 : (subject_content.IsOneByte()

285 ? SearchString(isolate, subject_content.ToOneByteVector(),

286 needle_content.ToUC16Vector(), index)

287 : SearchString(isolate, subject_content.ToUC16Vector(),

288 needle_content.ToUC16Vector(), index)));

289 if (index == -1) {

290 return i / 2; // Return number of matches.

291 } else {

292 output[i] = index;

293 output[i+1] = index + needle_len;

294 index += needle_len;

295 }

296 }

297 return output_size / 2;

298 }

299

300

301 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,

302 Handle<String> subject,

303 int index,

304 Handle<JSArray> last_match_info) {

305 Isolate* isolate = re->GetIsolate();

306

307 static const int kNumRegisters = 2;

308 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);

309 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();

310

311 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);

312

313 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();

314

315 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);

316 SealHandleScope shs(isolate);

317 FixedArray* array = FixedArray::cast(last_match_info->elements());

318 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);

319 return last_match_info;

320 }

321

322

323 // Irregexp implementation.

324

325 // Ensures that the regexp object contains a compiled version of the

326 // source for either one-byte or two-byte subject strings.

327 // If the compiled version doesn't already exist, it is compiled

328 // from the source pattern.

329 // If compilation fails, an exception is thrown and this function

330 // returns false.

331 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,

332 Handle<String> sample_subject,

333 bool is_one_byte) {

334 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));

335 #ifdef V8_INTERPRETED_REGEXP

336 if (compiled_code->IsByteArray()) return true;

337 #else // V8_INTERPRETED_REGEXP (RegExp native code)

338 if (compiled_code->IsCode()) return true;

339 #endif

340 // We could potentially have marked this as flushable, but have kept

341 // a saved version if we did not flush it yet.

342 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));

343 if (saved_code->IsCode()) {

344 // Reinstate the code in the original place.

345 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);

346 DCHECK(compiled_code->IsSmi());

347 return true;

348 }

349 return CompileIrregexp(re, sample_subject, is_one_byte);

350 }

351

352

353 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,

354 Handle<String> sample_subject,

355 bool is_one_byte) {

356 // Compile the RegExp.

357 Isolate* isolate = re->GetIsolate();

358 Zone zone;

359 PostponeInterruptsScope postpone(isolate);

360 // If we had a compilation error the last time this is saved at the

361 // saved code index.

362 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));

363 // When arriving here entry can only be a smi, either representing an

364 // uncompiled regexp, a previous compilation error, or code that has

365 // been flushed.

366 DCHECK(entry->IsSmi());

367 int entry_value = Smi::cast(entry)->value();

368 DCHECK(entry_value == JSRegExp::kUninitializedValue \|\|

369 entry_value == JSRegExp::kCompilationErrorValue \|\|

370 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));

371

372 if (entry_value == JSRegExp::kCompilationErrorValue) {

373 // A previous compilation failed and threw an error which we store in

374 // the saved code index (we store the error message, not the actual

375 // error). Recreate the error object and throw it.

376 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));

377 DCHECK(error_string->IsString());

378 Handle<String> error_message(String::cast(error_string));

379 ThrowRegExpException(re, error_message);

380 return false;

381 }

382

383 JSRegExp::Flags flags = re->GetFlags();

384

385 Handle<String> pattern(re->Pattern());

386 pattern = String::Flatten(pattern);

387 RegExpCompileData compile_data;

388 FlatStringReader reader(isolate, pattern);

389 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags.is_multiline(),

390 flags.is_unicode(), &compile_data)) {

391 // Throw an exception if we fail to parse the pattern.

392 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.

393 USE(ThrowRegExpException(re, pattern, compile_data.error));

394 return false;

395 }

396 RegExpEngine::CompilationResult result = RegExpEngine::Compile(

397 isolate, &zone, &compile_data, flags.is_ignore_case(), flags.is_global(),

398 flags.is_multiline(), flags.is_sticky(), pattern, sample_subject,

399 is_one_byte);

400 if (result.error_message != NULL) {

401 // Unable to compile regexp.

402 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(

403 CStrVector(result.error_message)).ToHandleChecked();

404 ThrowRegExpException(re, error_message);

405 return false;

406 }

407

408 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));

409 data->set(JSRegExp::code_index(is_one_byte), result.code);

410 int register_max = IrregexpMaxRegisterCount(*data);

411 if (result.num_registers > register_max) {

412 SetIrregexpMaxRegisterCount(*data, result.num_registers);

413 }

414

415 return true;

416 }

417

418

419 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {

420 return Smi::cast(

421 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();

422 }

423

424

425 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {

426 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));

427 }

428

429

430 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {

431 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();

432 }

433

434

435 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {

436 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();

437 }

438

439

440 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {

441 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));

442 }

443

444

445 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {

446 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));

447 }

448

449

450 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,

451 Handle<String> pattern,

452 JSRegExp::Flags flags,

453 int capture_count) {

454 // Initialize compiled code entries to null.

455 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,

456 JSRegExp::IRREGEXP,

457 pattern,

458 flags,

459 capture_count);

460 }

461

462

463 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,

464 Handle<String> subject) {

465 subject = String::Flatten(subject);

466

467 // Check representation of the underlying storage.

468 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();

469 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;

470

471 #ifdef V8_INTERPRETED_REGEXP

472 // Byte-code regexp needs space allocated for all its registers.

473 // The result captures are copied to the start of the registers array

474 // if the match succeeds. This way those registers are not clobbered

475 // when we set the last match info from last successful match.

476 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +

477 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;

478 #else // V8_INTERPRETED_REGEXP

479 // Native regexp only needs room to output captures. Registers are handled

480 // internally.

481 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;

482 #endif // V8_INTERPRETED_REGEXP

483 }

484

485

486 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,

487 Handle<String> subject,

488 int index,

489 int32_t* output,

490 int output_size) {

491 Isolate* isolate = regexp->GetIsolate();

492

493 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);

494

495 DCHECK(index >= 0);

496 DCHECK(index <= subject->length());

497 DCHECK(subject->IsFlat());

498

499 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();

500

501 #ifndef V8_INTERPRETED_REGEXP

502 DCHECK(output_size >= (IrregexpNumberOfCaptures(irregexp) + 1) 2);

503 do {

504 EnsureCompiledIrregexp(regexp, subject, is_one_byte);

505 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);

506 // The stack is used to allocate registers for the compiled regexp code.

507 // This means that in case of failure, the output registers array is left

508 // untouched and contains the capture results from the previous successful

509 // match. We can use that to set the last match info lazily.

510 NativeRegExpMacroAssembler::Result res =

511 NativeRegExpMacroAssembler::Match(code,

512 subject,

513 output,

514 output_size,

515 index,

516 isolate);

517 if (res != NativeRegExpMacroAssembler::RETRY) {

518 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION \|\|

519 isolate->has_pending_exception());

520 STATIC_ASSERT(

521 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);

522 STATIC_ASSERT(

523 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);

524 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)

525 == RE_EXCEPTION);

526 return static_cast<IrregexpResult>(res);

527 }

528 // If result is RETRY, the string has changed representation, and we

529 // must restart from scratch.

530 // In this case, it means we must make sure we are prepared to handle

531 // the, potentially, different subject (the string can switch between

532 // being internal and external, and even between being Latin1 and UC16,

533 // but the characters are always the same).

534 IrregexpPrepare(regexp, subject);

535 is_one_byte = subject->IsOneByteRepresentationUnderneath();

536 } while (true);

537 UNREACHABLE();

538 return RE_EXCEPTION;

539 #else // V8_INTERPRETED_REGEXP

540

541 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));

542 // We must have done EnsureCompiledIrregexp, so we can get the number of

543 // registers.

544 int number_of_capture_registers =

545 (IrregexpNumberOfCaptures(irregexp) + 1) 2;

546 int32_t* raw_output = &output[number_of_capture_registers];

547 // We do not touch the actual capture result registers until we know there

548 // has been a match so that we can use those capture results to set the

549 // last match info.

550 for (int i = number_of_capture_registers - 1; i >= 0; i--) {

551 raw_output[i] = -1;

552 }

553 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),

554 isolate);

555

556 IrregexpResult result = IrregexpInterpreter::Match(isolate,

557 byte_codes,

558 subject,

559 raw_output,

560 index);

561 if (result == RE_SUCCESS) {

562 // Copy capture results to the start of the registers array.

563 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));

564 }

565 if (result == RE_EXCEPTION) {

566 DCHECK(!isolate->has_pending_exception());

567 isolate->StackOverflow();

568 }

569 return result;

570 #endif // V8_INTERPRETED_REGEXP

571 }

572

573

574 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,

575 Handle<String> subject,

576 int previous_index,

577 Handle<JSArray> last_match_info) {

578 Isolate* isolate = regexp->GetIsolate();

579 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);

580

581 // Prepare space for the return values.

582 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)

583 if (FLAG_trace_regexp_bytecodes) {

584 String* pattern = regexp->Pattern();

585 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());

586 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());

587 }

588 #endif

589 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);

590 if (required_registers < 0) {

591 // Compiling failed with an exception.

592 DCHECK(isolate->has_pending_exception());

593 return MaybeHandle<Object>();

594 }

595

596 int32_t* output_registers = NULL;

597 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {

598 output_registers = NewArray<int32_t>(required_registers);

599 }

600 base::SmartArrayPointer<int32_t> auto_release(output_registers);

601 if (output_registers == NULL) {

602 output_registers = isolate->jsregexp_static_offsets_vector();

603 }

604

605 int res = RegExpImpl::IrregexpExecRaw(

606 regexp, subject, previous_index, output_registers, required_registers);

607 if (res == RE_SUCCESS) {

608 int capture_count =

609 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));

610 return SetLastMatchInfo(

611 last_match_info, subject, capture_count, output_registers);

612 }

613 if (res == RE_EXCEPTION) {

614 DCHECK(isolate->has_pending_exception());

615 return MaybeHandle<Object>();

616 }

617 DCHECK(res == RE_FAILURE);

618 return isolate->factory()->null_value();

619 }

620

621

622 static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {

623 if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {

624 JSArray::SetLength(array, minimum_size);

625 }

626 }

627

628

629 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,

630 Handle<String> subject,

631 int capture_count,

632 int32_t* match) {

633 DCHECK(last_match_info->HasFastObjectElements());

634 int capture_register_count = (capture_count + 1) * 2;

635 EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);

636 DisallowHeapAllocation no_allocation;

637 FixedArray* array = FixedArray::cast(last_match_info->elements());

638 if (match != NULL) {

639 for (int i = 0; i < capture_register_count; i += 2) {

640 SetCapture(array, i, match[i]);

641 SetCapture(array, i + 1, match[i + 1]);

642 }

643 }

644 SetLastCaptureCount(array, capture_register_count);

645 SetLastSubject(array, *subject);

646 SetLastInput(array, *subject);

647 return last_match_info;

648 }

649

650

651 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,

652 Handle<String> subject,

653 bool is_global,

654 Isolate* isolate)

655 : register_array_(NULL),

656 register_array_size_(0),

657 regexp_(regexp),

658 subject_(subject) {

659 #ifdef V8_INTERPRETED_REGEXP

660 bool interpreted = true;

661 #else

662 bool interpreted = false;

663 #endif // V8_INTERPRETED_REGEXP

664

665 if (regexp_->TypeTag() == JSRegExp::ATOM) {

666 static const int kAtomRegistersPerMatch = 2;

667 registers_per_match_ = kAtomRegistersPerMatch;

668 // There is no distinction between interpreted and native for atom regexps.

669 interpreted = false;

670 } else {

671 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);

672 if (registers_per_match_ < 0) {

673 num_matches_ = -1; // Signal exception.

674 return;

675 }

676 }

677

678 if (is_global && !interpreted) {

679 register_array_size_ =

680 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);

681 max_matches_ = register_array_size_ / registers_per_match_;

682 } else {

683 // Global loop in interpreted regexp is not implemented. We choose

684 // the size of the offsets vector so that it can only store one match.

685 register_array_size_ = registers_per_match_;

686 max_matches_ = 1;

687 }

688

689 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {

690 register_array_ = NewArray<int32_t>(register_array_size_);

691 } else {

692 register_array_ = isolate->jsregexp_static_offsets_vector();

693 }

694

695 // Set state so that fetching the results the first time triggers a call

696 // to the compiled regexp.

697 current_match_index_ = max_matches_ - 1;

698 num_matches_ = max_matches_;

699 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.

700 DCHECK_GE(register_array_size_, registers_per_match_);

701 int32_t* last_match =

702 &register_array_[current_match_index_ * registers_per_match_];

703 last_match[0] = -1;

704 last_match[1] = 0;

705 }

706

707

708 // -------------------------------------------------------------------

709 // Implementation of the Irregexp regular expression engine.

710 //

711 // The Irregexp regular expression engine is intended to be a complete

712 // implementation of ECMAScript regular expressions. It generates either

713 // bytecodes or native code.

714

715 // The Irregexp regexp engine is structured in three steps.

716 // 1) The parser generates an abstract syntax tree. See ast.cc.

717 // 2) From the AST a node network is created. The nodes are all

718 // subclasses of RegExpNode. The nodes represent states when

719 // executing a regular expression. Several optimizations are

720 // performed on the node network.

721 // 3) From the nodes we generate either byte codes or native code

722 // that can actually execute the regular expression (perform

723 // the search). The code generation step is described in more

724 // detail below.

725

726 // Code generation.

727 //

728 // The nodes are divided into four main categories.

729 // * Choice nodes

730 // These represent places where the regular expression can

731 // match in more than one way. For example on entry to an

732 // alternation (foo\|bar) or a repetition (*, +, ? or {}).

733 // * Action nodes

734 // These represent places where some action should be

735 // performed. Examples include recording the current position

736 // in the input string to a register (in order to implement

737 // captures) or other actions on register for example in order

738 // to implement the counters needed for {} repetitions.

739 // * Matching nodes

740 // These attempt to match some element part of the input string.

741 // Examples of elements include character classes, plain strings

742 // or back references.

743 // * End nodes

744 // These are used to implement the actions required on finding

745 // a successful match or failing to find a match.

746 //

747 // The code generated (whether as byte codes or native code) maintains

748 // some state as it runs. This consists of the following elements:

749 //

750 // * The capture registers. Used for string captures.

751 // * Other registers. Used for counters etc.

752 // * The current position.

753 // * The stack of backtracking information. Used when a matching node

754 // fails to find a match and needs to try an alternative.

755 //

756 // Conceptual regular expression execution model:

757 //

758 // There is a simple conceptual model of regular expression execution

759 // which will be presented first. The actual code generated is a more

760 // efficient simulation of the simple conceptual model:

761 //

762 // * Choice nodes are implemented as follows:

763 // For each choice except the last {

764 // push current position

765 // push backtrack code location

766 // <generate code to test for choice>

767 // backtrack code location:

768 // pop current position

769 // }

770 // <generate code to test for last choice>

771 //

772 // * Actions nodes are generated as follows

773 // <push affected registers on backtrack stack>

774 // <generate code to perform action>

775 // push backtrack code location

776 // <generate code to test for following nodes>

777 // backtrack code location:

778 // <pop affected registers to restore their state>

779 // <pop backtrack location from stack and go to it>

780 //

781 // * Matching nodes are generated as follows:

782 // if input string matches at current position

783 // update current position

784 // <generate code to test for following nodes>

785 // else

786 // <pop backtrack location from stack and go to it>

787 //

788 // Thus it can be seen that the current position is saved and restored

789 // by the choice nodes, whereas the registers are saved and restored by

790 // by the action nodes that manipulate them.

791 //

792 // The other interesting aspect of this model is that nodes are generated

793 // at the point where they are needed by a recursive call to Emit(). If

794 // the node has already been code generated then the Emit() call will

795 // generate a jump to the previously generated code instead. In order to

796 // limit recursion it is possible for the Emit() function to put the node

797 // on a work list for later generation and instead generate a jump. The

798 // destination of the jump is resolved later when the code is generated.

799 //

800 // Actual regular expression code generation.

801 //

802 // Code generation is actually more complicated than the above. In order

803 // to improve the efficiency of the generated code some optimizations are

804 // performed

805 //

806 // * Choice nodes have 1-character lookahead.

807 // A choice node looks at the following character and eliminates some of

808 // the choices immediately based on that character. This is not yet

809 // implemented.

810 // * Simple greedy loops store reduced backtracking information.

811 // A quantifier like /.*foo/m will greedily match the whole input. It will

812 // then need to backtrack to a point where it can match "foo". The naive

813 // implementation of this would push each character position onto the

814 // backtracking stack, then pop them off one by one. This would use space

815 // proportional to the length of the input string. However since the "."

816 // can only match in one way and always has a constant length (in this case

817 // of 1) it suffices to store the current position on the top of the stack

818 // once. Matching now becomes merely incrementing the current position and

819 // backtracking becomes decrementing the current position and checking the

820 // result against the stored current position. This is faster and saves

821 // space.

822 // * The current state is virtualized.

823 // This is used to defer expensive operations until it is clear that they

824 // are needed and to generate code for a node more than once, allowing

825 // specialized an efficient versions of the code to be created. This is

826 // explained in the section below.

827 //

828 // Execution state virtualization.

829 //

830 // Instead of emitting code, nodes that manipulate the state can record their

831 // manipulation in an object called the Trace. The Trace object can record a

832 // current position offset, an optional backtrack code location on the top of

833 // the virtualized backtrack stack and some register changes. When a node is

834 // to be emitted it can flush the Trace or update it. Flushing the Trace

835 // will emit code to bring the actual state into line with the virtual state.

836 // Avoiding flushing the state can postpone some work (e.g. updates of capture

837 // registers). Postponing work can save time when executing the regular

838 // expression since it may be found that the work never has to be done as a

839 // failure to match can occur. In addition it is much faster to jump to a

840 // known backtrack code location than it is to pop an unknown backtrack

841 // location from the stack and jump there.

842 //

843 // The virtual state found in the Trace affects code generation. For example

844 // the virtual state contains the difference between the actual current

845 // position and the virtual current position, and matching code needs to use

846 // this offset to attempt a match in the correct location of the input

847 // string. Therefore code generated for a non-trivial trace is specialized

848 // to that trace. The code generator therefore has the ability to generate

849 // code for each node several times. In order to limit the size of the

850 // generated code there is an arbitrary limit on how many specialized sets of

851 // code may be generated for a given node. If the limit is reached, the

852 // trace is flushed and a generic version of the code for a node is emitted.

853 // This is subsequently used for that node. The code emitted for non-generic

854 // trace is not recorded in the node and so it cannot currently be reused in

855 // the event that code generation is requested for an identical trace.

856

857

858 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {

859 UNREACHABLE();

860 }

861

862

863 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {

864 text->AddElement(TextElement::Atom(this), zone);

865 }

866

867

868 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {

869 text->AddElement(TextElement::CharClass(this), zone);

870 }

871

872

873 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {

874 for (int i = 0; i < elements()->length(); i++)

875 text->AddElement(elements()->at(i), zone);

876 }

877

878

879 TextElement TextElement::Atom(RegExpAtom* atom) {

880 return TextElement(ATOM, atom);

881 }

882

883

884 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {

885 return TextElement(CHAR_CLASS, char_class);

886 }

887

888

889 int TextElement::length() const {

890 switch (text_type()) {

891 case ATOM:

892 return atom()->length();

893

894 case CHAR_CLASS:

895 return 1;

896 }

897 UNREACHABLE();

898 return 0;

899 }

900

901

902 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {

903 if (table_ == NULL) {

904 table_ = new(zone()) DispatchTable(zone());

905 DispatchTableConstructor cons(table_, ignore_case, zone());

906 cons.BuildTable(this);

907 }

908 return table_;

909 }

910

911

912 class FrequencyCollator {

913 public:

914 FrequencyCollator() : total_samples_(0) {

915 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {

916 frequencies_[i] = CharacterFrequency(i);

917 }

918 }

919

920 void CountCharacter(int character) {

921 int index = (character & RegExpMacroAssembler::kTableMask);

922 frequencies_[index].Increment();

923 total_samples_++;

924 }

925

926 // Does not measure in percent, but rather per-128 (the table size from the

927 // regexp macro assembler).

928 int Frequency(int in_character) {

929 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);

930 if (total_samples_ < 1) return 1; // Division by zero.

931 int freq_in_per128 =

932 (frequencies_[in_character].counter() * 128) / total_samples_;

933 return freq_in_per128;

934 }

935

936 private:

937 class CharacterFrequency {

938 public:

939 CharacterFrequency() : counter_(0), character_(-1) { }

940 explicit CharacterFrequency(int character)

941 : counter_(0), character_(character) { }

942

943 void Increment() { counter_++; }

944 int counter() { return counter_; }

945 int character() { return character_; }

946

947 private:

948 int counter_;

949 int character_;

950 };

951

952

953 private:

954 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];

955 int total_samples_;

956 };

957

958

959 class RegExpCompiler {

960 public:

961 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,

962 bool ignore_case, bool is_one_byte);

963

964 int AllocateRegister() {

965 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {

966 reg_exp_too_big_ = true;

967 return next_register_;

968 }

969 return next_register_++;

970 }

971

972 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,

973 RegExpNode* start,

974 int capture_count,

975 Handle<String> pattern);

976

977 inline void AddWork(RegExpNode* node) {

978 if (!node->on_work_list() && !node->label()->is_bound()) {

979 node->set_on_work_list(true);

980 work_list_->Add(node);

981 }

982 }

983

984 static const int kImplementationOffset = 0;

985 static const int kNumberOfRegistersOffset = 0;

986 static const int kCodeOffset = 1;

987

988 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }

989 EndNode* accept() { return accept_; }

990

991 static const int kMaxRecursion = 100;

992 inline int recursion_depth() { return recursion_depth_; }

993 inline void IncrementRecursionDepth() { recursion_depth_++; }

994 inline void DecrementRecursionDepth() { recursion_depth_--; }

995

996 void SetRegExpTooBig() { reg_exp_too_big_ = true; }

997

998 inline bool ignore_case() { return ignore_case_; }

999 inline bool one_byte() { return one_byte_; }

1000 inline bool optimize() { return optimize_; }

1001 inline void set_optimize(bool value) { optimize_ = value; }

1002 inline bool limiting_recursion() { return limiting_recursion_; }

1003 inline void set_limiting_recursion(bool value) {

1004 limiting_recursion_ = value;

1005 }

1006 FrequencyCollator* frequency_collator() { return &frequency_collator_; }

1007

1008 int current_expansion_factor() { return current_expansion_factor_; }

1009 void set_current_expansion_factor(int value) {

1010 current_expansion_factor_ = value;

1011 }

1012

1013 Isolate* isolate() const { return isolate_; }

1014 Zone* zone() const { return zone_; }

1015

1016 static const int kNoRegister = -1;

1017

1018 private:

1019 EndNode* accept_;

1020 int next_register_;

1021 List<RegExpNode> work_list_;

1022 int recursion_depth_;

1023 RegExpMacroAssembler* macro_assembler_;

1024 bool ignore_case_;

1025 bool one_byte_;

1026 bool reg_exp_too_big_;

1027 bool limiting_recursion_;

1028 bool optimize_;

1029 int current_expansion_factor_;

1030 FrequencyCollator frequency_collator_;

1031 Isolate* isolate_;

1032 Zone* zone_;

1033 };

1034

1035

1036 class RecursionCheck {

1037 public:

1038 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {

1039 compiler->IncrementRecursionDepth();

1040 }

1041 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }

1042 private:

1043 RegExpCompiler* compiler_;

1044 };

1045

1046

1047 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {

1048 return RegExpEngine::CompilationResult(isolate, "RegExp too big");

1049 }

1050

1051

1052 // Attempts to compile the regexp using an Irregexp code generator. Returns

1053 // a fixed array or a null handle depending on whether it succeeded.

1054 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,

1055 bool ignore_case, bool one_byte)

1056 : next_register_(2 * (capture_count + 1)),

1057 work_list_(NULL),

1058 recursion_depth_(0),

1059 ignore_case_(ignore_case),

1060 one_byte_(one_byte),

1061 reg_exp_too_big_(false),

1062 limiting_recursion_(false),

1063 optimize_(FLAG_regexp_optimization),

1064 current_expansion_factor_(1),

1065 frequency_collator_(),

1066 isolate_(isolate),

1067 zone_(zone) {

1068 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);

1069 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);

1070 }

1071

1072

1073 RegExpEngine::CompilationResult RegExpCompiler::Assemble(

1074 RegExpMacroAssembler* macro_assembler,

1075 RegExpNode* start,

1076 int capture_count,

1077 Handle<String> pattern) {

1078 Heap* heap = pattern->GetHeap();

1079

1080 #ifdef DEBUG

1081 if (FLAG_trace_regexp_assembler)

1082 macro_assembler_ =

1083 new RegExpMacroAssemblerTracer(isolate(), macro_assembler);

1084 else

1085 #endif

1086 macro_assembler_ = macro_assembler;

1087

1088 List <RegExpNode*> work_list(0);

1089 work_list_ = &work_list;

1090 Label fail;

1091 macro_assembler_->PushBacktrack(&fail);

1092 Trace new_trace;

1093 start->Emit(this, &new_trace);

1094 macro_assembler_->Bind(&fail);

1095 macro_assembler_->Fail();

1096 while (!work_list.is_empty()) {

1097 RegExpNode* node = work_list.RemoveLast();

1098 node->set_on_work_list(false);

1099 if (!node->label()->is_bound()) node->Emit(this, &new_trace);

1100 }

1101 if (reg_exp_too_big_) {

1102 macro_assembler_->AbortedCodeGeneration();

1103 return IrregexpRegExpTooBig(isolate_);

1104 }

1105

1106 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);

1107 heap->IncreaseTotalRegexpCodeGenerated(code->Size());

1108 work_list_ = NULL;

1109 #ifdef ENABLE_DISASSEMBLER

1110 if (FLAG_print_code) {

1111 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());

1112 OFStream os(trace_scope.file());

1113 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);

1114 }

1115 #endif

1116 #ifdef DEBUG

1117 if (FLAG_trace_regexp_assembler) {

1118 delete macro_assembler_;

1119 }

1120 #endif

1121 return RegExpEngine::CompilationResult(*code, next_register_);

1122 }

1123

1124

1125 bool Trace::DeferredAction::Mentions(int that) {

1126 if (action_type() == ActionNode::CLEAR_CAPTURES) {

1127 Interval range = static_cast<DeferredClearCaptures*>(this)->range();

1128 return range.Contains(that);

1129 } else {

1130 return reg() == that;

1131 }

1132 }

1133

1134

1135 bool Trace::mentions_reg(int reg) {

1136 for (DeferredAction* action = actions_;

1137 action != NULL;

1138 action = action->next()) {

1139 if (action->Mentions(reg))

1140 return true;

1141 }

1142 return false;

1143 }

1144

1145

1146 bool Trace::GetStoredPosition(int reg, int* cp_offset) {

1147 DCHECK_EQ(0, *cp_offset);

1148 for (DeferredAction* action = actions_;

1149 action != NULL;

1150 action = action->next()) {

1151 if (action->Mentions(reg)) {

1152 if (action->action_type() == ActionNode::STORE_POSITION) {

1153 cp_offset = static_cast<DeferredCapture>(action)->cp_offset();

1154 return true;

1155 } else {

1156 return false;

1157 }

1158 }

1159 }

1160 return false;

1161 }

1162

1163

1164 int Trace::FindAffectedRegisters(OutSet* affected_registers,

1165 Zone* zone) {

1166 int max_register = RegExpCompiler::kNoRegister;

1167 for (DeferredAction* action = actions_;

1168 action != NULL;

1169 action = action->next()) {

1170 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {

1171 Interval range = static_cast<DeferredClearCaptures*>(action)->range();

1172 for (int i = range.from(); i <= range.to(); i++)

1173 affected_registers->Set(i, zone);

1174 if (range.to() > max_register) max_register = range.to();

1175 } else {

1176 affected_registers->Set(action->reg(), zone);

1177 if (action->reg() > max_register) max_register = action->reg();

1178 }

1179 }

1180 return max_register;

1181 }

1182

1183

1184 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,

1185 int max_register,

1186 const OutSet& registers_to_pop,

1187 const OutSet& registers_to_clear) {

1188 for (int reg = max_register; reg >= 0; reg--) {

1189 if (registers_to_pop.Get(reg)) {

1190 assembler->PopRegister(reg);

1191 } else if (registers_to_clear.Get(reg)) {

1192 int clear_to = reg;

1193 while (reg > 0 && registers_to_clear.Get(reg - 1)) {

1194 reg--;

1195 }

1196 assembler->ClearRegisters(reg, clear_to);

1197 }

1198 }

1199 }

1200

1201

1202 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,

1203 int max_register,

1204 const OutSet& affected_registers,

1205 OutSet* registers_to_pop,

1206 OutSet* registers_to_clear,

1207 Zone* zone) {

1208 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.

1209 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;

1210

1211 // Count pushes performed to force a stack limit check occasionally.

1212 int pushes = 0;

1213

1214 for (int reg = 0; reg <= max_register; reg++) {

1215 if (!affected_registers.Get(reg)) {

1216 continue;

1217 }

1218

1219 // The chronologically first deferred action in the trace

1220 // is used to infer the action needed to restore a register

1221 // to its previous state (or not, if it's safe to ignore it).

1222 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };

1223 DeferredActionUndoType undo_action = IGNORE;

1224

1225 int value = 0;

1226 bool absolute = false;

1227 bool clear = false;

1228 int store_position = -1;

1229 // This is a little tricky because we are scanning the actions in reverse

1230 // historical order (newest first).

1231 for (DeferredAction* action = actions_;

1232 action != NULL;

1233 action = action->next()) {

1234 if (action->Mentions(reg)) {

1235 switch (action->action_type()) {

1236 case ActionNode::SET_REGISTER: {

1237 Trace::DeferredSetRegister* psr =

1238 static_cast<Trace::DeferredSetRegister*>(action);

1239 if (!absolute) {

1240 value += psr->value();

1241 absolute = true;

1242 }

1243 // SET_REGISTER is currently only used for newly introduced loop

1244 // counters. They can have a significant previous value if they

1245 // occour in a loop. TODO(lrn): Propagate this information, so

1246 // we can set undo_action to IGNORE if we know there is no value to

1247 // restore.

1248 undo_action = RESTORE;

1249 DCHECK_EQ(store_position, -1);

1250 DCHECK(!clear);

1251 break;

1252 }

1253 case ActionNode::INCREMENT_REGISTER:

1254 if (!absolute) {

1255 value++;

1256 }

1257 DCHECK_EQ(store_position, -1);

1258 DCHECK(!clear);

1259 undo_action = RESTORE;

1260 break;

1261 case ActionNode::STORE_POSITION: {

1262 Trace::DeferredCapture* pc =

1263 static_cast<Trace::DeferredCapture*>(action);

1264 if (!clear && store_position == -1) {

1265 store_position = pc->cp_offset();

1266 }

1267

1268 // For captures we know that stores and clears alternate.

1269 // Other register, are never cleared, and if the occur

1270 // inside a loop, they might be assigned more than once.

1271 if (reg <= 1) {

1272 // Registers zero and one, aka "capture zero", is

1273 // always set correctly if we succeed. There is no

1274 // need to undo a setting on backtrack, because we

1275 // will set it again or fail.

1276 undo_action = IGNORE;

1277 } else {

1278 undo_action = pc->is_capture() ? CLEAR : RESTORE;

1279 }

1280 DCHECK(!absolute);

1281 DCHECK_EQ(value, 0);

1282 break;

1283 }

1284 case ActionNode::CLEAR_CAPTURES: {

1285 // Since we're scanning in reverse order, if we've already

1286 // set the position we have to ignore historically earlier

1287 // clearing operations.

1288 if (store_position == -1) {

1289 clear = true;

1290 }

1291 undo_action = RESTORE;

1292 DCHECK(!absolute);

1293 DCHECK_EQ(value, 0);

1294 break;

1295 }

1296 default:

1297 UNREACHABLE();

1298 break;

1299 }

1300 }

1301 }

1302 // Prepare for the undo-action (e.g., push if it's going to be popped).

1303 if (undo_action == RESTORE) {

1304 pushes++;

1305 RegExpMacroAssembler::StackCheckFlag stack_check =

1306 RegExpMacroAssembler::kNoStackLimitCheck;

1307 if (pushes == push_limit) {

1308 stack_check = RegExpMacroAssembler::kCheckStackLimit;

1309 pushes = 0;

1310 }

1311

1312 assembler->PushRegister(reg, stack_check);

1313 registers_to_pop->Set(reg, zone);

1314 } else if (undo_action == CLEAR) {

1315 registers_to_clear->Set(reg, zone);

1316 }

1317 // Perform the chronologically last action (or accumulated increment)

1318 // for the register.

1319 if (store_position != -1) {

1320 assembler->WriteCurrentPositionToRegister(reg, store_position);

1321 } else if (clear) {

1322 assembler->ClearRegisters(reg, reg);

1323 } else if (absolute) {

1324 assembler->SetRegister(reg, value);

1325 } else if (value != 0) {

1326 assembler->AdvanceRegister(reg, value);

1327 }

1328 }

1329 }

1330

1331

1332 // This is called as we come into a loop choice node and some other tricky

1333 // nodes. It normalizes the state of the code generator to ensure we can

1334 // generate generic code.

1335 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {

1336 RegExpMacroAssembler* assembler = compiler->macro_assembler();

1337

1338 DCHECK(!is_trivial());

1339

1340 if (actions_ == NULL && backtrack() == NULL) {

1341 // Here we just have some deferred cp advances to fix and we are back to

1342 // a normal situation. We may also have to forget some information gained

1343 // through a quick check that was already performed.

1344 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);

1345 // Create a new trivial state and generate the node with that.

1346 Trace new_state;

1347 successor->Emit(compiler, &new_state);

1348 return;

1349 }

1350

1351 // Generate deferred actions here along with code to undo them again.

1352 OutSet affected_registers;

1353

1354 if (backtrack() != NULL) {

1355 // Here we have a concrete backtrack location. These are set up by choice

1356 // nodes and so they indicate that we have a deferred save of the current

1357 // position which we may need to emit here.

1358 assembler->PushCurrentPosition();

1359 }

1360

1361 int max_register = FindAffectedRegisters(&affected_registers,

1362 compiler->zone());

1363 OutSet registers_to_pop;

1364 OutSet registers_to_clear;

1365 PerformDeferredActions(assembler,

1366 max_register,

1367 affected_registers,

1368 &registers_to_pop,

1369 &registers_to_clear,

1370 compiler->zone());

1371 if (cp_offset_ != 0) {

1372 assembler->AdvanceCurrentPosition(cp_offset_);

1373 }

1374

1375 // Create a new trivial state and generate the node with that.

1376 Label undo;

1377 assembler->PushBacktrack(&undo);

1378 if (successor->KeepRecursing(compiler)) {

1379 Trace new_state;

1380 successor->Emit(compiler, &new_state);

1381 } else {

1382 compiler->AddWork(successor);

1383 assembler->GoTo(successor->label());

1384 }

1385

1386 // On backtrack we need to restore state.

1387 assembler->Bind(&undo);

1388 RestoreAffectedRegisters(assembler,

1389 max_register,

1390 registers_to_pop,

1391 registers_to_clear);

1392 if (backtrack() == NULL) {

1393 assembler->Backtrack();

1394 } else {

1395 assembler->PopCurrentPosition();

1396 assembler->GoTo(backtrack());

1397 }

1398 }

1399

1400

1401 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {

1402 RegExpMacroAssembler* assembler = compiler->macro_assembler();

1403

1404 // Omit flushing the trace. We discard the entire stack frame anyway.

1405

1406 if (!label()->is_bound()) {

1407 // We are completely independent of the trace, since we ignore it,

1408 // so this code can be used as the generic version.

1409 assembler->Bind(label());

1410 }

1411

1412 // Throw away everything on the backtrack stack since the start

1413 // of the negative submatch and restore the character position.

1414 assembler->ReadCurrentPositionFromRegister(current_position_register_);

1415 assembler->ReadStackPointerFromRegister(stack_pointer_register_);

1416 if (clear_capture_count_ > 0) {

1417 // Clear any captures that might have been performed during the success

1418 // of the body of the negative look-ahead.

1419 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;

1420 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);

1421 }

1422 // Now that we have unwound the stack we find at the top of the stack the

1423 // backtrack that the BeginSubmatch node got.

1424 assembler->Backtrack();

1425 }

1426

1427

1428 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {

1429 if (!trace->is_trivial()) {

1430 trace->Flush(compiler, this);

1431 return;

1432 }

1433 RegExpMacroAssembler* assembler = compiler->macro_assembler();

1434 if (!label()->is_bound()) {

1435 assembler->Bind(label());

1436 }

1437 switch (action_) {

1438 case ACCEPT:

1439 assembler->Succeed();

1440 return;

1441 case BACKTRACK:

1442 assembler->GoTo(trace->backtrack());

1443 return;

1444 case NEGATIVE_SUBMATCH_SUCCESS:

1445 // This case is handled in a different virtual method.

1446 UNREACHABLE();

1447 }

1448 UNIMPLEMENTED();

1449 }

1450

1451

1452 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {

1453 if (guards_ == NULL)

1454 guards_ = new(zone) ZoneList<Guard*>(1, zone);

1455 guards_->Add(guard, zone);

1456 }

1457

1458

1459 ActionNode* ActionNode::SetRegister(int reg,

1460 int val,

1461 RegExpNode* on_success) {

1462 ActionNode* result =

1463 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);

1464 result->data_.u_store_register.reg = reg;

1465 result->data_.u_store_register.value = val;

1466 return result;

1467 }

1468

1469

1470 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {

1471 ActionNode* result =

1472 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);

1473 result->data_.u_increment_register.reg = reg;

1474 return result;

1475 }

1476

1477

1478 ActionNode* ActionNode::StorePosition(int reg,

1479 bool is_capture,

1480 RegExpNode* on_success) {

1481 ActionNode* result =

1482 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);

1483 result->data_.u_position_register.reg = reg;

1484 result->data_.u_position_register.is_capture = is_capture;

1485 return result;

1486 }

1487

1488

1489 ActionNode* ActionNode::ClearCaptures(Interval range,

1490 RegExpNode* on_success) {

1491 ActionNode* result =

1492 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);

1493 result->data_.u_clear_captures.range_from = range.from();

1494 result->data_.u_clear_captures.range_to = range.to();

1495 return result;

1496 }

1497

1498

1499 ActionNode* ActionNode::BeginSubmatch(int stack_reg,

1500 int position_reg,

1501 RegExpNode* on_success) {

1502 ActionNode* result =

1503 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);

1504 result->data_.u_submatch.stack_pointer_register = stack_reg;

1505 result->data_.u_submatch.current_position_register = position_reg;

1506 return result;

1507 }

1508

1509

1510 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,

1511 int position_reg,

1512 int clear_register_count,

1513 int clear_register_from,

1514 RegExpNode* on_success) {

1515 ActionNode* result =

1516 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);

1517 result->data_.u_submatch.stack_pointer_register = stack_reg;

1518 result->data_.u_submatch.current_position_register = position_reg;

1519 result->data_.u_submatch.clear_register_count = clear_register_count;

1520 result->data_.u_submatch.clear_register_from = clear_register_from;

1521 return result;

1522 }

1523

1524

1525 ActionNode* ActionNode::EmptyMatchCheck(int start_register,

1526 int repetition_register,

1527 int repetition_limit,

1528 RegExpNode* on_success) {

1529 ActionNode* result =

1530 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);

1531 result->data_.u_empty_match_check.start_register = start_register;

1532 result->data_.u_empty_match_check.repetition_register = repetition_register;

1533 result->data_.u_empty_match_check.repetition_limit = repetition_limit;

1534 return result;

1535 }

1536

1537

1538 #define DEFINE_ACCEPT(Type) \

1539 void Type##Node::Accept(NodeVisitor* visitor) { \

1540 visitor->Visit##Type(this); \

1541 }

1542 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)

1543 #undef DEFINE_ACCEPT

1544

1545

1546 void LoopChoiceNode::Accept(NodeVisitor* visitor) {

1547 visitor->VisitLoopChoice(this);

1548 }

1549

1550

1551 // -------------------------------------------------------------------

1552 // Emit code.

1553

1554

1555 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,

1556 Guard* guard,

1557 Trace* trace) {

1558 switch (guard->op()) {

1559 case Guard::LT:

1560 DCHECK(!trace->mentions_reg(guard->reg()));

1561 macro_assembler->IfRegisterGE(guard->reg(),

1562 guard->value(),

1563 trace->backtrack());

1564 break;

1565 case Guard::GEQ:

1566 DCHECK(!trace->mentions_reg(guard->reg()));

1567 macro_assembler->IfRegisterLT(guard->reg(),

1568 guard->value(),

1569 trace->backtrack());

1570 break;

1571 }

1572 }

1573

1574

1575 // Returns the number of characters in the equivalence class, omitting those

1576 // that cannot occur in the source string because it is Latin1.

1577 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,

1578 bool one_byte_subject,

1579 unibrow::uchar* letters) {

1580 int length =

1581 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);

1582 // Unibrow returns 0 or 1 for characters where case independence is

1583 // trivial.

1584 if (length == 0) {

1585 letters[0] = character;

1586 length = 1;

1587 }

1588

1589 if (one_byte_subject) {

1590 int new_length = 0;

1591 for (int i = 0; i < length; i++) {

1592 if (letters[i] <= String::kMaxOneByteCharCode) {

1593 letters[new_length++] = letters[i];

1594 }

1595 }

1596 length = new_length;

1597 }

1598

1599 return length;

1600 }

1601

1602

1603 static inline bool EmitSimpleCharacter(Isolate* isolate,

1604 RegExpCompiler* compiler,

1605 uc16 c,

1606 Label* on_failure,

1607 int cp_offset,

1608 bool check,

1609 bool preloaded) {

1610 RegExpMacroAssembler* assembler = compiler->macro_assembler();

1611 bool bound_checked = false;

1612 if (!preloaded) {

1613 assembler->LoadCurrentCharacter(

1614 cp_offset,

1615 on_failure,

1616 check);

1617 bound_checked = true;

1618 }

1619 assembler->CheckNotCharacter(c, on_failure);

1620 return bound_checked;

1621 }

1622

1623

1624 // Only emits non-letters (things that don't have case). Only used for case

1625 // independent matches.

1626 static inline bool EmitAtomNonLetter(Isolate* isolate,

1627 RegExpCompiler* compiler,

1628 uc16 c,

1629 Label* on_failure,

1630 int cp_offset,

1631 bool check,

1632 bool preloaded) {

1633 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

1634 bool one_byte = compiler->one_byte();

1635 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1636 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);

1637 if (length < 1) {

1638 // This can't match. Must be an one-byte subject and a non-one-byte

1639 // character. We do not need to do anything since the one-byte pass

1640 // already handled this.

1641 return false; // Bounds not checked.

1642 }

1643 bool checked = false;

1644 // We handle the length > 1 case in a later pass.

1645 if (length == 1) {

1646 if (one_byte && c > String::kMaxOneByteCharCodeU) {

1647 // Can't match - see above.

1648 return false; // Bounds not checked.

1649 }

1650 if (!preloaded) {

1651 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1652 checked = check;

1653 }

1654 macro_assembler->CheckNotCharacter(c, on_failure);

1655 }

1656 return checked;

1657 }

1658

1659

1660 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,

1661 bool one_byte, uc16 c1, uc16 c2,

1662 Label* on_failure) {

1663 uc16 char_mask;

1664 if (one_byte) {

1665 char_mask = String::kMaxOneByteCharCode;

1666 } else {

1667 char_mask = String::kMaxUtf16CodeUnit;

1668 }

1669 uc16 exor = c1 ^ c2;

1670 // Check whether exor has only one bit set.

1671 if (((exor - 1) & exor) == 0) {

1672 // If c1 and c2 differ only by one bit.

1673 // Ecma262UnCanonicalize always gives the highest number last.

1674 DCHECK(c2 > c1);

1675 uc16 mask = char_mask ^ exor;

1676 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);

1677 return true;

1678 }

1679 DCHECK(c2 > c1);

1680 uc16 diff = c2 - c1;

1681 if (((diff - 1) & diff) == 0 && c1 >= diff) {

1682 // If the characters differ by 2^n but don't differ by one bit then

1683 // subtract the difference from the found character, then do the or

1684 // trick. We avoid the theoretical case where negative numbers are

1685 // involved in order to simplify code generation.

1686 uc16 mask = char_mask ^ diff;

1687 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,

1688 diff,

1689 mask,

1690 on_failure);

1691 return true;

1692 }

1693 return false;

1694 }

1695

1696

1697 typedef bool EmitCharacterFunction(Isolate* isolate,

1698 RegExpCompiler* compiler,

1699 uc16 c,

1700 Label* on_failure,

1701 int cp_offset,

1702 bool check,

1703 bool preloaded);

1704

1705 // Only emits letters (things that have case). Only used for case independent

1706 // matches.

1707 static inline bool EmitAtomLetter(Isolate* isolate,

1708 RegExpCompiler* compiler,

1709 uc16 c,

1710 Label* on_failure,

1711 int cp_offset,

1712 bool check,

1713 bool preloaded) {

1714 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

1715 bool one_byte = compiler->one_byte();

1716 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

1717 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);

1718 if (length <= 1) return false;

1719 // We may not need to check against the end of the input string

1720 // if this character lies before a character that matched.

1721 if (!preloaded) {

1722 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);

1723 }

1724 Label ok;

1725 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);

1726 switch (length) {

1727 case 2: {

1728 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],

1729 chars[1], on_failure)) {

1730 } else {

1731 macro_assembler->CheckCharacter(chars[0], &ok);

1732 macro_assembler->CheckNotCharacter(chars[1], on_failure);

1733 macro_assembler->Bind(&ok);

1734 }

1735 break;

1736 }

1737 case 4:

1738 macro_assembler->CheckCharacter(chars[3], &ok);

1739 // Fall through!

1740 case 3:

1741 macro_assembler->CheckCharacter(chars[0], &ok);

1742 macro_assembler->CheckCharacter(chars[1], &ok);

1743 macro_assembler->CheckNotCharacter(chars[2], on_failure);

1744 macro_assembler->Bind(&ok);

1745 break;

1746 default:

1747 UNREACHABLE();

1748 break;

1749 }

1750 return true;

1751 }

1752

1753

1754 static void EmitBoundaryTest(RegExpMacroAssembler* masm,

1755 int border,

1756 Label* fall_through,

1757 Label* above_or_equal,

1758 Label* below) {

1759 if (below != fall_through) {

1760 masm->CheckCharacterLT(border, below);

1761 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);

1762 } else {

1763 masm->CheckCharacterGT(border - 1, above_or_equal);

1764 }

1765 }

1766

1767

1768 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,

1769 int first,

1770 int last,

1771 Label* fall_through,

1772 Label* in_range,

1773 Label* out_of_range) {

1774 if (in_range == fall_through) {

1775 if (first == last) {

1776 masm->CheckNotCharacter(first, out_of_range);

1777 } else {

1778 masm->CheckCharacterNotInRange(first, last, out_of_range);

1779 }

1780 } else {

1781 if (first == last) {

1782 masm->CheckCharacter(first, in_range);

1783 } else {

1784 masm->CheckCharacterInRange(first, last, in_range);

1785 }

1786 if (out_of_range != fall_through) masm->GoTo(out_of_range);

1787 }

1788 }

1789

1790

1791 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.

1792 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.

1793 static void EmitUseLookupTable(

1794 RegExpMacroAssembler* masm,

1795 ZoneList<int>* ranges,

1796 int start_index,

1797 int end_index,

1798 int min_char,

1799 Label* fall_through,

1800 Label* even_label,

1801 Label* odd_label) {

1802 static const int kSize = RegExpMacroAssembler::kTableSize;

1803 static const int kMask = RegExpMacroAssembler::kTableMask;

1804

1805 int base = (min_char & ~kMask);

1806 USE(base);

1807

1808 // Assert that everything is on one kTableSize page.

1809 for (int i = start_index; i <= end_index; i++) {

1810 DCHECK_EQ(ranges->at(i) & ~kMask, base);

1811 }

1812 DCHECK(start_index == 0 \|\| (ranges->at(start_index - 1) & ~kMask) <= base);

1813

1814 char templ[kSize];

1815 Label* on_bit_set;

1816 Label* on_bit_clear;

1817 int bit;

1818 if (even_label == fall_through) {

1819 on_bit_set = odd_label;

1820 on_bit_clear = even_label;

1821 bit = 1;

1822 } else {

1823 on_bit_set = even_label;

1824 on_bit_clear = odd_label;

1825 bit = 0;

1826 }

1827 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {

1828 templ[i] = bit;

1829 }

1830 int j = 0;

1831 bit ^= 1;

1832 for (int i = start_index; i < end_index; i++) {

1833 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {

1834 templ[j] = bit;

1835 }

1836 bit ^= 1;

1837 }

1838 for (int i = j; i < kSize; i++) {

1839 templ[i] = bit;

1840 }

1841 Factory* factory = masm->isolate()->factory();

1842 // TODO(erikcorry): Cache these.

1843 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);

1844 for (int i = 0; i < kSize; i++) {

1845 ba->set(i, templ[i]);

1846 }

1847 masm->CheckBitInTable(ba, on_bit_set);

1848 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);

1849 }

1850

1851

1852 static void CutOutRange(RegExpMacroAssembler* masm,

1853 ZoneList<int>* ranges,

1854 int start_index,

1855 int end_index,

1856 int cut_index,

1857 Label* even_label,

1858 Label* odd_label) {

1859 bool odd = (((cut_index - start_index) & 1) == 1);

1860 Label* in_range_label = odd ? odd_label : even_label;

1861 Label dummy;

1862 EmitDoubleBoundaryTest(masm,

1863 ranges->at(cut_index),

1864 ranges->at(cut_index + 1) - 1,

1865 &dummy,

1866 in_range_label,

1867 &dummy);

1868 DCHECK(!dummy.is_linked());

1869 // Cut out the single range by rewriting the array. This creates a new

1870 // range that is a merger of the two ranges on either side of the one we

1871 // are cutting out. The oddity of the labels is preserved.

1872 for (int j = cut_index; j > start_index; j--) {

1873 ranges->at(j) = ranges->at(j - 1);

1874 }

1875 for (int j = cut_index + 1; j < end_index; j++) {

1876 ranges->at(j) = ranges->at(j + 1);

1877 }

1878 }

1879

1880

1881 // Unicode case. Split the search space into kSize spaces that are handled

1882 // with recursion.

1883 static void SplitSearchSpace(ZoneList<int>* ranges,

1884 int start_index,

1885 int end_index,

1886 int* new_start_index,

1887 int* new_end_index,

1888 int* border) {

1889 static const int kSize = RegExpMacroAssembler::kTableSize;

1890 static const int kMask = RegExpMacroAssembler::kTableMask;

1891

1892 int first = ranges->at(start_index);

1893 int last = ranges->at(end_index) - 1;

1894

1895 *new_start_index = start_index;

1896 *border = (ranges->at(start_index) & ~kMask) + kSize;

1897 while (*new_start_index < end_index) {

1898 if (ranges->at(new_start_index) > border) break;

1899 (*new_start_index)++;

1900 }

1901 // new_start_index is the index of the first edge that is beyond the

1902 // current kSize space.

1903

1904 // For very large search spaces we do a binary chop search of the non-Latin1

1905 // space instead of just going to the end of the current kSize space. The

1906 // heuristics are complicated a little by the fact that any 128-character

1907 // encoding space can be quickly tested with a table lookup, so we don't

1908 // wish to do binary chop search at a smaller granularity than that. A

1909 // 128-character space can take up a lot of space in the ranges array if,

1910 // for example, we only want to match every second character (eg. the lower

1911 // case characters on some Unicode pages).

1912 int binary_chop_index = (end_index + start_index) / 2;

1913 // The first test ensures that we get to the code that handles the Latin1

1914 // range with a single not-taken branch, speeding up this important

1915 // character range (even non-Latin1 charset-based text has spaces and

1916 // punctuation).

1917 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.

1918 end_index - start_index > (new_start_index - start_index) 2 &&

1919 last - first > kSize * 2 && binary_chop_index > *new_start_index &&

1920 ranges->at(binary_chop_index) >= first + 2 * kSize) {

1921 int scan_forward_for_section_border = binary_chop_index;;

1922 int new_border = (ranges->at(binary_chop_index) \| kMask) + 1;

1923

1924 while (scan_forward_for_section_border < end_index) {

1925 if (ranges->at(scan_forward_for_section_border) > new_border) {

1926 *new_start_index = scan_forward_for_section_border;

1927 *border = new_border;

1928 break;

1929 }

1930 scan_forward_for_section_border++;

1931 }

1932 }

1933

1934 DCHECK(*new_start_index > start_index);

1935 new_end_index = new_start_index - 1;

1936 if (ranges->at(new_end_index) == border) {

1937 (*new_end_index)--;

1938 }

1939 if (*border >= ranges->at(end_index)) {

1940 *border = ranges->at(end_index);

1941 *new_start_index = end_index; // Won't be used.

1942 *new_end_index = end_index - 1;

1943 }

1944 }

1945

1946

1947 // Gets a series of segment boundaries representing a character class. If the

1948 // character is in the range between an even and an odd boundary (counting from

1949 // start_index) then go to even_label, otherwise go to odd_label. We already

1950 // know that the character is in the range of min_char to max_char inclusive.

1951 // Either label can be NULL indicating backtracking. Either label can also be

1952 // equal to the fall_through label.

1953 static void GenerateBranches(RegExpMacroAssembler* masm,

1954 ZoneList<int>* ranges,

1955 int start_index,

1956 int end_index,

1957 uc16 min_char,

1958 uc16 max_char,

1959 Label* fall_through,

1960 Label* even_label,

1961 Label* odd_label) {

1962 int first = ranges->at(start_index);

1963 int last = ranges->at(end_index) - 1;

1964

1965 DCHECK_LT(min_char, first);

1966

1967 // Just need to test if the character is before or on-or-after

1968 // a particular character.

1969 if (start_index == end_index) {

1970 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);

1971 return;

1972 }

1973

1974 // Another almost trivial case: There is one interval in the middle that is

1975 // different from the end intervals.

1976 if (start_index + 1 == end_index) {

1977 EmitDoubleBoundaryTest(

1978 masm, first, last, fall_through, even_label, odd_label);

1979 return;

1980 }

1981

1982 // It's not worth using table lookup if there are very few intervals in the

1983 // character class.

1984 if (end_index - start_index <= 6) {

1985 // It is faster to test for individual characters, so we look for those

1986 // first, then try arbitrary ranges in the second round.

1987 static int kNoCutIndex = -1;

1988 int cut = kNoCutIndex;

1989 for (int i = start_index; i < end_index; i++) {

1990 if (ranges->at(i) == ranges->at(i + 1) - 1) {

1991 cut = i;

1992 break;

1993 }

1994 }

1995 if (cut == kNoCutIndex) cut = start_index;

1996 CutOutRange(

1997 masm, ranges, start_index, end_index, cut, even_label, odd_label);

1998 DCHECK_GE(end_index - start_index, 2);

1999 GenerateBranches(masm,

2000 ranges,

2001 start_index + 1,

2002 end_index - 1,

2003 min_char,

2004 max_char,

2005 fall_through,

2006 even_label,

2007 odd_label);

2008 return;

2009 }

2010

2011 // If there are a lot of intervals in the regexp, then we will use tables to

2012 // determine whether the character is inside or outside the character class.

2013 static const int kBits = RegExpMacroAssembler::kTableSizeBits;

2014

2015 if ((max_char >> kBits) == (min_char >> kBits)) {

2016 EmitUseLookupTable(masm,

2017 ranges,

2018 start_index,

2019 end_index,

2020 min_char,

2021 fall_through,

2022 even_label,

2023 odd_label);

2024 return;

2025 }

2026

2027 if ((min_char >> kBits) != (first >> kBits)) {

2028 masm->CheckCharacterLT(first, odd_label);

2029 GenerateBranches(masm,

2030 ranges,

2031 start_index + 1,

2032 end_index,

2033 first,

2034 max_char,

2035 fall_through,

2036 odd_label,

2037 even_label);

2038 return;

2039 }

2040

2041 int new_start_index = 0;

2042 int new_end_index = 0;

2043 int border = 0;

2044

2045 SplitSearchSpace(ranges,

2046 start_index,

2047 end_index,

2048 &new_start_index,

2049 &new_end_index,

2050 &border);

2051

2052 Label handle_rest;

2053 Label* above = &handle_rest;

2054 if (border == last + 1) {

2055 // We didn't find any section that started after the limit, so everything

2056 // above the border is one of the terminal labels.

2057 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;

2058 DCHECK(new_end_index == end_index - 1);

2059 }

2060

2061 DCHECK_LE(start_index, new_end_index);

2062 DCHECK_LE(new_start_index, end_index);

2063 DCHECK_LT(start_index, new_start_index);

2064 DCHECK_LT(new_end_index, end_index);

2065 DCHECK(new_end_index + 1 == new_start_index \|\|

2066 (new_end_index + 2 == new_start_index &&

2067 border == ranges->at(new_end_index + 1)));

2068 DCHECK_LT(min_char, border - 1);

2069 DCHECK_LT(border, max_char);

2070 DCHECK_LT(ranges->at(new_end_index), border);

2071 DCHECK(border < ranges->at(new_start_index) \|\|

2072 (border == ranges->at(new_start_index) &&

2073 new_start_index == end_index &&

2074 new_end_index == end_index - 1 &&

2075 border == last + 1));

2076 DCHECK(new_start_index == 0 \|\| border >= ranges->at(new_start_index - 1));

2077

2078 masm->CheckCharacterGT(border - 1, above);

2079 Label dummy;

2080 GenerateBranches(masm,

2081 ranges,

2082 start_index,

2083 new_end_index,

2084 min_char,

2085 border - 1,

2086 &dummy,

2087 even_label,

2088 odd_label);

2089 if (handle_rest.is_linked()) {

2090 masm->Bind(&handle_rest);

2091 bool flip = (new_start_index & 1) != (start_index & 1);

2092 GenerateBranches(masm,

2093 ranges,

2094 new_start_index,

2095 end_index,

2096 border,

2097 max_char,

2098 &dummy,

2099 flip ? odd_label : even_label,

2100 flip ? even_label : odd_label);

2101 }

2102 }

2103

2104

2105 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,

2106 RegExpCharacterClass* cc, bool one_byte,

2107 Label* on_failure, int cp_offset, bool check_offset,

2108 bool preloaded, Zone* zone) {

2109 ZoneList<CharacterRange>* ranges = cc->ranges(zone);

2110 if (!CharacterRange::IsCanonical(ranges)) {

2111 CharacterRange::Canonicalize(ranges);

2112 }

2113

2114 int max_char;

2115 if (one_byte) {

2116 max_char = String::kMaxOneByteCharCode;

2117 } else {

2118 max_char = String::kMaxUtf16CodeUnit;

2119 }

2120

2121 int range_count = ranges->length();

2122

2123 int last_valid_range = range_count - 1;

2124 while (last_valid_range >= 0) {

2125 CharacterRange& range = ranges->at(last_valid_range);

2126 if (range.from() <= max_char) {

2127 break;

2128 }

2129 last_valid_range--;

2130 }

2131

2132 if (last_valid_range < 0) {

2133 if (!cc->is_negated()) {

2134 macro_assembler->GoTo(on_failure);

2135 }

2136 if (check_offset) {

2137 macro_assembler->CheckPosition(cp_offset, on_failure);

2138 }

2139 return;

2140 }

2141

2142 if (last_valid_range == 0 &&

2143 ranges->at(0).IsEverything(max_char)) {

2144 if (cc->is_negated()) {

2145 macro_assembler->GoTo(on_failure);

2146 } else {

2147 // This is a common case hit by non-anchored expressions.

2148 if (check_offset) {

2149 macro_assembler->CheckPosition(cp_offset, on_failure);

2150 }

2151 }

2152 return;

2153 }

2154 if (last_valid_range == 0 &&

2155 !cc->is_negated() &&

2156 ranges->at(0).IsEverything(max_char)) {

2157 // This is a common case hit by non-anchored expressions.

2158 if (check_offset) {

2159 macro_assembler->CheckPosition(cp_offset, on_failure);

2160 }

2161 return;

2162 }

2163

2164 if (!preloaded) {

2165 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);

2166 }

2167

2168 if (cc->is_standard(zone) &&

2169 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),

2170 on_failure)) {

2171 return;

2172 }

2173

2174

2175 // A new list with ascending entries. Each entry is a code unit

2176 // where there is a boundary between code units that are part of

2177 // the class and code units that are not. Normally we insert an

2178 // entry at zero which goes to the failure label, but if there

2179 // was already one there we fall through for success on that entry.

2180 // Subsequent entries have alternating meaning (success/failure).

2181 ZoneList<int>* range_boundaries =

2182 new(zone) ZoneList<int>(last_valid_range, zone);

2183

2184 bool zeroth_entry_is_failure = !cc->is_negated();

2185

2186 for (int i = 0; i <= last_valid_range; i++) {

2187 CharacterRange& range = ranges->at(i);

2188 if (range.from() == 0) {

2189 DCHECK_EQ(i, 0);

2190 zeroth_entry_is_failure = !zeroth_entry_is_failure;

2191 } else {

2192 range_boundaries->Add(range.from(), zone);

2193 }

2194 range_boundaries->Add(range.to() + 1, zone);

2195 }

2196 int end_index = range_boundaries->length() - 1;

2197 if (range_boundaries->at(end_index) > max_char) {

2198 end_index--;

2199 }

2200

2201 Label fall_through;

2202 GenerateBranches(macro_assembler,

2203 range_boundaries,

2204 0, // start_index.

2205 end_index,

2206 0, // min_char.

2207 max_char,

2208 &fall_through,

2209 zeroth_entry_is_failure ? &fall_through : on_failure,

2210 zeroth_entry_is_failure ? on_failure : &fall_through);

2211 macro_assembler->Bind(&fall_through);

2212 }

2213

2214

2215 RegExpNode::~RegExpNode() {

2216 }

2217

2218

2219 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,

2220 Trace* trace) {

2221 // If we are generating a greedy loop then don't stop and don't reuse code.

2222 if (trace->stop_node() != NULL) {

2223 return CONTINUE;

2224 }

2225

2226 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

2227 if (trace->is_trivial()) {

2228 if (label_.is_bound() \|\| on_work_list() \|\| !KeepRecursing(compiler)) {

2229 // If a generic version is already scheduled to be generated or we have

2230 // recursed too deeply then just generate a jump to that code.

2231 macro_assembler->GoTo(&label_);

2232 // This will queue it up for generation of a generic version if it hasn't

2233 // already been queued.

2234 compiler->AddWork(this);

2235 return DONE;

2236 }

2237 // Generate generic version of the node and bind the label for later use.

2238 macro_assembler->Bind(&label_);

2239 return CONTINUE;

2240 }

2241

2242 // We are being asked to make a non-generic version. Keep track of how many

2243 // non-generic versions we generate so as not to overdo it.

2244 trace_count_++;

2245 if (KeepRecursing(compiler) && compiler->optimize() &&

2246 trace_count_ < kMaxCopiesCodeGenerated) {

2247 return CONTINUE;

2248 }

2249

2250 // If we get here code has been generated for this node too many times or

2251 // recursion is too deep. Time to switch to a generic version. The code for

2252 // generic versions above can handle deep recursion properly.

2253 bool was_limiting = compiler->limiting_recursion();

2254 compiler->set_limiting_recursion(true);

2255 trace->Flush(compiler, this);

2256 compiler->set_limiting_recursion(was_limiting);

2257 return DONE;

2258 }

2259

2260

2261 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {

2262 return !compiler->limiting_recursion() &&

2263 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;

2264 }

2265

2266

2267 int ActionNode::EatsAtLeast(int still_to_find,

2268 int budget,

2269 bool not_at_start) {

2270 if (budget <= 0) return 0;

2271 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!

2272 return on_success()->EatsAtLeast(still_to_find,

2273 budget - 1,

2274 not_at_start);

2275 }

2276

2277

2278 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

2279 BoyerMooreLookahead* bm, bool not_at_start) {

2280 if (action_type_ == BEGIN_SUBMATCH) {

2281 bm->SetRest(offset);

2282 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {

2283 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);

2284 }

2285 SaveBMInfo(bm, not_at_start, offset);

2286 }

2287

2288

2289 int AssertionNode::EatsAtLeast(int still_to_find,

2290 int budget,

2291 bool not_at_start) {

2292 if (budget <= 0) return 0;

2293 // If we know we are not at the start and we are asked "how many characters

2294 // will you match if you succeed?" then we can answer anything since false

2295 // implies false. So lets just return the max answer (still_to_find) since

2296 // that won't prevent us from preloading a lot of characters for the other

2297 // branches in the node graph.

2298 if (assertion_type() == AT_START && not_at_start) return still_to_find;

2299 return on_success()->EatsAtLeast(still_to_find,

2300 budget - 1,

2301 not_at_start);

2302 }

2303

2304

2305 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

2306 BoyerMooreLookahead* bm, bool not_at_start) {

2307 // Match the behaviour of EatsAtLeast on this node.

2308 if (assertion_type() == AT_START && not_at_start) return;

2309 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);

2310 SaveBMInfo(bm, not_at_start, offset);

2311 }

2312

2313

2314 int BackReferenceNode::EatsAtLeast(int still_to_find,

2315 int budget,

2316 bool not_at_start) {

2317 if (budget <= 0) return 0;

2318 return on_success()->EatsAtLeast(still_to_find,

2319 budget - 1,

2320 not_at_start);

2321 }

2322

2323

2324 int TextNode::EatsAtLeast(int still_to_find,

2325 int budget,

2326 bool not_at_start) {

2327 int answer = Length();

2328 if (answer >= still_to_find) return answer;

2329 if (budget <= 0) return answer;

2330 // We are not at start after this node so we set the last argument to 'true'.

2331 return answer + on_success()->EatsAtLeast(still_to_find - answer,

2332 budget - 1,

2333 true);

2334 }

2335

2336

2337 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,

2338 int budget,

2339 bool not_at_start) {

2340 if (budget <= 0) return 0;

2341 // Alternative 0 is the negative lookahead, alternative 1 is what comes

2342 // afterwards.

2343 RegExpNode* node = alternatives_->at(1).node();

2344 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);

2345 }

2346

2347

2348 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(

2349 QuickCheckDetails* details,

2350 RegExpCompiler* compiler,

2351 int filled_in,

2352 bool not_at_start) {

2353 // Alternative 0 is the negative lookahead, alternative 1 is what comes

2354 // afterwards.

2355 RegExpNode* node = alternatives_->at(1).node();

2356 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);

2357 }

2358

2359

2360 int ChoiceNode::EatsAtLeastHelper(int still_to_find,

2361 int budget,

2362 RegExpNode* ignore_this_node,

2363 bool not_at_start) {

2364 if (budget <= 0) return 0;

2365 int min = 100;

2366 int choice_count = alternatives_->length();

2367 budget = (budget - 1) / choice_count;

2368 for (int i = 0; i < choice_count; i++) {

2369 RegExpNode* node = alternatives_->at(i).node();

2370 if (node == ignore_this_node) continue;

2371 int node_eats_at_least =

2372 node->EatsAtLeast(still_to_find, budget, not_at_start);

2373 if (node_eats_at_least < min) min = node_eats_at_least;

2374 if (min == 0) return 0;

2375 }

2376 return min;

2377 }

2378

2379

2380 int LoopChoiceNode::EatsAtLeast(int still_to_find,

2381 int budget,

2382 bool not_at_start) {

2383 return EatsAtLeastHelper(still_to_find,

2384 budget - 1,

2385 loop_node_,

2386 not_at_start);

2387 }

2388

2389

2390 int ChoiceNode::EatsAtLeast(int still_to_find,

2391 int budget,

2392 bool not_at_start) {

2393 return EatsAtLeastHelper(still_to_find,

2394 budget,

2395 NULL,

2396 not_at_start);

2397 }

2398

2399

2400 // Takes the left-most 1-bit and smears it out, setting all bits to its right.

2401 static inline uint32_t SmearBitsRight(uint32_t v) {

2402 v \|= v >> 1;

2403 v \|= v >> 2;

2404 v \|= v >> 4;

2405 v \|= v >> 8;

2406 v \|= v >> 16;

2407 return v;

2408 }

2409

2410

2411 bool QuickCheckDetails::Rationalize(bool asc) {

2412 bool found_useful_op = false;

2413 uint32_t char_mask;

2414 if (asc) {

2415 char_mask = String::kMaxOneByteCharCode;

2416 } else {

2417 char_mask = String::kMaxUtf16CodeUnit;

2418 }

2419 mask_ = 0;

2420 value_ = 0;

2421 int char_shift = 0;

2422 for (int i = 0; i < characters_; i++) {

2423 Position* pos = &positions_[i];

2424 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {

2425 found_useful_op = true;

2426 }

2427 mask_ \|= (pos->mask & char_mask) << char_shift;

2428 value_ \|= (pos->value & char_mask) << char_shift;

2429 char_shift += asc ? 8 : 16;

2430 }

2431 return found_useful_op;

2432 }

2433

2434

2435 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,

2436 Trace* bounds_check_trace,

2437 Trace* trace,

2438 bool preload_has_checked_bounds,

2439 Label* on_possible_success,

2440 QuickCheckDetails* details,

2441 bool fall_through_on_failure) {

2442 if (details->characters() == 0) return false;

2443 GetQuickCheckDetails(

2444 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);

2445 if (details->cannot_match()) return false;

2446 if (!details->Rationalize(compiler->one_byte())) return false;

2447 DCHECK(details->characters() == 1 \|\|

2448 compiler->macro_assembler()->CanReadUnaligned());

2449 uint32_t mask = details->mask();

2450 uint32_t value = details->value();

2451

2452 RegExpMacroAssembler* assembler = compiler->macro_assembler();

2453

2454 if (trace->characters_preloaded() != details->characters()) {

2455 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());

2456 // We are attempting to preload the minimum number of characters

2457 // any choice would eat, so if the bounds check fails, then none of the

2458 // choices can succeed, so we can just immediately backtrack, rather

2459 // than go to the next choice.

2460 assembler->LoadCurrentCharacter(trace->cp_offset(),

2461 bounds_check_trace->backtrack(),

2462 !preload_has_checked_bounds,

2463 details->characters());

2464 }

2465

2466

2467 bool need_mask = true;

2468

2469 if (details->characters() == 1) {

2470 // If number of characters preloaded is 1 then we used a byte or 16 bit

2471 // load so the value is already masked down.

2472 uint32_t char_mask;

2473 if (compiler->one_byte()) {

2474 char_mask = String::kMaxOneByteCharCode;

2475 } else {

2476 char_mask = String::kMaxUtf16CodeUnit;

2477 }

2478 if ((mask & char_mask) == char_mask) need_mask = false;

2479 mask &= char_mask;

2480 } else {

2481 // For 2-character preloads in one-byte mode or 1-character preloads in

2482 // two-byte mode we also use a 16 bit load with zero extend.

2483 if (details->characters() == 2 && compiler->one_byte()) {

2484 if ((mask & 0xffff) == 0xffff) need_mask = false;

2485 } else if (details->characters() == 1 && !compiler->one_byte()) {

2486 if ((mask & 0xffff) == 0xffff) need_mask = false;

2487 } else {

2488 if (mask == 0xffffffff) need_mask = false;

2489 }

2490 }

2491

2492 if (fall_through_on_failure) {

2493 if (need_mask) {

2494 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);

2495 } else {

2496 assembler->CheckCharacter(value, on_possible_success);

2497 }

2498 } else {

2499 if (need_mask) {

2500 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());

2501 } else {

2502 assembler->CheckNotCharacter(value, trace->backtrack());

2503 }

2504 }

2505 return true;

2506 }

2507

2508

2509 // Here is the meat of GetQuickCheckDetails (see also the comment on the

2510 // super-class in the .h file).

2511 //

2512 // We iterate along the text object, building up for each character a

2513 // mask and value that can be used to test for a quick failure to match.

2514 // The masks and values for the positions will be combined into a single

2515 // machine word for the current character width in order to be used in

2516 // generating a quick check.

2517 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,

2518 RegExpCompiler* compiler,

2519 int characters_filled_in,

2520 bool not_at_start) {

2521 Isolate* isolate = compiler->macro_assembler()->isolate();

2522 DCHECK(characters_filled_in < details->characters());

2523 int characters = details->characters();

2524 int char_mask;

2525 if (compiler->one_byte()) {

2526 char_mask = String::kMaxOneByteCharCode;

2527 } else {

2528 char_mask = String::kMaxUtf16CodeUnit;

2529 }

2530 for (int k = 0; k < elms_->length(); k++) {

2531 TextElement elm = elms_->at(k);

2532 if (elm.text_type() == TextElement::ATOM) {

2533 Vector<const uc16> quarks = elm.atom()->data();

2534 for (int i = 0; i < characters && i < quarks.length(); i++) {

2535 QuickCheckDetails::Position* pos =

2536 details->positions(characters_filled_in);

2537 uc16 c = quarks[i];

2538 if (compiler->ignore_case()) {

2539 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

2540 int length = GetCaseIndependentLetters(isolate, c,

2541 compiler->one_byte(), chars);

2542 if (length == 0) {

2543 // This can happen because all case variants are non-Latin1, but we

2544 // know the input is Latin1.

2545 details->set_cannot_match();

2546 pos->determines_perfectly = false;

2547 return;

2548 }

2549 if (length == 1) {

2550 // This letter has no case equivalents, so it's nice and simple

2551 // and the mask-compare will determine definitely whether we have

2552 // a match at this character position.

2553 pos->mask = char_mask;

2554 pos->value = c;

2555 pos->determines_perfectly = true;

2556 } else {

2557 uint32_t common_bits = char_mask;

2558 uint32_t bits = chars[0];

2559 for (int j = 1; j < length; j++) {

2560 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);

2561 common_bits ^= differing_bits;

2562 bits &= common_bits;

2563 }

2564 // If length is 2 and common bits has only one zero in it then

2565 // our mask and compare instruction will determine definitely

2566 // whether we have a match at this character position. Otherwise

2567 // it can only be an approximate check.

2568 uint32_t one_zero = (common_bits \| ~char_mask);

2569 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {

2570 pos->determines_perfectly = true;

2571 }

2572 pos->mask = common_bits;

2573 pos->value = bits;

2574 }

2575 } else {

2576 // Don't ignore case. Nice simple case where the mask-compare will

2577 // determine definitely whether we have a match at this character

2578 // position.

2579 if (c > char_mask) {

2580 details->set_cannot_match();

2581 pos->determines_perfectly = false;

2582 return;

2583 }

2584 pos->mask = char_mask;

2585 pos->value = c;

2586 pos->determines_perfectly = true;

2587 }

2588 characters_filled_in++;

2589 DCHECK(characters_filled_in <= details->characters());

2590 if (characters_filled_in == details->characters()) {

2591 return;

2592 }

2593 }

2594 } else {

2595 QuickCheckDetails::Position* pos =

2596 details->positions(characters_filled_in);

2597 RegExpCharacterClass* tree = elm.char_class();

2598 ZoneList<CharacterRange>* ranges = tree->ranges(zone());

2599 if (tree->is_negated()) {

2600 // A quick check uses multi-character mask and compare. There is no

2601 // useful way to incorporate a negative char class into this scheme

2602 // so we just conservatively create a mask and value that will always

2603 // succeed.

2604 pos->mask = 0;

2605 pos->value = 0;

2606 } else {

2607 int first_range = 0;

2608 while (ranges->at(first_range).from() > char_mask) {

2609 first_range++;

2610 if (first_range == ranges->length()) {

2611 details->set_cannot_match();

2612 pos->determines_perfectly = false;

2613 return;

2614 }

2615 }

2616 CharacterRange range = ranges->at(first_range);

2617 uc16 from = range.from();

2618 uc16 to = range.to();

2619 if (to > char_mask) {

2620 to = char_mask;

2621 }

2622 uint32_t differing_bits = (from ^ to);

2623 // A mask and compare is only perfect if the differing bits form a

2624 // number like 00011111 with one single block of trailing 1s.

2625 if ((differing_bits & (differing_bits + 1)) == 0 &&

2626 from + differing_bits == to) {

2627 pos->determines_perfectly = true;

2628 }

2629 uint32_t common_bits = ~SmearBitsRight(differing_bits);

2630 uint32_t bits = (from & common_bits);

2631 for (int i = first_range + 1; i < ranges->length(); i++) {

2632 CharacterRange range = ranges->at(i);

2633 uc16 from = range.from();

2634 uc16 to = range.to();

2635 if (from > char_mask) continue;

2636 if (to > char_mask) to = char_mask;

2637 // Here we are combining more ranges into the mask and compare

2638 // value. With each new range the mask becomes more sparse and

2639 // so the chances of a false positive rise. A character class

2640 // with multiple ranges is assumed never to be equivalent to a

2641 // mask and compare operation.

2642 pos->determines_perfectly = false;

2643 uint32_t new_common_bits = (from ^ to);

2644 new_common_bits = ~SmearBitsRight(new_common_bits);

2645 common_bits &= new_common_bits;

2646 bits &= new_common_bits;

2647 uint32_t differing_bits = (from & common_bits) ^ bits;

2648 common_bits ^= differing_bits;

2649 bits &= common_bits;

2650 }

2651 pos->mask = common_bits;

2652 pos->value = bits;

2653 }

2654 characters_filled_in++;

2655 DCHECK(characters_filled_in <= details->characters());

2656 if (characters_filled_in == details->characters()) {

2657 return;

2658 }

2659 }

2660 }

2661 DCHECK(characters_filled_in != details->characters());

2662 if (!details->cannot_match()) {

2663 on_success()-> GetQuickCheckDetails(details,

2664 compiler,

2665 characters_filled_in,

2666 true);

2667 }

2668 }

2669

2670

2671 void QuickCheckDetails::Clear() {

2672 for (int i = 0; i < characters_; i++) {

2673 positions_[i].mask = 0;

2674 positions_[i].value = 0;

2675 positions_[i].determines_perfectly = false;

2676 }

2677 characters_ = 0;

2678 }

2679

2680

2681 void QuickCheckDetails::Advance(int by, bool one_byte) {

2682 DCHECK(by >= 0);

2683 if (by >= characters_) {

2684 Clear();

2685 return;

2686 }

2687 for (int i = 0; i < characters_ - by; i++) {

2688 positions_[i] = positions_[by + i];

2689 }

2690 for (int i = characters_ - by; i < characters_; i++) {

2691 positions_[i].mask = 0;

2692 positions_[i].value = 0;

2693 positions_[i].determines_perfectly = false;

2694 }

2695 characters_ -= by;

2696 // We could change mask_ and value_ here but we would never advance unless

2697 // they had already been used in a check and they won't be used again because

2698 // it would gain us nothing. So there's no point.

2699 }

2700

2701

2702 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {

2703 DCHECK(characters_ == other->characters_);

2704 if (other->cannot_match_) {

2705 return;

2706 }

2707 if (cannot_match_) {

2708 this = other;

2709 return;

2710 }

2711 for (int i = from_index; i < characters_; i++) {

2712 QuickCheckDetails::Position* pos = positions(i);

2713 QuickCheckDetails::Position* other_pos = other->positions(i);

2714 if (pos->mask != other_pos->mask \|\|

2715 pos->value != other_pos->value \|\|

2716 !other_pos->determines_perfectly) {

2717 // Our mask-compare operation will be approximate unless we have the

2718 // exact same operation on both sides of the alternation.

2719 pos->determines_perfectly = false;

2720 }

2721 pos->mask &= other_pos->mask;

2722 pos->value &= pos->mask;

2723 other_pos->value &= pos->mask;

2724 uc16 differing_bits = (pos->value ^ other_pos->value);

2725 pos->mask &= ~differing_bits;

2726 pos->value &= pos->mask;

2727 }

2728 }

2729

2730

2731 class VisitMarker {

2732 public:

2733 explicit VisitMarker(NodeInfo* info) : info_(info) {

2734 DCHECK(!info->visited);

2735 info->visited = true;

2736 }

2737 ~VisitMarker() {

2738 info_->visited = false;

2739 }

2740 private:

2741 NodeInfo* info_;

2742 };

2743

2744

2745 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {

2746 if (info()->replacement_calculated) return replacement();

2747 if (depth < 0) return this;

2748 DCHECK(!info()->visited);

2749 VisitMarker marker(info());

2750 return FilterSuccessor(depth - 1, ignore_case);

2751 }

2752

2753

2754 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {

2755 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);

2756 if (next == NULL) return set_replacement(NULL);

2757 on_success_ = next;

2758 return set_replacement(this);

2759 }

2760

2761

2762 // We need to check for the following characters: 0x39c 0x3bc 0x178.

2763 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {

2764 // TODO(dcarney): this could be a lot more efficient.

2765 return range.Contains(0x39c) \|\|

2766 range.Contains(0x3bc) \|\| range.Contains(0x178);

2767 }

2768

2769

2770 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {

2771 for (int i = 0; i < ranges->length(); i++) {

2772 // TODO(dcarney): this could be a lot more efficient.

2773 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;

2774 }

2775 return false;

2776 }

2777

2778

2779 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {

2780 if (info()->replacement_calculated) return replacement();

2781 if (depth < 0) return this;

2782 DCHECK(!info()->visited);

2783 VisitMarker marker(info());

2784 int element_count = elms_->length();

2785 for (int i = 0; i < element_count; i++) {

2786 TextElement elm = elms_->at(i);

2787 if (elm.text_type() == TextElement::ATOM) {

2788 Vector<const uc16> quarks = elm.atom()->data();

2789 for (int j = 0; j < quarks.length(); j++) {

2790 uint16_t c = quarks[j];

2791 if (c <= String::kMaxOneByteCharCode) continue;

2792 if (!ignore_case) return set_replacement(NULL);

2793 // Here, we need to check for characters whose upper and lower cases

2794 // are outside the Latin-1 range.

2795 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);

2796 // Character is outside Latin-1 completely

2797 if (converted == 0) return set_replacement(NULL);

2798 // Convert quark to Latin-1 in place.

2799 uint16_t* copy = const_cast<uint16_t*>(quarks.start());

2800 copy[j] = converted;

2801 }

2802 } else {

2803 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);

2804 RegExpCharacterClass* cc = elm.char_class();

2805 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

2806 if (!CharacterRange::IsCanonical(ranges)) {

2807 CharacterRange::Canonicalize(ranges);

2808 }

2809 // Now they are in order so we only need to look at the first.

2810 int range_count = ranges->length();

2811 if (cc->is_negated()) {

2812 if (range_count != 0 &&

2813 ranges->at(0).from() == 0 &&

2814 ranges->at(0).to() >= String::kMaxOneByteCharCode) {

2815 // This will be handled in a later filter.

2816 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

2817 return set_replacement(NULL);

2818 }

2819 } else {

2820 if (range_count == 0 \|\|

2821 ranges->at(0).from() > String::kMaxOneByteCharCode) {

2822 // This will be handled in a later filter.

2823 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;

2824 return set_replacement(NULL);

2825 }

2826 }

2827 }

2828 }

2829 return FilterSuccessor(depth - 1, ignore_case);

2830 }

2831

2832

2833 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {

2834 if (info()->replacement_calculated) return replacement();

2835 if (depth < 0) return this;

2836 if (info()->visited) return this;

2837 {

2838 VisitMarker marker(info());

2839

2840 RegExpNode* continue_replacement =

2841 continue_node_->FilterOneByte(depth - 1, ignore_case);

2842 // If we can't continue after the loop then there is no sense in doing the

2843 // loop.

2844 if (continue_replacement == NULL) return set_replacement(NULL);

2845 }

2846

2847 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);

2848 }

2849

2850

2851 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {

2852 if (info()->replacement_calculated) return replacement();

2853 if (depth < 0) return this;

2854 if (info()->visited) return this;

2855 VisitMarker marker(info());

2856 int choice_count = alternatives_->length();

2857

2858 for (int i = 0; i < choice_count; i++) {

2859 GuardedAlternative alternative = alternatives_->at(i);

2860 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {

2861 set_replacement(this);

2862 return this;

2863 }

2864 }

2865

2866 int surviving = 0;

2867 RegExpNode* survivor = NULL;

2868 for (int i = 0; i < choice_count; i++) {

2869 GuardedAlternative alternative = alternatives_->at(i);

2870 RegExpNode* replacement =

2871 alternative.node()->FilterOneByte(depth - 1, ignore_case);

2872 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.

2873 if (replacement != NULL) {

2874 alternatives_->at(i).set_node(replacement);

2875 surviving++;

2876 survivor = replacement;

2877 }

2878 }

2879 if (surviving < 2) return set_replacement(survivor);

2880

2881 set_replacement(this);

2882 if (surviving == choice_count) {

2883 return this;

2884 }

2885 // Only some of the nodes survived the filtering. We need to rebuild the

2886 // alternatives list.

2887 ZoneList<GuardedAlternative>* new_alternatives =

2888 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());

2889 for (int i = 0; i < choice_count; i++) {

2890 RegExpNode* replacement =

2891 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);

2892 if (replacement != NULL) {

2893 alternatives_->at(i).set_node(replacement);

2894 new_alternatives->Add(alternatives_->at(i), zone());

2895 }

2896 }

2897 alternatives_ = new_alternatives;

2898 return this;

2899 }

2900

2901

2902 RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth,

2903 bool ignore_case) {

2904 if (info()->replacement_calculated) return replacement();

2905 if (depth < 0) return this;

2906 if (info()->visited) return this;

2907 VisitMarker marker(info());

2908 // Alternative 0 is the negative lookahead, alternative 1 is what comes

2909 // afterwards.

2910 RegExpNode* node = alternatives_->at(1).node();

2911 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);

2912 if (replacement == NULL) return set_replacement(NULL);

2913 alternatives_->at(1).set_node(replacement);

2914

2915 RegExpNode* neg_node = alternatives_->at(0).node();

2916 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);

2917 // If the negative lookahead is always going to fail then

2918 // we don't need to check it.

2919 if (neg_replacement == NULL) return set_replacement(replacement);

2920 alternatives_->at(0).set_node(neg_replacement);

2921 return set_replacement(this);

2922 }

2923

2924

2925 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2926 RegExpCompiler* compiler,

2927 int characters_filled_in,

2928 bool not_at_start) {

2929 if (body_can_be_zero_length_ \|\| info()->visited) return;

2930 VisitMarker marker(info());

2931 return ChoiceNode::GetQuickCheckDetails(details,

2932 compiler,

2933 characters_filled_in,

2934 not_at_start);

2935 }

2936

2937

2938 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

2939 BoyerMooreLookahead* bm, bool not_at_start) {

2940 if (body_can_be_zero_length_ \|\| budget <= 0) {

2941 bm->SetRest(offset);

2942 SaveBMInfo(bm, not_at_start, offset);

2943 return;

2944 }

2945 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);

2946 SaveBMInfo(bm, not_at_start, offset);

2947 }

2948

2949

2950 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,

2951 RegExpCompiler* compiler,

2952 int characters_filled_in,

2953 bool not_at_start) {

2954 not_at_start = (not_at_start \|\| not_at_start_);

2955 int choice_count = alternatives_->length();

2956 DCHECK(choice_count > 0);

2957 alternatives_->at(0).node()->GetQuickCheckDetails(details,

2958 compiler,

2959 characters_filled_in,

2960 not_at_start);

2961 for (int i = 1; i < choice_count; i++) {

2962 QuickCheckDetails new_details(details->characters());

2963 RegExpNode* node = alternatives_->at(i).node();

2964 node->GetQuickCheckDetails(&new_details, compiler,

2965 characters_filled_in,

2966 not_at_start);

2967 // Here we merge the quick match details of the two branches.

2968 details->Merge(&new_details, characters_filled_in);

2969 }

2970 }

2971

2972

2973 // Check for [0-9A-Z_a-z].

2974 static void EmitWordCheck(RegExpMacroAssembler* assembler,

2975 Label* word,

2976 Label* non_word,

2977 bool fall_through_on_word) {

2978 if (assembler->CheckSpecialCharacterClass(

2979 fall_through_on_word ? 'w' : 'W',

2980 fall_through_on_word ? non_word : word)) {

2981 // Optimized implementation available.

2982 return;

2983 }

2984 assembler->CheckCharacterGT('z', non_word);

2985 assembler->CheckCharacterLT('0', non_word);

2986 assembler->CheckCharacterGT('a' - 1, word);

2987 assembler->CheckCharacterLT('9' + 1, word);

2988 assembler->CheckCharacterLT('A', non_word);

2989 assembler->CheckCharacterLT('Z' + 1, word);

2990 if (fall_through_on_word) {

2991 assembler->CheckNotCharacter('_', non_word);

2992 } else {

2993 assembler->CheckCharacter('_', word);

2994 }

2995 }

2996

2997

2998 // Emit the code to check for a ^ in multiline mode (1-character lookbehind

2999 // that matches newline or the start of input).

3000 static void EmitHat(RegExpCompiler* compiler,

3001 RegExpNode* on_success,

3002 Trace* trace) {

3003 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3004 // We will be loading the previous character into the current character

3005 // register.

3006 Trace new_trace(*trace);

3007 new_trace.InvalidateCurrentCharacter();

3008

3009 Label ok;

3010 if (new_trace.cp_offset() == 0) {

3011 // The start of input counts as a newline in this context, so skip to

3012 // ok if we are at the start.

3013 assembler->CheckAtStart(&ok);

3014 }

3015 // We already checked that we are not at the start of input so it must be

3016 // OK to load the previous character.

3017 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,

3018 new_trace.backtrack(),

3019 false);

3020 if (!assembler->CheckSpecialCharacterClass('n',

3021 new_trace.backtrack())) {

3022 // Newline means \n, \r, 0x2028 or 0x2029.

3023 if (!compiler->one_byte()) {

3024 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);

3025 }

3026 assembler->CheckCharacter('\n', &ok);

3027 assembler->CheckNotCharacter('\r', new_trace.backtrack());

3028 }

3029 assembler->Bind(&ok);

3030 on_success->Emit(compiler, &new_trace);

3031 }

3032

3033

3034 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).

3035 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {

3036 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3037 Isolate* isolate = assembler->isolate();

3038 Trace::TriBool next_is_word_character = Trace::UNKNOWN;

3039 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);

3040 BoyerMooreLookahead* lookahead = bm_info(not_at_start);

3041 if (lookahead == NULL) {

3042 int eats_at_least =

3043 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,

3044 kRecursionBudget,

3045 not_at_start));

3046 if (eats_at_least >= 1) {

3047 BoyerMooreLookahead* bm =

3048 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());

3049 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);

3050 if (bm->at(0)->is_non_word())

3051 next_is_word_character = Trace::FALSE_VALUE;

3052 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;

3053 }

3054 } else {

3055 if (lookahead->at(0)->is_non_word())

3056 next_is_word_character = Trace::FALSE_VALUE;

3057 if (lookahead->at(0)->is_word())

3058 next_is_word_character = Trace::TRUE_VALUE;

3059 }

3060 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);

3061 if (next_is_word_character == Trace::UNKNOWN) {

3062 Label before_non_word;

3063 Label before_word;

3064 if (trace->characters_preloaded() != 1) {

3065 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);

3066 }

3067 // Fall through on non-word.

3068 EmitWordCheck(assembler, &before_word, &before_non_word, false);

3069 // Next character is not a word character.

3070 assembler->Bind(&before_non_word);

3071 Label ok;

3072 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);

3073 assembler->GoTo(&ok);

3074

3075 assembler->Bind(&before_word);

3076 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);

3077 assembler->Bind(&ok);

3078 } else if (next_is_word_character == Trace::TRUE_VALUE) {

3079 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);

3080 } else {

3081 DCHECK(next_is_word_character == Trace::FALSE_VALUE);

3082 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);

3083 }

3084 }

3085

3086

3087 void AssertionNode::BacktrackIfPrevious(

3088 RegExpCompiler* compiler,

3089 Trace* trace,

3090 AssertionNode::IfPrevious backtrack_if_previous) {

3091 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3092 Trace new_trace(*trace);

3093 new_trace.InvalidateCurrentCharacter();

3094

3095 Label fall_through, dummy;

3096

3097 Label* non_word = backtrack_if_previous == kIsNonWord ?

3098 new_trace.backtrack() :

3099 &fall_through;

3100 Label* word = backtrack_if_previous == kIsNonWord ?

3101 &fall_through :

3102 new_trace.backtrack();

3103

3104 if (new_trace.cp_offset() == 0) {

3105 // The start of input counts as a non-word character, so the question is

3106 // decided if we are at the start.

3107 assembler->CheckAtStart(non_word);

3108 }

3109 // We already checked that we are not at the start of input so it must be

3110 // OK to load the previous character.

3111 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);

3112 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);

3113

3114 assembler->Bind(&fall_through);

3115 on_success()->Emit(compiler, &new_trace);

3116 }

3117

3118

3119 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,

3120 RegExpCompiler* compiler,

3121 int filled_in,

3122 bool not_at_start) {

3123 if (assertion_type_ == AT_START && not_at_start) {

3124 details->set_cannot_match();

3125 return;

3126 }

3127 return on_success()->GetQuickCheckDetails(details,

3128 compiler,

3129 filled_in,

3130 not_at_start);

3131 }

3132

3133

3134 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {

3135 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3136 switch (assertion_type_) {

3137 case AT_END: {

3138 Label ok;

3139 assembler->CheckPosition(trace->cp_offset(), &ok);

3140 assembler->GoTo(trace->backtrack());

3141 assembler->Bind(&ok);

3142 break;

3143 }

3144 case AT_START: {

3145 if (trace->at_start() == Trace::FALSE_VALUE) {

3146 assembler->GoTo(trace->backtrack());

3147 return;

3148 }

3149 if (trace->at_start() == Trace::UNKNOWN) {

3150 assembler->CheckNotAtStart(trace->backtrack());

3151 Trace at_start_trace = *trace;

3152 at_start_trace.set_at_start(true);

3153 on_success()->Emit(compiler, &at_start_trace);

3154 return;

3155 }

3156 }

3157 break;

3158 case AFTER_NEWLINE:

3159 EmitHat(compiler, on_success(), trace);

3160 return;

3161 case AT_BOUNDARY:

3162 case AT_NON_BOUNDARY: {

3163 EmitBoundaryCheck(compiler, trace);

3164 return;

3165 }

3166 }

3167 on_success()->Emit(compiler, trace);

3168 }

3169

3170

3171 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {

3172 if (quick_check == NULL) return false;

3173 if (offset >= quick_check->characters()) return false;

3174 return quick_check->positions(offset)->determines_perfectly;

3175 }

3176

3177

3178 static void UpdateBoundsCheck(int index, int* checked_up_to) {

3179 if (index > *checked_up_to) {

3180 *checked_up_to = index;

3181 }

3182 }

3183

3184

3185 // We call this repeatedly to generate code for each pass over the text node.

3186 // The passes are in increasing order of difficulty because we hope one

3187 // of the first passes will fail in which case we are saved the work of the

3188 // later passes. for example for the case independent regexp /%[asdfghjkl]a/

3189 // we will check the '%' in the first pass, the case independent 'a' in the

3190 // second pass and the character class in the last pass.

3191 //

3192 // The passes are done from right to left, so for example to test for /bar/

3193 // we will first test for an 'r' with offset 2, then an 'a' with offset 1

3194 // and then a 'b' with offset 0. This means we can avoid the end-of-input

3195 // bounds check most of the time. In the example we only need to check for

3196 // end-of-input when loading the putative 'r'.

3197 //

3198 // A slight complication involves the fact that the first character may already

3199 // be fetched into a register by the previous node. In this case we want to

3200 // do the test for that character first. We do this in separate passes. The

3201 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a

3202 // pass has been performed then subsequent passes will have true in

3203 // first_element_checked to indicate that that character does not need to be

3204 // checked again.

3205 //

3206 // In addition to all this we are passed a Trace, which can

3207 // contain an AlternativeGeneration object. In this AlternativeGeneration

3208 // object we can see details of any quick check that was already passed in

3209 // order to get to the code we are now generating. The quick check can involve

3210 // loading characters, which means we do not need to recheck the bounds

3211 // up to the limit the quick check already checked. In addition the quick

3212 // check can have involved a mask and compare operation which may simplify

3213 // or obviate the need for further checks at some character positions.

3214 void TextNode::TextEmitPass(RegExpCompiler* compiler,

3215 TextEmitPassType pass,

3216 bool preloaded,

3217 Trace* trace,

3218 bool first_element_checked,

3219 int* checked_up_to) {

3220 RegExpMacroAssembler* assembler = compiler->macro_assembler();

3221 Isolate* isolate = assembler->isolate();

3222 bool one_byte = compiler->one_byte();

3223 Label* backtrack = trace->backtrack();

3224 QuickCheckDetails* quick_check = trace->quick_check_performed();

3225 int element_count = elms_->length();

3226 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {

3227 TextElement elm = elms_->at(i);

3228 int cp_offset = trace->cp_offset() + elm.cp_offset();

3229 if (elm.text_type() == TextElement::ATOM) {

3230 Vector<const uc16> quarks = elm.atom()->data();

3231 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {

3232 if (first_element_checked && i == 0 && j == 0) continue;

3233 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;

3234 EmitCharacterFunction* emit_function = NULL;

3235 switch (pass) {

3236 case NON_LATIN1_MATCH:

3237 DCHECK(one_byte);

3238 if (quarks[j] > String::kMaxOneByteCharCode) {

3239 assembler->GoTo(backtrack);

3240 return;

3241 }

3242 break;

3243 case NON_LETTER_CHARACTER_MATCH:

3244 emit_function = &EmitAtomNonLetter;

3245 break;

3246 case SIMPLE_CHARACTER_MATCH:

3247 emit_function = &EmitSimpleCharacter;

3248 break;

3249 case CASE_CHARACTER_MATCH:

3250 emit_function = &EmitAtomLetter;

3251 break;

3252 default:

3253 break;

3254 }

3255 if (emit_function != NULL) {

3256 bool bound_checked = emit_function(isolate,

3257 compiler,

3258 quarks[j],

3259 backtrack,

3260 cp_offset + j,

3261 *checked_up_to < cp_offset + j,

3262 preloaded);

3263 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);

3264 }

3265 }

3266 } else {

3267 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());

3268 if (pass == CHARACTER_CLASS_MATCH) {

3269 if (first_element_checked && i == 0) continue;

3270 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;

3271 RegExpCharacterClass* cc = elm.char_class();

3272 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,

3273 *checked_up_to < cp_offset, preloaded, zone());

3274 UpdateBoundsCheck(cp_offset, checked_up_to);

3275 }

3276 }

3277 }

3278 }

3279

3280

3281 int TextNode::Length() {

3282 TextElement elm = elms_->last();

3283 DCHECK(elm.cp_offset() >= 0);

3284 return elm.cp_offset() + elm.length();

3285 }

3286

3287

3288 bool TextNode::SkipPass(int int_pass, bool ignore_case) {

3289 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);

3290 if (ignore_case) {

3291 return pass == SIMPLE_CHARACTER_MATCH;

3292 } else {

3293 return pass == NON_LETTER_CHARACTER_MATCH \|\| pass == CASE_CHARACTER_MATCH;

3294 }

3295 }

3296

3297

3298 // This generates the code to match a text node. A text node can contain

3299 // straight character sequences (possibly to be matched in a case-independent

3300 // way) and character classes. For efficiency we do not do this in a single

3301 // pass from left to right. Instead we pass over the text node several times,

3302 // emitting code for some character positions every time. See the comment on

3303 // TextEmitPass for details.

3304 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {

3305 LimitResult limit_result = LimitVersions(compiler, trace);

3306 if (limit_result == DONE) return;

3307 DCHECK(limit_result == CONTINUE);

3308

3309 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {

3310 compiler->SetRegExpTooBig();

3311 return;

3312 }

3313

3314 if (compiler->one_byte()) {

3315 int dummy = 0;

3316 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);

3317 }

3318

3319 bool first_elt_done = false;

3320 int bound_checked_to = trace->cp_offset() - 1;

3321 bound_checked_to += trace->bound_checked_up_to();

3322

3323 // If a character is preloaded into the current character register then

3324 // check that now.

3325 if (trace->characters_preloaded() == 1) {

3326 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {

3327 if (!SkipPass(pass, compiler->ignore_case())) {

3328 TextEmitPass(compiler,

3329 static_cast<TextEmitPassType>(pass),

3330 true,

3331 trace,

3332 false,

3333 &bound_checked_to);

3334 }

3335 }

3336 first_elt_done = true;

3337 }

3338

3339 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {

3340 if (!SkipPass(pass, compiler->ignore_case())) {

3341 TextEmitPass(compiler,

3342 static_cast<TextEmitPassType>(pass),

3343 false,

3344 trace,

3345 first_elt_done,

3346 &bound_checked_to);

3347 }

3348 }

3349

3350 Trace successor_trace(*trace);

3351 successor_trace.set_at_start(false);

3352 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);

3353 RecursionCheck rc(compiler);

3354 on_success()->Emit(compiler, &successor_trace);

3355 }

3356

3357

3358 void Trace::InvalidateCurrentCharacter() {

3359 characters_preloaded_ = 0;

3360 }

3361

3362

3363 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {

3364 DCHECK(by > 0);

3365 // We don't have an instruction for shifting the current character register

3366 // down or for using a shifted value for anything so lets just forget that

3367 // we preloaded any characters into it.

3368 characters_preloaded_ = 0;

3369 // Adjust the offsets of the quick check performed information. This

3370 // information is used to find out what we already determined about the

3371 // characters by means of mask and compare.

3372 quick_check_performed_.Advance(by, compiler->one_byte());

3373 cp_offset_ += by;

3374 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {

3375 compiler->SetRegExpTooBig();

3376 cp_offset_ = 0;

3377 }

3378 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);

3379 }

3380

3381

3382 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {

3383 int element_count = elms_->length();

3384 for (int i = 0; i < element_count; i++) {

3385 TextElement elm = elms_->at(i);

3386 if (elm.text_type() == TextElement::CHAR_CLASS) {

3387 RegExpCharacterClass* cc = elm.char_class();

3388 // None of the standard character classes is different in the case

3389 // independent case and it slows us down if we don't know that.

3390 if (cc->is_standard(zone())) continue;

3391 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

3392 int range_count = ranges->length();

3393 for (int j = 0; j < range_count; j++) {

3394 ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);

3395 }

3396 }

3397 }

3398 }

3399

3400

3401 int TextNode::GreedyLoopTextLength() {

3402 TextElement elm = elms_->at(elms_->length() - 1);

3403 return elm.cp_offset() + elm.length();

3404 }

3405

3406

3407 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(

3408 RegExpCompiler* compiler) {

3409 if (elms_->length() != 1) return NULL;

3410 TextElement elm = elms_->at(0);

3411 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;

3412 RegExpCharacterClass* node = elm.char_class();

3413 ZoneList<CharacterRange>* ranges = node->ranges(zone());

3414 if (!CharacterRange::IsCanonical(ranges)) {

3415 CharacterRange::Canonicalize(ranges);

3416 }

3417 if (node->is_negated()) {

3418 return ranges->length() == 0 ? on_success() : NULL;

3419 }

3420 if (ranges->length() != 1) return NULL;

3421 uint32_t max_char;

3422 if (compiler->one_byte()) {

3423 max_char = String::kMaxOneByteCharCode;

3424 } else {

3425 max_char = String::kMaxUtf16CodeUnit;

3426 }

3427 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;

3428 }

3429

3430

3431 // Finds the fixed match length of a sequence of nodes that goes from

3432 // this alternative and back to this choice node. If there are variable

3433 // length nodes or other complications in the way then return a sentinel

3434 // value indicating that a greedy loop cannot be constructed.

3435 int ChoiceNode::GreedyLoopTextLengthForAlternative(

3436 GuardedAlternative* alternative) {

3437 int length = 0;

3438 RegExpNode* node = alternative->node();

3439 // Later we will generate code for all these text nodes using recursion

3440 // so we have to limit the max number.

3441 int recursion_depth = 0;

3442 while (node != this) {

3443 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {

3444 return kNodeIsTooComplexForGreedyLoops;

3445 }

3446 int node_length = node->GreedyLoopTextLength();

3447 if (node_length == kNodeIsTooComplexForGreedyLoops) {

3448 return kNodeIsTooComplexForGreedyLoops;

3449 }

3450 length += node_length;

3451 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);

3452 node = seq_node->on_success();

3453 }

3454 return length;

3455 }

3456

3457

3458 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {

3459 DCHECK_NULL(loop_node_);

3460 AddAlternative(alt);

3461 loop_node_ = alt.node();

3462 }

3463

3464

3465 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {

3466 DCHECK_NULL(continue_node_);

3467 AddAlternative(alt);

3468 continue_node_ = alt.node();

3469 }

3470

3471

3472 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

3473 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

3474 if (trace->stop_node() == this) {

3475 // Back edge of greedy optimized loop node graph.

3476 int text_length =

3477 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));

3478 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);

3479 // Update the counter-based backtracking info on the stack. This is an

3480 // optimization for greedy loops (see below).

3481 DCHECK(trace->cp_offset() == text_length);

3482 macro_assembler->AdvanceCurrentPosition(text_length);

3483 macro_assembler->GoTo(trace->loop_label());

3484 return;

3485 }

3486 DCHECK_NULL(trace->stop_node());

3487 if (!trace->is_trivial()) {

3488 trace->Flush(compiler, this);

3489 return;

3490 }

3491 ChoiceNode::Emit(compiler, trace);

3492 }

3493

3494

3495 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,

3496 int eats_at_least) {

3497 int preload_characters = Min(4, eats_at_least);

3498 if (compiler->macro_assembler()->CanReadUnaligned()) {

3499 bool one_byte = compiler->one_byte();

3500 if (one_byte) {

3501 if (preload_characters > 4) preload_characters = 4;

3502 // We can't preload 3 characters because there is no machine instruction

3503 // to do that. We can't just load 4 because we could be reading

3504 // beyond the end of the string, which could cause a memory fault.

3505 if (preload_characters == 3) preload_characters = 2;

3506 } else {

3507 if (preload_characters > 2) preload_characters = 2;

3508 }

3509 } else {

3510 if (preload_characters > 1) preload_characters = 1;

3511 }

3512 return preload_characters;

3513 }

3514

3515

3516 // This class is used when generating the alternatives in a choice node. It

3517 // records the way the alternative is being code generated.

3518 class AlternativeGeneration: public Malloced {

3519 public:

3520 AlternativeGeneration()

3521 : possible_success(),

3522 expects_preload(false),

3523 after(),

3524 quick_check_details() { }

3525 Label possible_success;

3526 bool expects_preload;

3527 Label after;

3528 QuickCheckDetails quick_check_details;

3529 };

3530

3531

3532 // Creates a list of AlternativeGenerations. If the list has a reasonable

3533 // size then it is on the stack, otherwise the excess is on the heap.

3534 class AlternativeGenerationList {

3535 public:

3536 AlternativeGenerationList(int count, Zone* zone)

3537 : alt_gens_(count, zone) {

3538 for (int i = 0; i < count && i < kAFew; i++) {

3539 alt_gens_.Add(a_few_alt_gens_ + i, zone);

3540 }

3541 for (int i = kAFew; i < count; i++) {

3542 alt_gens_.Add(new AlternativeGeneration(), zone);

3543 }

3544 }

3545 ~AlternativeGenerationList() {

3546 for (int i = kAFew; i < alt_gens_.length(); i++) {

3547 delete alt_gens_[i];

3548 alt_gens_[i] = NULL;

3549 }

3550 }

3551

3552 AlternativeGeneration* at(int i) {

3553 return alt_gens_[i];

3554 }

3555

3556 private:

3557 static const int kAFew = 10;

3558 ZoneList<AlternativeGeneration*> alt_gens_;

3559 AlternativeGeneration a_few_alt_gens_[kAFew];

3560 };

3561

3562

3563 // The '2' variant is has inclusive from and exclusive to.

3564 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,

3565 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.

3566 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,

3567 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,

3568 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,

3569 0xFEFF, 0xFF00, 0x10000 };

3570 static const int kSpaceRangeCount = arraysize(kSpaceRanges);

3571

3572 static const int kWordRanges[] = {

3573 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };

3574 static const int kWordRangeCount = arraysize(kWordRanges);

3575 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };

3576 static const int kDigitRangeCount = arraysize(kDigitRanges);

3577 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };

3578 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);

3579 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,

3580 0x2028, 0x202A, 0x10000 };

3581 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);

3582

3583

3584 void BoyerMoorePositionInfo::Set(int character) {

3585 SetInterval(Interval(character, character));

3586 }

3587

3588

3589 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {

3590 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);

3591 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);

3592 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);

3593 surrogate_ =

3594 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);

3595 if (interval.to() - interval.from() >= kMapSize - 1) {

3596 if (map_count_ != kMapSize) {

3597 map_count_ = kMapSize;

3598 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;

3599 }

3600 return;

3601 }

3602 for (int i = interval.from(); i <= interval.to(); i++) {

3603 int mod_character = (i & kMask);

3604 if (!map_->at(mod_character)) {

3605 map_count_++;

3606 map_->at(mod_character) = true;

3607 }

3608 if (map_count_ == kMapSize) return;

3609 }

3610 }

3611

3612

3613 void BoyerMoorePositionInfo::SetAll() {

3614 s_ = w_ = d_ = kLatticeUnknown;

3615 if (map_count_ != kMapSize) {

3616 map_count_ = kMapSize;

3617 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;

3618 }

3619 }

3620

3621

3622 BoyerMooreLookahead::BoyerMooreLookahead(

3623 int length, RegExpCompiler* compiler, Zone* zone)

3624 : length_(length),

3625 compiler_(compiler) {

3626 if (compiler->one_byte()) {

3627 max_char_ = String::kMaxOneByteCharCode;

3628 } else {

3629 max_char_ = String::kMaxUtf16CodeUnit;

3630 }

3631 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);

3632 for (int i = 0; i < length; i++) {

3633 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);

3634 }

3635 }

3636

3637

3638 // Find the longest range of lookahead that has the fewest number of different

3639 // characters that can occur at a given position. Since we are optimizing two

3640 // different parameters at once this is a tradeoff.

3641 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {

3642 int biggest_points = 0;

3643 // If more than 32 characters out of 128 can occur it is unlikely that we can

3644 // be lucky enough to step forwards much of the time.

3645 const int kMaxMax = 32;

3646 for (int max_number_of_chars = 4;

3647 max_number_of_chars < kMaxMax;

3648 max_number_of_chars *= 2) {

3649 biggest_points =

3650 FindBestInterval(max_number_of_chars, biggest_points, from, to);

3651 }

3652 if (biggest_points == 0) return false;

3653 return true;

3654 }

3655

3656

3657 // Find the highest-points range between 0 and length_ where the character

3658 // information is not too vague. 'Too vague' means that there are more than

3659 // max_number_of_chars that can occur at this position. Calculates the number

3660 // of points as the product of width-of-the-range and

3661 // probability-of-finding-one-of-the-characters, where the probability is

3662 // calculated using the frequency distribution of the sample subject string.

3663 int BoyerMooreLookahead::FindBestInterval(

3664 int max_number_of_chars, int old_biggest_points, int* from, int* to) {

3665 int biggest_points = old_biggest_points;

3666 static const int kSize = RegExpMacroAssembler::kTableSize;

3667 for (int i = 0; i < length_; ) {

3668 while (i < length_ && Count(i) > max_number_of_chars) i++;

3669 if (i == length_) break;

3670 int remembered_from = i;

3671 bool union_map[kSize];

3672 for (int j = 0; j < kSize; j++) union_map[j] = false;

3673 while (i < length_ && Count(i) <= max_number_of_chars) {

3674 BoyerMoorePositionInfo* map = bitmaps_->at(i);

3675 for (int j = 0; j < kSize; j++) union_map[j] \|= map->at(j);

3676 i++;

3677 }

3678 int frequency = 0;

3679 for (int j = 0; j < kSize; j++) {

3680 if (union_map[j]) {

3681 // Add 1 to the frequency to give a small per-character boost for

3682 // the cases where our sampling is not good enough and many

3683 // characters have a frequency of zero. This means the frequency

3684 // can theoretically be up to 2*kSize though we treat it mostly as

3685 // a fraction of kSize.

3686 frequency += compiler_->frequency_collator()->Frequency(j) + 1;

3687 }

3688 }

3689 // We use the probability of skipping times the distance we are skipping to

3690 // judge the effectiveness of this. Actually we have a cut-off: By

3691 // dividing by 2 we switch off the skipping if the probability of skipping

3692 // is less than 50%. This is because the multibyte mask-and-compare

3693 // skipping in quickcheck is more likely to do well on this case.

3694 bool in_quickcheck_range =

3695 ((i - remembered_from < 4) \|\|

3696 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));

3697 // Called 'probability' but it is only a rough estimate and can actually

3698 // be outside the 0-kSize range.

3699 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;

3700 int points = (i - remembered_from) * probability;

3701 if (points > biggest_points) {

3702 *from = remembered_from;

3703 *to = i - 1;

3704 biggest_points = points;

3705 }

3706 }

3707 return biggest_points;

3708 }

3709

3710

3711 // Take all the characters that will not prevent a successful match if they

3712 // occur in the subject string in the range between min_lookahead and

3713 // max_lookahead (inclusive) measured from the current position. If the

3714 // character at max_lookahead offset is not one of these characters, then we

3715 // can safely skip forwards by the number of characters in the range.

3716 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,

3717 int max_lookahead,

3718 Handle<ByteArray> boolean_skip_table) {

3719 const int kSize = RegExpMacroAssembler::kTableSize;

3720

3721 const int kSkipArrayEntry = 0;

3722 const int kDontSkipArrayEntry = 1;

3723

3724 for (int i = 0; i < kSize; i++) {

3725 boolean_skip_table->set(i, kSkipArrayEntry);

3726 }

3727 int skip = max_lookahead + 1 - min_lookahead;

3728

3729 for (int i = max_lookahead; i >= min_lookahead; i--) {

3730 BoyerMoorePositionInfo* map = bitmaps_->at(i);

3731 for (int j = 0; j < kSize; j++) {

3732 if (map->at(j)) {

3733 boolean_skip_table->set(j, kDontSkipArrayEntry);

3734 }

3735 }

3736 }

3737

3738 return skip;

3739 }

3740

3741

3742 // See comment above on the implementation of GetSkipTable.

3743 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {

3744 const int kSize = RegExpMacroAssembler::kTableSize;

3745

3746 int min_lookahead = 0;

3747 int max_lookahead = 0;

3748

3749 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;

3750

3751 bool found_single_character = false;

3752 int single_character = 0;

3753 for (int i = max_lookahead; i >= min_lookahead; i--) {

3754 BoyerMoorePositionInfo* map = bitmaps_->at(i);

3755 if (map->map_count() > 1 \|\|

3756 (found_single_character && map->map_count() != 0)) {

3757 found_single_character = false;

3758 break;

3759 }

3760 for (int j = 0; j < kSize; j++) {

3761 if (map->at(j)) {

3762 found_single_character = true;

3763 single_character = j;

3764 break;

3765 }

3766 }

3767 }

3768

3769 int lookahead_width = max_lookahead + 1 - min_lookahead;

3770

3771 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {

3772 // The mask-compare can probably handle this better.

3773 return;

3774 }

3775

3776 if (found_single_character) {

3777 Label cont, again;

3778 masm->Bind(&again);

3779 masm->LoadCurrentCharacter(max_lookahead, &cont, true);

3780 if (max_char_ > kSize) {

3781 masm->CheckCharacterAfterAnd(single_character,

3782 RegExpMacroAssembler::kTableMask,

3783 &cont);

3784 } else {

3785 masm->CheckCharacter(single_character, &cont);

3786 }

3787 masm->AdvanceCurrentPosition(lookahead_width);

3788 masm->GoTo(&again);

3789 masm->Bind(&cont);

3790 return;

3791 }

3792

3793 Factory* factory = masm->isolate()->factory();

3794 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);

3795 int skip_distance = GetSkipTable(

3796 min_lookahead, max_lookahead, boolean_skip_table);

3797 DCHECK(skip_distance != 0);

3798

3799 Label cont, again;

3800 masm->Bind(&again);

3801 masm->LoadCurrentCharacter(max_lookahead, &cont, true);

3802 masm->CheckBitInTable(boolean_skip_table, &cont);

3803 masm->AdvanceCurrentPosition(skip_distance);

3804 masm->GoTo(&again);

3805 masm->Bind(&cont);

3806 }

3807

3808

3809 /* Code generation for choice nodes.

3810 *

3811 * We generate quick checks that do a mask and compare to eliminate a

3812 * choice. If the quick check succeeds then it jumps to the continuation to

3813 * do slow checks and check subsequent nodes. If it fails (the common case)

3814 * it falls through to the next choice.

3815 *

3816 * Here is the desired flow graph. Nodes directly below each other imply

3817 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative

3818 * 3 doesn't have a quick check so we have to call the slow check.

3819 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire

3820 * regexp continuation is generated directly after the Sn node, up to the

3821 * next GoTo if we decide to reuse some already generated code. Some

3822 * nodes expect preload_characters to be preloaded into the current

3823 * character register. R nodes do this preloading. Vertices are marked

3824 * F for failures and S for success (possible success in the case of quick

3825 * nodes). L, V, < and > are used as arrow heads.

3826 *

3827 * ----------> R

3828 * \|

3829 * V

3830 * Q1 -----> S1

3831 * \| S /

3832 * F\| /

3833 * \| F/

3834 * \| /

3835 * \| R

3836 * \| /

3837 * V L

3838 * Q2 -----> S2

3839 * \| S /

3840 * F\| /

3841 * \| F/

3842 * \| /

3843 * \| R

3844 * \| /

3845 * V L

3846 * S3

3847 * \|

3848 * F\|

3849 * \|

3850 * R

3851 * \|

3852 * backtrack V

3853 * <----------Q4

3854 * \ F \|

3855 * \ \|S

3856 * \ F V

3857 * \-----S4

3858 *

3859 * For greedy loops we push the current position, then generate the code that

3860 * eats the input specially in EmitGreedyLoop. The other choice (the

3861 * continuation) is generated by the normal code in EmitChoices, and steps back

3862 * in the input to the starting position when it fails to match. The loop code

3863 * looks like this (U is the unwind code that steps back in the greedy loop).

3864 *

3865 * _____

3866 * / \

3867 * V \|

3868 * ----------> S1 \|

3869 * /\| \|

3870 * / \|S \|

3871 * F/ \_____/

3872 * /

3873 * \|<-----

3874 * \| \

3875 * V \|S

3876 * Q2 ---> U----->backtrack

3877 * \| F /

3878 * S\| /

3879 * V F /

3880 * S2--/

3881 */

3882

3883 GreedyLoopState::GreedyLoopState(bool not_at_start) {

3884 counter_backtrack_trace_.set_backtrack(&label_);

3885 if (not_at_start) counter_backtrack_trace_.set_at_start(false);

3886 }

3887

3888

3889 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {

3890 #ifdef DEBUG

3891 int choice_count = alternatives_->length();

3892 for (int i = 0; i < choice_count - 1; i++) {

3893 GuardedAlternative alternative = alternatives_->at(i);

3894 ZoneList<Guard> guards = alternative.guards();

3895 int guard_count = (guards == NULL) ? 0 : guards->length();

3896 for (int j = 0; j < guard_count; j++) {

3897 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));

3898 }

3899 }

3900 #endif

3901 }

3902

3903

3904 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,

3905 Trace* current_trace,

3906 PreloadState* state) {

3907 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {

3908 // Save some time by looking at most one machine word ahead.

3909 state->eats_at_least_ =

3910 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,

3911 current_trace->at_start() == Trace::FALSE_VALUE);

3912 }

3913 state->preload_characters_ =

3914 CalculatePreloadCharacters(compiler, state->eats_at_least_);

3915

3916 state->preload_is_current_ =

3917 (current_trace->characters_preloaded() == state->preload_characters_);

3918 state->preload_has_checked_bounds_ = state->preload_is_current_;

3919 }

3920

3921

3922 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

3923 int choice_count = alternatives_->length();

3924

3925 AssertGuardsMentionRegisters(trace);

3926

3927 LimitResult limit_result = LimitVersions(compiler, trace);

3928 if (limit_result == DONE) return;

3929 DCHECK(limit_result == CONTINUE);

3930

3931 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for

3932 // other choice nodes we only flush if we are out of code size budget.

3933 if (trace->flush_budget() == 0 && trace->actions() != NULL) {

3934 trace->Flush(compiler, this);

3935 return;

3936 }

3937

3938 RecursionCheck rc(compiler);

3939

3940 PreloadState preload;

3941 preload.init();

3942 GreedyLoopState greedy_loop_state(not_at_start());

3943

3944 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));

3945 AlternativeGenerationList alt_gens(choice_count, zone());

3946

3947 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {

3948 trace = EmitGreedyLoop(compiler,

3949 trace,

3950 &alt_gens,

3951 &preload,

3952 &greedy_loop_state,

3953 text_length);

3954 } else {

3955 // TODO(erikcorry): Delete this. We don't need this label, but it makes us

3956 // match the traces produced pre-cleanup.

3957 Label second_choice;

3958 compiler->macro_assembler()->Bind(&second_choice);

3959

3960 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);

3961

3962 EmitChoices(compiler,

3963 &alt_gens,

3964 0,

3965 trace,

3966 &preload);

3967 }

3968

3969 // At this point we need to generate slow checks for the alternatives where

3970 // the quick check was inlined. We can recognize these because the associated

3971 // label was bound.

3972 int new_flush_budget = trace->flush_budget() / choice_count;

3973 for (int i = 0; i < choice_count; i++) {

3974 AlternativeGeneration* alt_gen = alt_gens.at(i);

3975 Trace new_trace(*trace);

3976 // If there are actions to be flushed we have to limit how many times

3977 // they are flushed. Take the budget of the parent trace and distribute

3978 // it fairly amongst the children.

3979 if (new_trace.actions() != NULL) {

3980 new_trace.set_flush_budget(new_flush_budget);

3981 }

3982 bool next_expects_preload =

3983 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;

3984 EmitOutOfLineContinuation(compiler,

3985 &new_trace,

3986 alternatives_->at(i),

3987 alt_gen,

3988 preload.preload_characters_,

3989 next_expects_preload);

3990 }

3991 }

3992

3993

3994 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,

3995 Trace* trace,

3996 AlternativeGenerationList* alt_gens,

3997 PreloadState* preload,

3998 GreedyLoopState* greedy_loop_state,

3999 int text_length) {

4000 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

4001 // Here we have special handling for greedy loops containing only text nodes

4002 // and other simple nodes. These are handled by pushing the current

4003 // position on the stack and then incrementing the current position each

4004 // time around the switch. On backtrack we decrement the current position

4005 // and check it against the pushed value. This avoids pushing backtrack

4006 // information for each iteration of the loop, which could take up a lot of

4007 // space.

4008 DCHECK(trace->stop_node() == NULL);

4009 macro_assembler->PushCurrentPosition();

4010 Label greedy_match_failed;

4011 Trace greedy_match_trace;

4012 if (not_at_start()) greedy_match_trace.set_at_start(false);

4013 greedy_match_trace.set_backtrack(&greedy_match_failed);

4014 Label loop_label;

4015 macro_assembler->Bind(&loop_label);

4016 greedy_match_trace.set_stop_node(this);

4017 greedy_match_trace.set_loop_label(&loop_label);

4018 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);

4019 macro_assembler->Bind(&greedy_match_failed);

4020

4021 Label second_choice; // For use in greedy matches.

4022 macro_assembler->Bind(&second_choice);

4023

4024 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();

4025

4026 EmitChoices(compiler,

4027 alt_gens,

4028 1,

4029 new_trace,

4030 preload);

4031

4032 macro_assembler->Bind(greedy_loop_state->label());

4033 // If we have unwound to the bottom then backtrack.

4034 macro_assembler->CheckGreedyLoop(trace->backtrack());

4035 // Otherwise try the second priority at an earlier position.

4036 macro_assembler->AdvanceCurrentPosition(-text_length);

4037 macro_assembler->GoTo(&second_choice);

4038 return new_trace;

4039 }

4040

4041 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,

4042 Trace* trace) {

4043 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;

4044 if (alternatives_->length() != 2) return eats_at_least;

4045

4046 GuardedAlternative alt1 = alternatives_->at(1);

4047 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {

4048 return eats_at_least;

4049 }

4050 RegExpNode* eats_anything_node = alt1.node();

4051 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {

4052 return eats_at_least;

4053 }

4054

4055 // Really we should be creating a new trace when we execute this function,

4056 // but there is no need, because the code it generates cannot backtrack, and

4057 // we always arrive here with a trivial trace (since it's the entry to a

4058 // loop. That also implies that there are no preloaded characters, which is

4059 // good, because it means we won't be violating any assumptions by

4060 // overwriting those characters with new load instructions.

4061 DCHECK(trace->is_trivial());

4062

4063 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

4064 Isolate* isolate = macro_assembler->isolate();

4065 // At this point we know that we are at a non-greedy loop that will eat

4066 // any character one at a time. Any non-anchored regexp has such a

4067 // loop prepended to it in order to find where it starts. We look for

4068 // a pattern of the form ...abc... where we can look 6 characters ahead

4069 // and step forwards 3 if the character is not one of abc. Abc need

4070 // not be atoms, they can be any reasonably limited character class or

4071 // small alternation.

4072 BoyerMooreLookahead* bm = bm_info(false);

4073 if (bm == NULL) {

4074 eats_at_least = Min(kMaxLookaheadForBoyerMoore,

4075 EatsAtLeast(kMaxLookaheadForBoyerMoore,

4076 kRecursionBudget,

4077 false));

4078 if (eats_at_least >= 1) {

4079 bm = new(zone()) BoyerMooreLookahead(eats_at_least,

4080 compiler,

4081 zone());

4082 GuardedAlternative alt0 = alternatives_->at(0);

4083 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);

4084 }

4085 }

4086 if (bm != NULL) {

4087 bm->EmitSkipInstructions(macro_assembler);

4088 }

4089 return eats_at_least;

4090 }

4091

4092

4093 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,

4094 AlternativeGenerationList* alt_gens,

4095 int first_choice,

4096 Trace* trace,

4097 PreloadState* preload) {

4098 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

4099 SetUpPreLoad(compiler, trace, preload);

4100

4101 // For now we just call all choices one after the other. The idea ultimately

4102 // is to use the Dispatch table to try only the relevant ones.

4103 int choice_count = alternatives_->length();

4104

4105 int new_flush_budget = trace->flush_budget() / choice_count;

4106

4107 for (int i = first_choice; i < choice_count; i++) {

4108 bool is_last = i == choice_count - 1;

4109 bool fall_through_on_failure = !is_last;

4110 GuardedAlternative alternative = alternatives_->at(i);

4111 AlternativeGeneration* alt_gen = alt_gens->at(i);

4112 alt_gen->quick_check_details.set_characters(preload->preload_characters_);

4113 ZoneList<Guard> guards = alternative.guards();

4114 int guard_count = (guards == NULL) ? 0 : guards->length();

4115 Trace new_trace(*trace);

4116 new_trace.set_characters_preloaded(preload->preload_is_current_ ?

4117 preload->preload_characters_ :

4118 0);

4119 if (preload->preload_has_checked_bounds_) {

4120 new_trace.set_bound_checked_up_to(preload->preload_characters_);

4121 }

4122 new_trace.quick_check_performed()->Clear();

4123 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);

4124 if (!is_last) {

4125 new_trace.set_backtrack(&alt_gen->after);

4126 }

4127 alt_gen->expects_preload = preload->preload_is_current_;

4128 bool generate_full_check_inline = false;

4129 if (compiler->optimize() &&

4130 try_to_emit_quick_check_for_alternative(i == 0) &&

4131 alternative.node()->EmitQuickCheck(

4132 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,

4133 &alt_gen->possible_success, &alt_gen->quick_check_details,

4134 fall_through_on_failure)) {

4135 // Quick check was generated for this choice.

4136 preload->preload_is_current_ = true;

4137 preload->preload_has_checked_bounds_ = true;

4138 // If we generated the quick check to fall through on possible success,

4139 // we now need to generate the full check inline.

4140 if (!fall_through_on_failure) {

4141 macro_assembler->Bind(&alt_gen->possible_success);

4142 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);

4143 new_trace.set_characters_preloaded(preload->preload_characters_);

4144 new_trace.set_bound_checked_up_to(preload->preload_characters_);

4145 generate_full_check_inline = true;

4146 }

4147 } else if (alt_gen->quick_check_details.cannot_match()) {

4148 if (!fall_through_on_failure) {

4149 macro_assembler->GoTo(trace->backtrack());

4150 }

4151 continue;

4152 } else {

4153 // No quick check was generated. Put the full code here.

4154 // If this is not the first choice then there could be slow checks from

4155 // previous cases that go here when they fail. There's no reason to

4156 // insist that they preload characters since the slow check we are about

4157 // to generate probably can't use it.

4158 if (i != first_choice) {

4159 alt_gen->expects_preload = false;

4160 new_trace.InvalidateCurrentCharacter();

4161 }

4162 generate_full_check_inline = true;

4163 }

4164 if (generate_full_check_inline) {

4165 if (new_trace.actions() != NULL) {

4166 new_trace.set_flush_budget(new_flush_budget);

4167 }

4168 for (int j = 0; j < guard_count; j++) {

4169 GenerateGuard(macro_assembler, guards->at(j), &new_trace);

4170 }

4171 alternative.node()->Emit(compiler, &new_trace);

4172 preload->preload_is_current_ = false;

4173 }

4174 macro_assembler->Bind(&alt_gen->after);

4175 }

4176 }

4177

4178

4179 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,

4180 Trace* trace,

4181 GuardedAlternative alternative,

4182 AlternativeGeneration* alt_gen,

4183 int preload_characters,

4184 bool next_expects_preload) {

4185 if (!alt_gen->possible_success.is_linked()) return;

4186

4187 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();

4188 macro_assembler->Bind(&alt_gen->possible_success);

4189 Trace out_of_line_trace(*trace);

4190 out_of_line_trace.set_characters_preloaded(preload_characters);

4191 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);

4192 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);

4193 ZoneList<Guard> guards = alternative.guards();

4194 int guard_count = (guards == NULL) ? 0 : guards->length();

4195 if (next_expects_preload) {

4196 Label reload_current_char;

4197 out_of_line_trace.set_backtrack(&reload_current_char);

4198 for (int j = 0; j < guard_count; j++) {

4199 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);

4200 }

4201 alternative.node()->Emit(compiler, &out_of_line_trace);

4202 macro_assembler->Bind(&reload_current_char);

4203 // Reload the current character, since the next quick check expects that.

4204 // We don't need to check bounds here because we only get into this

4205 // code through a quick check which already did the checked load.

4206 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),

4207 NULL,

4208 false,

4209 preload_characters);

4210 macro_assembler->GoTo(&(alt_gen->after));

4211 } else {

4212 out_of_line_trace.set_backtrack(&(alt_gen->after));

4213 for (int j = 0; j < guard_count; j++) {

4214 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);

4215 }

4216 alternative.node()->Emit(compiler, &out_of_line_trace);

4217 }

4218 }

4219

4220

4221 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {

4222 RegExpMacroAssembler* assembler = compiler->macro_assembler();

4223 LimitResult limit_result = LimitVersions(compiler, trace);

4224 if (limit_result == DONE) return;

4225 DCHECK(limit_result == CONTINUE);

4226

4227 RecursionCheck rc(compiler);

4228

4229 switch (action_type_) {

4230 case STORE_POSITION: {

4231 Trace::DeferredCapture

4232 new_capture(data_.u_position_register.reg,

4233 data_.u_position_register.is_capture,

4234 trace);

4235 Trace new_trace = *trace;

4236 new_trace.add_action(&new_capture);

4237 on_success()->Emit(compiler, &new_trace);

4238 break;

4239 }

4240 case INCREMENT_REGISTER: {

4241 Trace::DeferredIncrementRegister

4242 new_increment(data_.u_increment_register.reg);

4243 Trace new_trace = *trace;

4244 new_trace.add_action(&new_increment);

4245 on_success()->Emit(compiler, &new_trace);

4246 break;

4247 }

4248 case SET_REGISTER: {

4249 Trace::DeferredSetRegister

4250 new_set(data_.u_store_register.reg, data_.u_store_register.value);

4251 Trace new_trace = *trace;

4252 new_trace.add_action(&new_set);

4253 on_success()->Emit(compiler, &new_trace);

4254 break;

4255 }

4256 case CLEAR_CAPTURES: {

4257 Trace::DeferredClearCaptures

4258 new_capture(Interval(data_.u_clear_captures.range_from,

4259 data_.u_clear_captures.range_to));

4260 Trace new_trace = *trace;

4261 new_trace.add_action(&new_capture);

4262 on_success()->Emit(compiler, &new_trace);

4263 break;

4264 }

4265 case BEGIN_SUBMATCH:

4266 if (!trace->is_trivial()) {

4267 trace->Flush(compiler, this);

4268 } else {

4269 assembler->WriteCurrentPositionToRegister(

4270 data_.u_submatch.current_position_register, 0);

4271 assembler->WriteStackPointerToRegister(

4272 data_.u_submatch.stack_pointer_register);

4273 on_success()->Emit(compiler, trace);

4274 }

4275 break;

4276 case EMPTY_MATCH_CHECK: {

4277 int start_pos_reg = data_.u_empty_match_check.start_register;

4278 int stored_pos = 0;

4279 int rep_reg = data_.u_empty_match_check.repetition_register;

4280 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);

4281 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);

4282 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {

4283 // If we know we haven't advanced and there is no minimum we

4284 // can just backtrack immediately.

4285 assembler->GoTo(trace->backtrack());

4286 } else if (know_dist && stored_pos < trace->cp_offset()) {

4287 // If we know we've advanced we can generate the continuation

4288 // immediately.

4289 on_success()->Emit(compiler, trace);

4290 } else if (!trace->is_trivial()) {

4291 trace->Flush(compiler, this);

4292 } else {

4293 Label skip_empty_check;

4294 // If we have a minimum number of repetitions we check the current

4295 // number first and skip the empty check if it's not enough.

4296 if (has_minimum) {

4297 int limit = data_.u_empty_match_check.repetition_limit;

4298 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);

4299 }

4300 // If the match is empty we bail out, otherwise we fall through

4301 // to the on-success continuation.

4302 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,

4303 trace->backtrack());

4304 assembler->Bind(&skip_empty_check);

4305 on_success()->Emit(compiler, trace);

4306 }

4307 break;

4308 }

4309 case POSITIVE_SUBMATCH_SUCCESS: {

4310 if (!trace->is_trivial()) {

4311 trace->Flush(compiler, this);

4312 return;

4313 }

4314 assembler->ReadCurrentPositionFromRegister(

4315 data_.u_submatch.current_position_register);

4316 assembler->ReadStackPointerFromRegister(

4317 data_.u_submatch.stack_pointer_register);

4318 int clear_register_count = data_.u_submatch.clear_register_count;

4319 if (clear_register_count == 0) {

4320 on_success()->Emit(compiler, trace);

4321 return;

4322 }

4323 int clear_registers_from = data_.u_submatch.clear_register_from;

4324 Label clear_registers_backtrack;

4325 Trace new_trace = *trace;

4326 new_trace.set_backtrack(&clear_registers_backtrack);

4327 on_success()->Emit(compiler, &new_trace);

4328

4329 assembler->Bind(&clear_registers_backtrack);

4330 int clear_registers_to = clear_registers_from + clear_register_count - 1;

4331 assembler->ClearRegisters(clear_registers_from, clear_registers_to);

4332

4333 DCHECK(trace->backtrack() == NULL);

4334 assembler->Backtrack();

4335 return;

4336 }

4337 default:

4338 UNREACHABLE();

4339 }

4340 }

4341

4342

4343 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

4344 RegExpMacroAssembler* assembler = compiler->macro_assembler();

4345 if (!trace->is_trivial()) {

4346 trace->Flush(compiler, this);

4347 return;

4348 }

4349

4350 LimitResult limit_result = LimitVersions(compiler, trace);

4351 if (limit_result == DONE) return;

4352 DCHECK(limit_result == CONTINUE);

4353

4354 RecursionCheck rc(compiler);

4355

4356 DCHECK_EQ(start_reg_ + 1, end_reg_);

4357 if (compiler->ignore_case()) {

4358 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,

4359 trace->backtrack());

4360 } else {

4361 assembler->CheckNotBackReference(start_reg_, trace->backtrack());

4362 }

4363 on_success()->Emit(compiler, trace);

4364 }

4365

4366

4367 // -------------------------------------------------------------------

4368 // Dot/dotty output

4369

4370

4371 #ifdef DEBUG

4372

4373

4374 class DotPrinter: public NodeVisitor {

4375 public:

4376 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT

4377 : os_(os),

4378 ignore_case_(ignore_case) {}

4379 void PrintNode(const char* label, RegExpNode* node);

4380 void Visit(RegExpNode* node);

4381 void PrintAttributes(RegExpNode* from);

4382 void PrintOnFailure(RegExpNode* from, RegExpNode* to);

4383 #define DECLARE_VISIT(Type) \

4384 virtual void Visit##Type(Type##Node* that);

4385 FOR_EACH_NODE_TYPE(DECLARE_VISIT)

4386 #undef DECLARE_VISIT

4387 private:

4388 std::ostream& os_;

4389 bool ignore_case_;

4390 };

4391

4392

4393 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {

4394 os_ << "digraph G {\n graph [label=\"";

4395 for (int i = 0; label[i]; i++) {

4396 switch (label[i]) {

4397 case '\\':

4398 os_ << "\\\\";

4399 break;

4400 case '"':

4401 os_ << "\"";

4402 break;

4403 default:

4404 os_ << label[i];

4405 break;

4406 }

4407 }

4408 os_ << "\"];\n";

4409 Visit(node);

4410 os_ << "}" << std::endl;

4411 }

4412

4413

4414 void DotPrinter::Visit(RegExpNode* node) {

4415 if (node->info()->visited) return;

4416 node->info()->visited = true;

4417 node->Accept(this);

4418 }

4419

4420

4421 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {

4422 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";

4423 Visit(on_failure);

4424 }

4425

4426

4427 class TableEntryBodyPrinter {

4428 public:

4429 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT

4430 : os_(os),

4431 choice_(choice) {}

4432 void Call(uc16 from, DispatchTable::Entry entry) {

4433 OutSet* out_set = entry.out_set();

4434 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

4435 if (out_set->Get(i)) {

4436 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"

4437 << choice()->alternatives()->at(i).node() << ";\n";

4438 }

4439 }

4440 }

4441 private:

4442 ChoiceNode* choice() { return choice_; }

4443 std::ostream& os_;

4444 ChoiceNode* choice_;

4445 };

4446

4447

4448 class TableEntryHeaderPrinter {

4449 public:

4450 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT

4451 : first_(true),

4452 os_(os) {}

4453 void Call(uc16 from, DispatchTable::Entry entry) {

4454 if (first_) {

4455 first_ = false;

4456 } else {

4457 os_ << "\|";

4458 }

4459 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "\|{";

4460 OutSet* out_set = entry.out_set();

4461 int priority = 0;

4462 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

4463 if (out_set->Get(i)) {

4464 if (priority > 0) os_ << "\|";

4465 os_ << "<s" << from << "o" << i << "> " << priority;

4466 priority++;

4467 }

4468 }

4469 os_ << "}}";

4470 }

4471

4472 private:

4473 bool first_;

4474 std::ostream& os_;

4475 };

4476

4477

4478 class AttributePrinter {

4479 public:

4480 explicit AttributePrinter(std::ostream& os) // NOLINT

4481 : os_(os),

4482 first_(true) {}

4483 void PrintSeparator() {

4484 if (first_) {

4485 first_ = false;

4486 } else {

4487 os_ << "\|";

4488 }

4489 }

4490 void PrintBit(const char* name, bool value) {

4491 if (!value) return;

4492 PrintSeparator();

4493 os_ << "{" << name << "}";

4494 }

4495 void PrintPositive(const char* name, int value) {

4496 if (value < 0) return;

4497 PrintSeparator();

4498 os_ << "{" << name << "\|" << value << "}";

4499 }

4500

4501 private:

4502 std::ostream& os_;

4503 bool first_;

4504 };

4505

4506

4507 void DotPrinter::PrintAttributes(RegExpNode* that) {

4508 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "

4509 << "margin=0.1, fontsize=10, label=\"{";

4510 AttributePrinter printer(os_);

4511 NodeInfo* info = that->info();

4512 printer.PrintBit("NI", info->follows_newline_interest);

4513 printer.PrintBit("WI", info->follows_word_interest);

4514 printer.PrintBit("SI", info->follows_start_interest);

4515 Label* label = that->label();

4516 if (label->is_bound())

4517 printer.PrintPositive("@", label->pos());

4518 os_ << "}\"];\n"

4519 << " a" << that << " -> n" << that

4520 << " [style=dashed, color=grey, arrowhead=none];\n";

4521 }

4522

4523

4524 static const bool kPrintDispatchTable = false;

4525 void DotPrinter::VisitChoice(ChoiceNode* that) {

4526 if (kPrintDispatchTable) {

4527 os_ << " n" << that << " [shape=Mrecord, label=\"";

4528 TableEntryHeaderPrinter header_printer(os_);

4529 that->GetTable(ignore_case_)->ForEach(&header_printer);

4530 os_ << "\"]\n";

4531 PrintAttributes(that);

4532 TableEntryBodyPrinter body_printer(os_, that);

4533 that->GetTable(ignore_case_)->ForEach(&body_printer);

4534 } else {

4535 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";

4536 for (int i = 0; i < that->alternatives()->length(); i++) {

4537 GuardedAlternative alt = that->alternatives()->at(i);

4538 os_ << " n" << that << " -> n" << alt.node();

4539 }

4540 }

4541 for (int i = 0; i < that->alternatives()->length(); i++) {

4542 GuardedAlternative alt = that->alternatives()->at(i);

4543 alt.node()->Accept(this);

4544 }

4545 }

4546

4547

4548 void DotPrinter::VisitText(TextNode* that) {

4549 Zone* zone = that->zone();

4550 os_ << " n" << that << " [label=\"";

4551 for (int i = 0; i < that->elements()->length(); i++) {

4552 if (i > 0) os_ << " ";

4553 TextElement elm = that->elements()->at(i);

4554 switch (elm.text_type()) {

4555 case TextElement::ATOM: {

4556 Vector<const uc16> data = elm.atom()->data();

4557 for (int i = 0; i < data.length(); i++) {

4558 os_ << static_cast<char>(data[i]);

4559 }

4560 break;

4561 }

4562 case TextElement::CHAR_CLASS: {

4563 RegExpCharacterClass* node = elm.char_class();

4564 os_ << "[";

4565 if (node->is_negated()) os_ << "^";

4566 for (int j = 0; j < node->ranges(zone)->length(); j++) {

4567 CharacterRange range = node->ranges(zone)->at(j);

4568 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());

4569 }

4570 os_ << "]";

4571 break;

4572 }

4573 default:

4574 UNREACHABLE();

4575 }

4576 }

4577 os_ << "\", shape=box, peripheries=2];\n";

4578 PrintAttributes(that);

4579 os_ << " n" << that << " -> n" << that->on_success() << ";\n";

4580 Visit(that->on_success());

4581 }

4582

4583

4584 void DotPrinter::VisitBackReference(BackReferenceNode* that) {

4585 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"

4586 << that->end_register() << "\", shape=doubleoctagon];\n";

4587 PrintAttributes(that);

4588 os_ << " n" << that << " -> n" << that->on_success() << ";\n";

4589 Visit(that->on_success());

4590 }

4591

4592

4593 void DotPrinter::VisitEnd(EndNode* that) {

4594 os_ << " n" << that << " [style=bold, shape=point];\n";

4595 PrintAttributes(that);

4596 }

4597

4598

4599 void DotPrinter::VisitAssertion(AssertionNode* that) {

4600 os_ << " n" << that << " [";

4601 switch (that->assertion_type()) {

4602 case AssertionNode::AT_END:

4603 os_ << "label=\"$\", shape=septagon";

4604 break;

4605 case AssertionNode::AT_START:

4606 os_ << "label=\"^\", shape=septagon";

4607 break;

4608 case AssertionNode::AT_BOUNDARY:

4609 os_ << "label=\"\\b\", shape=septagon";

4610 break;

4611 case AssertionNode::AT_NON_BOUNDARY:

4612 os_ << "label=\"\\B\", shape=septagon";

4613 break;

4614 case AssertionNode::AFTER_NEWLINE:

4615 os_ << "label=\"(?<=\\n)\", shape=septagon";

4616 break;

4617 }

4618 os_ << "];\n";

4619 PrintAttributes(that);

4620 RegExpNode* successor = that->on_success();

4621 os_ << " n" << that << " -> n" << successor << ";\n";

4622 Visit(successor);

4623 }

4624

4625

4626 void DotPrinter::VisitAction(ActionNode* that) {

4627 os_ << " n" << that << " [";

4628 switch (that->action_type_) {

4629 case ActionNode::SET_REGISTER:

4630 os_ << "label=\"$" << that->data_.u_store_register.reg

4631 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";

4632 break;

4633 case ActionNode::INCREMENT_REGISTER:

4634 os_ << "label=\"$" << that->data_.u_increment_register.reg

4635 << "++\", shape=octagon";

4636 break;

4637 case ActionNode::STORE_POSITION:

4638 os_ << "label=\"$" << that->data_.u_position_register.reg

4639 << ":=$pos\", shape=octagon";

4640 break;

4641 case ActionNode::BEGIN_SUBMATCH:

4642 os_ << "label=\"$" << that->data_.u_submatch.current_position_register

4643 << ":=$pos,begin\", shape=septagon";

4644 break;

4645 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:

4646 os_ << "label=\"escape\", shape=septagon";

4647 break;

4648 case ActionNode::EMPTY_MATCH_CHECK:

4649 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register

4650 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register

4651 << "<" << that->data_.u_empty_match_check.repetition_limit

4652 << "?\", shape=septagon";

4653 break;

4654 case ActionNode::CLEAR_CAPTURES: {

4655 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from

4656 << " to $" << that->data_.u_clear_captures.range_to

4657 << "\", shape=septagon";

4658 break;

4659 }

4660 }

4661 os_ << "];\n";

4662 PrintAttributes(that);

4663 RegExpNode* successor = that->on_success();

4664 os_ << " n" << that << " -> n" << successor << ";\n";

4665 Visit(successor);

4666 }

4667

4668

4669 class DispatchTableDumper {

4670 public:

4671 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}

4672 void Call(uc16 key, DispatchTable::Entry entry);

4673 private:

4674 std::ostream& os_;

4675 };

4676

4677

4678 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {

4679 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";

4680 OutSet* set = entry.out_set();

4681 bool first = true;

4682 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {

4683 if (set->Get(i)) {

4684 if (first) {

4685 first = false;

4686 } else {

4687 os_ << ", ";

4688 }

4689 os_ << i;

4690 }

4691 }

4692 os_ << "}\n";

4693 }

4694

4695

4696 void DispatchTable::Dump() {

4697 OFStream os(stderr);

4698 DispatchTableDumper dumper(os);

4699 tree()->ForEach(&dumper);

4700 }

4701

4702

4703 void RegExpEngine::DotPrint(const char* label,

4704 RegExpNode* node,

4705 bool ignore_case) {

4706 OFStream os(stdout);

4707 DotPrinter printer(os, ignore_case);

4708 printer.PrintNode(label, node);

4709 }

4710

4711

4712 #endif // DEBUG

4713

4714

4715 // -------------------------------------------------------------------

4716 // Tree to graph conversion

4717

4718 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,

4719 RegExpNode* on_success) {

4720 ZoneList<TextElement>* elms =

4721 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());

4722 elms->Add(TextElement::Atom(this), compiler->zone());

4723 return new(compiler->zone()) TextNode(elms, on_success);

4724 }

4725

4726

4727 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,

4728 RegExpNode* on_success) {

4729 return new(compiler->zone()) TextNode(elements(), on_success);

4730 }

4731

4732

4733 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,

4734 const int* special_class,

4735 int length) {

4736 length--; // Remove final 0x10000.

4737 DCHECK(special_class[length] == 0x10000);

4738 DCHECK(ranges->length() != 0);

4739 DCHECK(length != 0);

4740 DCHECK(special_class[0] != 0);

4741 if (ranges->length() != (length >> 1) + 1) {

4742 return false;

4743 }

4744 CharacterRange range = ranges->at(0);

4745 if (range.from() != 0) {

4746 return false;

4747 }

4748 for (int i = 0; i < length; i += 2) {

4749 if (special_class[i] != (range.to() + 1)) {

4750 return false;

4751 }

4752 range = ranges->at((i >> 1) + 1);

4753 if (special_class[i+1] != range.from()) {

4754 return false;

4755 }

4756 }

4757 if (range.to() != 0xffff) {

4758 return false;

4759 }

4760 return true;

4761 }

4762

4763

4764 static bool CompareRanges(ZoneList<CharacterRange>* ranges,

4765 const int* special_class,

4766 int length) {

4767 length--; // Remove final 0x10000.

4768 DCHECK(special_class[length] == 0x10000);

4769 if (ranges->length() * 2 != length) {

4770 return false;

4771 }

4772 for (int i = 0; i < length; i += 2) {

4773 CharacterRange range = ranges->at(i >> 1);

4774 if (range.from() != special_class[i] \|\|

4775 range.to() != special_class[i + 1] - 1) {

4776 return false;

4777 }

4778 }

4779 return true;

4780 }

4781

4782

4783 bool RegExpCharacterClass::is_standard(Zone* zone) {

4784 // TODO(lrn): Remove need for this function, by not throwing away information

4785 // along the way.

4786 if (is_negated_) {

4787 return false;

4788 }

4789 if (set_.is_standard()) {

4790 return true;

4791 }

4792 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {

4793 set_.set_standard_set_type('s');

4794 return true;

4795 }

4796 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {

4797 set_.set_standard_set_type('S');

4798 return true;

4799 }

4800 if (CompareInverseRanges(set_.ranges(zone),

4801 kLineTerminatorRanges,

4802 kLineTerminatorRangeCount)) {

4803 set_.set_standard_set_type('.');

4804 return true;

4805 }

4806 if (CompareRanges(set_.ranges(zone),

4807 kLineTerminatorRanges,

4808 kLineTerminatorRangeCount)) {

4809 set_.set_standard_set_type('n');

4810 return true;

4811 }

4812 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {

4813 set_.set_standard_set_type('w');

4814 return true;

4815 }

4816 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {

4817 set_.set_standard_set_type('W');

4818 return true;

4819 }

4820 return false;

4821 }

4822

4823

4824 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

4825 RegExpNode* on_success) {

4826 return new(compiler->zone()) TextNode(this, on_success);

4827 }

4828

4829

4830 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {

4831 RegExpAtom* atom1 = (*a)->AsAtom();

4832 RegExpAtom* atom2 = (*b)->AsAtom();

4833 uc16 character1 = atom1->data().at(0);

4834 uc16 character2 = atom2->data().at(0);

4835 if (character1 < character2) return -1;

4836 if (character1 > character2) return 1;

4837 return 0;

4838 }

4839

4840

4841 static unibrow::uchar Canonical(

4842 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

4843 unibrow::uchar c) {

4844 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];

4845 int length = canonicalize->get(c, '\0', chars);

4846 DCHECK_LE(length, 1);

4847 unibrow::uchar canonical = c;

4848 if (length == 1) canonical = chars[0];

4849 return canonical;

4850 }

4851

4852

4853 int CompareFirstCharCaseIndependent(

4854 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

4855 RegExpTree* const* a, RegExpTree* const* b) {

4856 RegExpAtom* atom1 = (*a)->AsAtom();

4857 RegExpAtom* atom2 = (*b)->AsAtom();

4858 unibrow::uchar character1 = atom1->data().at(0);

4859 unibrow::uchar character2 = atom2->data().at(0);

4860 if (character1 == character2) return 0;

4861 if (character1 >= 'a' \|\| character2 >= 'a') {

4862 character1 = Canonical(canonicalize, character1);

4863 character2 = Canonical(canonicalize, character2);

4864 }

4865 return static_cast<int>(character1) - static_cast<int>(character2);

4866 }

4867

4868

4869 // We can stable sort runs of atoms, since the order does not matter if they

4870 // start with different characters.

4871 // Returns true if any consecutive atoms were found.

4872 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {

4873 ZoneList<RegExpTree> alternatives = this->alternatives();

4874 int length = alternatives->length();

4875 bool found_consecutive_atoms = false;

4876 for (int i = 0; i < length; i++) {

4877 while (i < length) {

4878 RegExpTree* alternative = alternatives->at(i);

4879 if (alternative->IsAtom()) break;

4880 i++;

4881 }

4882 // i is length or it is the index of an atom.

4883 if (i == length) break;

4884 int first_atom = i;

4885 i++;

4886 while (i < length) {

4887 RegExpTree* alternative = alternatives->at(i);

4888 if (!alternative->IsAtom()) break;

4889 i++;

4890 }

4891 // Sort atoms to get ones with common prefixes together.

4892 // This step is more tricky if we are in a case-independent regexp,

4893 // because it would change /is\|I/ to /I\|is/, and order matters when

4894 // the regexp parts don't match only disjoint starting points. To fix

4895 // this we have a version of CompareFirstChar that uses case-

4896 // independent character classes for comparison.

4897 DCHECK_LT(first_atom, alternatives->length());

4898 DCHECK_LE(i, alternatives->length());

4899 DCHECK_LE(first_atom, i);

4900 if (compiler->ignore_case()) {

4901 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

4902 compiler->isolate()->regexp_macro_assembler_canonicalize();

4903 auto compare_closure =

4904 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {

4905 return CompareFirstCharCaseIndependent(canonicalize, a, b);

4906 };

4907 alternatives->StableSort(compare_closure, first_atom, i - first_atom);

4908 } else {

4909 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);

4910 }

4911 if (i - first_atom > 1) found_consecutive_atoms = true;

4912 }

4913 return found_consecutive_atoms;

4914 }

4915

4916

4917 // Optimizes ab\|ac\|az to a(?:b\|c\|d).

4918 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

4919 Zone* zone = compiler->zone();

4920 ZoneList<RegExpTree> alternatives = this->alternatives();

4921 int length = alternatives->length();

4922

4923 int write_posn = 0;

4924 int i = 0;

4925 while (i < length) {

4926 RegExpTree* alternative = alternatives->at(i);

4927 if (!alternative->IsAtom()) {

4928 alternatives->at(write_posn++) = alternatives->at(i);

4929 i++;

4930 continue;

4931 }

4932 RegExpAtom* atom = alternative->AsAtom();

4933 unibrow::uchar common_prefix = atom->data().at(0);

4934 int first_with_prefix = i;

4935 int prefix_length = atom->length();

4936 i++;

4937 while (i < length) {

4938 alternative = alternatives->at(i);

4939 if (!alternative->IsAtom()) break;

4940 atom = alternative->AsAtom();

4941 unibrow::uchar new_prefix = atom->data().at(0);

4942 if (new_prefix != common_prefix) {

4943 if (!compiler->ignore_case()) break;

4944 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

4945 compiler->isolate()->regexp_macro_assembler_canonicalize();

4946 new_prefix = Canonical(canonicalize, new_prefix);

4947 common_prefix = Canonical(canonicalize, common_prefix);

4948 if (new_prefix != common_prefix) break;

4949 }

4950 prefix_length = Min(prefix_length, atom->length());

4951 i++;

4952 }

4953 if (i > first_with_prefix + 2) {

4954 // Found worthwhile run of alternatives with common prefix of at least one

4955 // character. The sorting function above did not sort on more than one

4956 // character for reasons of correctness, but there may still be a longer

4957 // common prefix if the terms were similar or presorted in the input.

4958 // Find out how long the common prefix is.

4959 int run_length = i - first_with_prefix;

4960 atom = alternatives->at(first_with_prefix)->AsAtom();

4961 for (int j = 1; j < run_length && prefix_length > 1; j++) {

4962 RegExpAtom* old_atom =

4963 alternatives->at(j + first_with_prefix)->AsAtom();

4964 for (int k = 1; k < prefix_length; k++) {

4965 if (atom->data().at(k) != old_atom->data().at(k)) {

4966 prefix_length = k;

4967 break;

4968 }

4969 }

4970 }

4971 RegExpAtom* prefix =

4972 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));

4973 ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);

4974 pair->Add(prefix, zone);

4975 ZoneList<RegExpTree> suffixes =

4976 new (zone) ZoneList<RegExpTree*>(run_length, zone);

4977 for (int j = 0; j < run_length; j++) {

4978 RegExpAtom* old_atom =

4979 alternatives->at(j + first_with_prefix)->AsAtom();

4980 int len = old_atom->length();

4981 if (len == prefix_length) {

4982 suffixes->Add(new (zone) RegExpEmpty(), zone);

4983 } else {

4984 RegExpTree* suffix = new (zone) RegExpAtom(

4985 old_atom->data().SubVector(prefix_length, old_atom->length()));

4986 suffixes->Add(suffix, zone);

4987 }

4988 }

4989 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);

4990 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);

4991 } else {

4992 // Just copy any non-worthwhile alternatives.

4993 for (int j = first_with_prefix; j < i; j++) {

4994 alternatives->at(write_posn++) = alternatives->at(j);

4995 }

4996 }

4997 }

4998 alternatives->Rewind(write_posn); // Trim end of array.

4999 }

5000

5001

5002 // Optimizes b\|c\|z to [bcz].

5003 void RegExpDisjunction::FixSingleCharacterDisjunctions(

5004 RegExpCompiler* compiler) {

5005 Zone* zone = compiler->zone();

5006 ZoneList<RegExpTree> alternatives = this->alternatives();

5007 int length = alternatives->length();

5008

5009 int write_posn = 0;

5010 int i = 0;

5011 while (i < length) {

5012 RegExpTree* alternative = alternatives->at(i);

5013 if (!alternative->IsAtom()) {

5014 alternatives->at(write_posn++) = alternatives->at(i);

5015 i++;

5016 continue;

5017 }

5018 RegExpAtom* atom = alternative->AsAtom();

5019 if (atom->length() != 1) {

5020 alternatives->at(write_posn++) = alternatives->at(i);

5021 i++;

5022 continue;

5023 }

5024 int first_in_run = i;

5025 i++;

5026 while (i < length) {

5027 alternative = alternatives->at(i);

5028 if (!alternative->IsAtom()) break;

5029 atom = alternative->AsAtom();

5030 if (atom->length() != 1) break;

5031 i++;

5032 }

5033 if (i > first_in_run + 1) {

5034 // Found non-trivial run of single-character alternatives.

5035 int run_length = i - first_in_run;

5036 ZoneList<CharacterRange>* ranges =

5037 new (zone) ZoneList<CharacterRange>(2, zone);

5038 for (int j = 0; j < run_length; j++) {

5039 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();

5040 DCHECK_EQ(old_atom->length(), 1);

5041 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);

5042 }

5043 alternatives->at(write_posn++) =

5044 new (zone) RegExpCharacterClass(ranges, false);

5045 } else {

5046 // Just copy any trivial alternatives.

5047 for (int j = first_in_run; j < i; j++) {

5048 alternatives->at(write_posn++) = alternatives->at(j);

5049 }

5050 }

5051 }

5052 alternatives->Rewind(write_posn); // Trim end of array.

5053 }

5054

5055

5056 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,

5057 RegExpNode* on_success) {

5058 ZoneList<RegExpTree> alternatives = this->alternatives();

5059

5060 if (alternatives->length() > 2) {

5061 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);

5062 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);

5063 FixSingleCharacterDisjunctions(compiler);

5064 if (alternatives->length() == 1) {

5065 return alternatives->at(0)->ToNode(compiler, on_success);

5066 }

5067 }

5068

5069 int length = alternatives->length();

5070

5071 ChoiceNode* result =

5072 new(compiler->zone()) ChoiceNode(length, compiler->zone());

5073 for (int i = 0; i < length; i++) {

5074 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,

5075 on_success));

5076 result->AddAlternative(alternative);

5077 }

5078 return result;

5079 }

5080

5081

5082 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,

5083 RegExpNode* on_success) {

5084 return ToNode(min(),

5085 max(),

5086 is_greedy(),

5087 body(),

5088 compiler,

5089 on_success);

5090 }

5091

5092

5093 // Scoped object to keep track of how much we unroll quantifier loops in the

5094 // regexp graph generator.

5095 class RegExpExpansionLimiter {

5096 public:

5097 static const int kMaxExpansionFactor = 6;

5098 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)

5099 : compiler_(compiler),

5100 saved_expansion_factor_(compiler->current_expansion_factor()),

5101 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {

5102 DCHECK(factor > 0);

5103 if (ok_to_expand_) {

5104 if (factor > kMaxExpansionFactor) {

5105 // Avoid integer overflow of the current expansion factor.

5106 ok_to_expand_ = false;

5107 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);

5108 } else {

5109 int new_factor = saved_expansion_factor_ * factor;

5110 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);

5111 compiler->set_current_expansion_factor(new_factor);

5112 }

5113 }

5114 }

5115

5116 ~RegExpExpansionLimiter() {

5117 compiler_->set_current_expansion_factor(saved_expansion_factor_);

5118 }

5119

5120 bool ok_to_expand() { return ok_to_expand_; }

5121

5122 private:

5123 RegExpCompiler* compiler_;

5124 int saved_expansion_factor_;

5125 bool ok_to_expand_;

5126

5127 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);

5128 };

5129

5130

5131 RegExpNode* RegExpQuantifier::ToNode(int min,

5132 int max,

5133 bool is_greedy,

5134 RegExpTree* body,

5135 RegExpCompiler* compiler,

5136 RegExpNode* on_success,

5137 bool not_at_start) {

5138 // x{f, t} becomes this:

5139 //

5140 // (r++)<-.

5141 // \| `

5142 // \| (x)

5143 // v ^

5144 // (r=0)-->(?)---/ [if r < t]

5145 // \|

5146 // [if r >= f] \----> ...

5147 //

5148

5149 // 15.10.2.5 RepeatMatcher algorithm.

5150 // The parser has already eliminated the case where max is 0. In the case

5151 // where max_match is zero the parser has removed the quantifier if min was

5152 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.

5153

5154 // If we know that we cannot match zero length then things are a little

5155 // simpler since we don't need to make the special zero length match check

5156 // from step 2.1. If the min and max are small we can unroll a little in

5157 // this case.

5158 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}

5159 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}

5160 if (max == 0) return on_success; // This can happen due to recursion.

5161 bool body_can_be_empty = (body->min_match() == 0);

5162 int body_start_reg = RegExpCompiler::kNoRegister;

5163 Interval capture_registers = body->CaptureRegisters();

5164 bool needs_capture_clearing = !capture_registers.is_empty();

5165 Zone* zone = compiler->zone();

5166

5167 if (body_can_be_empty) {

5168 body_start_reg = compiler->AllocateRegister();

5169 } else if (compiler->optimize() && !needs_capture_clearing) {

5170 // Only unroll if there are no captures and the body can't be

5171 // empty.

5172 {

5173 RegExpExpansionLimiter limiter(

5174 compiler, min + ((max != min) ? 1 : 0));

5175 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {

5176 int new_max = (max == kInfinity) ? max : max - min;

5177 // Recurse once to get the loop or optional matches after the fixed

5178 // ones.

5179 RegExpNode* answer = ToNode(

5180 0, new_max, is_greedy, body, compiler, on_success, true);

5181 // Unroll the forced matches from 0 to min. This can cause chains of

5182 // TextNodes (which the parser does not generate). These should be

5183 // combined if it turns out they hinder good code generation.

5184 for (int i = 0; i < min; i++) {

5185 answer = body->ToNode(compiler, answer);

5186 }

5187 return answer;

5188 }

5189 }

5190 if (max <= kMaxUnrolledMaxMatches && min == 0) {

5191 DCHECK(max > 0); // Due to the 'if' above.

5192 RegExpExpansionLimiter limiter(compiler, max);

5193 if (limiter.ok_to_expand()) {

5194 // Unroll the optional matches up to max.

5195 RegExpNode* answer = on_success;

5196 for (int i = 0; i < max; i++) {

5197 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);

5198 if (is_greedy) {

5199 alternation->AddAlternative(

5200 GuardedAlternative(body->ToNode(compiler, answer)));

5201 alternation->AddAlternative(GuardedAlternative(on_success));

5202 } else {

5203 alternation->AddAlternative(GuardedAlternative(on_success));

5204 alternation->AddAlternative(

5205 GuardedAlternative(body->ToNode(compiler, answer)));

5206 }

5207 answer = alternation;

5208 if (not_at_start) alternation->set_not_at_start();

5209 }

5210 return answer;

5211 }

5212 }

5213 }

5214 bool has_min = min > 0;

5215 bool has_max = max < RegExpTree::kInfinity;

5216 bool needs_counter = has_min \|\| has_max;

5217 int reg_ctr = needs_counter

5218 ? compiler->AllocateRegister()

5219 : RegExpCompiler::kNoRegister;

5220 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,

5221 zone);

5222 if (not_at_start) center->set_not_at_start();

5223 RegExpNode* loop_return = needs_counter

5224 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))

5225 : static_cast<RegExpNode*>(center);

5226 if (body_can_be_empty) {

5227 // If the body can be empty we need to check if it was and then

5228 // backtrack.

5229 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,

5230 reg_ctr,

5231 min,

5232 loop_return);

5233 }

5234 RegExpNode* body_node = body->ToNode(compiler, loop_return);

5235 if (body_can_be_empty) {

5236 // If the body can be empty we need to store the start position

5237 // so we can bail out if it was empty.

5238 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);

5239 }

5240 if (needs_capture_clearing) {

5241 // Before entering the body of this loop we need to clear captures.

5242 body_node = ActionNode::ClearCaptures(capture_registers, body_node);

5243 }

5244 GuardedAlternative body_alt(body_node);

5245 if (has_max) {

5246 Guard* body_guard =

5247 new(zone) Guard(reg_ctr, Guard::LT, max);

5248 body_alt.AddGuard(body_guard, zone);

5249 }

5250 GuardedAlternative rest_alt(on_success);

5251 if (has_min) {

5252 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);

5253 rest_alt.AddGuard(rest_guard, zone);

5254 }

5255 if (is_greedy) {

5256 center->AddLoopAlternative(body_alt);

5257 center->AddContinueAlternative(rest_alt);

5258 } else {

5259 center->AddContinueAlternative(rest_alt);

5260 center->AddLoopAlternative(body_alt);

5261 }

5262 if (needs_counter) {

5263 return ActionNode::SetRegister(reg_ctr, 0, center);

5264 } else {

5265 return center;

5266 }

5267 }

5268

5269

5270 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

5271 RegExpNode* on_success) {

5272 NodeInfo info;

5273 Zone* zone = compiler->zone();

5274

5275 switch (assertion_type()) {

5276 case START_OF_LINE:

5277 return AssertionNode::AfterNewline(on_success);

5278 case START_OF_INPUT:

5279 return AssertionNode::AtStart(on_success);

5280 case BOUNDARY:

5281 return AssertionNode::AtBoundary(on_success);

5282 case NON_BOUNDARY:

5283 return AssertionNode::AtNonBoundary(on_success);

5284 case END_OF_INPUT:

5285 return AssertionNode::AtEnd(on_success);

5286 case END_OF_LINE: {

5287 // Compile $ in multiline regexps as an alternation with a positive

5288 // lookahead in one side and an end-of-input on the other side.

5289 // We need two registers for the lookahead.

5290 int stack_pointer_register = compiler->AllocateRegister();

5291 int position_register = compiler->AllocateRegister();

5292 // The ChoiceNode to distinguish between a newline and end-of-input.

5293 ChoiceNode* result = new(zone) ChoiceNode(2, zone);

5294 // Create a newline atom.

5295 ZoneList<CharacterRange>* newline_ranges =

5296 new(zone) ZoneList<CharacterRange>(3, zone);

5297 CharacterRange::AddClassEscape('n', newline_ranges, zone);

5298 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');

5299 TextNode* newline_matcher = new(zone) TextNode(

5300 newline_atom,

5301 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,

5302 position_register,

5303 0, // No captures inside.

5304 -1, // Ignored if no captures.

5305 on_success));

5306 // Create an end-of-input matcher.

5307 RegExpNode* end_of_line = ActionNode::BeginSubmatch(

5308 stack_pointer_register,

5309 position_register,

5310 newline_matcher);

5311 // Add the two alternatives to the ChoiceNode.

5312 GuardedAlternative eol_alternative(end_of_line);

5313 result->AddAlternative(eol_alternative);

5314 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));

5315 result->AddAlternative(end_alternative);

5316 return result;

5317 }

5318 default:

5319 UNREACHABLE();

5320 }

5321 return on_success;

5322 }

5323

5324

5325 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,

5326 RegExpNode* on_success) {

5327 return new(compiler->zone())

5328 BackReferenceNode(RegExpCapture::StartRegister(index()),

5329 RegExpCapture::EndRegister(index()),

5330 on_success);

5331 }

5332

5333

5334 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,

5335 RegExpNode* on_success) {

5336 return on_success;

5337 }

5338

5339

5340 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,

5341 RegExpNode* on_success) {

5342 int stack_pointer_register = compiler->AllocateRegister();

5343 int position_register = compiler->AllocateRegister();

5344

5345 const int registers_per_capture = 2;

5346 const int register_of_first_capture = 2;

5347 int register_count = capture_count_ * registers_per_capture;

5348 int register_start =

5349 register_of_first_capture + capture_from_ * registers_per_capture;

5350

5351 RegExpNode* success;

5352 if (is_positive()) {

5353 RegExpNode* node = ActionNode::BeginSubmatch(

5354 stack_pointer_register,

5355 position_register,

5356 body()->ToNode(

5357 compiler,

5358 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,

5359 position_register,

5360 register_count,

5361 register_start,

5362 on_success)));

5363 return node;

5364 } else {

5365 // We use a ChoiceNode for a negative lookahead because it has most of

5366 // the characteristics we need. It has the body of the lookahead as its

5367 // first alternative and the expression after the lookahead of the second

5368 // alternative. If the first alternative succeeds then the

5369 // NegativeSubmatchSuccess will unwind the stack including everything the

5370 // choice node set up and backtrack. If the first alternative fails then

5371 // the second alternative is tried, which is exactly the desired result

5372 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special

5373 // ChoiceNode that knows to ignore the first exit when calculating quick

5374 // checks.

5375 Zone* zone = compiler->zone();

5376

5377 GuardedAlternative body_alt(

5378 body()->ToNode(

5379 compiler,

5380 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,

5381 position_register,

5382 register_count,

5383 register_start,

5384 zone)));

5385 ChoiceNode* choice_node =

5386 new(zone) NegativeLookaheadChoiceNode(body_alt,

5387 GuardedAlternative(on_success),

5388 zone);

5389 return ActionNode::BeginSubmatch(stack_pointer_register,

5390 position_register,

5391 choice_node);

5392 }

5393 }

5394

5395

5396 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,

5397 RegExpNode* on_success) {

5398 return ToNode(body(), index(), compiler, on_success);

5399 }

5400

5401

5402 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,

5403 int index,

5404 RegExpCompiler* compiler,

5405 RegExpNode* on_success) {

5406 int start_reg = RegExpCapture::StartRegister(index);

5407 int end_reg = RegExpCapture::EndRegister(index);

5408 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);

5409 RegExpNode* body_node = body->ToNode(compiler, store_end);

5410 return ActionNode::StorePosition(start_reg, true, body_node);

5411 }

5412

5413

5414 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,

5415 RegExpNode* on_success) {

5416 ZoneList<RegExpTree> children = nodes();

5417 RegExpNode* current = on_success;

5418 for (int i = children->length() - 1; i >= 0; i--) {

5419 current = children->at(i)->ToNode(compiler, current);

5420 }

5421 return current;

5422 }

5423

5424

5425 static void AddClass(const int* elmv,

5426 int elmc,

5427 ZoneList<CharacterRange>* ranges,

5428 Zone* zone) {

5429 elmc--;

5430 DCHECK(elmv[elmc] == 0x10000);

5431 for (int i = 0; i < elmc; i += 2) {

5432 DCHECK(elmv[i] < elmv[i + 1]);

5433 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);

5434 }

5435 }

5436

5437

5438 static void AddClassNegated(const int *elmv,

5439 int elmc,

5440 ZoneList<CharacterRange>* ranges,

5441 Zone* zone) {

5442 elmc--;

5443 DCHECK(elmv[elmc] == 0x10000);

5444 DCHECK(elmv[0] != 0x0000);

5445 DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);

5446 uc16 last = 0x0000;

5447 for (int i = 0; i < elmc; i += 2) {

5448 DCHECK(last <= elmv[i] - 1);

5449 DCHECK(elmv[i] < elmv[i + 1]);

5450 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);

5451 last = elmv[i + 1];

5452 }

5453 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);

5454 }

5455

5456

5457 void CharacterRange::AddClassEscape(uc16 type,

5458 ZoneList<CharacterRange>* ranges,

5459 Zone* zone) {

5460 switch (type) {

5461 case 's':

5462 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);

5463 break;

5464 case 'S':

5465 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);

5466 break;

5467 case 'w':

5468 AddClass(kWordRanges, kWordRangeCount, ranges, zone);

5469 break;

5470 case 'W':

5471 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);

5472 break;

5473 case 'd':

5474 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);

5475 break;

5476 case 'D':

5477 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);

5478 break;

5479 case '.':

5480 AddClassNegated(kLineTerminatorRanges,

5481 kLineTerminatorRangeCount,

5482 ranges,

5483 zone);

5484 break;

5485 // This is not a character range as defined by the spec but a

5486 // convenient shorthand for a character class that matches any

5487 // character.

5488 case '*':

5489 ranges->Add(CharacterRange::Everything(), zone);

5490 break;

5491 // This is the set of characters matched by the $ and ^ symbols

5492 // in multiline mode.

5493 case 'n':

5494 AddClass(kLineTerminatorRanges,

5495 kLineTerminatorRangeCount,

5496 ranges,

5497 zone);

5498 break;

5499 default:

5500 UNREACHABLE();

5501 }

5502 }

5503

5504

5505 Vector<const int> CharacterRange::GetWordBounds() {

5506 return Vector<const int>(kWordRanges, kWordRangeCount - 1);

5507 }

5508

5509

5510 class CharacterRangeSplitter {

5511 public:

5512 CharacterRangeSplitter(ZoneList<CharacterRange>** included,

5513 ZoneList<CharacterRange>** excluded,

5514 Zone* zone)

5515 : included_(included),

5516 excluded_(excluded),

5517 zone_(zone) { }

5518 void Call(uc16 from, DispatchTable::Entry entry);

5519

5520 static const int kInBase = 0;

5521 static const int kInOverlay = 1;

5522

5523 private:

5524 ZoneList<CharacterRange>** included_;

5525 ZoneList<CharacterRange>** excluded_;

5526 Zone* zone_;

5527 };

5528

5529

5530 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {

5531 if (!entry.out_set()->Get(kInBase)) return;

5532 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)

5533 ? included_

5534 : excluded_;

5535 if (target == NULL) target = new(zone_) ZoneList<CharacterRange>(2, zone_);

5536 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);

5537 }

5538

5539

5540 void CharacterRange::Split(ZoneList<CharacterRange>* base,

5541 Vector<const int> overlay,

5542 ZoneList<CharacterRange>** included,

5543 ZoneList<CharacterRange>** excluded,

5544 Zone* zone) {

5545 DCHECK_NULL(*included);

5546 DCHECK_NULL(*excluded);

5547 DispatchTable table(zone);

5548 for (int i = 0; i < base->length(); i++)

5549 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);

5550 for (int i = 0; i < overlay.length(); i += 2) {

5551 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),

5552 CharacterRangeSplitter::kInOverlay, zone);

5553 }

5554 CharacterRangeSplitter callback(included, excluded, zone);

5555 table.ForEach(&callback);

5556 }

5557

5558

5559 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,

5560 ZoneList<CharacterRange>* ranges,

5561 bool is_one_byte) {

5562 uc16 bottom = from();

5563 uc16 top = to();

5564 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {

5565 if (bottom > String::kMaxOneByteCharCode) return;

5566 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;

5567 }

5568 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

5569 if (top == bottom) {

5570 // If this is a singleton we just expand the one character.

5571 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);

5572 for (int i = 0; i < length; i++) {

5573 uc32 chr = chars[i];

5574 if (chr != bottom) {

5575 ranges->Add(CharacterRange::Singleton(chars[i]), zone);

5576 }

5577 }

5578 } else {

5579 // If this is a range we expand the characters block by block,

5580 // expanding contiguous subranges (blocks) one at a time.

5581 // The approach is as follows. For a given start character we

5582 // look up the remainder of the block that contains it (represented

5583 // by the end point), for instance we find 'z' if the character

5584 // is 'c'. A block is characterized by the property

5585 // that all characters uncanonicalize in the same way, except that

5586 // each entry in the result is incremented by the distance from the first

5587 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and

5588 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].

5589 // Once we've found the end point we look up its uncanonicalization

5590 // and produce a range for each element. For instance for [c-f]

5591 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only

5592 // add a range if it is not already contained in the input, so [c-f]

5593 // will be skipped but [C-F] will be added. If this range is not

5594 // completely contained in a block we do this for all the blocks

5595 // covered by the range (handling characters that is not in a block

5596 // as a "singleton block").

5597 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

5598 int pos = bottom;

5599 while (pos <= top) {

5600 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);

5601 uc16 block_end;

5602 if (length == 0) {

5603 block_end = pos;

5604 } else {

5605 DCHECK_EQ(1, length);

5606 block_end = range[0];

5607 }

5608 int end = (block_end > top) ? top : block_end;

5609 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);

5610 for (int i = 0; i < length; i++) {

5611 uc32 c = range[i];

5612 uc16 range_from = c - (block_end - pos);

5613 uc16 range_to = c - (block_end - end);

5614 if (!(bottom <= range_from && range_to <= top)) {

5615 ranges->Add(CharacterRange(range_from, range_to), zone);

5616 }

5617 }

5618 pos = end + 1;

5619 }

5620 }

5621 }

5622

5623

5624 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {

5625 DCHECK_NOT_NULL(ranges);

5626 int n = ranges->length();

5627 if (n <= 1) return true;

5628 int max = ranges->at(0).to();

5629 for (int i = 1; i < n; i++) {

5630 CharacterRange next_range = ranges->at(i);

5631 if (next_range.from() <= max + 1) return false;

5632 max = next_range.to();

5633 }

5634 return true;

5635 }

5636

5637

5638 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {

5639 if (ranges_ == NULL) {

5640 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);

5641 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);

5642 }

5643 return ranges_;

5644 }

5645

5646

5647 // Move a number of elements in a zonelist to another position

5648 // in the same list. Handles overlapping source and target areas.

5649 static void MoveRanges(ZoneList<CharacterRange>* list,

5650 int from,

5651 int to,

5652 int count) {

5653 // Ranges are potentially overlapping.

5654 if (from < to) {

5655 for (int i = count - 1; i >= 0; i--) {

5656 list->at(to + i) = list->at(from + i);

5657 }

5658 } else {

5659 for (int i = 0; i < count; i++) {

5660 list->at(to + i) = list->at(from + i);

5661 }

5662 }

5663 }

5664

5665

5666 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,

5667 int count,

5668 CharacterRange insert) {

5669 // Inserts a range into list[0..count[, which must be sorted

5670 // by from value and non-overlapping and non-adjacent, using at most

5671 // list[0..count] for the result. Returns the number of resulting

5672 // canonicalized ranges. Inserting a range may collapse existing ranges into

5673 // fewer ranges, so the return value can be anything in the range 1..count+1.

5674 uc16 from = insert.from();

5675 uc16 to = insert.to();

5676 int start_pos = 0;

5677 int end_pos = count;

5678 for (int i = count - 1; i >= 0; i--) {

5679 CharacterRange current = list->at(i);

5680 if (current.from() > to + 1) {

5681 end_pos = i;

5682 } else if (current.to() + 1 < from) {

5683 start_pos = i + 1;

5684 break;

5685 }

5686 }

5687

5688 // Inserted range overlaps, or is adjacent to, ranges at positions

5689 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are

5690 // not affected by the insertion.

5691 // If start_pos == end_pos, the range must be inserted before start_pos.

5692 // if start_pos < end_pos, the entire range from start_pos to end_pos

5693 // must be merged with the insert range.

5694

5695 if (start_pos == end_pos) {

5696 // Insert between existing ranges at position start_pos.

5697 if (start_pos < count) {

5698 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);

5699 }

5700 list->at(start_pos) = insert;

5701 return count + 1;

5702 }

5703 if (start_pos + 1 == end_pos) {

5704 // Replace single existing range at position start_pos.

5705 CharacterRange to_replace = list->at(start_pos);

5706 int new_from = Min(to_replace.from(), from);

5707 int new_to = Max(to_replace.to(), to);

5708 list->at(start_pos) = CharacterRange(new_from, new_to);

5709 return count;

5710 }

5711 // Replace a number of existing ranges from start_pos to end_pos - 1.

5712 // Move the remaining ranges down.

5713

5714 int new_from = Min(list->at(start_pos).from(), from);

5715 int new_to = Max(list->at(end_pos - 1).to(), to);

5716 if (end_pos < count) {

5717 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);

5718 }

5719 list->at(start_pos) = CharacterRange(new_from, new_to);

5720 return count - (end_pos - start_pos) + 1;

5721 }

5722

5723

5724 void CharacterSet::Canonicalize() {

5725 // Special/default classes are always considered canonical. The result

5726 // of calling ranges() will be sorted.

5727 if (ranges_ == NULL) return;

5728 CharacterRange::Canonicalize(ranges_);

5729 }

5730

5731

5732 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {

5733 if (character_ranges->length() <= 1) return;

5734 // Check whether ranges are already canonical (increasing, non-overlapping,

5735 // non-adjacent).

5736 int n = character_ranges->length();

5737 int max = character_ranges->at(0).to();

5738 int i = 1;

5739 while (i < n) {

5740 CharacterRange current = character_ranges->at(i);

5741 if (current.from() <= max + 1) {

5742 break;

5743 }

5744 max = current.to();

5745 i++;

5746 }

5747 // Canonical until the i'th range. If that's all of them, we are done.

5748 if (i == n) return;

5749

5750 // The ranges at index i and forward are not canonicalized. Make them so by

5751 // doing the equivalent of insertion sort (inserting each into the previous

5752 // list, in order).

5753 // Notice that inserting a range can reduce the number of ranges in the

5754 // result due to combining of adjacent and overlapping ranges.

5755 int read = i; // Range to insert.

5756 int num_canonical = i; // Length of canonicalized part of list.

5757 do {

5758 num_canonical = InsertRangeInCanonicalList(character_ranges,

5759 num_canonical,

5760 character_ranges->at(read));

5761 read++;

5762 } while (read < n);

5763 character_ranges->Rewind(num_canonical);

5764

5765 DCHECK(CharacterRange::IsCanonical(character_ranges));

5766 }

5767

5768

5769 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,

5770 ZoneList<CharacterRange>* negated_ranges,

5771 Zone* zone) {

5772 DCHECK(CharacterRange::IsCanonical(ranges));

5773 DCHECK_EQ(0, negated_ranges->length());

5774 int range_count = ranges->length();

5775 uc16 from = 0;

5776 int i = 0;

5777 if (range_count > 0 && ranges->at(0).from() == 0) {

5778 from = ranges->at(0).to();

5779 i = 1;

5780 }

5781 while (i < range_count) {

5782 CharacterRange range = ranges->at(i);

5783 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);

5784 from = range.to();

5785 i++;

5786 }

5787 if (from < String::kMaxUtf16CodeUnit) {

5788 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),

5789 zone);

5790 }

5791 }

5792

5793

5794 // -------------------------------------------------------------------

5795 // Splay tree

5796

5797

5798 OutSet* OutSet::Extend(unsigned value, Zone* zone) {

5799 if (Get(value))

5800 return this;

5801 if (successors(zone) != NULL) {

5802 for (int i = 0; i < successors(zone)->length(); i++) {

5803 OutSet* successor = successors(zone)->at(i);

5804 if (successor->Get(value))

5805 return successor;

5806 }

5807 } else {

5808 successors_ = new(zone) ZoneList<OutSet*>(2, zone);

5809 }

5810 OutSet* result = new(zone) OutSet(first_, remaining_);

5811 result->Set(value, zone);

5812 successors(zone)->Add(result, zone);

5813 return result;

5814 }

5815

5816

5817 void OutSet::Set(unsigned value, Zone *zone) {

5818 if (value < kFirstLimit) {

5819 first_ \|= (1 << value);

5820 } else {

5821 if (remaining_ == NULL)

5822 remaining_ = new(zone) ZoneList<unsigned>(1, zone);

5823 if (remaining_->is_empty() \|\| !remaining_->Contains(value))

5824 remaining_->Add(value, zone);

5825 }

5826 }

5827

5828

5829 bool OutSet::Get(unsigned value) const {

5830 if (value < kFirstLimit) {

5831 return (first_ & (1 << value)) != 0;

5832 } else if (remaining_ == NULL) {

5833 return false;

5834 } else {

5835 return remaining_->Contains(value);

5836 }

5837 }

5838

5839

5840 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;

5841

5842

5843 void DispatchTable::AddRange(CharacterRange full_range, int value,

5844 Zone* zone) {

5845 CharacterRange current = full_range;

5846 if (tree()->is_empty()) {

5847 // If this is the first range we just insert into the table.

5848 ZoneSplayTree<Config>::Locator loc;

5849 bool inserted = tree()->Insert(current.from(), &loc);

5850 DCHECK(inserted);

5851 USE(inserted);

5852 loc.set_value(Entry(current.from(), current.to(),

5853 empty()->Extend(value, zone)));

5854 return;

5855 }

5856 // First see if there is a range to the left of this one that

5857 // overlaps.

5858 ZoneSplayTree<Config>::Locator loc;

5859 if (tree()->FindGreatestLessThan(current.from(), &loc)) {

5860 Entry* entry = &loc.value();

5861 // If we've found a range that overlaps with this one, and it

5862 // starts strictly to the left of this one, we have to fix it

5863 // because the following code only handles ranges that start on

5864 // or after the start point of the range we're adding.

5865 if (entry->from() < current.from() && entry->to() >= current.from()) {

5866 // Snap the overlapping range in half around the start point of

5867 // the range we're adding.

5868 CharacterRange left(entry->from(), current.from() - 1);

5869 CharacterRange right(current.from(), entry->to());

5870 // The left part of the overlapping range doesn't overlap.

5871 // Truncate the whole entry to be just the left part.

5872 entry->set_to(left.to());

5873 // The right part is the one that overlaps. We add this part

5874 // to the map and let the next step deal with merging it with

5875 // the range we're adding.

5876 ZoneSplayTree<Config>::Locator loc;

5877 bool inserted = tree()->Insert(right.from(), &loc);

5878 DCHECK(inserted);

5879 USE(inserted);

5880 loc.set_value(Entry(right.from(),

5881 right.to(),

5882 entry->out_set()));

5883 }

5884 }

5885 while (current.is_valid()) {

5886 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&

5887 (loc.value().from() <= current.to()) &&

5888 (loc.value().to() >= current.from())) {

5889 Entry* entry = &loc.value();

5890 // We have overlap. If there is space between the start point of

5891 // the range we're adding and where the overlapping range starts

5892 // then we have to add a range covering just that space.

5893 if (current.from() < entry->from()) {

5894 ZoneSplayTree<Config>::Locator ins;

5895 bool inserted = tree()->Insert(current.from(), &ins);

5896 DCHECK(inserted);

5897 USE(inserted);

5898 ins.set_value(Entry(current.from(),

5899 entry->from() - 1,

5900 empty()->Extend(value, zone)));

5901 current.set_from(entry->from());

5902 }

5903 DCHECK_EQ(current.from(), entry->from());

5904 // If the overlapping range extends beyond the one we want to add

5905 // we have to snap the right part off and add it separately.

5906 if (entry->to() > current.to()) {

5907 ZoneSplayTree<Config>::Locator ins;

5908 bool inserted = tree()->Insert(current.to() + 1, &ins);

5909 DCHECK(inserted);

5910 USE(inserted);

5911 ins.set_value(Entry(current.to() + 1,

5912 entry->to(),

5913 entry->out_set()));

5914 entry->set_to(current.to());

5915 }

5916 DCHECK(entry->to() <= current.to());

5917 // The overlapping range is now completely contained by the range

5918 // we're adding so we can just update it and move the start point

5919 // of the range we're adding just past it.

5920 entry->AddValue(value, zone);

5921 // Bail out if the last interval ended at 0xFFFF since otherwise

5922 // adding 1 will wrap around to 0.

5923 if (entry->to() == String::kMaxUtf16CodeUnit)

5924 break;

5925 DCHECK(entry->to() + 1 > current.from());

5926 current.set_from(entry->to() + 1);

5927 } else {

5928 // There is no overlap so we can just add the range

5929 ZoneSplayTree<Config>::Locator ins;

5930 bool inserted = tree()->Insert(current.from(), &ins);

5931 DCHECK(inserted);

5932 USE(inserted);

5933 ins.set_value(Entry(current.from(),

5934 current.to(),

5935 empty()->Extend(value, zone)));

5936 break;

5937 }

5938 }

5939 }

5940

5941

5942 OutSet* DispatchTable::Get(uc16 value) {

5943 ZoneSplayTree<Config>::Locator loc;

5944 if (!tree()->FindGreatestLessThan(value, &loc))

5945 return empty();

5946 Entry* entry = &loc.value();

5947 if (value <= entry->to())

5948 return entry->out_set();

5949 else

5950 return empty();

5951 }

5952

5953

5954 // -------------------------------------------------------------------

5955 // Analysis

5956

5957

5958 void Analysis::EnsureAnalyzed(RegExpNode* that) {

5959 StackLimitCheck check(isolate());

5960 if (check.HasOverflowed()) {

5961 fail("Stack overflow");

5962 return;

5963 }

5964 if (that->info()->been_analyzed \|\| that->info()->being_analyzed)

5965 return;

5966 that->info()->being_analyzed = true;

5967 that->Accept(this);

5968 that->info()->being_analyzed = false;

5969 that->info()->been_analyzed = true;

5970 }

5971

5972

5973 void Analysis::VisitEnd(EndNode* that) {

5974 // nothing to do

5975 }

5976

5977

5978 void TextNode::CalculateOffsets() {

5979 int element_count = elements()->length();

5980 // Set up the offsets of the elements relative to the start. This is a fixed

5981 // quantity since a TextNode can only contain fixed-width things.

5982 int cp_offset = 0;

5983 for (int i = 0; i < element_count; i++) {

5984 TextElement& elm = elements()->at(i);

5985 elm.set_cp_offset(cp_offset);

5986 cp_offset += elm.length();

5987 }

5988 }

5989

5990

5991 void Analysis::VisitText(TextNode* that) {

5992 if (ignore_case_) {

5993 that->MakeCaseIndependent(isolate(), is_one_byte_);

5994 }

5995 EnsureAnalyzed(that->on_success());

5996 if (!has_failed()) {

5997 that->CalculateOffsets();

5998 }

5999 }

6000

6001

6002 void Analysis::VisitAction(ActionNode* that) {

6003 RegExpNode* target = that->on_success();

6004 EnsureAnalyzed(target);

6005 if (!has_failed()) {

6006 // If the next node is interested in what it follows then this node

6007 // has to be interested too so it can pass the information on.

6008 that->info()->AddFromFollowing(target->info());

6009 }

6010 }

6011

6012

6013 void Analysis::VisitChoice(ChoiceNode* that) {

6014 NodeInfo* info = that->info();

6015 for (int i = 0; i < that->alternatives()->length(); i++) {

6016 RegExpNode* node = that->alternatives()->at(i).node();

6017 EnsureAnalyzed(node);

6018 if (has_failed()) return;

6019 // Anything the following nodes need to know has to be known by

6020 // this node also, so it can pass it on.

6021 info->AddFromFollowing(node->info());

6022 }

6023 }

6024

6025

6026 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {

6027 NodeInfo* info = that->info();

6028 for (int i = 0; i < that->alternatives()->length(); i++) {

6029 RegExpNode* node = that->alternatives()->at(i).node();

6030 if (node != that->loop_node()) {

6031 EnsureAnalyzed(node);

6032 if (has_failed()) return;

6033 info->AddFromFollowing(node->info());

6034 }

6035 }

6036 // Check the loop last since it may need the value of this node

6037 // to get a correct result.

6038 EnsureAnalyzed(that->loop_node());

6039 if (!has_failed()) {

6040 info->AddFromFollowing(that->loop_node()->info());

6041 }

6042 }

6043

6044

6045 void Analysis::VisitBackReference(BackReferenceNode* that) {

6046 EnsureAnalyzed(that->on_success());

6047 }

6048

6049

6050 void Analysis::VisitAssertion(AssertionNode* that) {

6051 EnsureAnalyzed(that->on_success());

6052 }

6053

6054

6055 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

6056 BoyerMooreLookahead* bm,

6057 bool not_at_start) {

6058 // Working out the set of characters that a backreference can match is too

6059 // hard, so we just say that any character can match.

6060 bm->SetRest(offset);

6061 SaveBMInfo(bm, not_at_start, offset);

6062 }

6063

6064

6065 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==

6066 RegExpMacroAssembler::kTableSize);

6067

6068

6069 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,

6070 BoyerMooreLookahead* bm, bool not_at_start) {

6071 ZoneList<GuardedAlternative>* alts = alternatives();

6072 budget = (budget - 1) / alts->length();

6073 for (int i = 0; i < alts->length(); i++) {

6074 GuardedAlternative& alt = alts->at(i);

6075 if (alt.guards() != NULL && alt.guards()->length() != 0) {

6076 bm->SetRest(offset); // Give up trying to fill in info.

6077 SaveBMInfo(bm, not_at_start, offset);

6078 return;

6079 }

6080 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);

6081 }

6082 SaveBMInfo(bm, not_at_start, offset);

6083 }

6084

6085

6086 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,

6087 BoyerMooreLookahead* bm, bool not_at_start) {

6088 if (initial_offset >= bm->length()) return;

6089 int offset = initial_offset;

6090 int max_char = bm->max_char();

6091 for (int i = 0; i < elements()->length(); i++) {

6092 if (offset >= bm->length()) {

6093 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6094 return;

6095 }

6096 TextElement text = elements()->at(i);

6097 if (text.text_type() == TextElement::ATOM) {

6098 RegExpAtom* atom = text.atom();

6099 for (int j = 0; j < atom->length(); j++, offset++) {

6100 if (offset >= bm->length()) {

6101 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6102 return;

6103 }

6104 uc16 character = atom->data()[j];

6105 if (bm->compiler()->ignore_case()) {

6106 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

6107 int length = GetCaseIndependentLetters(

6108 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,

6109 chars);

6110 for (int j = 0; j < length; j++) {

6111 bm->Set(offset, chars[j]);

6112 }

6113 } else {

6114 if (character <= max_char) bm->Set(offset, character);

6115 }

6116 }

6117 } else {

6118 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());

6119 RegExpCharacterClass* char_class = text.char_class();

6120 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());

6121 if (char_class->is_negated()) {

6122 bm->SetAll(offset);

6123 } else {

6124 for (int k = 0; k < ranges->length(); k++) {

6125 CharacterRange& range = ranges->at(k);

6126 if (range.from() > max_char) continue;

6127 int to = Min(max_char, static_cast<int>(range.to()));

6128 bm->SetInterval(offset, Interval(range.from(), to));

6129 }

6130 }

6131 offset++;

6132 }

6133 }

6134 if (offset >= bm->length()) {

6135 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6136 return;

6137 }

6138 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,

6139 true); // Not at start after a text node.

6140 if (initial_offset == 0) set_bm_info(not_at_start, bm);

6141 }

6142

6143

6144 // -------------------------------------------------------------------

6145 // Dispatch table construction

6146

6147

6148 void DispatchTableConstructor::VisitEnd(EndNode* that) {

6149 AddRange(CharacterRange::Everything());

6150 }

6151

6152

6153 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {

6154 node->set_being_calculated(true);

6155 ZoneList<GuardedAlternative>* alternatives = node->alternatives();

6156 for (int i = 0; i < alternatives->length(); i++) {

6157 set_choice_index(i);

6158 alternatives->at(i).node()->Accept(this);

6159 }

6160 node->set_being_calculated(false);

6161 }

6162

6163

6164 class AddDispatchRange {

6165 public:

6166 explicit AddDispatchRange(DispatchTableConstructor* constructor)

6167 : constructor_(constructor) { }

6168 void Call(uc32 from, DispatchTable::Entry entry);

6169 private:

6170 DispatchTableConstructor* constructor_;

6171 };

6172

6173

6174 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {

6175 CharacterRange range(from, entry.to());

6176 constructor_->AddRange(range);

6177 }

6178

6179

6180 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {

6181 if (node->being_calculated())

6182 return;

6183 DispatchTable* table = node->GetTable(ignore_case_);

6184 AddDispatchRange adder(this);

6185 table->ForEach(&adder);

6186 }

6187

6188

6189 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {

6190 // TODO(160): Find the node that we refer back to and propagate its start

6191 // set back to here. For now we just accept anything.

6192 AddRange(CharacterRange::Everything());

6193 }

6194

6195

6196 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {

6197 RegExpNode* target = that->on_success();

6198 target->Accept(this);

6199 }

6200

6201

6202 static int CompareRangeByFrom(const CharacterRange* a,

6203 const CharacterRange* b) {

6204 return Compare<uc16>(a->from(), b->from());

6205 }

6206

6207

6208 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {

6209 ranges->Sort(CompareRangeByFrom);

6210 uc16 last = 0;

6211 for (int i = 0; i < ranges->length(); i++) {

6212 CharacterRange range = ranges->at(i);

6213 if (last < range.from())

6214 AddRange(CharacterRange(last, range.from() - 1));

6215 if (range.to() >= last) {

6216 if (range.to() == String::kMaxUtf16CodeUnit) {

6217 return;

6218 } else {

6219 last = range.to() + 1;

6220 }

6221 }

6222 }

6223 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));

6224 }

6225

6226

6227 void DispatchTableConstructor::VisitText(TextNode* that) {

6228 TextElement elm = that->elements()->at(0);

6229 switch (elm.text_type()) {

6230 case TextElement::ATOM: {

6231 uc16 c = elm.atom()->data()[0];

6232 AddRange(CharacterRange(c, c));

6233 break;

6234 }

6235 case TextElement::CHAR_CLASS: {

6236 RegExpCharacterClass* tree = elm.char_class();

6237 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());

6238 if (tree->is_negated()) {

6239 AddInverse(ranges);

6240 } else {

6241 for (int i = 0; i < ranges->length(); i++)

6242 AddRange(ranges->at(i));

6243 }

6244 break;

6245 }

6246 default: {

6247 UNIMPLEMENTED();

6248 }

6249 }

6250 }

6251

6252

6253 void DispatchTableConstructor::VisitAction(ActionNode* that) {

6254 RegExpNode* target = that->on_success();

6255 target->Accept(this);

6256 }

6257

6258

6259 RegExpEngine::CompilationResult RegExpEngine::Compile(

6260 Isolate* isolate, Zone* zone, RegExpCompileData* data, bool ignore_case,

6261 bool is_global, bool is_multiline, bool is_sticky, Handle<String> pattern,

6262 Handle<String> sample_subject, bool is_one_byte) {

6263 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {

6264 return IrregexpRegExpTooBig(isolate);

6265 }

6266 RegExpCompiler compiler(isolate, zone, data->capture_count, ignore_case,

6267 is_one_byte);

6268

6269 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));

6270

6271 // Sample some characters from the middle of the string.

6272 static const int kSampleSize = 128;

6273

6274 sample_subject = String::Flatten(sample_subject);

6275 int chars_sampled = 0;

6276 int half_way = (sample_subject->length() - kSampleSize) / 2;

6277 for (int i = Max(0, half_way);

6278 i < sample_subject->length() && chars_sampled < kSampleSize;

6279 i++, chars_sampled++) {

6280 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));

6281 }

6282

6283 // Wrap the body of the regexp in capture #0.

6284 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,

6285 0,

6286 &compiler,

6287 compiler.accept());

6288 RegExpNode* node = captured_body;

6289 bool is_end_anchored = data->tree->IsAnchoredAtEnd();

6290 bool is_start_anchored = data->tree->IsAnchoredAtStart();

6291 int max_length = data->tree->max_match();

6292 if (!is_start_anchored && !is_sticky) {

6293 // Add a .*? at the beginning, outside the body capture, unless

6294 // this expression is anchored at the beginning or sticky.

6295 RegExpNode* loop_node =

6296 RegExpQuantifier::ToNode(0,

6297 RegExpTree::kInfinity,

6298 false,

6299 new(zone) RegExpCharacterClass('*'),

6300 &compiler,

6301 captured_body,

6302 data->contains_anchor);

6303

6304 if (data->contains_anchor) {

6305 // Unroll loop once, to take care of the case that might start

6306 // at the start of input.

6307 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);

6308 first_step_node->AddAlternative(GuardedAlternative(captured_body));

6309 first_step_node->AddAlternative(GuardedAlternative(

6310 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));

6311 node = first_step_node;

6312 } else {

6313 node = loop_node;

6314 }

6315 }

6316 if (is_one_byte) {

6317 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);

6318 // Do it again to propagate the new nodes to places where they were not

6319 // put because they had not been calculated yet.

6320 if (node != NULL) {

6321 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);

6322 }

6323 }

6324

6325 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);

6326 data->node = node;

6327 Analysis analysis(isolate, ignore_case, is_one_byte);

6328 analysis.EnsureAnalyzed(node);

6329 if (analysis.has_failed()) {

6330 const char* error_message = analysis.error_message();

6331 return CompilationResult(isolate, error_message);

6332 }

6333

6334 // Create the correct assembler for the architecture.

6335 #ifndef V8_INTERPRETED_REGEXP

6336 // Native regexp implementation.

6337

6338 NativeRegExpMacroAssembler::Mode mode =

6339 is_one_byte ? NativeRegExpMacroAssembler::LATIN1

6340 : NativeRegExpMacroAssembler::UC16;

6341

6342 #if V8_TARGET_ARCH_IA32

6343 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,

6344 (data->capture_count + 1) * 2);

6345 #elif V8_TARGET_ARCH_X64

6346 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,

6347 (data->capture_count + 1) * 2);

6348 #elif V8_TARGET_ARCH_ARM

6349 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,

6350 (data->capture_count + 1) * 2);

6351 #elif V8_TARGET_ARCH_ARM64

6352 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,

6353 (data->capture_count + 1) * 2);

6354 #elif V8_TARGET_ARCH_PPC

6355 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,

6356 (data->capture_count + 1) * 2);

6357 #elif V8_TARGET_ARCH_MIPS

6358 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,

6359 (data->capture_count + 1) * 2);

6360 #elif V8_TARGET_ARCH_MIPS64

6361 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,

6362 (data->capture_count + 1) * 2);

6363 #elif V8_TARGET_ARCH_X87

6364 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,

6365 (data->capture_count + 1) * 2);

6366 #else

6367 #error "Unsupported architecture"

6368 #endif

6369

6370 #else // V8_INTERPRETED_REGEXP

6371 // Interpreted regexp implementation.

6372 EmbeddedVector<byte, 1024> codes;

6373 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);

6374 #endif // V8_INTERPRETED_REGEXP

6375

6376 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));

6377

6378 // Inserted here, instead of in Assembler, because it depends on information

6379 // in the AST that isn't replicated in the Node structure.

6380 static const int kMaxBacksearchLimit = 1024;

6381 if (is_end_anchored &&

6382 !is_start_anchored &&

6383 max_length < kMaxBacksearchLimit) {

6384 macro_assembler.SetCurrentPositionFromEnd(max_length);

6385 }

6386

6387 if (is_global) {

6388 macro_assembler.set_global_mode(

6389 (data->tree->min_match() > 0)

6390 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK

6391 : RegExpMacroAssembler::GLOBAL);

6392 }

6393

6394 return compiler.Assemble(&macro_assembler,

6395 node,

6396 data->capture_count,

6397 pattern);

6398 }

6399

6400

6401 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {

6402 Heap* heap = pattern->GetHeap();

6403 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;

6404 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&

6405 heap->isolate()->memory_allocator()->SizeExecutable() >

6406 RegExpImpl::kRegExpExecutableMemoryLimit) {

6407 too_much = true;

6408 }

6409 return too_much;

6410 }

6411 } // namespace internal

6412 } // namespace v8

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | src/jsregexp-inl.h » ('j') | no next file with comments »