Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(454)

Side by Side Diff: src/jsregexp.cc

Issue 1285163003: Move regexp implementation into its own folder. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: addressed comment Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/jsregexp-inl.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/v8.h"
6
7 #include "src/ast.h"
8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h"
10 #include "src/compiler.h"
11 #include "src/execution.h"
12 #include "src/factory.h"
13 #include "src/jsregexp-inl.h"
14 #include "src/jsregexp.h"
15 #include "src/messages.h"
16 #include "src/ostreams.h"
17 #include "src/parser.h"
18 #include "src/regexp-macro-assembler.h"
19 #include "src/regexp-macro-assembler-irregexp.h"
20 #include "src/regexp-macro-assembler-tracer.h"
21 #include "src/regexp-stack.h"
22 #include "src/runtime/runtime.h"
23 #include "src/splay-tree-inl.h"
24 #include "src/string-search.h"
25 #include "src/unicode-decoder.h"
26
27 #ifndef V8_INTERPRETED_REGEXP
28 #if V8_TARGET_ARCH_IA32
29 #include "src/ia32/regexp-macro-assembler-ia32.h" // NOLINT
30 #elif V8_TARGET_ARCH_X64
31 #include "src/x64/regexp-macro-assembler-x64.h" // NOLINT
32 #elif V8_TARGET_ARCH_ARM64
33 #include "src/arm64/regexp-macro-assembler-arm64.h" // NOLINT
34 #elif V8_TARGET_ARCH_ARM
35 #include "src/arm/regexp-macro-assembler-arm.h" // NOLINT
36 #elif V8_TARGET_ARCH_PPC
37 #include "src/ppc/regexp-macro-assembler-ppc.h" // NOLINT
38 #elif V8_TARGET_ARCH_MIPS
39 #include "src/mips/regexp-macro-assembler-mips.h" // NOLINT
40 #elif V8_TARGET_ARCH_MIPS64
41 #include "src/mips64/regexp-macro-assembler-mips64.h" // NOLINT
42 #elif V8_TARGET_ARCH_X87
43 #include "src/x87/regexp-macro-assembler-x87.h" // NOLINT
44 #else
45 #error Unsupported target architecture.
46 #endif
47 #endif
48
49 #include "src/interpreter-irregexp.h"
50
51
52 namespace v8 {
53 namespace internal {
54
55 MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral(
56 Handle<JSFunction> constructor,
57 Handle<String> pattern,
58 Handle<String> flags) {
59 // Call the construct code with 2 arguments.
60 Handle<Object> argv[] = { pattern, flags };
61 return Execution::New(constructor, arraysize(argv), argv);
62 }
63
64
65 MUST_USE_RESULT
66 static inline MaybeHandle<Object> ThrowRegExpException(
67 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
68 Isolate* isolate = re->GetIsolate();
69 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
70 pattern, error_text),
71 Object);
72 }
73
74
75 inline void ThrowRegExpException(Handle<JSRegExp> re,
76 Handle<String> error_text) {
77 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
78 }
79
80
81 ContainedInLattice AddRange(ContainedInLattice containment,
82 const int* ranges,
83 int ranges_length,
84 Interval new_range) {
85 DCHECK((ranges_length & 1) == 1);
86 DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
87 if (containment == kLatticeUnknown) return containment;
88 bool inside = false;
89 int last = 0;
90 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
91 // Consider the range from last to ranges[i].
92 // We haven't got to the new range yet.
93 if (ranges[i] <= new_range.from()) continue;
94 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
95 // inclusive, but the values in ranges are not.
96 if (last <= new_range.from() && new_range.to() < ranges[i]) {
97 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
98 }
99 return kLatticeUnknown;
100 }
101 return containment;
102 }
103
104
105 // More makes code generation slower, less makes V8 benchmark score lower.
106 const int kMaxLookaheadForBoyerMoore = 8;
107 // In a 3-character pattern you can maximally step forwards 3 characters
108 // at a time, which is not always enough to pay for the extra logic.
109 const int kPatternTooShortForBoyerMoore = 2;
110
111
112 // Identifies the sort of regexps where the regexp engine is faster
113 // than the code used for atom matches.
114 static bool HasFewDifferentCharacters(Handle<String> pattern) {
115 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
116 if (length <= kPatternTooShortForBoyerMoore) return false;
117 const int kMod = 128;
118 bool character_found[kMod];
119 int different = 0;
120 memset(&character_found[0], 0, sizeof(character_found));
121 for (int i = 0; i < length; i++) {
122 int ch = (pattern->Get(i) & (kMod - 1));
123 if (!character_found[ch]) {
124 character_found[ch] = true;
125 different++;
126 // We declare a regexp low-alphabet if it has at least 3 times as many
127 // characters as it has different characters.
128 if (different * 3 > length) return false;
129 }
130 }
131 return true;
132 }
133
134
135 // Generic RegExp methods. Dispatches to implementation specific methods.
136
137
138 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
139 Handle<String> pattern,
140 JSRegExp::Flags flags) {
141 Isolate* isolate = re->GetIsolate();
142 Zone zone;
143 CompilationCache* compilation_cache = isolate->compilation_cache();
144 MaybeHandle<FixedArray> maybe_cached =
145 compilation_cache->LookupRegExp(pattern, flags);
146 Handle<FixedArray> cached;
147 bool in_cache = maybe_cached.ToHandle(&cached);
148 LOG(isolate, RegExpCompileEvent(re, in_cache));
149
150 Handle<Object> result;
151 if (in_cache) {
152 re->set_data(*cached);
153 return re;
154 }
155 pattern = String::Flatten(pattern);
156 PostponeInterruptsScope postpone(isolate);
157 RegExpCompileData parse_result;
158 FlatStringReader reader(isolate, pattern);
159 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader,
160 flags.is_multiline(), flags.is_unicode(),
161 &parse_result)) {
162 // Throw an exception if we fail to parse the pattern.
163 return ThrowRegExpException(re, pattern, parse_result.error);
164 }
165
166 bool has_been_compiled = false;
167
168 if (parse_result.simple &&
169 !flags.is_ignore_case() &&
170 !flags.is_sticky() &&
171 !HasFewDifferentCharacters(pattern)) {
172 // Parse-tree is a single atom that is equal to the pattern.
173 AtomCompile(re, pattern, flags, pattern);
174 has_been_compiled = true;
175 } else if (parse_result.tree->IsAtom() &&
176 !flags.is_ignore_case() &&
177 !flags.is_sticky() &&
178 parse_result.capture_count == 0) {
179 RegExpAtom* atom = parse_result.tree->AsAtom();
180 Vector<const uc16> atom_pattern = atom->data();
181 Handle<String> atom_string;
182 ASSIGN_RETURN_ON_EXCEPTION(
183 isolate, atom_string,
184 isolate->factory()->NewStringFromTwoByte(atom_pattern),
185 Object);
186 if (!HasFewDifferentCharacters(atom_string)) {
187 AtomCompile(re, pattern, flags, atom_string);
188 has_been_compiled = true;
189 }
190 }
191 if (!has_been_compiled) {
192 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
193 }
194 DCHECK(re->data()->IsFixedArray());
195 // Compilation succeeded so the data is set on the regexp
196 // and we can store it in the cache.
197 Handle<FixedArray> data(FixedArray::cast(re->data()));
198 compilation_cache->PutRegExp(pattern, flags, data);
199
200 return re;
201 }
202
203
204 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
205 Handle<String> subject,
206 int index,
207 Handle<JSArray> last_match_info) {
208 switch (regexp->TypeTag()) {
209 case JSRegExp::ATOM:
210 return AtomExec(regexp, subject, index, last_match_info);
211 case JSRegExp::IRREGEXP: {
212 return IrregexpExec(regexp, subject, index, last_match_info);
213 }
214 default:
215 UNREACHABLE();
216 return MaybeHandle<Object>();
217 }
218 }
219
220
221 // RegExp Atom implementation: Simple string search using indexOf.
222
223
224 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
225 Handle<String> pattern,
226 JSRegExp::Flags flags,
227 Handle<String> match_pattern) {
228 re->GetIsolate()->factory()->SetRegExpAtomData(re,
229 JSRegExp::ATOM,
230 pattern,
231 flags,
232 match_pattern);
233 }
234
235
236 static void SetAtomLastCapture(FixedArray* array,
237 String* subject,
238 int from,
239 int to) {
240 SealHandleScope shs(array->GetIsolate());
241 RegExpImpl::SetLastCaptureCount(array, 2);
242 RegExpImpl::SetLastSubject(array, subject);
243 RegExpImpl::SetLastInput(array, subject);
244 RegExpImpl::SetCapture(array, 0, from);
245 RegExpImpl::SetCapture(array, 1, to);
246 }
247
248
249 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
250 Handle<String> subject,
251 int index,
252 int32_t* output,
253 int output_size) {
254 Isolate* isolate = regexp->GetIsolate();
255
256 DCHECK(0 <= index);
257 DCHECK(index <= subject->length());
258
259 subject = String::Flatten(subject);
260 DisallowHeapAllocation no_gc; // ensure vectors stay valid
261
262 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
263 int needle_len = needle->length();
264 DCHECK(needle->IsFlat());
265 DCHECK_LT(0, needle_len);
266
267 if (index + needle_len > subject->length()) {
268 return RegExpImpl::RE_FAILURE;
269 }
270
271 for (int i = 0; i < output_size; i += 2) {
272 String::FlatContent needle_content = needle->GetFlatContent();
273 String::FlatContent subject_content = subject->GetFlatContent();
274 DCHECK(needle_content.IsFlat());
275 DCHECK(subject_content.IsFlat());
276 // dispatch on type of strings
277 index =
278 (needle_content.IsOneByte()
279 ? (subject_content.IsOneByte()
280 ? SearchString(isolate, subject_content.ToOneByteVector(),
281 needle_content.ToOneByteVector(), index)
282 : SearchString(isolate, subject_content.ToUC16Vector(),
283 needle_content.ToOneByteVector(), index))
284 : (subject_content.IsOneByte()
285 ? SearchString(isolate, subject_content.ToOneByteVector(),
286 needle_content.ToUC16Vector(), index)
287 : SearchString(isolate, subject_content.ToUC16Vector(),
288 needle_content.ToUC16Vector(), index)));
289 if (index == -1) {
290 return i / 2; // Return number of matches.
291 } else {
292 output[i] = index;
293 output[i+1] = index + needle_len;
294 index += needle_len;
295 }
296 }
297 return output_size / 2;
298 }
299
300
301 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
302 Handle<String> subject,
303 int index,
304 Handle<JSArray> last_match_info) {
305 Isolate* isolate = re->GetIsolate();
306
307 static const int kNumRegisters = 2;
308 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
309 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
310
311 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
312
313 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
314
315 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
316 SealHandleScope shs(isolate);
317 FixedArray* array = FixedArray::cast(last_match_info->elements());
318 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
319 return last_match_info;
320 }
321
322
323 // Irregexp implementation.
324
325 // Ensures that the regexp object contains a compiled version of the
326 // source for either one-byte or two-byte subject strings.
327 // If the compiled version doesn't already exist, it is compiled
328 // from the source pattern.
329 // If compilation fails, an exception is thrown and this function
330 // returns false.
331 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
332 Handle<String> sample_subject,
333 bool is_one_byte) {
334 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
335 #ifdef V8_INTERPRETED_REGEXP
336 if (compiled_code->IsByteArray()) return true;
337 #else // V8_INTERPRETED_REGEXP (RegExp native code)
338 if (compiled_code->IsCode()) return true;
339 #endif
340 // We could potentially have marked this as flushable, but have kept
341 // a saved version if we did not flush it yet.
342 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
343 if (saved_code->IsCode()) {
344 // Reinstate the code in the original place.
345 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
346 DCHECK(compiled_code->IsSmi());
347 return true;
348 }
349 return CompileIrregexp(re, sample_subject, is_one_byte);
350 }
351
352
353 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
354 Handle<String> sample_subject,
355 bool is_one_byte) {
356 // Compile the RegExp.
357 Isolate* isolate = re->GetIsolate();
358 Zone zone;
359 PostponeInterruptsScope postpone(isolate);
360 // If we had a compilation error the last time this is saved at the
361 // saved code index.
362 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
363 // When arriving here entry can only be a smi, either representing an
364 // uncompiled regexp, a previous compilation error, or code that has
365 // been flushed.
366 DCHECK(entry->IsSmi());
367 int entry_value = Smi::cast(entry)->value();
368 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
369 entry_value == JSRegExp::kCompilationErrorValue ||
370 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
371
372 if (entry_value == JSRegExp::kCompilationErrorValue) {
373 // A previous compilation failed and threw an error which we store in
374 // the saved code index (we store the error message, not the actual
375 // error). Recreate the error object and throw it.
376 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
377 DCHECK(error_string->IsString());
378 Handle<String> error_message(String::cast(error_string));
379 ThrowRegExpException(re, error_message);
380 return false;
381 }
382
383 JSRegExp::Flags flags = re->GetFlags();
384
385 Handle<String> pattern(re->Pattern());
386 pattern = String::Flatten(pattern);
387 RegExpCompileData compile_data;
388 FlatStringReader reader(isolate, pattern);
389 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags.is_multiline(),
390 flags.is_unicode(), &compile_data)) {
391 // Throw an exception if we fail to parse the pattern.
392 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
393 USE(ThrowRegExpException(re, pattern, compile_data.error));
394 return false;
395 }
396 RegExpEngine::CompilationResult result = RegExpEngine::Compile(
397 isolate, &zone, &compile_data, flags.is_ignore_case(), flags.is_global(),
398 flags.is_multiline(), flags.is_sticky(), pattern, sample_subject,
399 is_one_byte);
400 if (result.error_message != NULL) {
401 // Unable to compile regexp.
402 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
403 CStrVector(result.error_message)).ToHandleChecked();
404 ThrowRegExpException(re, error_message);
405 return false;
406 }
407
408 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
409 data->set(JSRegExp::code_index(is_one_byte), result.code);
410 int register_max = IrregexpMaxRegisterCount(*data);
411 if (result.num_registers > register_max) {
412 SetIrregexpMaxRegisterCount(*data, result.num_registers);
413 }
414
415 return true;
416 }
417
418
419 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
420 return Smi::cast(
421 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
422 }
423
424
425 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
426 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
427 }
428
429
430 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
431 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
432 }
433
434
435 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
436 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
437 }
438
439
440 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
441 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
442 }
443
444
445 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
446 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
447 }
448
449
450 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
451 Handle<String> pattern,
452 JSRegExp::Flags flags,
453 int capture_count) {
454 // Initialize compiled code entries to null.
455 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
456 JSRegExp::IRREGEXP,
457 pattern,
458 flags,
459 capture_count);
460 }
461
462
463 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
464 Handle<String> subject) {
465 subject = String::Flatten(subject);
466
467 // Check representation of the underlying storage.
468 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
469 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
470
471 #ifdef V8_INTERPRETED_REGEXP
472 // Byte-code regexp needs space allocated for all its registers.
473 // The result captures are copied to the start of the registers array
474 // if the match succeeds. This way those registers are not clobbered
475 // when we set the last match info from last successful match.
476 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
477 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
478 #else // V8_INTERPRETED_REGEXP
479 // Native regexp only needs room to output captures. Registers are handled
480 // internally.
481 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
482 #endif // V8_INTERPRETED_REGEXP
483 }
484
485
486 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
487 Handle<String> subject,
488 int index,
489 int32_t* output,
490 int output_size) {
491 Isolate* isolate = regexp->GetIsolate();
492
493 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
494
495 DCHECK(index >= 0);
496 DCHECK(index <= subject->length());
497 DCHECK(subject->IsFlat());
498
499 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
500
501 #ifndef V8_INTERPRETED_REGEXP
502 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
503 do {
504 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
505 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
506 // The stack is used to allocate registers for the compiled regexp code.
507 // This means that in case of failure, the output registers array is left
508 // untouched and contains the capture results from the previous successful
509 // match. We can use that to set the last match info lazily.
510 NativeRegExpMacroAssembler::Result res =
511 NativeRegExpMacroAssembler::Match(code,
512 subject,
513 output,
514 output_size,
515 index,
516 isolate);
517 if (res != NativeRegExpMacroAssembler::RETRY) {
518 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
519 isolate->has_pending_exception());
520 STATIC_ASSERT(
521 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
522 STATIC_ASSERT(
523 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
524 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
525 == RE_EXCEPTION);
526 return static_cast<IrregexpResult>(res);
527 }
528 // If result is RETRY, the string has changed representation, and we
529 // must restart from scratch.
530 // In this case, it means we must make sure we are prepared to handle
531 // the, potentially, different subject (the string can switch between
532 // being internal and external, and even between being Latin1 and UC16,
533 // but the characters are always the same).
534 IrregexpPrepare(regexp, subject);
535 is_one_byte = subject->IsOneByteRepresentationUnderneath();
536 } while (true);
537 UNREACHABLE();
538 return RE_EXCEPTION;
539 #else // V8_INTERPRETED_REGEXP
540
541 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
542 // We must have done EnsureCompiledIrregexp, so we can get the number of
543 // registers.
544 int number_of_capture_registers =
545 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
546 int32_t* raw_output = &output[number_of_capture_registers];
547 // We do not touch the actual capture result registers until we know there
548 // has been a match so that we can use those capture results to set the
549 // last match info.
550 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
551 raw_output[i] = -1;
552 }
553 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
554 isolate);
555
556 IrregexpResult result = IrregexpInterpreter::Match(isolate,
557 byte_codes,
558 subject,
559 raw_output,
560 index);
561 if (result == RE_SUCCESS) {
562 // Copy capture results to the start of the registers array.
563 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
564 }
565 if (result == RE_EXCEPTION) {
566 DCHECK(!isolate->has_pending_exception());
567 isolate->StackOverflow();
568 }
569 return result;
570 #endif // V8_INTERPRETED_REGEXP
571 }
572
573
574 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
575 Handle<String> subject,
576 int previous_index,
577 Handle<JSArray> last_match_info) {
578 Isolate* isolate = regexp->GetIsolate();
579 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
580
581 // Prepare space for the return values.
582 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
583 if (FLAG_trace_regexp_bytecodes) {
584 String* pattern = regexp->Pattern();
585 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
586 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
587 }
588 #endif
589 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
590 if (required_registers < 0) {
591 // Compiling failed with an exception.
592 DCHECK(isolate->has_pending_exception());
593 return MaybeHandle<Object>();
594 }
595
596 int32_t* output_registers = NULL;
597 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
598 output_registers = NewArray<int32_t>(required_registers);
599 }
600 base::SmartArrayPointer<int32_t> auto_release(output_registers);
601 if (output_registers == NULL) {
602 output_registers = isolate->jsregexp_static_offsets_vector();
603 }
604
605 int res = RegExpImpl::IrregexpExecRaw(
606 regexp, subject, previous_index, output_registers, required_registers);
607 if (res == RE_SUCCESS) {
608 int capture_count =
609 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
610 return SetLastMatchInfo(
611 last_match_info, subject, capture_count, output_registers);
612 }
613 if (res == RE_EXCEPTION) {
614 DCHECK(isolate->has_pending_exception());
615 return MaybeHandle<Object>();
616 }
617 DCHECK(res == RE_FAILURE);
618 return isolate->factory()->null_value();
619 }
620
621
622 static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
623 if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
624 JSArray::SetLength(array, minimum_size);
625 }
626 }
627
628
629 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
630 Handle<String> subject,
631 int capture_count,
632 int32_t* match) {
633 DCHECK(last_match_info->HasFastObjectElements());
634 int capture_register_count = (capture_count + 1) * 2;
635 EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
636 DisallowHeapAllocation no_allocation;
637 FixedArray* array = FixedArray::cast(last_match_info->elements());
638 if (match != NULL) {
639 for (int i = 0; i < capture_register_count; i += 2) {
640 SetCapture(array, i, match[i]);
641 SetCapture(array, i + 1, match[i + 1]);
642 }
643 }
644 SetLastCaptureCount(array, capture_register_count);
645 SetLastSubject(array, *subject);
646 SetLastInput(array, *subject);
647 return last_match_info;
648 }
649
650
651 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
652 Handle<String> subject,
653 bool is_global,
654 Isolate* isolate)
655 : register_array_(NULL),
656 register_array_size_(0),
657 regexp_(regexp),
658 subject_(subject) {
659 #ifdef V8_INTERPRETED_REGEXP
660 bool interpreted = true;
661 #else
662 bool interpreted = false;
663 #endif // V8_INTERPRETED_REGEXP
664
665 if (regexp_->TypeTag() == JSRegExp::ATOM) {
666 static const int kAtomRegistersPerMatch = 2;
667 registers_per_match_ = kAtomRegistersPerMatch;
668 // There is no distinction between interpreted and native for atom regexps.
669 interpreted = false;
670 } else {
671 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
672 if (registers_per_match_ < 0) {
673 num_matches_ = -1; // Signal exception.
674 return;
675 }
676 }
677
678 if (is_global && !interpreted) {
679 register_array_size_ =
680 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
681 max_matches_ = register_array_size_ / registers_per_match_;
682 } else {
683 // Global loop in interpreted regexp is not implemented. We choose
684 // the size of the offsets vector so that it can only store one match.
685 register_array_size_ = registers_per_match_;
686 max_matches_ = 1;
687 }
688
689 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
690 register_array_ = NewArray<int32_t>(register_array_size_);
691 } else {
692 register_array_ = isolate->jsregexp_static_offsets_vector();
693 }
694
695 // Set state so that fetching the results the first time triggers a call
696 // to the compiled regexp.
697 current_match_index_ = max_matches_ - 1;
698 num_matches_ = max_matches_;
699 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
700 DCHECK_GE(register_array_size_, registers_per_match_);
701 int32_t* last_match =
702 &register_array_[current_match_index_ * registers_per_match_];
703 last_match[0] = -1;
704 last_match[1] = 0;
705 }
706
707
708 // -------------------------------------------------------------------
709 // Implementation of the Irregexp regular expression engine.
710 //
711 // The Irregexp regular expression engine is intended to be a complete
712 // implementation of ECMAScript regular expressions. It generates either
713 // bytecodes or native code.
714
715 // The Irregexp regexp engine is structured in three steps.
716 // 1) The parser generates an abstract syntax tree. See ast.cc.
717 // 2) From the AST a node network is created. The nodes are all
718 // subclasses of RegExpNode. The nodes represent states when
719 // executing a regular expression. Several optimizations are
720 // performed on the node network.
721 // 3) From the nodes we generate either byte codes or native code
722 // that can actually execute the regular expression (perform
723 // the search). The code generation step is described in more
724 // detail below.
725
726 // Code generation.
727 //
728 // The nodes are divided into four main categories.
729 // * Choice nodes
730 // These represent places where the regular expression can
731 // match in more than one way. For example on entry to an
732 // alternation (foo|bar) or a repetition (*, +, ? or {}).
733 // * Action nodes
734 // These represent places where some action should be
735 // performed. Examples include recording the current position
736 // in the input string to a register (in order to implement
737 // captures) or other actions on register for example in order
738 // to implement the counters needed for {} repetitions.
739 // * Matching nodes
740 // These attempt to match some element part of the input string.
741 // Examples of elements include character classes, plain strings
742 // or back references.
743 // * End nodes
744 // These are used to implement the actions required on finding
745 // a successful match or failing to find a match.
746 //
747 // The code generated (whether as byte codes or native code) maintains
748 // some state as it runs. This consists of the following elements:
749 //
750 // * The capture registers. Used for string captures.
751 // * Other registers. Used for counters etc.
752 // * The current position.
753 // * The stack of backtracking information. Used when a matching node
754 // fails to find a match and needs to try an alternative.
755 //
756 // Conceptual regular expression execution model:
757 //
758 // There is a simple conceptual model of regular expression execution
759 // which will be presented first. The actual code generated is a more
760 // efficient simulation of the simple conceptual model:
761 //
762 // * Choice nodes are implemented as follows:
763 // For each choice except the last {
764 // push current position
765 // push backtrack code location
766 // <generate code to test for choice>
767 // backtrack code location:
768 // pop current position
769 // }
770 // <generate code to test for last choice>
771 //
772 // * Actions nodes are generated as follows
773 // <push affected registers on backtrack stack>
774 // <generate code to perform action>
775 // push backtrack code location
776 // <generate code to test for following nodes>
777 // backtrack code location:
778 // <pop affected registers to restore their state>
779 // <pop backtrack location from stack and go to it>
780 //
781 // * Matching nodes are generated as follows:
782 // if input string matches at current position
783 // update current position
784 // <generate code to test for following nodes>
785 // else
786 // <pop backtrack location from stack and go to it>
787 //
788 // Thus it can be seen that the current position is saved and restored
789 // by the choice nodes, whereas the registers are saved and restored by
790 // by the action nodes that manipulate them.
791 //
792 // The other interesting aspect of this model is that nodes are generated
793 // at the point where they are needed by a recursive call to Emit(). If
794 // the node has already been code generated then the Emit() call will
795 // generate a jump to the previously generated code instead. In order to
796 // limit recursion it is possible for the Emit() function to put the node
797 // on a work list for later generation and instead generate a jump. The
798 // destination of the jump is resolved later when the code is generated.
799 //
800 // Actual regular expression code generation.
801 //
802 // Code generation is actually more complicated than the above. In order
803 // to improve the efficiency of the generated code some optimizations are
804 // performed
805 //
806 // * Choice nodes have 1-character lookahead.
807 // A choice node looks at the following character and eliminates some of
808 // the choices immediately based on that character. This is not yet
809 // implemented.
810 // * Simple greedy loops store reduced backtracking information.
811 // A quantifier like /.*foo/m will greedily match the whole input. It will
812 // then need to backtrack to a point where it can match "foo". The naive
813 // implementation of this would push each character position onto the
814 // backtracking stack, then pop them off one by one. This would use space
815 // proportional to the length of the input string. However since the "."
816 // can only match in one way and always has a constant length (in this case
817 // of 1) it suffices to store the current position on the top of the stack
818 // once. Matching now becomes merely incrementing the current position and
819 // backtracking becomes decrementing the current position and checking the
820 // result against the stored current position. This is faster and saves
821 // space.
822 // * The current state is virtualized.
823 // This is used to defer expensive operations until it is clear that they
824 // are needed and to generate code for a node more than once, allowing
825 // specialized an efficient versions of the code to be created. This is
826 // explained in the section below.
827 //
828 // Execution state virtualization.
829 //
830 // Instead of emitting code, nodes that manipulate the state can record their
831 // manipulation in an object called the Trace. The Trace object can record a
832 // current position offset, an optional backtrack code location on the top of
833 // the virtualized backtrack stack and some register changes. When a node is
834 // to be emitted it can flush the Trace or update it. Flushing the Trace
835 // will emit code to bring the actual state into line with the virtual state.
836 // Avoiding flushing the state can postpone some work (e.g. updates of capture
837 // registers). Postponing work can save time when executing the regular
838 // expression since it may be found that the work never has to be done as a
839 // failure to match can occur. In addition it is much faster to jump to a
840 // known backtrack code location than it is to pop an unknown backtrack
841 // location from the stack and jump there.
842 //
843 // The virtual state found in the Trace affects code generation. For example
844 // the virtual state contains the difference between the actual current
845 // position and the virtual current position, and matching code needs to use
846 // this offset to attempt a match in the correct location of the input
847 // string. Therefore code generated for a non-trivial trace is specialized
848 // to that trace. The code generator therefore has the ability to generate
849 // code for each node several times. In order to limit the size of the
850 // generated code there is an arbitrary limit on how many specialized sets of
851 // code may be generated for a given node. If the limit is reached, the
852 // trace is flushed and a generic version of the code for a node is emitted.
853 // This is subsequently used for that node. The code emitted for non-generic
854 // trace is not recorded in the node and so it cannot currently be reused in
855 // the event that code generation is requested for an identical trace.
856
857
858 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
859 UNREACHABLE();
860 }
861
862
863 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
864 text->AddElement(TextElement::Atom(this), zone);
865 }
866
867
868 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
869 text->AddElement(TextElement::CharClass(this), zone);
870 }
871
872
873 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
874 for (int i = 0; i < elements()->length(); i++)
875 text->AddElement(elements()->at(i), zone);
876 }
877
878
879 TextElement TextElement::Atom(RegExpAtom* atom) {
880 return TextElement(ATOM, atom);
881 }
882
883
884 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
885 return TextElement(CHAR_CLASS, char_class);
886 }
887
888
889 int TextElement::length() const {
890 switch (text_type()) {
891 case ATOM:
892 return atom()->length();
893
894 case CHAR_CLASS:
895 return 1;
896 }
897 UNREACHABLE();
898 return 0;
899 }
900
901
902 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
903 if (table_ == NULL) {
904 table_ = new(zone()) DispatchTable(zone());
905 DispatchTableConstructor cons(table_, ignore_case, zone());
906 cons.BuildTable(this);
907 }
908 return table_;
909 }
910
911
912 class FrequencyCollator {
913 public:
914 FrequencyCollator() : total_samples_(0) {
915 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
916 frequencies_[i] = CharacterFrequency(i);
917 }
918 }
919
920 void CountCharacter(int character) {
921 int index = (character & RegExpMacroAssembler::kTableMask);
922 frequencies_[index].Increment();
923 total_samples_++;
924 }
925
926 // Does not measure in percent, but rather per-128 (the table size from the
927 // regexp macro assembler).
928 int Frequency(int in_character) {
929 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
930 if (total_samples_ < 1) return 1; // Division by zero.
931 int freq_in_per128 =
932 (frequencies_[in_character].counter() * 128) / total_samples_;
933 return freq_in_per128;
934 }
935
936 private:
937 class CharacterFrequency {
938 public:
939 CharacterFrequency() : counter_(0), character_(-1) { }
940 explicit CharacterFrequency(int character)
941 : counter_(0), character_(character) { }
942
943 void Increment() { counter_++; }
944 int counter() { return counter_; }
945 int character() { return character_; }
946
947 private:
948 int counter_;
949 int character_;
950 };
951
952
953 private:
954 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
955 int total_samples_;
956 };
957
958
959 class RegExpCompiler {
960 public:
961 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
962 bool ignore_case, bool is_one_byte);
963
964 int AllocateRegister() {
965 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
966 reg_exp_too_big_ = true;
967 return next_register_;
968 }
969 return next_register_++;
970 }
971
972 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
973 RegExpNode* start,
974 int capture_count,
975 Handle<String> pattern);
976
977 inline void AddWork(RegExpNode* node) {
978 if (!node->on_work_list() && !node->label()->is_bound()) {
979 node->set_on_work_list(true);
980 work_list_->Add(node);
981 }
982 }
983
984 static const int kImplementationOffset = 0;
985 static const int kNumberOfRegistersOffset = 0;
986 static const int kCodeOffset = 1;
987
988 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
989 EndNode* accept() { return accept_; }
990
991 static const int kMaxRecursion = 100;
992 inline int recursion_depth() { return recursion_depth_; }
993 inline void IncrementRecursionDepth() { recursion_depth_++; }
994 inline void DecrementRecursionDepth() { recursion_depth_--; }
995
996 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
997
998 inline bool ignore_case() { return ignore_case_; }
999 inline bool one_byte() { return one_byte_; }
1000 inline bool optimize() { return optimize_; }
1001 inline void set_optimize(bool value) { optimize_ = value; }
1002 inline bool limiting_recursion() { return limiting_recursion_; }
1003 inline void set_limiting_recursion(bool value) {
1004 limiting_recursion_ = value;
1005 }
1006 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1007
1008 int current_expansion_factor() { return current_expansion_factor_; }
1009 void set_current_expansion_factor(int value) {
1010 current_expansion_factor_ = value;
1011 }
1012
1013 Isolate* isolate() const { return isolate_; }
1014 Zone* zone() const { return zone_; }
1015
1016 static const int kNoRegister = -1;
1017
1018 private:
1019 EndNode* accept_;
1020 int next_register_;
1021 List<RegExpNode*>* work_list_;
1022 int recursion_depth_;
1023 RegExpMacroAssembler* macro_assembler_;
1024 bool ignore_case_;
1025 bool one_byte_;
1026 bool reg_exp_too_big_;
1027 bool limiting_recursion_;
1028 bool optimize_;
1029 int current_expansion_factor_;
1030 FrequencyCollator frequency_collator_;
1031 Isolate* isolate_;
1032 Zone* zone_;
1033 };
1034
1035
1036 class RecursionCheck {
1037 public:
1038 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1039 compiler->IncrementRecursionDepth();
1040 }
1041 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1042 private:
1043 RegExpCompiler* compiler_;
1044 };
1045
1046
1047 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1048 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1049 }
1050
1051
1052 // Attempts to compile the regexp using an Irregexp code generator. Returns
1053 // a fixed array or a null handle depending on whether it succeeded.
1054 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1055 bool ignore_case, bool one_byte)
1056 : next_register_(2 * (capture_count + 1)),
1057 work_list_(NULL),
1058 recursion_depth_(0),
1059 ignore_case_(ignore_case),
1060 one_byte_(one_byte),
1061 reg_exp_too_big_(false),
1062 limiting_recursion_(false),
1063 optimize_(FLAG_regexp_optimization),
1064 current_expansion_factor_(1),
1065 frequency_collator_(),
1066 isolate_(isolate),
1067 zone_(zone) {
1068 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1069 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1070 }
1071
1072
1073 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1074 RegExpMacroAssembler* macro_assembler,
1075 RegExpNode* start,
1076 int capture_count,
1077 Handle<String> pattern) {
1078 Heap* heap = pattern->GetHeap();
1079
1080 #ifdef DEBUG
1081 if (FLAG_trace_regexp_assembler)
1082 macro_assembler_ =
1083 new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
1084 else
1085 #endif
1086 macro_assembler_ = macro_assembler;
1087
1088 List <RegExpNode*> work_list(0);
1089 work_list_ = &work_list;
1090 Label fail;
1091 macro_assembler_->PushBacktrack(&fail);
1092 Trace new_trace;
1093 start->Emit(this, &new_trace);
1094 macro_assembler_->Bind(&fail);
1095 macro_assembler_->Fail();
1096 while (!work_list.is_empty()) {
1097 RegExpNode* node = work_list.RemoveLast();
1098 node->set_on_work_list(false);
1099 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1100 }
1101 if (reg_exp_too_big_) {
1102 macro_assembler_->AbortedCodeGeneration();
1103 return IrregexpRegExpTooBig(isolate_);
1104 }
1105
1106 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1107 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1108 work_list_ = NULL;
1109 #ifdef ENABLE_DISASSEMBLER
1110 if (FLAG_print_code) {
1111 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1112 OFStream os(trace_scope.file());
1113 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1114 }
1115 #endif
1116 #ifdef DEBUG
1117 if (FLAG_trace_regexp_assembler) {
1118 delete macro_assembler_;
1119 }
1120 #endif
1121 return RegExpEngine::CompilationResult(*code, next_register_);
1122 }
1123
1124
1125 bool Trace::DeferredAction::Mentions(int that) {
1126 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1127 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1128 return range.Contains(that);
1129 } else {
1130 return reg() == that;
1131 }
1132 }
1133
1134
1135 bool Trace::mentions_reg(int reg) {
1136 for (DeferredAction* action = actions_;
1137 action != NULL;
1138 action = action->next()) {
1139 if (action->Mentions(reg))
1140 return true;
1141 }
1142 return false;
1143 }
1144
1145
1146 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1147 DCHECK_EQ(0, *cp_offset);
1148 for (DeferredAction* action = actions_;
1149 action != NULL;
1150 action = action->next()) {
1151 if (action->Mentions(reg)) {
1152 if (action->action_type() == ActionNode::STORE_POSITION) {
1153 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1154 return true;
1155 } else {
1156 return false;
1157 }
1158 }
1159 }
1160 return false;
1161 }
1162
1163
1164 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1165 Zone* zone) {
1166 int max_register = RegExpCompiler::kNoRegister;
1167 for (DeferredAction* action = actions_;
1168 action != NULL;
1169 action = action->next()) {
1170 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1171 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1172 for (int i = range.from(); i <= range.to(); i++)
1173 affected_registers->Set(i, zone);
1174 if (range.to() > max_register) max_register = range.to();
1175 } else {
1176 affected_registers->Set(action->reg(), zone);
1177 if (action->reg() > max_register) max_register = action->reg();
1178 }
1179 }
1180 return max_register;
1181 }
1182
1183
1184 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1185 int max_register,
1186 const OutSet& registers_to_pop,
1187 const OutSet& registers_to_clear) {
1188 for (int reg = max_register; reg >= 0; reg--) {
1189 if (registers_to_pop.Get(reg)) {
1190 assembler->PopRegister(reg);
1191 } else if (registers_to_clear.Get(reg)) {
1192 int clear_to = reg;
1193 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1194 reg--;
1195 }
1196 assembler->ClearRegisters(reg, clear_to);
1197 }
1198 }
1199 }
1200
1201
1202 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1203 int max_register,
1204 const OutSet& affected_registers,
1205 OutSet* registers_to_pop,
1206 OutSet* registers_to_clear,
1207 Zone* zone) {
1208 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1209 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1210
1211 // Count pushes performed to force a stack limit check occasionally.
1212 int pushes = 0;
1213
1214 for (int reg = 0; reg <= max_register; reg++) {
1215 if (!affected_registers.Get(reg)) {
1216 continue;
1217 }
1218
1219 // The chronologically first deferred action in the trace
1220 // is used to infer the action needed to restore a register
1221 // to its previous state (or not, if it's safe to ignore it).
1222 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1223 DeferredActionUndoType undo_action = IGNORE;
1224
1225 int value = 0;
1226 bool absolute = false;
1227 bool clear = false;
1228 int store_position = -1;
1229 // This is a little tricky because we are scanning the actions in reverse
1230 // historical order (newest first).
1231 for (DeferredAction* action = actions_;
1232 action != NULL;
1233 action = action->next()) {
1234 if (action->Mentions(reg)) {
1235 switch (action->action_type()) {
1236 case ActionNode::SET_REGISTER: {
1237 Trace::DeferredSetRegister* psr =
1238 static_cast<Trace::DeferredSetRegister*>(action);
1239 if (!absolute) {
1240 value += psr->value();
1241 absolute = true;
1242 }
1243 // SET_REGISTER is currently only used for newly introduced loop
1244 // counters. They can have a significant previous value if they
1245 // occour in a loop. TODO(lrn): Propagate this information, so
1246 // we can set undo_action to IGNORE if we know there is no value to
1247 // restore.
1248 undo_action = RESTORE;
1249 DCHECK_EQ(store_position, -1);
1250 DCHECK(!clear);
1251 break;
1252 }
1253 case ActionNode::INCREMENT_REGISTER:
1254 if (!absolute) {
1255 value++;
1256 }
1257 DCHECK_EQ(store_position, -1);
1258 DCHECK(!clear);
1259 undo_action = RESTORE;
1260 break;
1261 case ActionNode::STORE_POSITION: {
1262 Trace::DeferredCapture* pc =
1263 static_cast<Trace::DeferredCapture*>(action);
1264 if (!clear && store_position == -1) {
1265 store_position = pc->cp_offset();
1266 }
1267
1268 // For captures we know that stores and clears alternate.
1269 // Other register, are never cleared, and if the occur
1270 // inside a loop, they might be assigned more than once.
1271 if (reg <= 1) {
1272 // Registers zero and one, aka "capture zero", is
1273 // always set correctly if we succeed. There is no
1274 // need to undo a setting on backtrack, because we
1275 // will set it again or fail.
1276 undo_action = IGNORE;
1277 } else {
1278 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1279 }
1280 DCHECK(!absolute);
1281 DCHECK_EQ(value, 0);
1282 break;
1283 }
1284 case ActionNode::CLEAR_CAPTURES: {
1285 // Since we're scanning in reverse order, if we've already
1286 // set the position we have to ignore historically earlier
1287 // clearing operations.
1288 if (store_position == -1) {
1289 clear = true;
1290 }
1291 undo_action = RESTORE;
1292 DCHECK(!absolute);
1293 DCHECK_EQ(value, 0);
1294 break;
1295 }
1296 default:
1297 UNREACHABLE();
1298 break;
1299 }
1300 }
1301 }
1302 // Prepare for the undo-action (e.g., push if it's going to be popped).
1303 if (undo_action == RESTORE) {
1304 pushes++;
1305 RegExpMacroAssembler::StackCheckFlag stack_check =
1306 RegExpMacroAssembler::kNoStackLimitCheck;
1307 if (pushes == push_limit) {
1308 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1309 pushes = 0;
1310 }
1311
1312 assembler->PushRegister(reg, stack_check);
1313 registers_to_pop->Set(reg, zone);
1314 } else if (undo_action == CLEAR) {
1315 registers_to_clear->Set(reg, zone);
1316 }
1317 // Perform the chronologically last action (or accumulated increment)
1318 // for the register.
1319 if (store_position != -1) {
1320 assembler->WriteCurrentPositionToRegister(reg, store_position);
1321 } else if (clear) {
1322 assembler->ClearRegisters(reg, reg);
1323 } else if (absolute) {
1324 assembler->SetRegister(reg, value);
1325 } else if (value != 0) {
1326 assembler->AdvanceRegister(reg, value);
1327 }
1328 }
1329 }
1330
1331
1332 // This is called as we come into a loop choice node and some other tricky
1333 // nodes. It normalizes the state of the code generator to ensure we can
1334 // generate generic code.
1335 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1336 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1337
1338 DCHECK(!is_trivial());
1339
1340 if (actions_ == NULL && backtrack() == NULL) {
1341 // Here we just have some deferred cp advances to fix and we are back to
1342 // a normal situation. We may also have to forget some information gained
1343 // through a quick check that was already performed.
1344 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1345 // Create a new trivial state and generate the node with that.
1346 Trace new_state;
1347 successor->Emit(compiler, &new_state);
1348 return;
1349 }
1350
1351 // Generate deferred actions here along with code to undo them again.
1352 OutSet affected_registers;
1353
1354 if (backtrack() != NULL) {
1355 // Here we have a concrete backtrack location. These are set up by choice
1356 // nodes and so they indicate that we have a deferred save of the current
1357 // position which we may need to emit here.
1358 assembler->PushCurrentPosition();
1359 }
1360
1361 int max_register = FindAffectedRegisters(&affected_registers,
1362 compiler->zone());
1363 OutSet registers_to_pop;
1364 OutSet registers_to_clear;
1365 PerformDeferredActions(assembler,
1366 max_register,
1367 affected_registers,
1368 &registers_to_pop,
1369 &registers_to_clear,
1370 compiler->zone());
1371 if (cp_offset_ != 0) {
1372 assembler->AdvanceCurrentPosition(cp_offset_);
1373 }
1374
1375 // Create a new trivial state and generate the node with that.
1376 Label undo;
1377 assembler->PushBacktrack(&undo);
1378 if (successor->KeepRecursing(compiler)) {
1379 Trace new_state;
1380 successor->Emit(compiler, &new_state);
1381 } else {
1382 compiler->AddWork(successor);
1383 assembler->GoTo(successor->label());
1384 }
1385
1386 // On backtrack we need to restore state.
1387 assembler->Bind(&undo);
1388 RestoreAffectedRegisters(assembler,
1389 max_register,
1390 registers_to_pop,
1391 registers_to_clear);
1392 if (backtrack() == NULL) {
1393 assembler->Backtrack();
1394 } else {
1395 assembler->PopCurrentPosition();
1396 assembler->GoTo(backtrack());
1397 }
1398 }
1399
1400
1401 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1402 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1403
1404 // Omit flushing the trace. We discard the entire stack frame anyway.
1405
1406 if (!label()->is_bound()) {
1407 // We are completely independent of the trace, since we ignore it,
1408 // so this code can be used as the generic version.
1409 assembler->Bind(label());
1410 }
1411
1412 // Throw away everything on the backtrack stack since the start
1413 // of the negative submatch and restore the character position.
1414 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1415 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1416 if (clear_capture_count_ > 0) {
1417 // Clear any captures that might have been performed during the success
1418 // of the body of the negative look-ahead.
1419 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1420 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1421 }
1422 // Now that we have unwound the stack we find at the top of the stack the
1423 // backtrack that the BeginSubmatch node got.
1424 assembler->Backtrack();
1425 }
1426
1427
1428 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1429 if (!trace->is_trivial()) {
1430 trace->Flush(compiler, this);
1431 return;
1432 }
1433 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1434 if (!label()->is_bound()) {
1435 assembler->Bind(label());
1436 }
1437 switch (action_) {
1438 case ACCEPT:
1439 assembler->Succeed();
1440 return;
1441 case BACKTRACK:
1442 assembler->GoTo(trace->backtrack());
1443 return;
1444 case NEGATIVE_SUBMATCH_SUCCESS:
1445 // This case is handled in a different virtual method.
1446 UNREACHABLE();
1447 }
1448 UNIMPLEMENTED();
1449 }
1450
1451
1452 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1453 if (guards_ == NULL)
1454 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1455 guards_->Add(guard, zone);
1456 }
1457
1458
1459 ActionNode* ActionNode::SetRegister(int reg,
1460 int val,
1461 RegExpNode* on_success) {
1462 ActionNode* result =
1463 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1464 result->data_.u_store_register.reg = reg;
1465 result->data_.u_store_register.value = val;
1466 return result;
1467 }
1468
1469
1470 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1471 ActionNode* result =
1472 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1473 result->data_.u_increment_register.reg = reg;
1474 return result;
1475 }
1476
1477
1478 ActionNode* ActionNode::StorePosition(int reg,
1479 bool is_capture,
1480 RegExpNode* on_success) {
1481 ActionNode* result =
1482 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1483 result->data_.u_position_register.reg = reg;
1484 result->data_.u_position_register.is_capture = is_capture;
1485 return result;
1486 }
1487
1488
1489 ActionNode* ActionNode::ClearCaptures(Interval range,
1490 RegExpNode* on_success) {
1491 ActionNode* result =
1492 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1493 result->data_.u_clear_captures.range_from = range.from();
1494 result->data_.u_clear_captures.range_to = range.to();
1495 return result;
1496 }
1497
1498
1499 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1500 int position_reg,
1501 RegExpNode* on_success) {
1502 ActionNode* result =
1503 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1504 result->data_.u_submatch.stack_pointer_register = stack_reg;
1505 result->data_.u_submatch.current_position_register = position_reg;
1506 return result;
1507 }
1508
1509
1510 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1511 int position_reg,
1512 int clear_register_count,
1513 int clear_register_from,
1514 RegExpNode* on_success) {
1515 ActionNode* result =
1516 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1517 result->data_.u_submatch.stack_pointer_register = stack_reg;
1518 result->data_.u_submatch.current_position_register = position_reg;
1519 result->data_.u_submatch.clear_register_count = clear_register_count;
1520 result->data_.u_submatch.clear_register_from = clear_register_from;
1521 return result;
1522 }
1523
1524
1525 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1526 int repetition_register,
1527 int repetition_limit,
1528 RegExpNode* on_success) {
1529 ActionNode* result =
1530 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1531 result->data_.u_empty_match_check.start_register = start_register;
1532 result->data_.u_empty_match_check.repetition_register = repetition_register;
1533 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1534 return result;
1535 }
1536
1537
1538 #define DEFINE_ACCEPT(Type) \
1539 void Type##Node::Accept(NodeVisitor* visitor) { \
1540 visitor->Visit##Type(this); \
1541 }
1542 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1543 #undef DEFINE_ACCEPT
1544
1545
1546 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1547 visitor->VisitLoopChoice(this);
1548 }
1549
1550
1551 // -------------------------------------------------------------------
1552 // Emit code.
1553
1554
1555 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1556 Guard* guard,
1557 Trace* trace) {
1558 switch (guard->op()) {
1559 case Guard::LT:
1560 DCHECK(!trace->mentions_reg(guard->reg()));
1561 macro_assembler->IfRegisterGE(guard->reg(),
1562 guard->value(),
1563 trace->backtrack());
1564 break;
1565 case Guard::GEQ:
1566 DCHECK(!trace->mentions_reg(guard->reg()));
1567 macro_assembler->IfRegisterLT(guard->reg(),
1568 guard->value(),
1569 trace->backtrack());
1570 break;
1571 }
1572 }
1573
1574
1575 // Returns the number of characters in the equivalence class, omitting those
1576 // that cannot occur in the source string because it is Latin1.
1577 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1578 bool one_byte_subject,
1579 unibrow::uchar* letters) {
1580 int length =
1581 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1582 // Unibrow returns 0 or 1 for characters where case independence is
1583 // trivial.
1584 if (length == 0) {
1585 letters[0] = character;
1586 length = 1;
1587 }
1588
1589 if (one_byte_subject) {
1590 int new_length = 0;
1591 for (int i = 0; i < length; i++) {
1592 if (letters[i] <= String::kMaxOneByteCharCode) {
1593 letters[new_length++] = letters[i];
1594 }
1595 }
1596 length = new_length;
1597 }
1598
1599 return length;
1600 }
1601
1602
1603 static inline bool EmitSimpleCharacter(Isolate* isolate,
1604 RegExpCompiler* compiler,
1605 uc16 c,
1606 Label* on_failure,
1607 int cp_offset,
1608 bool check,
1609 bool preloaded) {
1610 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1611 bool bound_checked = false;
1612 if (!preloaded) {
1613 assembler->LoadCurrentCharacter(
1614 cp_offset,
1615 on_failure,
1616 check);
1617 bound_checked = true;
1618 }
1619 assembler->CheckNotCharacter(c, on_failure);
1620 return bound_checked;
1621 }
1622
1623
1624 // Only emits non-letters (things that don't have case). Only used for case
1625 // independent matches.
1626 static inline bool EmitAtomNonLetter(Isolate* isolate,
1627 RegExpCompiler* compiler,
1628 uc16 c,
1629 Label* on_failure,
1630 int cp_offset,
1631 bool check,
1632 bool preloaded) {
1633 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1634 bool one_byte = compiler->one_byte();
1635 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1636 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1637 if (length < 1) {
1638 // This can't match. Must be an one-byte subject and a non-one-byte
1639 // character. We do not need to do anything since the one-byte pass
1640 // already handled this.
1641 return false; // Bounds not checked.
1642 }
1643 bool checked = false;
1644 // We handle the length > 1 case in a later pass.
1645 if (length == 1) {
1646 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1647 // Can't match - see above.
1648 return false; // Bounds not checked.
1649 }
1650 if (!preloaded) {
1651 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1652 checked = check;
1653 }
1654 macro_assembler->CheckNotCharacter(c, on_failure);
1655 }
1656 return checked;
1657 }
1658
1659
1660 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1661 bool one_byte, uc16 c1, uc16 c2,
1662 Label* on_failure) {
1663 uc16 char_mask;
1664 if (one_byte) {
1665 char_mask = String::kMaxOneByteCharCode;
1666 } else {
1667 char_mask = String::kMaxUtf16CodeUnit;
1668 }
1669 uc16 exor = c1 ^ c2;
1670 // Check whether exor has only one bit set.
1671 if (((exor - 1) & exor) == 0) {
1672 // If c1 and c2 differ only by one bit.
1673 // Ecma262UnCanonicalize always gives the highest number last.
1674 DCHECK(c2 > c1);
1675 uc16 mask = char_mask ^ exor;
1676 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1677 return true;
1678 }
1679 DCHECK(c2 > c1);
1680 uc16 diff = c2 - c1;
1681 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1682 // If the characters differ by 2^n but don't differ by one bit then
1683 // subtract the difference from the found character, then do the or
1684 // trick. We avoid the theoretical case where negative numbers are
1685 // involved in order to simplify code generation.
1686 uc16 mask = char_mask ^ diff;
1687 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1688 diff,
1689 mask,
1690 on_failure);
1691 return true;
1692 }
1693 return false;
1694 }
1695
1696
1697 typedef bool EmitCharacterFunction(Isolate* isolate,
1698 RegExpCompiler* compiler,
1699 uc16 c,
1700 Label* on_failure,
1701 int cp_offset,
1702 bool check,
1703 bool preloaded);
1704
1705 // Only emits letters (things that have case). Only used for case independent
1706 // matches.
1707 static inline bool EmitAtomLetter(Isolate* isolate,
1708 RegExpCompiler* compiler,
1709 uc16 c,
1710 Label* on_failure,
1711 int cp_offset,
1712 bool check,
1713 bool preloaded) {
1714 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1715 bool one_byte = compiler->one_byte();
1716 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1717 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1718 if (length <= 1) return false;
1719 // We may not need to check against the end of the input string
1720 // if this character lies before a character that matched.
1721 if (!preloaded) {
1722 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1723 }
1724 Label ok;
1725 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1726 switch (length) {
1727 case 2: {
1728 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1729 chars[1], on_failure)) {
1730 } else {
1731 macro_assembler->CheckCharacter(chars[0], &ok);
1732 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1733 macro_assembler->Bind(&ok);
1734 }
1735 break;
1736 }
1737 case 4:
1738 macro_assembler->CheckCharacter(chars[3], &ok);
1739 // Fall through!
1740 case 3:
1741 macro_assembler->CheckCharacter(chars[0], &ok);
1742 macro_assembler->CheckCharacter(chars[1], &ok);
1743 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1744 macro_assembler->Bind(&ok);
1745 break;
1746 default:
1747 UNREACHABLE();
1748 break;
1749 }
1750 return true;
1751 }
1752
1753
1754 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1755 int border,
1756 Label* fall_through,
1757 Label* above_or_equal,
1758 Label* below) {
1759 if (below != fall_through) {
1760 masm->CheckCharacterLT(border, below);
1761 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1762 } else {
1763 masm->CheckCharacterGT(border - 1, above_or_equal);
1764 }
1765 }
1766
1767
1768 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1769 int first,
1770 int last,
1771 Label* fall_through,
1772 Label* in_range,
1773 Label* out_of_range) {
1774 if (in_range == fall_through) {
1775 if (first == last) {
1776 masm->CheckNotCharacter(first, out_of_range);
1777 } else {
1778 masm->CheckCharacterNotInRange(first, last, out_of_range);
1779 }
1780 } else {
1781 if (first == last) {
1782 masm->CheckCharacter(first, in_range);
1783 } else {
1784 masm->CheckCharacterInRange(first, last, in_range);
1785 }
1786 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1787 }
1788 }
1789
1790
1791 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1792 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1793 static void EmitUseLookupTable(
1794 RegExpMacroAssembler* masm,
1795 ZoneList<int>* ranges,
1796 int start_index,
1797 int end_index,
1798 int min_char,
1799 Label* fall_through,
1800 Label* even_label,
1801 Label* odd_label) {
1802 static const int kSize = RegExpMacroAssembler::kTableSize;
1803 static const int kMask = RegExpMacroAssembler::kTableMask;
1804
1805 int base = (min_char & ~kMask);
1806 USE(base);
1807
1808 // Assert that everything is on one kTableSize page.
1809 for (int i = start_index; i <= end_index; i++) {
1810 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1811 }
1812 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1813
1814 char templ[kSize];
1815 Label* on_bit_set;
1816 Label* on_bit_clear;
1817 int bit;
1818 if (even_label == fall_through) {
1819 on_bit_set = odd_label;
1820 on_bit_clear = even_label;
1821 bit = 1;
1822 } else {
1823 on_bit_set = even_label;
1824 on_bit_clear = odd_label;
1825 bit = 0;
1826 }
1827 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1828 templ[i] = bit;
1829 }
1830 int j = 0;
1831 bit ^= 1;
1832 for (int i = start_index; i < end_index; i++) {
1833 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1834 templ[j] = bit;
1835 }
1836 bit ^= 1;
1837 }
1838 for (int i = j; i < kSize; i++) {
1839 templ[i] = bit;
1840 }
1841 Factory* factory = masm->isolate()->factory();
1842 // TODO(erikcorry): Cache these.
1843 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1844 for (int i = 0; i < kSize; i++) {
1845 ba->set(i, templ[i]);
1846 }
1847 masm->CheckBitInTable(ba, on_bit_set);
1848 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1849 }
1850
1851
1852 static void CutOutRange(RegExpMacroAssembler* masm,
1853 ZoneList<int>* ranges,
1854 int start_index,
1855 int end_index,
1856 int cut_index,
1857 Label* even_label,
1858 Label* odd_label) {
1859 bool odd = (((cut_index - start_index) & 1) == 1);
1860 Label* in_range_label = odd ? odd_label : even_label;
1861 Label dummy;
1862 EmitDoubleBoundaryTest(masm,
1863 ranges->at(cut_index),
1864 ranges->at(cut_index + 1) - 1,
1865 &dummy,
1866 in_range_label,
1867 &dummy);
1868 DCHECK(!dummy.is_linked());
1869 // Cut out the single range by rewriting the array. This creates a new
1870 // range that is a merger of the two ranges on either side of the one we
1871 // are cutting out. The oddity of the labels is preserved.
1872 for (int j = cut_index; j > start_index; j--) {
1873 ranges->at(j) = ranges->at(j - 1);
1874 }
1875 for (int j = cut_index + 1; j < end_index; j++) {
1876 ranges->at(j) = ranges->at(j + 1);
1877 }
1878 }
1879
1880
1881 // Unicode case. Split the search space into kSize spaces that are handled
1882 // with recursion.
1883 static void SplitSearchSpace(ZoneList<int>* ranges,
1884 int start_index,
1885 int end_index,
1886 int* new_start_index,
1887 int* new_end_index,
1888 int* border) {
1889 static const int kSize = RegExpMacroAssembler::kTableSize;
1890 static const int kMask = RegExpMacroAssembler::kTableMask;
1891
1892 int first = ranges->at(start_index);
1893 int last = ranges->at(end_index) - 1;
1894
1895 *new_start_index = start_index;
1896 *border = (ranges->at(start_index) & ~kMask) + kSize;
1897 while (*new_start_index < end_index) {
1898 if (ranges->at(*new_start_index) > *border) break;
1899 (*new_start_index)++;
1900 }
1901 // new_start_index is the index of the first edge that is beyond the
1902 // current kSize space.
1903
1904 // For very large search spaces we do a binary chop search of the non-Latin1
1905 // space instead of just going to the end of the current kSize space. The
1906 // heuristics are complicated a little by the fact that any 128-character
1907 // encoding space can be quickly tested with a table lookup, so we don't
1908 // wish to do binary chop search at a smaller granularity than that. A
1909 // 128-character space can take up a lot of space in the ranges array if,
1910 // for example, we only want to match every second character (eg. the lower
1911 // case characters on some Unicode pages).
1912 int binary_chop_index = (end_index + start_index) / 2;
1913 // The first test ensures that we get to the code that handles the Latin1
1914 // range with a single not-taken branch, speeding up this important
1915 // character range (even non-Latin1 charset-based text has spaces and
1916 // punctuation).
1917 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1918 end_index - start_index > (*new_start_index - start_index) * 2 &&
1919 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1920 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1921 int scan_forward_for_section_border = binary_chop_index;;
1922 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1923
1924 while (scan_forward_for_section_border < end_index) {
1925 if (ranges->at(scan_forward_for_section_border) > new_border) {
1926 *new_start_index = scan_forward_for_section_border;
1927 *border = new_border;
1928 break;
1929 }
1930 scan_forward_for_section_border++;
1931 }
1932 }
1933
1934 DCHECK(*new_start_index > start_index);
1935 *new_end_index = *new_start_index - 1;
1936 if (ranges->at(*new_end_index) == *border) {
1937 (*new_end_index)--;
1938 }
1939 if (*border >= ranges->at(end_index)) {
1940 *border = ranges->at(end_index);
1941 *new_start_index = end_index; // Won't be used.
1942 *new_end_index = end_index - 1;
1943 }
1944 }
1945
1946
1947 // Gets a series of segment boundaries representing a character class. If the
1948 // character is in the range between an even and an odd boundary (counting from
1949 // start_index) then go to even_label, otherwise go to odd_label. We already
1950 // know that the character is in the range of min_char to max_char inclusive.
1951 // Either label can be NULL indicating backtracking. Either label can also be
1952 // equal to the fall_through label.
1953 static void GenerateBranches(RegExpMacroAssembler* masm,
1954 ZoneList<int>* ranges,
1955 int start_index,
1956 int end_index,
1957 uc16 min_char,
1958 uc16 max_char,
1959 Label* fall_through,
1960 Label* even_label,
1961 Label* odd_label) {
1962 int first = ranges->at(start_index);
1963 int last = ranges->at(end_index) - 1;
1964
1965 DCHECK_LT(min_char, first);
1966
1967 // Just need to test if the character is before or on-or-after
1968 // a particular character.
1969 if (start_index == end_index) {
1970 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1971 return;
1972 }
1973
1974 // Another almost trivial case: There is one interval in the middle that is
1975 // different from the end intervals.
1976 if (start_index + 1 == end_index) {
1977 EmitDoubleBoundaryTest(
1978 masm, first, last, fall_through, even_label, odd_label);
1979 return;
1980 }
1981
1982 // It's not worth using table lookup if there are very few intervals in the
1983 // character class.
1984 if (end_index - start_index <= 6) {
1985 // It is faster to test for individual characters, so we look for those
1986 // first, then try arbitrary ranges in the second round.
1987 static int kNoCutIndex = -1;
1988 int cut = kNoCutIndex;
1989 for (int i = start_index; i < end_index; i++) {
1990 if (ranges->at(i) == ranges->at(i + 1) - 1) {
1991 cut = i;
1992 break;
1993 }
1994 }
1995 if (cut == kNoCutIndex) cut = start_index;
1996 CutOutRange(
1997 masm, ranges, start_index, end_index, cut, even_label, odd_label);
1998 DCHECK_GE(end_index - start_index, 2);
1999 GenerateBranches(masm,
2000 ranges,
2001 start_index + 1,
2002 end_index - 1,
2003 min_char,
2004 max_char,
2005 fall_through,
2006 even_label,
2007 odd_label);
2008 return;
2009 }
2010
2011 // If there are a lot of intervals in the regexp, then we will use tables to
2012 // determine whether the character is inside or outside the character class.
2013 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2014
2015 if ((max_char >> kBits) == (min_char >> kBits)) {
2016 EmitUseLookupTable(masm,
2017 ranges,
2018 start_index,
2019 end_index,
2020 min_char,
2021 fall_through,
2022 even_label,
2023 odd_label);
2024 return;
2025 }
2026
2027 if ((min_char >> kBits) != (first >> kBits)) {
2028 masm->CheckCharacterLT(first, odd_label);
2029 GenerateBranches(masm,
2030 ranges,
2031 start_index + 1,
2032 end_index,
2033 first,
2034 max_char,
2035 fall_through,
2036 odd_label,
2037 even_label);
2038 return;
2039 }
2040
2041 int new_start_index = 0;
2042 int new_end_index = 0;
2043 int border = 0;
2044
2045 SplitSearchSpace(ranges,
2046 start_index,
2047 end_index,
2048 &new_start_index,
2049 &new_end_index,
2050 &border);
2051
2052 Label handle_rest;
2053 Label* above = &handle_rest;
2054 if (border == last + 1) {
2055 // We didn't find any section that started after the limit, so everything
2056 // above the border is one of the terminal labels.
2057 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2058 DCHECK(new_end_index == end_index - 1);
2059 }
2060
2061 DCHECK_LE(start_index, new_end_index);
2062 DCHECK_LE(new_start_index, end_index);
2063 DCHECK_LT(start_index, new_start_index);
2064 DCHECK_LT(new_end_index, end_index);
2065 DCHECK(new_end_index + 1 == new_start_index ||
2066 (new_end_index + 2 == new_start_index &&
2067 border == ranges->at(new_end_index + 1)));
2068 DCHECK_LT(min_char, border - 1);
2069 DCHECK_LT(border, max_char);
2070 DCHECK_LT(ranges->at(new_end_index), border);
2071 DCHECK(border < ranges->at(new_start_index) ||
2072 (border == ranges->at(new_start_index) &&
2073 new_start_index == end_index &&
2074 new_end_index == end_index - 1 &&
2075 border == last + 1));
2076 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2077
2078 masm->CheckCharacterGT(border - 1, above);
2079 Label dummy;
2080 GenerateBranches(masm,
2081 ranges,
2082 start_index,
2083 new_end_index,
2084 min_char,
2085 border - 1,
2086 &dummy,
2087 even_label,
2088 odd_label);
2089 if (handle_rest.is_linked()) {
2090 masm->Bind(&handle_rest);
2091 bool flip = (new_start_index & 1) != (start_index & 1);
2092 GenerateBranches(masm,
2093 ranges,
2094 new_start_index,
2095 end_index,
2096 border,
2097 max_char,
2098 &dummy,
2099 flip ? odd_label : even_label,
2100 flip ? even_label : odd_label);
2101 }
2102 }
2103
2104
2105 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2106 RegExpCharacterClass* cc, bool one_byte,
2107 Label* on_failure, int cp_offset, bool check_offset,
2108 bool preloaded, Zone* zone) {
2109 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2110 if (!CharacterRange::IsCanonical(ranges)) {
2111 CharacterRange::Canonicalize(ranges);
2112 }
2113
2114 int max_char;
2115 if (one_byte) {
2116 max_char = String::kMaxOneByteCharCode;
2117 } else {
2118 max_char = String::kMaxUtf16CodeUnit;
2119 }
2120
2121 int range_count = ranges->length();
2122
2123 int last_valid_range = range_count - 1;
2124 while (last_valid_range >= 0) {
2125 CharacterRange& range = ranges->at(last_valid_range);
2126 if (range.from() <= max_char) {
2127 break;
2128 }
2129 last_valid_range--;
2130 }
2131
2132 if (last_valid_range < 0) {
2133 if (!cc->is_negated()) {
2134 macro_assembler->GoTo(on_failure);
2135 }
2136 if (check_offset) {
2137 macro_assembler->CheckPosition(cp_offset, on_failure);
2138 }
2139 return;
2140 }
2141
2142 if (last_valid_range == 0 &&
2143 ranges->at(0).IsEverything(max_char)) {
2144 if (cc->is_negated()) {
2145 macro_assembler->GoTo(on_failure);
2146 } else {
2147 // This is a common case hit by non-anchored expressions.
2148 if (check_offset) {
2149 macro_assembler->CheckPosition(cp_offset, on_failure);
2150 }
2151 }
2152 return;
2153 }
2154 if (last_valid_range == 0 &&
2155 !cc->is_negated() &&
2156 ranges->at(0).IsEverything(max_char)) {
2157 // This is a common case hit by non-anchored expressions.
2158 if (check_offset) {
2159 macro_assembler->CheckPosition(cp_offset, on_failure);
2160 }
2161 return;
2162 }
2163
2164 if (!preloaded) {
2165 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2166 }
2167
2168 if (cc->is_standard(zone) &&
2169 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2170 on_failure)) {
2171 return;
2172 }
2173
2174
2175 // A new list with ascending entries. Each entry is a code unit
2176 // where there is a boundary between code units that are part of
2177 // the class and code units that are not. Normally we insert an
2178 // entry at zero which goes to the failure label, but if there
2179 // was already one there we fall through for success on that entry.
2180 // Subsequent entries have alternating meaning (success/failure).
2181 ZoneList<int>* range_boundaries =
2182 new(zone) ZoneList<int>(last_valid_range, zone);
2183
2184 bool zeroth_entry_is_failure = !cc->is_negated();
2185
2186 for (int i = 0; i <= last_valid_range; i++) {
2187 CharacterRange& range = ranges->at(i);
2188 if (range.from() == 0) {
2189 DCHECK_EQ(i, 0);
2190 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2191 } else {
2192 range_boundaries->Add(range.from(), zone);
2193 }
2194 range_boundaries->Add(range.to() + 1, zone);
2195 }
2196 int end_index = range_boundaries->length() - 1;
2197 if (range_boundaries->at(end_index) > max_char) {
2198 end_index--;
2199 }
2200
2201 Label fall_through;
2202 GenerateBranches(macro_assembler,
2203 range_boundaries,
2204 0, // start_index.
2205 end_index,
2206 0, // min_char.
2207 max_char,
2208 &fall_through,
2209 zeroth_entry_is_failure ? &fall_through : on_failure,
2210 zeroth_entry_is_failure ? on_failure : &fall_through);
2211 macro_assembler->Bind(&fall_through);
2212 }
2213
2214
2215 RegExpNode::~RegExpNode() {
2216 }
2217
2218
2219 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2220 Trace* trace) {
2221 // If we are generating a greedy loop then don't stop and don't reuse code.
2222 if (trace->stop_node() != NULL) {
2223 return CONTINUE;
2224 }
2225
2226 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2227 if (trace->is_trivial()) {
2228 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2229 // If a generic version is already scheduled to be generated or we have
2230 // recursed too deeply then just generate a jump to that code.
2231 macro_assembler->GoTo(&label_);
2232 // This will queue it up for generation of a generic version if it hasn't
2233 // already been queued.
2234 compiler->AddWork(this);
2235 return DONE;
2236 }
2237 // Generate generic version of the node and bind the label for later use.
2238 macro_assembler->Bind(&label_);
2239 return CONTINUE;
2240 }
2241
2242 // We are being asked to make a non-generic version. Keep track of how many
2243 // non-generic versions we generate so as not to overdo it.
2244 trace_count_++;
2245 if (KeepRecursing(compiler) && compiler->optimize() &&
2246 trace_count_ < kMaxCopiesCodeGenerated) {
2247 return CONTINUE;
2248 }
2249
2250 // If we get here code has been generated for this node too many times or
2251 // recursion is too deep. Time to switch to a generic version. The code for
2252 // generic versions above can handle deep recursion properly.
2253 bool was_limiting = compiler->limiting_recursion();
2254 compiler->set_limiting_recursion(true);
2255 trace->Flush(compiler, this);
2256 compiler->set_limiting_recursion(was_limiting);
2257 return DONE;
2258 }
2259
2260
2261 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2262 return !compiler->limiting_recursion() &&
2263 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2264 }
2265
2266
2267 int ActionNode::EatsAtLeast(int still_to_find,
2268 int budget,
2269 bool not_at_start) {
2270 if (budget <= 0) return 0;
2271 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2272 return on_success()->EatsAtLeast(still_to_find,
2273 budget - 1,
2274 not_at_start);
2275 }
2276
2277
2278 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2279 BoyerMooreLookahead* bm, bool not_at_start) {
2280 if (action_type_ == BEGIN_SUBMATCH) {
2281 bm->SetRest(offset);
2282 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2283 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2284 }
2285 SaveBMInfo(bm, not_at_start, offset);
2286 }
2287
2288
2289 int AssertionNode::EatsAtLeast(int still_to_find,
2290 int budget,
2291 bool not_at_start) {
2292 if (budget <= 0) return 0;
2293 // If we know we are not at the start and we are asked "how many characters
2294 // will you match if you succeed?" then we can answer anything since false
2295 // implies false. So lets just return the max answer (still_to_find) since
2296 // that won't prevent us from preloading a lot of characters for the other
2297 // branches in the node graph.
2298 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2299 return on_success()->EatsAtLeast(still_to_find,
2300 budget - 1,
2301 not_at_start);
2302 }
2303
2304
2305 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2306 BoyerMooreLookahead* bm, bool not_at_start) {
2307 // Match the behaviour of EatsAtLeast on this node.
2308 if (assertion_type() == AT_START && not_at_start) return;
2309 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2310 SaveBMInfo(bm, not_at_start, offset);
2311 }
2312
2313
2314 int BackReferenceNode::EatsAtLeast(int still_to_find,
2315 int budget,
2316 bool not_at_start) {
2317 if (budget <= 0) return 0;
2318 return on_success()->EatsAtLeast(still_to_find,
2319 budget - 1,
2320 not_at_start);
2321 }
2322
2323
2324 int TextNode::EatsAtLeast(int still_to_find,
2325 int budget,
2326 bool not_at_start) {
2327 int answer = Length();
2328 if (answer >= still_to_find) return answer;
2329 if (budget <= 0) return answer;
2330 // We are not at start after this node so we set the last argument to 'true'.
2331 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2332 budget - 1,
2333 true);
2334 }
2335
2336
2337 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
2338 int budget,
2339 bool not_at_start) {
2340 if (budget <= 0) return 0;
2341 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2342 // afterwards.
2343 RegExpNode* node = alternatives_->at(1).node();
2344 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2345 }
2346
2347
2348 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2349 QuickCheckDetails* details,
2350 RegExpCompiler* compiler,
2351 int filled_in,
2352 bool not_at_start) {
2353 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2354 // afterwards.
2355 RegExpNode* node = alternatives_->at(1).node();
2356 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2357 }
2358
2359
2360 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2361 int budget,
2362 RegExpNode* ignore_this_node,
2363 bool not_at_start) {
2364 if (budget <= 0) return 0;
2365 int min = 100;
2366 int choice_count = alternatives_->length();
2367 budget = (budget - 1) / choice_count;
2368 for (int i = 0; i < choice_count; i++) {
2369 RegExpNode* node = alternatives_->at(i).node();
2370 if (node == ignore_this_node) continue;
2371 int node_eats_at_least =
2372 node->EatsAtLeast(still_to_find, budget, not_at_start);
2373 if (node_eats_at_least < min) min = node_eats_at_least;
2374 if (min == 0) return 0;
2375 }
2376 return min;
2377 }
2378
2379
2380 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2381 int budget,
2382 bool not_at_start) {
2383 return EatsAtLeastHelper(still_to_find,
2384 budget - 1,
2385 loop_node_,
2386 not_at_start);
2387 }
2388
2389
2390 int ChoiceNode::EatsAtLeast(int still_to_find,
2391 int budget,
2392 bool not_at_start) {
2393 return EatsAtLeastHelper(still_to_find,
2394 budget,
2395 NULL,
2396 not_at_start);
2397 }
2398
2399
2400 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2401 static inline uint32_t SmearBitsRight(uint32_t v) {
2402 v |= v >> 1;
2403 v |= v >> 2;
2404 v |= v >> 4;
2405 v |= v >> 8;
2406 v |= v >> 16;
2407 return v;
2408 }
2409
2410
2411 bool QuickCheckDetails::Rationalize(bool asc) {
2412 bool found_useful_op = false;
2413 uint32_t char_mask;
2414 if (asc) {
2415 char_mask = String::kMaxOneByteCharCode;
2416 } else {
2417 char_mask = String::kMaxUtf16CodeUnit;
2418 }
2419 mask_ = 0;
2420 value_ = 0;
2421 int char_shift = 0;
2422 for (int i = 0; i < characters_; i++) {
2423 Position* pos = &positions_[i];
2424 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2425 found_useful_op = true;
2426 }
2427 mask_ |= (pos->mask & char_mask) << char_shift;
2428 value_ |= (pos->value & char_mask) << char_shift;
2429 char_shift += asc ? 8 : 16;
2430 }
2431 return found_useful_op;
2432 }
2433
2434
2435 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2436 Trace* bounds_check_trace,
2437 Trace* trace,
2438 bool preload_has_checked_bounds,
2439 Label* on_possible_success,
2440 QuickCheckDetails* details,
2441 bool fall_through_on_failure) {
2442 if (details->characters() == 0) return false;
2443 GetQuickCheckDetails(
2444 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2445 if (details->cannot_match()) return false;
2446 if (!details->Rationalize(compiler->one_byte())) return false;
2447 DCHECK(details->characters() == 1 ||
2448 compiler->macro_assembler()->CanReadUnaligned());
2449 uint32_t mask = details->mask();
2450 uint32_t value = details->value();
2451
2452 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2453
2454 if (trace->characters_preloaded() != details->characters()) {
2455 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2456 // We are attempting to preload the minimum number of characters
2457 // any choice would eat, so if the bounds check fails, then none of the
2458 // choices can succeed, so we can just immediately backtrack, rather
2459 // than go to the next choice.
2460 assembler->LoadCurrentCharacter(trace->cp_offset(),
2461 bounds_check_trace->backtrack(),
2462 !preload_has_checked_bounds,
2463 details->characters());
2464 }
2465
2466
2467 bool need_mask = true;
2468
2469 if (details->characters() == 1) {
2470 // If number of characters preloaded is 1 then we used a byte or 16 bit
2471 // load so the value is already masked down.
2472 uint32_t char_mask;
2473 if (compiler->one_byte()) {
2474 char_mask = String::kMaxOneByteCharCode;
2475 } else {
2476 char_mask = String::kMaxUtf16CodeUnit;
2477 }
2478 if ((mask & char_mask) == char_mask) need_mask = false;
2479 mask &= char_mask;
2480 } else {
2481 // For 2-character preloads in one-byte mode or 1-character preloads in
2482 // two-byte mode we also use a 16 bit load with zero extend.
2483 if (details->characters() == 2 && compiler->one_byte()) {
2484 if ((mask & 0xffff) == 0xffff) need_mask = false;
2485 } else if (details->characters() == 1 && !compiler->one_byte()) {
2486 if ((mask & 0xffff) == 0xffff) need_mask = false;
2487 } else {
2488 if (mask == 0xffffffff) need_mask = false;
2489 }
2490 }
2491
2492 if (fall_through_on_failure) {
2493 if (need_mask) {
2494 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2495 } else {
2496 assembler->CheckCharacter(value, on_possible_success);
2497 }
2498 } else {
2499 if (need_mask) {
2500 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2501 } else {
2502 assembler->CheckNotCharacter(value, trace->backtrack());
2503 }
2504 }
2505 return true;
2506 }
2507
2508
2509 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2510 // super-class in the .h file).
2511 //
2512 // We iterate along the text object, building up for each character a
2513 // mask and value that can be used to test for a quick failure to match.
2514 // The masks and values for the positions will be combined into a single
2515 // machine word for the current character width in order to be used in
2516 // generating a quick check.
2517 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2518 RegExpCompiler* compiler,
2519 int characters_filled_in,
2520 bool not_at_start) {
2521 Isolate* isolate = compiler->macro_assembler()->isolate();
2522 DCHECK(characters_filled_in < details->characters());
2523 int characters = details->characters();
2524 int char_mask;
2525 if (compiler->one_byte()) {
2526 char_mask = String::kMaxOneByteCharCode;
2527 } else {
2528 char_mask = String::kMaxUtf16CodeUnit;
2529 }
2530 for (int k = 0; k < elms_->length(); k++) {
2531 TextElement elm = elms_->at(k);
2532 if (elm.text_type() == TextElement::ATOM) {
2533 Vector<const uc16> quarks = elm.atom()->data();
2534 for (int i = 0; i < characters && i < quarks.length(); i++) {
2535 QuickCheckDetails::Position* pos =
2536 details->positions(characters_filled_in);
2537 uc16 c = quarks[i];
2538 if (compiler->ignore_case()) {
2539 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2540 int length = GetCaseIndependentLetters(isolate, c,
2541 compiler->one_byte(), chars);
2542 if (length == 0) {
2543 // This can happen because all case variants are non-Latin1, but we
2544 // know the input is Latin1.
2545 details->set_cannot_match();
2546 pos->determines_perfectly = false;
2547 return;
2548 }
2549 if (length == 1) {
2550 // This letter has no case equivalents, so it's nice and simple
2551 // and the mask-compare will determine definitely whether we have
2552 // a match at this character position.
2553 pos->mask = char_mask;
2554 pos->value = c;
2555 pos->determines_perfectly = true;
2556 } else {
2557 uint32_t common_bits = char_mask;
2558 uint32_t bits = chars[0];
2559 for (int j = 1; j < length; j++) {
2560 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2561 common_bits ^= differing_bits;
2562 bits &= common_bits;
2563 }
2564 // If length is 2 and common bits has only one zero in it then
2565 // our mask and compare instruction will determine definitely
2566 // whether we have a match at this character position. Otherwise
2567 // it can only be an approximate check.
2568 uint32_t one_zero = (common_bits | ~char_mask);
2569 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2570 pos->determines_perfectly = true;
2571 }
2572 pos->mask = common_bits;
2573 pos->value = bits;
2574 }
2575 } else {
2576 // Don't ignore case. Nice simple case where the mask-compare will
2577 // determine definitely whether we have a match at this character
2578 // position.
2579 if (c > char_mask) {
2580 details->set_cannot_match();
2581 pos->determines_perfectly = false;
2582 return;
2583 }
2584 pos->mask = char_mask;
2585 pos->value = c;
2586 pos->determines_perfectly = true;
2587 }
2588 characters_filled_in++;
2589 DCHECK(characters_filled_in <= details->characters());
2590 if (characters_filled_in == details->characters()) {
2591 return;
2592 }
2593 }
2594 } else {
2595 QuickCheckDetails::Position* pos =
2596 details->positions(characters_filled_in);
2597 RegExpCharacterClass* tree = elm.char_class();
2598 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2599 if (tree->is_negated()) {
2600 // A quick check uses multi-character mask and compare. There is no
2601 // useful way to incorporate a negative char class into this scheme
2602 // so we just conservatively create a mask and value that will always
2603 // succeed.
2604 pos->mask = 0;
2605 pos->value = 0;
2606 } else {
2607 int first_range = 0;
2608 while (ranges->at(first_range).from() > char_mask) {
2609 first_range++;
2610 if (first_range == ranges->length()) {
2611 details->set_cannot_match();
2612 pos->determines_perfectly = false;
2613 return;
2614 }
2615 }
2616 CharacterRange range = ranges->at(first_range);
2617 uc16 from = range.from();
2618 uc16 to = range.to();
2619 if (to > char_mask) {
2620 to = char_mask;
2621 }
2622 uint32_t differing_bits = (from ^ to);
2623 // A mask and compare is only perfect if the differing bits form a
2624 // number like 00011111 with one single block of trailing 1s.
2625 if ((differing_bits & (differing_bits + 1)) == 0 &&
2626 from + differing_bits == to) {
2627 pos->determines_perfectly = true;
2628 }
2629 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2630 uint32_t bits = (from & common_bits);
2631 for (int i = first_range + 1; i < ranges->length(); i++) {
2632 CharacterRange range = ranges->at(i);
2633 uc16 from = range.from();
2634 uc16 to = range.to();
2635 if (from > char_mask) continue;
2636 if (to > char_mask) to = char_mask;
2637 // Here we are combining more ranges into the mask and compare
2638 // value. With each new range the mask becomes more sparse and
2639 // so the chances of a false positive rise. A character class
2640 // with multiple ranges is assumed never to be equivalent to a
2641 // mask and compare operation.
2642 pos->determines_perfectly = false;
2643 uint32_t new_common_bits = (from ^ to);
2644 new_common_bits = ~SmearBitsRight(new_common_bits);
2645 common_bits &= new_common_bits;
2646 bits &= new_common_bits;
2647 uint32_t differing_bits = (from & common_bits) ^ bits;
2648 common_bits ^= differing_bits;
2649 bits &= common_bits;
2650 }
2651 pos->mask = common_bits;
2652 pos->value = bits;
2653 }
2654 characters_filled_in++;
2655 DCHECK(characters_filled_in <= details->characters());
2656 if (characters_filled_in == details->characters()) {
2657 return;
2658 }
2659 }
2660 }
2661 DCHECK(characters_filled_in != details->characters());
2662 if (!details->cannot_match()) {
2663 on_success()-> GetQuickCheckDetails(details,
2664 compiler,
2665 characters_filled_in,
2666 true);
2667 }
2668 }
2669
2670
2671 void QuickCheckDetails::Clear() {
2672 for (int i = 0; i < characters_; i++) {
2673 positions_[i].mask = 0;
2674 positions_[i].value = 0;
2675 positions_[i].determines_perfectly = false;
2676 }
2677 characters_ = 0;
2678 }
2679
2680
2681 void QuickCheckDetails::Advance(int by, bool one_byte) {
2682 DCHECK(by >= 0);
2683 if (by >= characters_) {
2684 Clear();
2685 return;
2686 }
2687 for (int i = 0; i < characters_ - by; i++) {
2688 positions_[i] = positions_[by + i];
2689 }
2690 for (int i = characters_ - by; i < characters_; i++) {
2691 positions_[i].mask = 0;
2692 positions_[i].value = 0;
2693 positions_[i].determines_perfectly = false;
2694 }
2695 characters_ -= by;
2696 // We could change mask_ and value_ here but we would never advance unless
2697 // they had already been used in a check and they won't be used again because
2698 // it would gain us nothing. So there's no point.
2699 }
2700
2701
2702 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2703 DCHECK(characters_ == other->characters_);
2704 if (other->cannot_match_) {
2705 return;
2706 }
2707 if (cannot_match_) {
2708 *this = *other;
2709 return;
2710 }
2711 for (int i = from_index; i < characters_; i++) {
2712 QuickCheckDetails::Position* pos = positions(i);
2713 QuickCheckDetails::Position* other_pos = other->positions(i);
2714 if (pos->mask != other_pos->mask ||
2715 pos->value != other_pos->value ||
2716 !other_pos->determines_perfectly) {
2717 // Our mask-compare operation will be approximate unless we have the
2718 // exact same operation on both sides of the alternation.
2719 pos->determines_perfectly = false;
2720 }
2721 pos->mask &= other_pos->mask;
2722 pos->value &= pos->mask;
2723 other_pos->value &= pos->mask;
2724 uc16 differing_bits = (pos->value ^ other_pos->value);
2725 pos->mask &= ~differing_bits;
2726 pos->value &= pos->mask;
2727 }
2728 }
2729
2730
2731 class VisitMarker {
2732 public:
2733 explicit VisitMarker(NodeInfo* info) : info_(info) {
2734 DCHECK(!info->visited);
2735 info->visited = true;
2736 }
2737 ~VisitMarker() {
2738 info_->visited = false;
2739 }
2740 private:
2741 NodeInfo* info_;
2742 };
2743
2744
2745 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2746 if (info()->replacement_calculated) return replacement();
2747 if (depth < 0) return this;
2748 DCHECK(!info()->visited);
2749 VisitMarker marker(info());
2750 return FilterSuccessor(depth - 1, ignore_case);
2751 }
2752
2753
2754 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2755 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2756 if (next == NULL) return set_replacement(NULL);
2757 on_success_ = next;
2758 return set_replacement(this);
2759 }
2760
2761
2762 // We need to check for the following characters: 0x39c 0x3bc 0x178.
2763 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2764 // TODO(dcarney): this could be a lot more efficient.
2765 return range.Contains(0x39c) ||
2766 range.Contains(0x3bc) || range.Contains(0x178);
2767 }
2768
2769
2770 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2771 for (int i = 0; i < ranges->length(); i++) {
2772 // TODO(dcarney): this could be a lot more efficient.
2773 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2774 }
2775 return false;
2776 }
2777
2778
2779 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2780 if (info()->replacement_calculated) return replacement();
2781 if (depth < 0) return this;
2782 DCHECK(!info()->visited);
2783 VisitMarker marker(info());
2784 int element_count = elms_->length();
2785 for (int i = 0; i < element_count; i++) {
2786 TextElement elm = elms_->at(i);
2787 if (elm.text_type() == TextElement::ATOM) {
2788 Vector<const uc16> quarks = elm.atom()->data();
2789 for (int j = 0; j < quarks.length(); j++) {
2790 uint16_t c = quarks[j];
2791 if (c <= String::kMaxOneByteCharCode) continue;
2792 if (!ignore_case) return set_replacement(NULL);
2793 // Here, we need to check for characters whose upper and lower cases
2794 // are outside the Latin-1 range.
2795 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2796 // Character is outside Latin-1 completely
2797 if (converted == 0) return set_replacement(NULL);
2798 // Convert quark to Latin-1 in place.
2799 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2800 copy[j] = converted;
2801 }
2802 } else {
2803 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2804 RegExpCharacterClass* cc = elm.char_class();
2805 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2806 if (!CharacterRange::IsCanonical(ranges)) {
2807 CharacterRange::Canonicalize(ranges);
2808 }
2809 // Now they are in order so we only need to look at the first.
2810 int range_count = ranges->length();
2811 if (cc->is_negated()) {
2812 if (range_count != 0 &&
2813 ranges->at(0).from() == 0 &&
2814 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2815 // This will be handled in a later filter.
2816 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2817 return set_replacement(NULL);
2818 }
2819 } else {
2820 if (range_count == 0 ||
2821 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2822 // This will be handled in a later filter.
2823 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2824 return set_replacement(NULL);
2825 }
2826 }
2827 }
2828 }
2829 return FilterSuccessor(depth - 1, ignore_case);
2830 }
2831
2832
2833 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2834 if (info()->replacement_calculated) return replacement();
2835 if (depth < 0) return this;
2836 if (info()->visited) return this;
2837 {
2838 VisitMarker marker(info());
2839
2840 RegExpNode* continue_replacement =
2841 continue_node_->FilterOneByte(depth - 1, ignore_case);
2842 // If we can't continue after the loop then there is no sense in doing the
2843 // loop.
2844 if (continue_replacement == NULL) return set_replacement(NULL);
2845 }
2846
2847 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2848 }
2849
2850
2851 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2852 if (info()->replacement_calculated) return replacement();
2853 if (depth < 0) return this;
2854 if (info()->visited) return this;
2855 VisitMarker marker(info());
2856 int choice_count = alternatives_->length();
2857
2858 for (int i = 0; i < choice_count; i++) {
2859 GuardedAlternative alternative = alternatives_->at(i);
2860 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2861 set_replacement(this);
2862 return this;
2863 }
2864 }
2865
2866 int surviving = 0;
2867 RegExpNode* survivor = NULL;
2868 for (int i = 0; i < choice_count; i++) {
2869 GuardedAlternative alternative = alternatives_->at(i);
2870 RegExpNode* replacement =
2871 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2872 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2873 if (replacement != NULL) {
2874 alternatives_->at(i).set_node(replacement);
2875 surviving++;
2876 survivor = replacement;
2877 }
2878 }
2879 if (surviving < 2) return set_replacement(survivor);
2880
2881 set_replacement(this);
2882 if (surviving == choice_count) {
2883 return this;
2884 }
2885 // Only some of the nodes survived the filtering. We need to rebuild the
2886 // alternatives list.
2887 ZoneList<GuardedAlternative>* new_alternatives =
2888 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2889 for (int i = 0; i < choice_count; i++) {
2890 RegExpNode* replacement =
2891 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2892 if (replacement != NULL) {
2893 alternatives_->at(i).set_node(replacement);
2894 new_alternatives->Add(alternatives_->at(i), zone());
2895 }
2896 }
2897 alternatives_ = new_alternatives;
2898 return this;
2899 }
2900
2901
2902 RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth,
2903 bool ignore_case) {
2904 if (info()->replacement_calculated) return replacement();
2905 if (depth < 0) return this;
2906 if (info()->visited) return this;
2907 VisitMarker marker(info());
2908 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2909 // afterwards.
2910 RegExpNode* node = alternatives_->at(1).node();
2911 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2912 if (replacement == NULL) return set_replacement(NULL);
2913 alternatives_->at(1).set_node(replacement);
2914
2915 RegExpNode* neg_node = alternatives_->at(0).node();
2916 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2917 // If the negative lookahead is always going to fail then
2918 // we don't need to check it.
2919 if (neg_replacement == NULL) return set_replacement(replacement);
2920 alternatives_->at(0).set_node(neg_replacement);
2921 return set_replacement(this);
2922 }
2923
2924
2925 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2926 RegExpCompiler* compiler,
2927 int characters_filled_in,
2928 bool not_at_start) {
2929 if (body_can_be_zero_length_ || info()->visited) return;
2930 VisitMarker marker(info());
2931 return ChoiceNode::GetQuickCheckDetails(details,
2932 compiler,
2933 characters_filled_in,
2934 not_at_start);
2935 }
2936
2937
2938 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2939 BoyerMooreLookahead* bm, bool not_at_start) {
2940 if (body_can_be_zero_length_ || budget <= 0) {
2941 bm->SetRest(offset);
2942 SaveBMInfo(bm, not_at_start, offset);
2943 return;
2944 }
2945 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2946 SaveBMInfo(bm, not_at_start, offset);
2947 }
2948
2949
2950 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2951 RegExpCompiler* compiler,
2952 int characters_filled_in,
2953 bool not_at_start) {
2954 not_at_start = (not_at_start || not_at_start_);
2955 int choice_count = alternatives_->length();
2956 DCHECK(choice_count > 0);
2957 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2958 compiler,
2959 characters_filled_in,
2960 not_at_start);
2961 for (int i = 1; i < choice_count; i++) {
2962 QuickCheckDetails new_details(details->characters());
2963 RegExpNode* node = alternatives_->at(i).node();
2964 node->GetQuickCheckDetails(&new_details, compiler,
2965 characters_filled_in,
2966 not_at_start);
2967 // Here we merge the quick match details of the two branches.
2968 details->Merge(&new_details, characters_filled_in);
2969 }
2970 }
2971
2972
2973 // Check for [0-9A-Z_a-z].
2974 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2975 Label* word,
2976 Label* non_word,
2977 bool fall_through_on_word) {
2978 if (assembler->CheckSpecialCharacterClass(
2979 fall_through_on_word ? 'w' : 'W',
2980 fall_through_on_word ? non_word : word)) {
2981 // Optimized implementation available.
2982 return;
2983 }
2984 assembler->CheckCharacterGT('z', non_word);
2985 assembler->CheckCharacterLT('0', non_word);
2986 assembler->CheckCharacterGT('a' - 1, word);
2987 assembler->CheckCharacterLT('9' + 1, word);
2988 assembler->CheckCharacterLT('A', non_word);
2989 assembler->CheckCharacterLT('Z' + 1, word);
2990 if (fall_through_on_word) {
2991 assembler->CheckNotCharacter('_', non_word);
2992 } else {
2993 assembler->CheckCharacter('_', word);
2994 }
2995 }
2996
2997
2998 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2999 // that matches newline or the start of input).
3000 static void EmitHat(RegExpCompiler* compiler,
3001 RegExpNode* on_success,
3002 Trace* trace) {
3003 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3004 // We will be loading the previous character into the current character
3005 // register.
3006 Trace new_trace(*trace);
3007 new_trace.InvalidateCurrentCharacter();
3008
3009 Label ok;
3010 if (new_trace.cp_offset() == 0) {
3011 // The start of input counts as a newline in this context, so skip to
3012 // ok if we are at the start.
3013 assembler->CheckAtStart(&ok);
3014 }
3015 // We already checked that we are not at the start of input so it must be
3016 // OK to load the previous character.
3017 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3018 new_trace.backtrack(),
3019 false);
3020 if (!assembler->CheckSpecialCharacterClass('n',
3021 new_trace.backtrack())) {
3022 // Newline means \n, \r, 0x2028 or 0x2029.
3023 if (!compiler->one_byte()) {
3024 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3025 }
3026 assembler->CheckCharacter('\n', &ok);
3027 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3028 }
3029 assembler->Bind(&ok);
3030 on_success->Emit(compiler, &new_trace);
3031 }
3032
3033
3034 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3035 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3036 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3037 Isolate* isolate = assembler->isolate();
3038 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3039 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3040 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3041 if (lookahead == NULL) {
3042 int eats_at_least =
3043 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3044 kRecursionBudget,
3045 not_at_start));
3046 if (eats_at_least >= 1) {
3047 BoyerMooreLookahead* bm =
3048 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3049 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3050 if (bm->at(0)->is_non_word())
3051 next_is_word_character = Trace::FALSE_VALUE;
3052 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3053 }
3054 } else {
3055 if (lookahead->at(0)->is_non_word())
3056 next_is_word_character = Trace::FALSE_VALUE;
3057 if (lookahead->at(0)->is_word())
3058 next_is_word_character = Trace::TRUE_VALUE;
3059 }
3060 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3061 if (next_is_word_character == Trace::UNKNOWN) {
3062 Label before_non_word;
3063 Label before_word;
3064 if (trace->characters_preloaded() != 1) {
3065 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3066 }
3067 // Fall through on non-word.
3068 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3069 // Next character is not a word character.
3070 assembler->Bind(&before_non_word);
3071 Label ok;
3072 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3073 assembler->GoTo(&ok);
3074
3075 assembler->Bind(&before_word);
3076 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3077 assembler->Bind(&ok);
3078 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3079 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3080 } else {
3081 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3082 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3083 }
3084 }
3085
3086
3087 void AssertionNode::BacktrackIfPrevious(
3088 RegExpCompiler* compiler,
3089 Trace* trace,
3090 AssertionNode::IfPrevious backtrack_if_previous) {
3091 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3092 Trace new_trace(*trace);
3093 new_trace.InvalidateCurrentCharacter();
3094
3095 Label fall_through, dummy;
3096
3097 Label* non_word = backtrack_if_previous == kIsNonWord ?
3098 new_trace.backtrack() :
3099 &fall_through;
3100 Label* word = backtrack_if_previous == kIsNonWord ?
3101 &fall_through :
3102 new_trace.backtrack();
3103
3104 if (new_trace.cp_offset() == 0) {
3105 // The start of input counts as a non-word character, so the question is
3106 // decided if we are at the start.
3107 assembler->CheckAtStart(non_word);
3108 }
3109 // We already checked that we are not at the start of input so it must be
3110 // OK to load the previous character.
3111 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3112 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3113
3114 assembler->Bind(&fall_through);
3115 on_success()->Emit(compiler, &new_trace);
3116 }
3117
3118
3119 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3120 RegExpCompiler* compiler,
3121 int filled_in,
3122 bool not_at_start) {
3123 if (assertion_type_ == AT_START && not_at_start) {
3124 details->set_cannot_match();
3125 return;
3126 }
3127 return on_success()->GetQuickCheckDetails(details,
3128 compiler,
3129 filled_in,
3130 not_at_start);
3131 }
3132
3133
3134 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3135 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3136 switch (assertion_type_) {
3137 case AT_END: {
3138 Label ok;
3139 assembler->CheckPosition(trace->cp_offset(), &ok);
3140 assembler->GoTo(trace->backtrack());
3141 assembler->Bind(&ok);
3142 break;
3143 }
3144 case AT_START: {
3145 if (trace->at_start() == Trace::FALSE_VALUE) {
3146 assembler->GoTo(trace->backtrack());
3147 return;
3148 }
3149 if (trace->at_start() == Trace::UNKNOWN) {
3150 assembler->CheckNotAtStart(trace->backtrack());
3151 Trace at_start_trace = *trace;
3152 at_start_trace.set_at_start(true);
3153 on_success()->Emit(compiler, &at_start_trace);
3154 return;
3155 }
3156 }
3157 break;
3158 case AFTER_NEWLINE:
3159 EmitHat(compiler, on_success(), trace);
3160 return;
3161 case AT_BOUNDARY:
3162 case AT_NON_BOUNDARY: {
3163 EmitBoundaryCheck(compiler, trace);
3164 return;
3165 }
3166 }
3167 on_success()->Emit(compiler, trace);
3168 }
3169
3170
3171 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3172 if (quick_check == NULL) return false;
3173 if (offset >= quick_check->characters()) return false;
3174 return quick_check->positions(offset)->determines_perfectly;
3175 }
3176
3177
3178 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3179 if (index > *checked_up_to) {
3180 *checked_up_to = index;
3181 }
3182 }
3183
3184
3185 // We call this repeatedly to generate code for each pass over the text node.
3186 // The passes are in increasing order of difficulty because we hope one
3187 // of the first passes will fail in which case we are saved the work of the
3188 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3189 // we will check the '%' in the first pass, the case independent 'a' in the
3190 // second pass and the character class in the last pass.
3191 //
3192 // The passes are done from right to left, so for example to test for /bar/
3193 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3194 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3195 // bounds check most of the time. In the example we only need to check for
3196 // end-of-input when loading the putative 'r'.
3197 //
3198 // A slight complication involves the fact that the first character may already
3199 // be fetched into a register by the previous node. In this case we want to
3200 // do the test for that character first. We do this in separate passes. The
3201 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3202 // pass has been performed then subsequent passes will have true in
3203 // first_element_checked to indicate that that character does not need to be
3204 // checked again.
3205 //
3206 // In addition to all this we are passed a Trace, which can
3207 // contain an AlternativeGeneration object. In this AlternativeGeneration
3208 // object we can see details of any quick check that was already passed in
3209 // order to get to the code we are now generating. The quick check can involve
3210 // loading characters, which means we do not need to recheck the bounds
3211 // up to the limit the quick check already checked. In addition the quick
3212 // check can have involved a mask and compare operation which may simplify
3213 // or obviate the need for further checks at some character positions.
3214 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3215 TextEmitPassType pass,
3216 bool preloaded,
3217 Trace* trace,
3218 bool first_element_checked,
3219 int* checked_up_to) {
3220 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3221 Isolate* isolate = assembler->isolate();
3222 bool one_byte = compiler->one_byte();
3223 Label* backtrack = trace->backtrack();
3224 QuickCheckDetails* quick_check = trace->quick_check_performed();
3225 int element_count = elms_->length();
3226 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3227 TextElement elm = elms_->at(i);
3228 int cp_offset = trace->cp_offset() + elm.cp_offset();
3229 if (elm.text_type() == TextElement::ATOM) {
3230 Vector<const uc16> quarks = elm.atom()->data();
3231 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3232 if (first_element_checked && i == 0 && j == 0) continue;
3233 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3234 EmitCharacterFunction* emit_function = NULL;
3235 switch (pass) {
3236 case NON_LATIN1_MATCH:
3237 DCHECK(one_byte);
3238 if (quarks[j] > String::kMaxOneByteCharCode) {
3239 assembler->GoTo(backtrack);
3240 return;
3241 }
3242 break;
3243 case NON_LETTER_CHARACTER_MATCH:
3244 emit_function = &EmitAtomNonLetter;
3245 break;
3246 case SIMPLE_CHARACTER_MATCH:
3247 emit_function = &EmitSimpleCharacter;
3248 break;
3249 case CASE_CHARACTER_MATCH:
3250 emit_function = &EmitAtomLetter;
3251 break;
3252 default:
3253 break;
3254 }
3255 if (emit_function != NULL) {
3256 bool bound_checked = emit_function(isolate,
3257 compiler,
3258 quarks[j],
3259 backtrack,
3260 cp_offset + j,
3261 *checked_up_to < cp_offset + j,
3262 preloaded);
3263 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3264 }
3265 }
3266 } else {
3267 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3268 if (pass == CHARACTER_CLASS_MATCH) {
3269 if (first_element_checked && i == 0) continue;
3270 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3271 RegExpCharacterClass* cc = elm.char_class();
3272 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3273 *checked_up_to < cp_offset, preloaded, zone());
3274 UpdateBoundsCheck(cp_offset, checked_up_to);
3275 }
3276 }
3277 }
3278 }
3279
3280
3281 int TextNode::Length() {
3282 TextElement elm = elms_->last();
3283 DCHECK(elm.cp_offset() >= 0);
3284 return elm.cp_offset() + elm.length();
3285 }
3286
3287
3288 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3289 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3290 if (ignore_case) {
3291 return pass == SIMPLE_CHARACTER_MATCH;
3292 } else {
3293 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3294 }
3295 }
3296
3297
3298 // This generates the code to match a text node. A text node can contain
3299 // straight character sequences (possibly to be matched in a case-independent
3300 // way) and character classes. For efficiency we do not do this in a single
3301 // pass from left to right. Instead we pass over the text node several times,
3302 // emitting code for some character positions every time. See the comment on
3303 // TextEmitPass for details.
3304 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3305 LimitResult limit_result = LimitVersions(compiler, trace);
3306 if (limit_result == DONE) return;
3307 DCHECK(limit_result == CONTINUE);
3308
3309 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3310 compiler->SetRegExpTooBig();
3311 return;
3312 }
3313
3314 if (compiler->one_byte()) {
3315 int dummy = 0;
3316 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3317 }
3318
3319 bool first_elt_done = false;
3320 int bound_checked_to = trace->cp_offset() - 1;
3321 bound_checked_to += trace->bound_checked_up_to();
3322
3323 // If a character is preloaded into the current character register then
3324 // check that now.
3325 if (trace->characters_preloaded() == 1) {
3326 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3327 if (!SkipPass(pass, compiler->ignore_case())) {
3328 TextEmitPass(compiler,
3329 static_cast<TextEmitPassType>(pass),
3330 true,
3331 trace,
3332 false,
3333 &bound_checked_to);
3334 }
3335 }
3336 first_elt_done = true;
3337 }
3338
3339 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3340 if (!SkipPass(pass, compiler->ignore_case())) {
3341 TextEmitPass(compiler,
3342 static_cast<TextEmitPassType>(pass),
3343 false,
3344 trace,
3345 first_elt_done,
3346 &bound_checked_to);
3347 }
3348 }
3349
3350 Trace successor_trace(*trace);
3351 successor_trace.set_at_start(false);
3352 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
3353 RecursionCheck rc(compiler);
3354 on_success()->Emit(compiler, &successor_trace);
3355 }
3356
3357
3358 void Trace::InvalidateCurrentCharacter() {
3359 characters_preloaded_ = 0;
3360 }
3361
3362
3363 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3364 DCHECK(by > 0);
3365 // We don't have an instruction for shifting the current character register
3366 // down or for using a shifted value for anything so lets just forget that
3367 // we preloaded any characters into it.
3368 characters_preloaded_ = 0;
3369 // Adjust the offsets of the quick check performed information. This
3370 // information is used to find out what we already determined about the
3371 // characters by means of mask and compare.
3372 quick_check_performed_.Advance(by, compiler->one_byte());
3373 cp_offset_ += by;
3374 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3375 compiler->SetRegExpTooBig();
3376 cp_offset_ = 0;
3377 }
3378 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3379 }
3380
3381
3382 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3383 int element_count = elms_->length();
3384 for (int i = 0; i < element_count; i++) {
3385 TextElement elm = elms_->at(i);
3386 if (elm.text_type() == TextElement::CHAR_CLASS) {
3387 RegExpCharacterClass* cc = elm.char_class();
3388 // None of the standard character classes is different in the case
3389 // independent case and it slows us down if we don't know that.
3390 if (cc->is_standard(zone())) continue;
3391 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3392 int range_count = ranges->length();
3393 for (int j = 0; j < range_count; j++) {
3394 ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
3395 }
3396 }
3397 }
3398 }
3399
3400
3401 int TextNode::GreedyLoopTextLength() {
3402 TextElement elm = elms_->at(elms_->length() - 1);
3403 return elm.cp_offset() + elm.length();
3404 }
3405
3406
3407 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3408 RegExpCompiler* compiler) {
3409 if (elms_->length() != 1) return NULL;
3410 TextElement elm = elms_->at(0);
3411 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3412 RegExpCharacterClass* node = elm.char_class();
3413 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3414 if (!CharacterRange::IsCanonical(ranges)) {
3415 CharacterRange::Canonicalize(ranges);
3416 }
3417 if (node->is_negated()) {
3418 return ranges->length() == 0 ? on_success() : NULL;
3419 }
3420 if (ranges->length() != 1) return NULL;
3421 uint32_t max_char;
3422 if (compiler->one_byte()) {
3423 max_char = String::kMaxOneByteCharCode;
3424 } else {
3425 max_char = String::kMaxUtf16CodeUnit;
3426 }
3427 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3428 }
3429
3430
3431 // Finds the fixed match length of a sequence of nodes that goes from
3432 // this alternative and back to this choice node. If there are variable
3433 // length nodes or other complications in the way then return a sentinel
3434 // value indicating that a greedy loop cannot be constructed.
3435 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3436 GuardedAlternative* alternative) {
3437 int length = 0;
3438 RegExpNode* node = alternative->node();
3439 // Later we will generate code for all these text nodes using recursion
3440 // so we have to limit the max number.
3441 int recursion_depth = 0;
3442 while (node != this) {
3443 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3444 return kNodeIsTooComplexForGreedyLoops;
3445 }
3446 int node_length = node->GreedyLoopTextLength();
3447 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3448 return kNodeIsTooComplexForGreedyLoops;
3449 }
3450 length += node_length;
3451 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3452 node = seq_node->on_success();
3453 }
3454 return length;
3455 }
3456
3457
3458 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3459 DCHECK_NULL(loop_node_);
3460 AddAlternative(alt);
3461 loop_node_ = alt.node();
3462 }
3463
3464
3465 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3466 DCHECK_NULL(continue_node_);
3467 AddAlternative(alt);
3468 continue_node_ = alt.node();
3469 }
3470
3471
3472 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3473 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3474 if (trace->stop_node() == this) {
3475 // Back edge of greedy optimized loop node graph.
3476 int text_length =
3477 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3478 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3479 // Update the counter-based backtracking info on the stack. This is an
3480 // optimization for greedy loops (see below).
3481 DCHECK(trace->cp_offset() == text_length);
3482 macro_assembler->AdvanceCurrentPosition(text_length);
3483 macro_assembler->GoTo(trace->loop_label());
3484 return;
3485 }
3486 DCHECK_NULL(trace->stop_node());
3487 if (!trace->is_trivial()) {
3488 trace->Flush(compiler, this);
3489 return;
3490 }
3491 ChoiceNode::Emit(compiler, trace);
3492 }
3493
3494
3495 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3496 int eats_at_least) {
3497 int preload_characters = Min(4, eats_at_least);
3498 if (compiler->macro_assembler()->CanReadUnaligned()) {
3499 bool one_byte = compiler->one_byte();
3500 if (one_byte) {
3501 if (preload_characters > 4) preload_characters = 4;
3502 // We can't preload 3 characters because there is no machine instruction
3503 // to do that. We can't just load 4 because we could be reading
3504 // beyond the end of the string, which could cause a memory fault.
3505 if (preload_characters == 3) preload_characters = 2;
3506 } else {
3507 if (preload_characters > 2) preload_characters = 2;
3508 }
3509 } else {
3510 if (preload_characters > 1) preload_characters = 1;
3511 }
3512 return preload_characters;
3513 }
3514
3515
3516 // This class is used when generating the alternatives in a choice node. It
3517 // records the way the alternative is being code generated.
3518 class AlternativeGeneration: public Malloced {
3519 public:
3520 AlternativeGeneration()
3521 : possible_success(),
3522 expects_preload(false),
3523 after(),
3524 quick_check_details() { }
3525 Label possible_success;
3526 bool expects_preload;
3527 Label after;
3528 QuickCheckDetails quick_check_details;
3529 };
3530
3531
3532 // Creates a list of AlternativeGenerations. If the list has a reasonable
3533 // size then it is on the stack, otherwise the excess is on the heap.
3534 class AlternativeGenerationList {
3535 public:
3536 AlternativeGenerationList(int count, Zone* zone)
3537 : alt_gens_(count, zone) {
3538 for (int i = 0; i < count && i < kAFew; i++) {
3539 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3540 }
3541 for (int i = kAFew; i < count; i++) {
3542 alt_gens_.Add(new AlternativeGeneration(), zone);
3543 }
3544 }
3545 ~AlternativeGenerationList() {
3546 for (int i = kAFew; i < alt_gens_.length(); i++) {
3547 delete alt_gens_[i];
3548 alt_gens_[i] = NULL;
3549 }
3550 }
3551
3552 AlternativeGeneration* at(int i) {
3553 return alt_gens_[i];
3554 }
3555
3556 private:
3557 static const int kAFew = 10;
3558 ZoneList<AlternativeGeneration*> alt_gens_;
3559 AlternativeGeneration a_few_alt_gens_[kAFew];
3560 };
3561
3562
3563 // The '2' variant is has inclusive from and exclusive to.
3564 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3565 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3566 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
3567 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
3568 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
3569 0xFEFF, 0xFF00, 0x10000 };
3570 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3571
3572 static const int kWordRanges[] = {
3573 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3574 static const int kWordRangeCount = arraysize(kWordRanges);
3575 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3576 static const int kDigitRangeCount = arraysize(kDigitRanges);
3577 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3578 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3579 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3580 0x2028, 0x202A, 0x10000 };
3581 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3582
3583
3584 void BoyerMoorePositionInfo::Set(int character) {
3585 SetInterval(Interval(character, character));
3586 }
3587
3588
3589 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3590 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3591 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3592 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3593 surrogate_ =
3594 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3595 if (interval.to() - interval.from() >= kMapSize - 1) {
3596 if (map_count_ != kMapSize) {
3597 map_count_ = kMapSize;
3598 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3599 }
3600 return;
3601 }
3602 for (int i = interval.from(); i <= interval.to(); i++) {
3603 int mod_character = (i & kMask);
3604 if (!map_->at(mod_character)) {
3605 map_count_++;
3606 map_->at(mod_character) = true;
3607 }
3608 if (map_count_ == kMapSize) return;
3609 }
3610 }
3611
3612
3613 void BoyerMoorePositionInfo::SetAll() {
3614 s_ = w_ = d_ = kLatticeUnknown;
3615 if (map_count_ != kMapSize) {
3616 map_count_ = kMapSize;
3617 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3618 }
3619 }
3620
3621
3622 BoyerMooreLookahead::BoyerMooreLookahead(
3623 int length, RegExpCompiler* compiler, Zone* zone)
3624 : length_(length),
3625 compiler_(compiler) {
3626 if (compiler->one_byte()) {
3627 max_char_ = String::kMaxOneByteCharCode;
3628 } else {
3629 max_char_ = String::kMaxUtf16CodeUnit;
3630 }
3631 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3632 for (int i = 0; i < length; i++) {
3633 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3634 }
3635 }
3636
3637
3638 // Find the longest range of lookahead that has the fewest number of different
3639 // characters that can occur at a given position. Since we are optimizing two
3640 // different parameters at once this is a tradeoff.
3641 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3642 int biggest_points = 0;
3643 // If more than 32 characters out of 128 can occur it is unlikely that we can
3644 // be lucky enough to step forwards much of the time.
3645 const int kMaxMax = 32;
3646 for (int max_number_of_chars = 4;
3647 max_number_of_chars < kMaxMax;
3648 max_number_of_chars *= 2) {
3649 biggest_points =
3650 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3651 }
3652 if (biggest_points == 0) return false;
3653 return true;
3654 }
3655
3656
3657 // Find the highest-points range between 0 and length_ where the character
3658 // information is not too vague. 'Too vague' means that there are more than
3659 // max_number_of_chars that can occur at this position. Calculates the number
3660 // of points as the product of width-of-the-range and
3661 // probability-of-finding-one-of-the-characters, where the probability is
3662 // calculated using the frequency distribution of the sample subject string.
3663 int BoyerMooreLookahead::FindBestInterval(
3664 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3665 int biggest_points = old_biggest_points;
3666 static const int kSize = RegExpMacroAssembler::kTableSize;
3667 for (int i = 0; i < length_; ) {
3668 while (i < length_ && Count(i) > max_number_of_chars) i++;
3669 if (i == length_) break;
3670 int remembered_from = i;
3671 bool union_map[kSize];
3672 for (int j = 0; j < kSize; j++) union_map[j] = false;
3673 while (i < length_ && Count(i) <= max_number_of_chars) {
3674 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3675 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3676 i++;
3677 }
3678 int frequency = 0;
3679 for (int j = 0; j < kSize; j++) {
3680 if (union_map[j]) {
3681 // Add 1 to the frequency to give a small per-character boost for
3682 // the cases where our sampling is not good enough and many
3683 // characters have a frequency of zero. This means the frequency
3684 // can theoretically be up to 2*kSize though we treat it mostly as
3685 // a fraction of kSize.
3686 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3687 }
3688 }
3689 // We use the probability of skipping times the distance we are skipping to
3690 // judge the effectiveness of this. Actually we have a cut-off: By
3691 // dividing by 2 we switch off the skipping if the probability of skipping
3692 // is less than 50%. This is because the multibyte mask-and-compare
3693 // skipping in quickcheck is more likely to do well on this case.
3694 bool in_quickcheck_range =
3695 ((i - remembered_from < 4) ||
3696 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3697 // Called 'probability' but it is only a rough estimate and can actually
3698 // be outside the 0-kSize range.
3699 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3700 int points = (i - remembered_from) * probability;
3701 if (points > biggest_points) {
3702 *from = remembered_from;
3703 *to = i - 1;
3704 biggest_points = points;
3705 }
3706 }
3707 return biggest_points;
3708 }
3709
3710
3711 // Take all the characters that will not prevent a successful match if they
3712 // occur in the subject string in the range between min_lookahead and
3713 // max_lookahead (inclusive) measured from the current position. If the
3714 // character at max_lookahead offset is not one of these characters, then we
3715 // can safely skip forwards by the number of characters in the range.
3716 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3717 int max_lookahead,
3718 Handle<ByteArray> boolean_skip_table) {
3719 const int kSize = RegExpMacroAssembler::kTableSize;
3720
3721 const int kSkipArrayEntry = 0;
3722 const int kDontSkipArrayEntry = 1;
3723
3724 for (int i = 0; i < kSize; i++) {
3725 boolean_skip_table->set(i, kSkipArrayEntry);
3726 }
3727 int skip = max_lookahead + 1 - min_lookahead;
3728
3729 for (int i = max_lookahead; i >= min_lookahead; i--) {
3730 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3731 for (int j = 0; j < kSize; j++) {
3732 if (map->at(j)) {
3733 boolean_skip_table->set(j, kDontSkipArrayEntry);
3734 }
3735 }
3736 }
3737
3738 return skip;
3739 }
3740
3741
3742 // See comment above on the implementation of GetSkipTable.
3743 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3744 const int kSize = RegExpMacroAssembler::kTableSize;
3745
3746 int min_lookahead = 0;
3747 int max_lookahead = 0;
3748
3749 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3750
3751 bool found_single_character = false;
3752 int single_character = 0;
3753 for (int i = max_lookahead; i >= min_lookahead; i--) {
3754 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3755 if (map->map_count() > 1 ||
3756 (found_single_character && map->map_count() != 0)) {
3757 found_single_character = false;
3758 break;
3759 }
3760 for (int j = 0; j < kSize; j++) {
3761 if (map->at(j)) {
3762 found_single_character = true;
3763 single_character = j;
3764 break;
3765 }
3766 }
3767 }
3768
3769 int lookahead_width = max_lookahead + 1 - min_lookahead;
3770
3771 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3772 // The mask-compare can probably handle this better.
3773 return;
3774 }
3775
3776 if (found_single_character) {
3777 Label cont, again;
3778 masm->Bind(&again);
3779 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3780 if (max_char_ > kSize) {
3781 masm->CheckCharacterAfterAnd(single_character,
3782 RegExpMacroAssembler::kTableMask,
3783 &cont);
3784 } else {
3785 masm->CheckCharacter(single_character, &cont);
3786 }
3787 masm->AdvanceCurrentPosition(lookahead_width);
3788 masm->GoTo(&again);
3789 masm->Bind(&cont);
3790 return;
3791 }
3792
3793 Factory* factory = masm->isolate()->factory();
3794 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3795 int skip_distance = GetSkipTable(
3796 min_lookahead, max_lookahead, boolean_skip_table);
3797 DCHECK(skip_distance != 0);
3798
3799 Label cont, again;
3800 masm->Bind(&again);
3801 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3802 masm->CheckBitInTable(boolean_skip_table, &cont);
3803 masm->AdvanceCurrentPosition(skip_distance);
3804 masm->GoTo(&again);
3805 masm->Bind(&cont);
3806 }
3807
3808
3809 /* Code generation for choice nodes.
3810 *
3811 * We generate quick checks that do a mask and compare to eliminate a
3812 * choice. If the quick check succeeds then it jumps to the continuation to
3813 * do slow checks and check subsequent nodes. If it fails (the common case)
3814 * it falls through to the next choice.
3815 *
3816 * Here is the desired flow graph. Nodes directly below each other imply
3817 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3818 * 3 doesn't have a quick check so we have to call the slow check.
3819 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3820 * regexp continuation is generated directly after the Sn node, up to the
3821 * next GoTo if we decide to reuse some already generated code. Some
3822 * nodes expect preload_characters to be preloaded into the current
3823 * character register. R nodes do this preloading. Vertices are marked
3824 * F for failures and S for success (possible success in the case of quick
3825 * nodes). L, V, < and > are used as arrow heads.
3826 *
3827 * ----------> R
3828 * |
3829 * V
3830 * Q1 -----> S1
3831 * | S /
3832 * F| /
3833 * | F/
3834 * | /
3835 * | R
3836 * | /
3837 * V L
3838 * Q2 -----> S2
3839 * | S /
3840 * F| /
3841 * | F/
3842 * | /
3843 * | R
3844 * | /
3845 * V L
3846 * S3
3847 * |
3848 * F|
3849 * |
3850 * R
3851 * |
3852 * backtrack V
3853 * <----------Q4
3854 * \ F |
3855 * \ |S
3856 * \ F V
3857 * \-----S4
3858 *
3859 * For greedy loops we push the current position, then generate the code that
3860 * eats the input specially in EmitGreedyLoop. The other choice (the
3861 * continuation) is generated by the normal code in EmitChoices, and steps back
3862 * in the input to the starting position when it fails to match. The loop code
3863 * looks like this (U is the unwind code that steps back in the greedy loop).
3864 *
3865 * _____
3866 * / \
3867 * V |
3868 * ----------> S1 |
3869 * /| |
3870 * / |S |
3871 * F/ \_____/
3872 * /
3873 * |<-----
3874 * | \
3875 * V |S
3876 * Q2 ---> U----->backtrack
3877 * | F /
3878 * S| /
3879 * V F /
3880 * S2--/
3881 */
3882
3883 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3884 counter_backtrack_trace_.set_backtrack(&label_);
3885 if (not_at_start) counter_backtrack_trace_.set_at_start(false);
3886 }
3887
3888
3889 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3890 #ifdef DEBUG
3891 int choice_count = alternatives_->length();
3892 for (int i = 0; i < choice_count - 1; i++) {
3893 GuardedAlternative alternative = alternatives_->at(i);
3894 ZoneList<Guard*>* guards = alternative.guards();
3895 int guard_count = (guards == NULL) ? 0 : guards->length();
3896 for (int j = 0; j < guard_count; j++) {
3897 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3898 }
3899 }
3900 #endif
3901 }
3902
3903
3904 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3905 Trace* current_trace,
3906 PreloadState* state) {
3907 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3908 // Save some time by looking at most one machine word ahead.
3909 state->eats_at_least_ =
3910 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3911 current_trace->at_start() == Trace::FALSE_VALUE);
3912 }
3913 state->preload_characters_ =
3914 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3915
3916 state->preload_is_current_ =
3917 (current_trace->characters_preloaded() == state->preload_characters_);
3918 state->preload_has_checked_bounds_ = state->preload_is_current_;
3919 }
3920
3921
3922 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3923 int choice_count = alternatives_->length();
3924
3925 AssertGuardsMentionRegisters(trace);
3926
3927 LimitResult limit_result = LimitVersions(compiler, trace);
3928 if (limit_result == DONE) return;
3929 DCHECK(limit_result == CONTINUE);
3930
3931 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3932 // other choice nodes we only flush if we are out of code size budget.
3933 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3934 trace->Flush(compiler, this);
3935 return;
3936 }
3937
3938 RecursionCheck rc(compiler);
3939
3940 PreloadState preload;
3941 preload.init();
3942 GreedyLoopState greedy_loop_state(not_at_start());
3943
3944 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3945 AlternativeGenerationList alt_gens(choice_count, zone());
3946
3947 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3948 trace = EmitGreedyLoop(compiler,
3949 trace,
3950 &alt_gens,
3951 &preload,
3952 &greedy_loop_state,
3953 text_length);
3954 } else {
3955 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3956 // match the traces produced pre-cleanup.
3957 Label second_choice;
3958 compiler->macro_assembler()->Bind(&second_choice);
3959
3960 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3961
3962 EmitChoices(compiler,
3963 &alt_gens,
3964 0,
3965 trace,
3966 &preload);
3967 }
3968
3969 // At this point we need to generate slow checks for the alternatives where
3970 // the quick check was inlined. We can recognize these because the associated
3971 // label was bound.
3972 int new_flush_budget = trace->flush_budget() / choice_count;
3973 for (int i = 0; i < choice_count; i++) {
3974 AlternativeGeneration* alt_gen = alt_gens.at(i);
3975 Trace new_trace(*trace);
3976 // If there are actions to be flushed we have to limit how many times
3977 // they are flushed. Take the budget of the parent trace and distribute
3978 // it fairly amongst the children.
3979 if (new_trace.actions() != NULL) {
3980 new_trace.set_flush_budget(new_flush_budget);
3981 }
3982 bool next_expects_preload =
3983 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3984 EmitOutOfLineContinuation(compiler,
3985 &new_trace,
3986 alternatives_->at(i),
3987 alt_gen,
3988 preload.preload_characters_,
3989 next_expects_preload);
3990 }
3991 }
3992
3993
3994 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3995 Trace* trace,
3996 AlternativeGenerationList* alt_gens,
3997 PreloadState* preload,
3998 GreedyLoopState* greedy_loop_state,
3999 int text_length) {
4000 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4001 // Here we have special handling for greedy loops containing only text nodes
4002 // and other simple nodes. These are handled by pushing the current
4003 // position on the stack and then incrementing the current position each
4004 // time around the switch. On backtrack we decrement the current position
4005 // and check it against the pushed value. This avoids pushing backtrack
4006 // information for each iteration of the loop, which could take up a lot of
4007 // space.
4008 DCHECK(trace->stop_node() == NULL);
4009 macro_assembler->PushCurrentPosition();
4010 Label greedy_match_failed;
4011 Trace greedy_match_trace;
4012 if (not_at_start()) greedy_match_trace.set_at_start(false);
4013 greedy_match_trace.set_backtrack(&greedy_match_failed);
4014 Label loop_label;
4015 macro_assembler->Bind(&loop_label);
4016 greedy_match_trace.set_stop_node(this);
4017 greedy_match_trace.set_loop_label(&loop_label);
4018 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4019 macro_assembler->Bind(&greedy_match_failed);
4020
4021 Label second_choice; // For use in greedy matches.
4022 macro_assembler->Bind(&second_choice);
4023
4024 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4025
4026 EmitChoices(compiler,
4027 alt_gens,
4028 1,
4029 new_trace,
4030 preload);
4031
4032 macro_assembler->Bind(greedy_loop_state->label());
4033 // If we have unwound to the bottom then backtrack.
4034 macro_assembler->CheckGreedyLoop(trace->backtrack());
4035 // Otherwise try the second priority at an earlier position.
4036 macro_assembler->AdvanceCurrentPosition(-text_length);
4037 macro_assembler->GoTo(&second_choice);
4038 return new_trace;
4039 }
4040
4041 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4042 Trace* trace) {
4043 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4044 if (alternatives_->length() != 2) return eats_at_least;
4045
4046 GuardedAlternative alt1 = alternatives_->at(1);
4047 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4048 return eats_at_least;
4049 }
4050 RegExpNode* eats_anything_node = alt1.node();
4051 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4052 return eats_at_least;
4053 }
4054
4055 // Really we should be creating a new trace when we execute this function,
4056 // but there is no need, because the code it generates cannot backtrack, and
4057 // we always arrive here with a trivial trace (since it's the entry to a
4058 // loop. That also implies that there are no preloaded characters, which is
4059 // good, because it means we won't be violating any assumptions by
4060 // overwriting those characters with new load instructions.
4061 DCHECK(trace->is_trivial());
4062
4063 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4064 Isolate* isolate = macro_assembler->isolate();
4065 // At this point we know that we are at a non-greedy loop that will eat
4066 // any character one at a time. Any non-anchored regexp has such a
4067 // loop prepended to it in order to find where it starts. We look for
4068 // a pattern of the form ...abc... where we can look 6 characters ahead
4069 // and step forwards 3 if the character is not one of abc. Abc need
4070 // not be atoms, they can be any reasonably limited character class or
4071 // small alternation.
4072 BoyerMooreLookahead* bm = bm_info(false);
4073 if (bm == NULL) {
4074 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4075 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4076 kRecursionBudget,
4077 false));
4078 if (eats_at_least >= 1) {
4079 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4080 compiler,
4081 zone());
4082 GuardedAlternative alt0 = alternatives_->at(0);
4083 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4084 }
4085 }
4086 if (bm != NULL) {
4087 bm->EmitSkipInstructions(macro_assembler);
4088 }
4089 return eats_at_least;
4090 }
4091
4092
4093 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4094 AlternativeGenerationList* alt_gens,
4095 int first_choice,
4096 Trace* trace,
4097 PreloadState* preload) {
4098 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4099 SetUpPreLoad(compiler, trace, preload);
4100
4101 // For now we just call all choices one after the other. The idea ultimately
4102 // is to use the Dispatch table to try only the relevant ones.
4103 int choice_count = alternatives_->length();
4104
4105 int new_flush_budget = trace->flush_budget() / choice_count;
4106
4107 for (int i = first_choice; i < choice_count; i++) {
4108 bool is_last = i == choice_count - 1;
4109 bool fall_through_on_failure = !is_last;
4110 GuardedAlternative alternative = alternatives_->at(i);
4111 AlternativeGeneration* alt_gen = alt_gens->at(i);
4112 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4113 ZoneList<Guard*>* guards = alternative.guards();
4114 int guard_count = (guards == NULL) ? 0 : guards->length();
4115 Trace new_trace(*trace);
4116 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4117 preload->preload_characters_ :
4118 0);
4119 if (preload->preload_has_checked_bounds_) {
4120 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4121 }
4122 new_trace.quick_check_performed()->Clear();
4123 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4124 if (!is_last) {
4125 new_trace.set_backtrack(&alt_gen->after);
4126 }
4127 alt_gen->expects_preload = preload->preload_is_current_;
4128 bool generate_full_check_inline = false;
4129 if (compiler->optimize() &&
4130 try_to_emit_quick_check_for_alternative(i == 0) &&
4131 alternative.node()->EmitQuickCheck(
4132 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4133 &alt_gen->possible_success, &alt_gen->quick_check_details,
4134 fall_through_on_failure)) {
4135 // Quick check was generated for this choice.
4136 preload->preload_is_current_ = true;
4137 preload->preload_has_checked_bounds_ = true;
4138 // If we generated the quick check to fall through on possible success,
4139 // we now need to generate the full check inline.
4140 if (!fall_through_on_failure) {
4141 macro_assembler->Bind(&alt_gen->possible_success);
4142 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4143 new_trace.set_characters_preloaded(preload->preload_characters_);
4144 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4145 generate_full_check_inline = true;
4146 }
4147 } else if (alt_gen->quick_check_details.cannot_match()) {
4148 if (!fall_through_on_failure) {
4149 macro_assembler->GoTo(trace->backtrack());
4150 }
4151 continue;
4152 } else {
4153 // No quick check was generated. Put the full code here.
4154 // If this is not the first choice then there could be slow checks from
4155 // previous cases that go here when they fail. There's no reason to
4156 // insist that they preload characters since the slow check we are about
4157 // to generate probably can't use it.
4158 if (i != first_choice) {
4159 alt_gen->expects_preload = false;
4160 new_trace.InvalidateCurrentCharacter();
4161 }
4162 generate_full_check_inline = true;
4163 }
4164 if (generate_full_check_inline) {
4165 if (new_trace.actions() != NULL) {
4166 new_trace.set_flush_budget(new_flush_budget);
4167 }
4168 for (int j = 0; j < guard_count; j++) {
4169 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4170 }
4171 alternative.node()->Emit(compiler, &new_trace);
4172 preload->preload_is_current_ = false;
4173 }
4174 macro_assembler->Bind(&alt_gen->after);
4175 }
4176 }
4177
4178
4179 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4180 Trace* trace,
4181 GuardedAlternative alternative,
4182 AlternativeGeneration* alt_gen,
4183 int preload_characters,
4184 bool next_expects_preload) {
4185 if (!alt_gen->possible_success.is_linked()) return;
4186
4187 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4188 macro_assembler->Bind(&alt_gen->possible_success);
4189 Trace out_of_line_trace(*trace);
4190 out_of_line_trace.set_characters_preloaded(preload_characters);
4191 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4192 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4193 ZoneList<Guard*>* guards = alternative.guards();
4194 int guard_count = (guards == NULL) ? 0 : guards->length();
4195 if (next_expects_preload) {
4196 Label reload_current_char;
4197 out_of_line_trace.set_backtrack(&reload_current_char);
4198 for (int j = 0; j < guard_count; j++) {
4199 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4200 }
4201 alternative.node()->Emit(compiler, &out_of_line_trace);
4202 macro_assembler->Bind(&reload_current_char);
4203 // Reload the current character, since the next quick check expects that.
4204 // We don't need to check bounds here because we only get into this
4205 // code through a quick check which already did the checked load.
4206 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4207 NULL,
4208 false,
4209 preload_characters);
4210 macro_assembler->GoTo(&(alt_gen->after));
4211 } else {
4212 out_of_line_trace.set_backtrack(&(alt_gen->after));
4213 for (int j = 0; j < guard_count; j++) {
4214 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4215 }
4216 alternative.node()->Emit(compiler, &out_of_line_trace);
4217 }
4218 }
4219
4220
4221 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4222 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4223 LimitResult limit_result = LimitVersions(compiler, trace);
4224 if (limit_result == DONE) return;
4225 DCHECK(limit_result == CONTINUE);
4226
4227 RecursionCheck rc(compiler);
4228
4229 switch (action_type_) {
4230 case STORE_POSITION: {
4231 Trace::DeferredCapture
4232 new_capture(data_.u_position_register.reg,
4233 data_.u_position_register.is_capture,
4234 trace);
4235 Trace new_trace = *trace;
4236 new_trace.add_action(&new_capture);
4237 on_success()->Emit(compiler, &new_trace);
4238 break;
4239 }
4240 case INCREMENT_REGISTER: {
4241 Trace::DeferredIncrementRegister
4242 new_increment(data_.u_increment_register.reg);
4243 Trace new_trace = *trace;
4244 new_trace.add_action(&new_increment);
4245 on_success()->Emit(compiler, &new_trace);
4246 break;
4247 }
4248 case SET_REGISTER: {
4249 Trace::DeferredSetRegister
4250 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4251 Trace new_trace = *trace;
4252 new_trace.add_action(&new_set);
4253 on_success()->Emit(compiler, &new_trace);
4254 break;
4255 }
4256 case CLEAR_CAPTURES: {
4257 Trace::DeferredClearCaptures
4258 new_capture(Interval(data_.u_clear_captures.range_from,
4259 data_.u_clear_captures.range_to));
4260 Trace new_trace = *trace;
4261 new_trace.add_action(&new_capture);
4262 on_success()->Emit(compiler, &new_trace);
4263 break;
4264 }
4265 case BEGIN_SUBMATCH:
4266 if (!trace->is_trivial()) {
4267 trace->Flush(compiler, this);
4268 } else {
4269 assembler->WriteCurrentPositionToRegister(
4270 data_.u_submatch.current_position_register, 0);
4271 assembler->WriteStackPointerToRegister(
4272 data_.u_submatch.stack_pointer_register);
4273 on_success()->Emit(compiler, trace);
4274 }
4275 break;
4276 case EMPTY_MATCH_CHECK: {
4277 int start_pos_reg = data_.u_empty_match_check.start_register;
4278 int stored_pos = 0;
4279 int rep_reg = data_.u_empty_match_check.repetition_register;
4280 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4281 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4282 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4283 // If we know we haven't advanced and there is no minimum we
4284 // can just backtrack immediately.
4285 assembler->GoTo(trace->backtrack());
4286 } else if (know_dist && stored_pos < trace->cp_offset()) {
4287 // If we know we've advanced we can generate the continuation
4288 // immediately.
4289 on_success()->Emit(compiler, trace);
4290 } else if (!trace->is_trivial()) {
4291 trace->Flush(compiler, this);
4292 } else {
4293 Label skip_empty_check;
4294 // If we have a minimum number of repetitions we check the current
4295 // number first and skip the empty check if it's not enough.
4296 if (has_minimum) {
4297 int limit = data_.u_empty_match_check.repetition_limit;
4298 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4299 }
4300 // If the match is empty we bail out, otherwise we fall through
4301 // to the on-success continuation.
4302 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4303 trace->backtrack());
4304 assembler->Bind(&skip_empty_check);
4305 on_success()->Emit(compiler, trace);
4306 }
4307 break;
4308 }
4309 case POSITIVE_SUBMATCH_SUCCESS: {
4310 if (!trace->is_trivial()) {
4311 trace->Flush(compiler, this);
4312 return;
4313 }
4314 assembler->ReadCurrentPositionFromRegister(
4315 data_.u_submatch.current_position_register);
4316 assembler->ReadStackPointerFromRegister(
4317 data_.u_submatch.stack_pointer_register);
4318 int clear_register_count = data_.u_submatch.clear_register_count;
4319 if (clear_register_count == 0) {
4320 on_success()->Emit(compiler, trace);
4321 return;
4322 }
4323 int clear_registers_from = data_.u_submatch.clear_register_from;
4324 Label clear_registers_backtrack;
4325 Trace new_trace = *trace;
4326 new_trace.set_backtrack(&clear_registers_backtrack);
4327 on_success()->Emit(compiler, &new_trace);
4328
4329 assembler->Bind(&clear_registers_backtrack);
4330 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4331 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4332
4333 DCHECK(trace->backtrack() == NULL);
4334 assembler->Backtrack();
4335 return;
4336 }
4337 default:
4338 UNREACHABLE();
4339 }
4340 }
4341
4342
4343 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4344 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4345 if (!trace->is_trivial()) {
4346 trace->Flush(compiler, this);
4347 return;
4348 }
4349
4350 LimitResult limit_result = LimitVersions(compiler, trace);
4351 if (limit_result == DONE) return;
4352 DCHECK(limit_result == CONTINUE);
4353
4354 RecursionCheck rc(compiler);
4355
4356 DCHECK_EQ(start_reg_ + 1, end_reg_);
4357 if (compiler->ignore_case()) {
4358 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4359 trace->backtrack());
4360 } else {
4361 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4362 }
4363 on_success()->Emit(compiler, trace);
4364 }
4365
4366
4367 // -------------------------------------------------------------------
4368 // Dot/dotty output
4369
4370
4371 #ifdef DEBUG
4372
4373
4374 class DotPrinter: public NodeVisitor {
4375 public:
4376 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4377 : os_(os),
4378 ignore_case_(ignore_case) {}
4379 void PrintNode(const char* label, RegExpNode* node);
4380 void Visit(RegExpNode* node);
4381 void PrintAttributes(RegExpNode* from);
4382 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4383 #define DECLARE_VISIT(Type) \
4384 virtual void Visit##Type(Type##Node* that);
4385 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4386 #undef DECLARE_VISIT
4387 private:
4388 std::ostream& os_;
4389 bool ignore_case_;
4390 };
4391
4392
4393 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4394 os_ << "digraph G {\n graph [label=\"";
4395 for (int i = 0; label[i]; i++) {
4396 switch (label[i]) {
4397 case '\\':
4398 os_ << "\\\\";
4399 break;
4400 case '"':
4401 os_ << "\"";
4402 break;
4403 default:
4404 os_ << label[i];
4405 break;
4406 }
4407 }
4408 os_ << "\"];\n";
4409 Visit(node);
4410 os_ << "}" << std::endl;
4411 }
4412
4413
4414 void DotPrinter::Visit(RegExpNode* node) {
4415 if (node->info()->visited) return;
4416 node->info()->visited = true;
4417 node->Accept(this);
4418 }
4419
4420
4421 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4422 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4423 Visit(on_failure);
4424 }
4425
4426
4427 class TableEntryBodyPrinter {
4428 public:
4429 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4430 : os_(os),
4431 choice_(choice) {}
4432 void Call(uc16 from, DispatchTable::Entry entry) {
4433 OutSet* out_set = entry.out_set();
4434 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4435 if (out_set->Get(i)) {
4436 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4437 << choice()->alternatives()->at(i).node() << ";\n";
4438 }
4439 }
4440 }
4441 private:
4442 ChoiceNode* choice() { return choice_; }
4443 std::ostream& os_;
4444 ChoiceNode* choice_;
4445 };
4446
4447
4448 class TableEntryHeaderPrinter {
4449 public:
4450 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4451 : first_(true),
4452 os_(os) {}
4453 void Call(uc16 from, DispatchTable::Entry entry) {
4454 if (first_) {
4455 first_ = false;
4456 } else {
4457 os_ << "|";
4458 }
4459 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4460 OutSet* out_set = entry.out_set();
4461 int priority = 0;
4462 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4463 if (out_set->Get(i)) {
4464 if (priority > 0) os_ << "|";
4465 os_ << "<s" << from << "o" << i << "> " << priority;
4466 priority++;
4467 }
4468 }
4469 os_ << "}}";
4470 }
4471
4472 private:
4473 bool first_;
4474 std::ostream& os_;
4475 };
4476
4477
4478 class AttributePrinter {
4479 public:
4480 explicit AttributePrinter(std::ostream& os) // NOLINT
4481 : os_(os),
4482 first_(true) {}
4483 void PrintSeparator() {
4484 if (first_) {
4485 first_ = false;
4486 } else {
4487 os_ << "|";
4488 }
4489 }
4490 void PrintBit(const char* name, bool value) {
4491 if (!value) return;
4492 PrintSeparator();
4493 os_ << "{" << name << "}";
4494 }
4495 void PrintPositive(const char* name, int value) {
4496 if (value < 0) return;
4497 PrintSeparator();
4498 os_ << "{" << name << "|" << value << "}";
4499 }
4500
4501 private:
4502 std::ostream& os_;
4503 bool first_;
4504 };
4505
4506
4507 void DotPrinter::PrintAttributes(RegExpNode* that) {
4508 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4509 << "margin=0.1, fontsize=10, label=\"{";
4510 AttributePrinter printer(os_);
4511 NodeInfo* info = that->info();
4512 printer.PrintBit("NI", info->follows_newline_interest);
4513 printer.PrintBit("WI", info->follows_word_interest);
4514 printer.PrintBit("SI", info->follows_start_interest);
4515 Label* label = that->label();
4516 if (label->is_bound())
4517 printer.PrintPositive("@", label->pos());
4518 os_ << "}\"];\n"
4519 << " a" << that << " -> n" << that
4520 << " [style=dashed, color=grey, arrowhead=none];\n";
4521 }
4522
4523
4524 static const bool kPrintDispatchTable = false;
4525 void DotPrinter::VisitChoice(ChoiceNode* that) {
4526 if (kPrintDispatchTable) {
4527 os_ << " n" << that << " [shape=Mrecord, label=\"";
4528 TableEntryHeaderPrinter header_printer(os_);
4529 that->GetTable(ignore_case_)->ForEach(&header_printer);
4530 os_ << "\"]\n";
4531 PrintAttributes(that);
4532 TableEntryBodyPrinter body_printer(os_, that);
4533 that->GetTable(ignore_case_)->ForEach(&body_printer);
4534 } else {
4535 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4536 for (int i = 0; i < that->alternatives()->length(); i++) {
4537 GuardedAlternative alt = that->alternatives()->at(i);
4538 os_ << " n" << that << " -> n" << alt.node();
4539 }
4540 }
4541 for (int i = 0; i < that->alternatives()->length(); i++) {
4542 GuardedAlternative alt = that->alternatives()->at(i);
4543 alt.node()->Accept(this);
4544 }
4545 }
4546
4547
4548 void DotPrinter::VisitText(TextNode* that) {
4549 Zone* zone = that->zone();
4550 os_ << " n" << that << " [label=\"";
4551 for (int i = 0; i < that->elements()->length(); i++) {
4552 if (i > 0) os_ << " ";
4553 TextElement elm = that->elements()->at(i);
4554 switch (elm.text_type()) {
4555 case TextElement::ATOM: {
4556 Vector<const uc16> data = elm.atom()->data();
4557 for (int i = 0; i < data.length(); i++) {
4558 os_ << static_cast<char>(data[i]);
4559 }
4560 break;
4561 }
4562 case TextElement::CHAR_CLASS: {
4563 RegExpCharacterClass* node = elm.char_class();
4564 os_ << "[";
4565 if (node->is_negated()) os_ << "^";
4566 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4567 CharacterRange range = node->ranges(zone)->at(j);
4568 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4569 }
4570 os_ << "]";
4571 break;
4572 }
4573 default:
4574 UNREACHABLE();
4575 }
4576 }
4577 os_ << "\", shape=box, peripheries=2];\n";
4578 PrintAttributes(that);
4579 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4580 Visit(that->on_success());
4581 }
4582
4583
4584 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4585 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4586 << that->end_register() << "\", shape=doubleoctagon];\n";
4587 PrintAttributes(that);
4588 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4589 Visit(that->on_success());
4590 }
4591
4592
4593 void DotPrinter::VisitEnd(EndNode* that) {
4594 os_ << " n" << that << " [style=bold, shape=point];\n";
4595 PrintAttributes(that);
4596 }
4597
4598
4599 void DotPrinter::VisitAssertion(AssertionNode* that) {
4600 os_ << " n" << that << " [";
4601 switch (that->assertion_type()) {
4602 case AssertionNode::AT_END:
4603 os_ << "label=\"$\", shape=septagon";
4604 break;
4605 case AssertionNode::AT_START:
4606 os_ << "label=\"^\", shape=septagon";
4607 break;
4608 case AssertionNode::AT_BOUNDARY:
4609 os_ << "label=\"\\b\", shape=septagon";
4610 break;
4611 case AssertionNode::AT_NON_BOUNDARY:
4612 os_ << "label=\"\\B\", shape=septagon";
4613 break;
4614 case AssertionNode::AFTER_NEWLINE:
4615 os_ << "label=\"(?<=\\n)\", shape=septagon";
4616 break;
4617 }
4618 os_ << "];\n";
4619 PrintAttributes(that);
4620 RegExpNode* successor = that->on_success();
4621 os_ << " n" << that << " -> n" << successor << ";\n";
4622 Visit(successor);
4623 }
4624
4625
4626 void DotPrinter::VisitAction(ActionNode* that) {
4627 os_ << " n" << that << " [";
4628 switch (that->action_type_) {
4629 case ActionNode::SET_REGISTER:
4630 os_ << "label=\"$" << that->data_.u_store_register.reg
4631 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4632 break;
4633 case ActionNode::INCREMENT_REGISTER:
4634 os_ << "label=\"$" << that->data_.u_increment_register.reg
4635 << "++\", shape=octagon";
4636 break;
4637 case ActionNode::STORE_POSITION:
4638 os_ << "label=\"$" << that->data_.u_position_register.reg
4639 << ":=$pos\", shape=octagon";
4640 break;
4641 case ActionNode::BEGIN_SUBMATCH:
4642 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4643 << ":=$pos,begin\", shape=septagon";
4644 break;
4645 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4646 os_ << "label=\"escape\", shape=septagon";
4647 break;
4648 case ActionNode::EMPTY_MATCH_CHECK:
4649 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4650 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4651 << "<" << that->data_.u_empty_match_check.repetition_limit
4652 << "?\", shape=septagon";
4653 break;
4654 case ActionNode::CLEAR_CAPTURES: {
4655 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4656 << " to $" << that->data_.u_clear_captures.range_to
4657 << "\", shape=septagon";
4658 break;
4659 }
4660 }
4661 os_ << "];\n";
4662 PrintAttributes(that);
4663 RegExpNode* successor = that->on_success();
4664 os_ << " n" << that << " -> n" << successor << ";\n";
4665 Visit(successor);
4666 }
4667
4668
4669 class DispatchTableDumper {
4670 public:
4671 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4672 void Call(uc16 key, DispatchTable::Entry entry);
4673 private:
4674 std::ostream& os_;
4675 };
4676
4677
4678 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4679 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4680 OutSet* set = entry.out_set();
4681 bool first = true;
4682 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4683 if (set->Get(i)) {
4684 if (first) {
4685 first = false;
4686 } else {
4687 os_ << ", ";
4688 }
4689 os_ << i;
4690 }
4691 }
4692 os_ << "}\n";
4693 }
4694
4695
4696 void DispatchTable::Dump() {
4697 OFStream os(stderr);
4698 DispatchTableDumper dumper(os);
4699 tree()->ForEach(&dumper);
4700 }
4701
4702
4703 void RegExpEngine::DotPrint(const char* label,
4704 RegExpNode* node,
4705 bool ignore_case) {
4706 OFStream os(stdout);
4707 DotPrinter printer(os, ignore_case);
4708 printer.PrintNode(label, node);
4709 }
4710
4711
4712 #endif // DEBUG
4713
4714
4715 // -------------------------------------------------------------------
4716 // Tree to graph conversion
4717
4718 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4719 RegExpNode* on_success) {
4720 ZoneList<TextElement>* elms =
4721 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4722 elms->Add(TextElement::Atom(this), compiler->zone());
4723 return new(compiler->zone()) TextNode(elms, on_success);
4724 }
4725
4726
4727 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4728 RegExpNode* on_success) {
4729 return new(compiler->zone()) TextNode(elements(), on_success);
4730 }
4731
4732
4733 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4734 const int* special_class,
4735 int length) {
4736 length--; // Remove final 0x10000.
4737 DCHECK(special_class[length] == 0x10000);
4738 DCHECK(ranges->length() != 0);
4739 DCHECK(length != 0);
4740 DCHECK(special_class[0] != 0);
4741 if (ranges->length() != (length >> 1) + 1) {
4742 return false;
4743 }
4744 CharacterRange range = ranges->at(0);
4745 if (range.from() != 0) {
4746 return false;
4747 }
4748 for (int i = 0; i < length; i += 2) {
4749 if (special_class[i] != (range.to() + 1)) {
4750 return false;
4751 }
4752 range = ranges->at((i >> 1) + 1);
4753 if (special_class[i+1] != range.from()) {
4754 return false;
4755 }
4756 }
4757 if (range.to() != 0xffff) {
4758 return false;
4759 }
4760 return true;
4761 }
4762
4763
4764 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4765 const int* special_class,
4766 int length) {
4767 length--; // Remove final 0x10000.
4768 DCHECK(special_class[length] == 0x10000);
4769 if (ranges->length() * 2 != length) {
4770 return false;
4771 }
4772 for (int i = 0; i < length; i += 2) {
4773 CharacterRange range = ranges->at(i >> 1);
4774 if (range.from() != special_class[i] ||
4775 range.to() != special_class[i + 1] - 1) {
4776 return false;
4777 }
4778 }
4779 return true;
4780 }
4781
4782
4783 bool RegExpCharacterClass::is_standard(Zone* zone) {
4784 // TODO(lrn): Remove need for this function, by not throwing away information
4785 // along the way.
4786 if (is_negated_) {
4787 return false;
4788 }
4789 if (set_.is_standard()) {
4790 return true;
4791 }
4792 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4793 set_.set_standard_set_type('s');
4794 return true;
4795 }
4796 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4797 set_.set_standard_set_type('S');
4798 return true;
4799 }
4800 if (CompareInverseRanges(set_.ranges(zone),
4801 kLineTerminatorRanges,
4802 kLineTerminatorRangeCount)) {
4803 set_.set_standard_set_type('.');
4804 return true;
4805 }
4806 if (CompareRanges(set_.ranges(zone),
4807 kLineTerminatorRanges,
4808 kLineTerminatorRangeCount)) {
4809 set_.set_standard_set_type('n');
4810 return true;
4811 }
4812 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4813 set_.set_standard_set_type('w');
4814 return true;
4815 }
4816 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4817 set_.set_standard_set_type('W');
4818 return true;
4819 }
4820 return false;
4821 }
4822
4823
4824 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4825 RegExpNode* on_success) {
4826 return new(compiler->zone()) TextNode(this, on_success);
4827 }
4828
4829
4830 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
4831 RegExpAtom* atom1 = (*a)->AsAtom();
4832 RegExpAtom* atom2 = (*b)->AsAtom();
4833 uc16 character1 = atom1->data().at(0);
4834 uc16 character2 = atom2->data().at(0);
4835 if (character1 < character2) return -1;
4836 if (character1 > character2) return 1;
4837 return 0;
4838 }
4839
4840
4841 static unibrow::uchar Canonical(
4842 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
4843 unibrow::uchar c) {
4844 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
4845 int length = canonicalize->get(c, '\0', chars);
4846 DCHECK_LE(length, 1);
4847 unibrow::uchar canonical = c;
4848 if (length == 1) canonical = chars[0];
4849 return canonical;
4850 }
4851
4852
4853 int CompareFirstCharCaseIndependent(
4854 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
4855 RegExpTree* const* a, RegExpTree* const* b) {
4856 RegExpAtom* atom1 = (*a)->AsAtom();
4857 RegExpAtom* atom2 = (*b)->AsAtom();
4858 unibrow::uchar character1 = atom1->data().at(0);
4859 unibrow::uchar character2 = atom2->data().at(0);
4860 if (character1 == character2) return 0;
4861 if (character1 >= 'a' || character2 >= 'a') {
4862 character1 = Canonical(canonicalize, character1);
4863 character2 = Canonical(canonicalize, character2);
4864 }
4865 return static_cast<int>(character1) - static_cast<int>(character2);
4866 }
4867
4868
4869 // We can stable sort runs of atoms, since the order does not matter if they
4870 // start with different characters.
4871 // Returns true if any consecutive atoms were found.
4872 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
4873 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4874 int length = alternatives->length();
4875 bool found_consecutive_atoms = false;
4876 for (int i = 0; i < length; i++) {
4877 while (i < length) {
4878 RegExpTree* alternative = alternatives->at(i);
4879 if (alternative->IsAtom()) break;
4880 i++;
4881 }
4882 // i is length or it is the index of an atom.
4883 if (i == length) break;
4884 int first_atom = i;
4885 i++;
4886 while (i < length) {
4887 RegExpTree* alternative = alternatives->at(i);
4888 if (!alternative->IsAtom()) break;
4889 i++;
4890 }
4891 // Sort atoms to get ones with common prefixes together.
4892 // This step is more tricky if we are in a case-independent regexp,
4893 // because it would change /is|I/ to /I|is/, and order matters when
4894 // the regexp parts don't match only disjoint starting points. To fix
4895 // this we have a version of CompareFirstChar that uses case-
4896 // independent character classes for comparison.
4897 DCHECK_LT(first_atom, alternatives->length());
4898 DCHECK_LE(i, alternatives->length());
4899 DCHECK_LE(first_atom, i);
4900 if (compiler->ignore_case()) {
4901 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
4902 compiler->isolate()->regexp_macro_assembler_canonicalize();
4903 auto compare_closure =
4904 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
4905 return CompareFirstCharCaseIndependent(canonicalize, a, b);
4906 };
4907 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
4908 } else {
4909 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
4910 }
4911 if (i - first_atom > 1) found_consecutive_atoms = true;
4912 }
4913 return found_consecutive_atoms;
4914 }
4915
4916
4917 // Optimizes ab|ac|az to a(?:b|c|d).
4918 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
4919 Zone* zone = compiler->zone();
4920 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4921 int length = alternatives->length();
4922
4923 int write_posn = 0;
4924 int i = 0;
4925 while (i < length) {
4926 RegExpTree* alternative = alternatives->at(i);
4927 if (!alternative->IsAtom()) {
4928 alternatives->at(write_posn++) = alternatives->at(i);
4929 i++;
4930 continue;
4931 }
4932 RegExpAtom* atom = alternative->AsAtom();
4933 unibrow::uchar common_prefix = atom->data().at(0);
4934 int first_with_prefix = i;
4935 int prefix_length = atom->length();
4936 i++;
4937 while (i < length) {
4938 alternative = alternatives->at(i);
4939 if (!alternative->IsAtom()) break;
4940 atom = alternative->AsAtom();
4941 unibrow::uchar new_prefix = atom->data().at(0);
4942 if (new_prefix != common_prefix) {
4943 if (!compiler->ignore_case()) break;
4944 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
4945 compiler->isolate()->regexp_macro_assembler_canonicalize();
4946 new_prefix = Canonical(canonicalize, new_prefix);
4947 common_prefix = Canonical(canonicalize, common_prefix);
4948 if (new_prefix != common_prefix) break;
4949 }
4950 prefix_length = Min(prefix_length, atom->length());
4951 i++;
4952 }
4953 if (i > first_with_prefix + 2) {
4954 // Found worthwhile run of alternatives with common prefix of at least one
4955 // character. The sorting function above did not sort on more than one
4956 // character for reasons of correctness, but there may still be a longer
4957 // common prefix if the terms were similar or presorted in the input.
4958 // Find out how long the common prefix is.
4959 int run_length = i - first_with_prefix;
4960 atom = alternatives->at(first_with_prefix)->AsAtom();
4961 for (int j = 1; j < run_length && prefix_length > 1; j++) {
4962 RegExpAtom* old_atom =
4963 alternatives->at(j + first_with_prefix)->AsAtom();
4964 for (int k = 1; k < prefix_length; k++) {
4965 if (atom->data().at(k) != old_atom->data().at(k)) {
4966 prefix_length = k;
4967 break;
4968 }
4969 }
4970 }
4971 RegExpAtom* prefix =
4972 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
4973 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
4974 pair->Add(prefix, zone);
4975 ZoneList<RegExpTree*>* suffixes =
4976 new (zone) ZoneList<RegExpTree*>(run_length, zone);
4977 for (int j = 0; j < run_length; j++) {
4978 RegExpAtom* old_atom =
4979 alternatives->at(j + first_with_prefix)->AsAtom();
4980 int len = old_atom->length();
4981 if (len == prefix_length) {
4982 suffixes->Add(new (zone) RegExpEmpty(), zone);
4983 } else {
4984 RegExpTree* suffix = new (zone) RegExpAtom(
4985 old_atom->data().SubVector(prefix_length, old_atom->length()));
4986 suffixes->Add(suffix, zone);
4987 }
4988 }
4989 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
4990 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
4991 } else {
4992 // Just copy any non-worthwhile alternatives.
4993 for (int j = first_with_prefix; j < i; j++) {
4994 alternatives->at(write_posn++) = alternatives->at(j);
4995 }
4996 }
4997 }
4998 alternatives->Rewind(write_posn); // Trim end of array.
4999 }
5000
5001
5002 // Optimizes b|c|z to [bcz].
5003 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5004 RegExpCompiler* compiler) {
5005 Zone* zone = compiler->zone();
5006 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5007 int length = alternatives->length();
5008
5009 int write_posn = 0;
5010 int i = 0;
5011 while (i < length) {
5012 RegExpTree* alternative = alternatives->at(i);
5013 if (!alternative->IsAtom()) {
5014 alternatives->at(write_posn++) = alternatives->at(i);
5015 i++;
5016 continue;
5017 }
5018 RegExpAtom* atom = alternative->AsAtom();
5019 if (atom->length() != 1) {
5020 alternatives->at(write_posn++) = alternatives->at(i);
5021 i++;
5022 continue;
5023 }
5024 int first_in_run = i;
5025 i++;
5026 while (i < length) {
5027 alternative = alternatives->at(i);
5028 if (!alternative->IsAtom()) break;
5029 atom = alternative->AsAtom();
5030 if (atom->length() != 1) break;
5031 i++;
5032 }
5033 if (i > first_in_run + 1) {
5034 // Found non-trivial run of single-character alternatives.
5035 int run_length = i - first_in_run;
5036 ZoneList<CharacterRange>* ranges =
5037 new (zone) ZoneList<CharacterRange>(2, zone);
5038 for (int j = 0; j < run_length; j++) {
5039 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5040 DCHECK_EQ(old_atom->length(), 1);
5041 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5042 }
5043 alternatives->at(write_posn++) =
5044 new (zone) RegExpCharacterClass(ranges, false);
5045 } else {
5046 // Just copy any trivial alternatives.
5047 for (int j = first_in_run; j < i; j++) {
5048 alternatives->at(write_posn++) = alternatives->at(j);
5049 }
5050 }
5051 }
5052 alternatives->Rewind(write_posn); // Trim end of array.
5053 }
5054
5055
5056 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5057 RegExpNode* on_success) {
5058 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5059
5060 if (alternatives->length() > 2) {
5061 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5062 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5063 FixSingleCharacterDisjunctions(compiler);
5064 if (alternatives->length() == 1) {
5065 return alternatives->at(0)->ToNode(compiler, on_success);
5066 }
5067 }
5068
5069 int length = alternatives->length();
5070
5071 ChoiceNode* result =
5072 new(compiler->zone()) ChoiceNode(length, compiler->zone());
5073 for (int i = 0; i < length; i++) {
5074 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5075 on_success));
5076 result->AddAlternative(alternative);
5077 }
5078 return result;
5079 }
5080
5081
5082 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5083 RegExpNode* on_success) {
5084 return ToNode(min(),
5085 max(),
5086 is_greedy(),
5087 body(),
5088 compiler,
5089 on_success);
5090 }
5091
5092
5093 // Scoped object to keep track of how much we unroll quantifier loops in the
5094 // regexp graph generator.
5095 class RegExpExpansionLimiter {
5096 public:
5097 static const int kMaxExpansionFactor = 6;
5098 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5099 : compiler_(compiler),
5100 saved_expansion_factor_(compiler->current_expansion_factor()),
5101 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5102 DCHECK(factor > 0);
5103 if (ok_to_expand_) {
5104 if (factor > kMaxExpansionFactor) {
5105 // Avoid integer overflow of the current expansion factor.
5106 ok_to_expand_ = false;
5107 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5108 } else {
5109 int new_factor = saved_expansion_factor_ * factor;
5110 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5111 compiler->set_current_expansion_factor(new_factor);
5112 }
5113 }
5114 }
5115
5116 ~RegExpExpansionLimiter() {
5117 compiler_->set_current_expansion_factor(saved_expansion_factor_);
5118 }
5119
5120 bool ok_to_expand() { return ok_to_expand_; }
5121
5122 private:
5123 RegExpCompiler* compiler_;
5124 int saved_expansion_factor_;
5125 bool ok_to_expand_;
5126
5127 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5128 };
5129
5130
5131 RegExpNode* RegExpQuantifier::ToNode(int min,
5132 int max,
5133 bool is_greedy,
5134 RegExpTree* body,
5135 RegExpCompiler* compiler,
5136 RegExpNode* on_success,
5137 bool not_at_start) {
5138 // x{f, t} becomes this:
5139 //
5140 // (r++)<-.
5141 // | `
5142 // | (x)
5143 // v ^
5144 // (r=0)-->(?)---/ [if r < t]
5145 // |
5146 // [if r >= f] \----> ...
5147 //
5148
5149 // 15.10.2.5 RepeatMatcher algorithm.
5150 // The parser has already eliminated the case where max is 0. In the case
5151 // where max_match is zero the parser has removed the quantifier if min was
5152 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5153
5154 // If we know that we cannot match zero length then things are a little
5155 // simpler since we don't need to make the special zero length match check
5156 // from step 2.1. If the min and max are small we can unroll a little in
5157 // this case.
5158 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5159 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5160 if (max == 0) return on_success; // This can happen due to recursion.
5161 bool body_can_be_empty = (body->min_match() == 0);
5162 int body_start_reg = RegExpCompiler::kNoRegister;
5163 Interval capture_registers = body->CaptureRegisters();
5164 bool needs_capture_clearing = !capture_registers.is_empty();
5165 Zone* zone = compiler->zone();
5166
5167 if (body_can_be_empty) {
5168 body_start_reg = compiler->AllocateRegister();
5169 } else if (compiler->optimize() && !needs_capture_clearing) {
5170 // Only unroll if there are no captures and the body can't be
5171 // empty.
5172 {
5173 RegExpExpansionLimiter limiter(
5174 compiler, min + ((max != min) ? 1 : 0));
5175 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5176 int new_max = (max == kInfinity) ? max : max - min;
5177 // Recurse once to get the loop or optional matches after the fixed
5178 // ones.
5179 RegExpNode* answer = ToNode(
5180 0, new_max, is_greedy, body, compiler, on_success, true);
5181 // Unroll the forced matches from 0 to min. This can cause chains of
5182 // TextNodes (which the parser does not generate). These should be
5183 // combined if it turns out they hinder good code generation.
5184 for (int i = 0; i < min; i++) {
5185 answer = body->ToNode(compiler, answer);
5186 }
5187 return answer;
5188 }
5189 }
5190 if (max <= kMaxUnrolledMaxMatches && min == 0) {
5191 DCHECK(max > 0); // Due to the 'if' above.
5192 RegExpExpansionLimiter limiter(compiler, max);
5193 if (limiter.ok_to_expand()) {
5194 // Unroll the optional matches up to max.
5195 RegExpNode* answer = on_success;
5196 for (int i = 0; i < max; i++) {
5197 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5198 if (is_greedy) {
5199 alternation->AddAlternative(
5200 GuardedAlternative(body->ToNode(compiler, answer)));
5201 alternation->AddAlternative(GuardedAlternative(on_success));
5202 } else {
5203 alternation->AddAlternative(GuardedAlternative(on_success));
5204 alternation->AddAlternative(
5205 GuardedAlternative(body->ToNode(compiler, answer)));
5206 }
5207 answer = alternation;
5208 if (not_at_start) alternation->set_not_at_start();
5209 }
5210 return answer;
5211 }
5212 }
5213 }
5214 bool has_min = min > 0;
5215 bool has_max = max < RegExpTree::kInfinity;
5216 bool needs_counter = has_min || has_max;
5217 int reg_ctr = needs_counter
5218 ? compiler->AllocateRegister()
5219 : RegExpCompiler::kNoRegister;
5220 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
5221 zone);
5222 if (not_at_start) center->set_not_at_start();
5223 RegExpNode* loop_return = needs_counter
5224 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5225 : static_cast<RegExpNode*>(center);
5226 if (body_can_be_empty) {
5227 // If the body can be empty we need to check if it was and then
5228 // backtrack.
5229 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5230 reg_ctr,
5231 min,
5232 loop_return);
5233 }
5234 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5235 if (body_can_be_empty) {
5236 // If the body can be empty we need to store the start position
5237 // so we can bail out if it was empty.
5238 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5239 }
5240 if (needs_capture_clearing) {
5241 // Before entering the body of this loop we need to clear captures.
5242 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5243 }
5244 GuardedAlternative body_alt(body_node);
5245 if (has_max) {
5246 Guard* body_guard =
5247 new(zone) Guard(reg_ctr, Guard::LT, max);
5248 body_alt.AddGuard(body_guard, zone);
5249 }
5250 GuardedAlternative rest_alt(on_success);
5251 if (has_min) {
5252 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5253 rest_alt.AddGuard(rest_guard, zone);
5254 }
5255 if (is_greedy) {
5256 center->AddLoopAlternative(body_alt);
5257 center->AddContinueAlternative(rest_alt);
5258 } else {
5259 center->AddContinueAlternative(rest_alt);
5260 center->AddLoopAlternative(body_alt);
5261 }
5262 if (needs_counter) {
5263 return ActionNode::SetRegister(reg_ctr, 0, center);
5264 } else {
5265 return center;
5266 }
5267 }
5268
5269
5270 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5271 RegExpNode* on_success) {
5272 NodeInfo info;
5273 Zone* zone = compiler->zone();
5274
5275 switch (assertion_type()) {
5276 case START_OF_LINE:
5277 return AssertionNode::AfterNewline(on_success);
5278 case START_OF_INPUT:
5279 return AssertionNode::AtStart(on_success);
5280 case BOUNDARY:
5281 return AssertionNode::AtBoundary(on_success);
5282 case NON_BOUNDARY:
5283 return AssertionNode::AtNonBoundary(on_success);
5284 case END_OF_INPUT:
5285 return AssertionNode::AtEnd(on_success);
5286 case END_OF_LINE: {
5287 // Compile $ in multiline regexps as an alternation with a positive
5288 // lookahead in one side and an end-of-input on the other side.
5289 // We need two registers for the lookahead.
5290 int stack_pointer_register = compiler->AllocateRegister();
5291 int position_register = compiler->AllocateRegister();
5292 // The ChoiceNode to distinguish between a newline and end-of-input.
5293 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5294 // Create a newline atom.
5295 ZoneList<CharacterRange>* newline_ranges =
5296 new(zone) ZoneList<CharacterRange>(3, zone);
5297 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5298 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5299 TextNode* newline_matcher = new(zone) TextNode(
5300 newline_atom,
5301 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5302 position_register,
5303 0, // No captures inside.
5304 -1, // Ignored if no captures.
5305 on_success));
5306 // Create an end-of-input matcher.
5307 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5308 stack_pointer_register,
5309 position_register,
5310 newline_matcher);
5311 // Add the two alternatives to the ChoiceNode.
5312 GuardedAlternative eol_alternative(end_of_line);
5313 result->AddAlternative(eol_alternative);
5314 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5315 result->AddAlternative(end_alternative);
5316 return result;
5317 }
5318 default:
5319 UNREACHABLE();
5320 }
5321 return on_success;
5322 }
5323
5324
5325 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5326 RegExpNode* on_success) {
5327 return new(compiler->zone())
5328 BackReferenceNode(RegExpCapture::StartRegister(index()),
5329 RegExpCapture::EndRegister(index()),
5330 on_success);
5331 }
5332
5333
5334 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5335 RegExpNode* on_success) {
5336 return on_success;
5337 }
5338
5339
5340 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
5341 RegExpNode* on_success) {
5342 int stack_pointer_register = compiler->AllocateRegister();
5343 int position_register = compiler->AllocateRegister();
5344
5345 const int registers_per_capture = 2;
5346 const int register_of_first_capture = 2;
5347 int register_count = capture_count_ * registers_per_capture;
5348 int register_start =
5349 register_of_first_capture + capture_from_ * registers_per_capture;
5350
5351 RegExpNode* success;
5352 if (is_positive()) {
5353 RegExpNode* node = ActionNode::BeginSubmatch(
5354 stack_pointer_register,
5355 position_register,
5356 body()->ToNode(
5357 compiler,
5358 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5359 position_register,
5360 register_count,
5361 register_start,
5362 on_success)));
5363 return node;
5364 } else {
5365 // We use a ChoiceNode for a negative lookahead because it has most of
5366 // the characteristics we need. It has the body of the lookahead as its
5367 // first alternative and the expression after the lookahead of the second
5368 // alternative. If the first alternative succeeds then the
5369 // NegativeSubmatchSuccess will unwind the stack including everything the
5370 // choice node set up and backtrack. If the first alternative fails then
5371 // the second alternative is tried, which is exactly the desired result
5372 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5373 // ChoiceNode that knows to ignore the first exit when calculating quick
5374 // checks.
5375 Zone* zone = compiler->zone();
5376
5377 GuardedAlternative body_alt(
5378 body()->ToNode(
5379 compiler,
5380 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5381 position_register,
5382 register_count,
5383 register_start,
5384 zone)));
5385 ChoiceNode* choice_node =
5386 new(zone) NegativeLookaheadChoiceNode(body_alt,
5387 GuardedAlternative(on_success),
5388 zone);
5389 return ActionNode::BeginSubmatch(stack_pointer_register,
5390 position_register,
5391 choice_node);
5392 }
5393 }
5394
5395
5396 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5397 RegExpNode* on_success) {
5398 return ToNode(body(), index(), compiler, on_success);
5399 }
5400
5401
5402 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5403 int index,
5404 RegExpCompiler* compiler,
5405 RegExpNode* on_success) {
5406 int start_reg = RegExpCapture::StartRegister(index);
5407 int end_reg = RegExpCapture::EndRegister(index);
5408 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5409 RegExpNode* body_node = body->ToNode(compiler, store_end);
5410 return ActionNode::StorePosition(start_reg, true, body_node);
5411 }
5412
5413
5414 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5415 RegExpNode* on_success) {
5416 ZoneList<RegExpTree*>* children = nodes();
5417 RegExpNode* current = on_success;
5418 for (int i = children->length() - 1; i >= 0; i--) {
5419 current = children->at(i)->ToNode(compiler, current);
5420 }
5421 return current;
5422 }
5423
5424
5425 static void AddClass(const int* elmv,
5426 int elmc,
5427 ZoneList<CharacterRange>* ranges,
5428 Zone* zone) {
5429 elmc--;
5430 DCHECK(elmv[elmc] == 0x10000);
5431 for (int i = 0; i < elmc; i += 2) {
5432 DCHECK(elmv[i] < elmv[i + 1]);
5433 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5434 }
5435 }
5436
5437
5438 static void AddClassNegated(const int *elmv,
5439 int elmc,
5440 ZoneList<CharacterRange>* ranges,
5441 Zone* zone) {
5442 elmc--;
5443 DCHECK(elmv[elmc] == 0x10000);
5444 DCHECK(elmv[0] != 0x0000);
5445 DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
5446 uc16 last = 0x0000;
5447 for (int i = 0; i < elmc; i += 2) {
5448 DCHECK(last <= elmv[i] - 1);
5449 DCHECK(elmv[i] < elmv[i + 1]);
5450 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5451 last = elmv[i + 1];
5452 }
5453 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
5454 }
5455
5456
5457 void CharacterRange::AddClassEscape(uc16 type,
5458 ZoneList<CharacterRange>* ranges,
5459 Zone* zone) {
5460 switch (type) {
5461 case 's':
5462 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5463 break;
5464 case 'S':
5465 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5466 break;
5467 case 'w':
5468 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5469 break;
5470 case 'W':
5471 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5472 break;
5473 case 'd':
5474 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5475 break;
5476 case 'D':
5477 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5478 break;
5479 case '.':
5480 AddClassNegated(kLineTerminatorRanges,
5481 kLineTerminatorRangeCount,
5482 ranges,
5483 zone);
5484 break;
5485 // This is not a character range as defined by the spec but a
5486 // convenient shorthand for a character class that matches any
5487 // character.
5488 case '*':
5489 ranges->Add(CharacterRange::Everything(), zone);
5490 break;
5491 // This is the set of characters matched by the $ and ^ symbols
5492 // in multiline mode.
5493 case 'n':
5494 AddClass(kLineTerminatorRanges,
5495 kLineTerminatorRangeCount,
5496 ranges,
5497 zone);
5498 break;
5499 default:
5500 UNREACHABLE();
5501 }
5502 }
5503
5504
5505 Vector<const int> CharacterRange::GetWordBounds() {
5506 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5507 }
5508
5509
5510 class CharacterRangeSplitter {
5511 public:
5512 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
5513 ZoneList<CharacterRange>** excluded,
5514 Zone* zone)
5515 : included_(included),
5516 excluded_(excluded),
5517 zone_(zone) { }
5518 void Call(uc16 from, DispatchTable::Entry entry);
5519
5520 static const int kInBase = 0;
5521 static const int kInOverlay = 1;
5522
5523 private:
5524 ZoneList<CharacterRange>** included_;
5525 ZoneList<CharacterRange>** excluded_;
5526 Zone* zone_;
5527 };
5528
5529
5530 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5531 if (!entry.out_set()->Get(kInBase)) return;
5532 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5533 ? included_
5534 : excluded_;
5535 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5536 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5537 }
5538
5539
5540 void CharacterRange::Split(ZoneList<CharacterRange>* base,
5541 Vector<const int> overlay,
5542 ZoneList<CharacterRange>** included,
5543 ZoneList<CharacterRange>** excluded,
5544 Zone* zone) {
5545 DCHECK_NULL(*included);
5546 DCHECK_NULL(*excluded);
5547 DispatchTable table(zone);
5548 for (int i = 0; i < base->length(); i++)
5549 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5550 for (int i = 0; i < overlay.length(); i += 2) {
5551 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5552 CharacterRangeSplitter::kInOverlay, zone);
5553 }
5554 CharacterRangeSplitter callback(included, excluded, zone);
5555 table.ForEach(&callback);
5556 }
5557
5558
5559 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5560 ZoneList<CharacterRange>* ranges,
5561 bool is_one_byte) {
5562 uc16 bottom = from();
5563 uc16 top = to();
5564 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
5565 if (bottom > String::kMaxOneByteCharCode) return;
5566 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5567 }
5568 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5569 if (top == bottom) {
5570 // If this is a singleton we just expand the one character.
5571 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5572 for (int i = 0; i < length; i++) {
5573 uc32 chr = chars[i];
5574 if (chr != bottom) {
5575 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5576 }
5577 }
5578 } else {
5579 // If this is a range we expand the characters block by block,
5580 // expanding contiguous subranges (blocks) one at a time.
5581 // The approach is as follows. For a given start character we
5582 // look up the remainder of the block that contains it (represented
5583 // by the end point), for instance we find 'z' if the character
5584 // is 'c'. A block is characterized by the property
5585 // that all characters uncanonicalize in the same way, except that
5586 // each entry in the result is incremented by the distance from the first
5587 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5588 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5589 // Once we've found the end point we look up its uncanonicalization
5590 // and produce a range for each element. For instance for [c-f]
5591 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
5592 // add a range if it is not already contained in the input, so [c-f]
5593 // will be skipped but [C-F] will be added. If this range is not
5594 // completely contained in a block we do this for all the blocks
5595 // covered by the range (handling characters that is not in a block
5596 // as a "singleton block").
5597 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5598 int pos = bottom;
5599 while (pos <= top) {
5600 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5601 uc16 block_end;
5602 if (length == 0) {
5603 block_end = pos;
5604 } else {
5605 DCHECK_EQ(1, length);
5606 block_end = range[0];
5607 }
5608 int end = (block_end > top) ? top : block_end;
5609 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5610 for (int i = 0; i < length; i++) {
5611 uc32 c = range[i];
5612 uc16 range_from = c - (block_end - pos);
5613 uc16 range_to = c - (block_end - end);
5614 if (!(bottom <= range_from && range_to <= top)) {
5615 ranges->Add(CharacterRange(range_from, range_to), zone);
5616 }
5617 }
5618 pos = end + 1;
5619 }
5620 }
5621 }
5622
5623
5624 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5625 DCHECK_NOT_NULL(ranges);
5626 int n = ranges->length();
5627 if (n <= 1) return true;
5628 int max = ranges->at(0).to();
5629 for (int i = 1; i < n; i++) {
5630 CharacterRange next_range = ranges->at(i);
5631 if (next_range.from() <= max + 1) return false;
5632 max = next_range.to();
5633 }
5634 return true;
5635 }
5636
5637
5638 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5639 if (ranges_ == NULL) {
5640 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5641 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5642 }
5643 return ranges_;
5644 }
5645
5646
5647 // Move a number of elements in a zonelist to another position
5648 // in the same list. Handles overlapping source and target areas.
5649 static void MoveRanges(ZoneList<CharacterRange>* list,
5650 int from,
5651 int to,
5652 int count) {
5653 // Ranges are potentially overlapping.
5654 if (from < to) {
5655 for (int i = count - 1; i >= 0; i--) {
5656 list->at(to + i) = list->at(from + i);
5657 }
5658 } else {
5659 for (int i = 0; i < count; i++) {
5660 list->at(to + i) = list->at(from + i);
5661 }
5662 }
5663 }
5664
5665
5666 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5667 int count,
5668 CharacterRange insert) {
5669 // Inserts a range into list[0..count[, which must be sorted
5670 // by from value and non-overlapping and non-adjacent, using at most
5671 // list[0..count] for the result. Returns the number of resulting
5672 // canonicalized ranges. Inserting a range may collapse existing ranges into
5673 // fewer ranges, so the return value can be anything in the range 1..count+1.
5674 uc16 from = insert.from();
5675 uc16 to = insert.to();
5676 int start_pos = 0;
5677 int end_pos = count;
5678 for (int i = count - 1; i >= 0; i--) {
5679 CharacterRange current = list->at(i);
5680 if (current.from() > to + 1) {
5681 end_pos = i;
5682 } else if (current.to() + 1 < from) {
5683 start_pos = i + 1;
5684 break;
5685 }
5686 }
5687
5688 // Inserted range overlaps, or is adjacent to, ranges at positions
5689 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5690 // not affected by the insertion.
5691 // If start_pos == end_pos, the range must be inserted before start_pos.
5692 // if start_pos < end_pos, the entire range from start_pos to end_pos
5693 // must be merged with the insert range.
5694
5695 if (start_pos == end_pos) {
5696 // Insert between existing ranges at position start_pos.
5697 if (start_pos < count) {
5698 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5699 }
5700 list->at(start_pos) = insert;
5701 return count + 1;
5702 }
5703 if (start_pos + 1 == end_pos) {
5704 // Replace single existing range at position start_pos.
5705 CharacterRange to_replace = list->at(start_pos);
5706 int new_from = Min(to_replace.from(), from);
5707 int new_to = Max(to_replace.to(), to);
5708 list->at(start_pos) = CharacterRange(new_from, new_to);
5709 return count;
5710 }
5711 // Replace a number of existing ranges from start_pos to end_pos - 1.
5712 // Move the remaining ranges down.
5713
5714 int new_from = Min(list->at(start_pos).from(), from);
5715 int new_to = Max(list->at(end_pos - 1).to(), to);
5716 if (end_pos < count) {
5717 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5718 }
5719 list->at(start_pos) = CharacterRange(new_from, new_to);
5720 return count - (end_pos - start_pos) + 1;
5721 }
5722
5723
5724 void CharacterSet::Canonicalize() {
5725 // Special/default classes are always considered canonical. The result
5726 // of calling ranges() will be sorted.
5727 if (ranges_ == NULL) return;
5728 CharacterRange::Canonicalize(ranges_);
5729 }
5730
5731
5732 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
5733 if (character_ranges->length() <= 1) return;
5734 // Check whether ranges are already canonical (increasing, non-overlapping,
5735 // non-adjacent).
5736 int n = character_ranges->length();
5737 int max = character_ranges->at(0).to();
5738 int i = 1;
5739 while (i < n) {
5740 CharacterRange current = character_ranges->at(i);
5741 if (current.from() <= max + 1) {
5742 break;
5743 }
5744 max = current.to();
5745 i++;
5746 }
5747 // Canonical until the i'th range. If that's all of them, we are done.
5748 if (i == n) return;
5749
5750 // The ranges at index i and forward are not canonicalized. Make them so by
5751 // doing the equivalent of insertion sort (inserting each into the previous
5752 // list, in order).
5753 // Notice that inserting a range can reduce the number of ranges in the
5754 // result due to combining of adjacent and overlapping ranges.
5755 int read = i; // Range to insert.
5756 int num_canonical = i; // Length of canonicalized part of list.
5757 do {
5758 num_canonical = InsertRangeInCanonicalList(character_ranges,
5759 num_canonical,
5760 character_ranges->at(read));
5761 read++;
5762 } while (read < n);
5763 character_ranges->Rewind(num_canonical);
5764
5765 DCHECK(CharacterRange::IsCanonical(character_ranges));
5766 }
5767
5768
5769 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
5770 ZoneList<CharacterRange>* negated_ranges,
5771 Zone* zone) {
5772 DCHECK(CharacterRange::IsCanonical(ranges));
5773 DCHECK_EQ(0, negated_ranges->length());
5774 int range_count = ranges->length();
5775 uc16 from = 0;
5776 int i = 0;
5777 if (range_count > 0 && ranges->at(0).from() == 0) {
5778 from = ranges->at(0).to();
5779 i = 1;
5780 }
5781 while (i < range_count) {
5782 CharacterRange range = ranges->at(i);
5783 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
5784 from = range.to();
5785 i++;
5786 }
5787 if (from < String::kMaxUtf16CodeUnit) {
5788 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5789 zone);
5790 }
5791 }
5792
5793
5794 // -------------------------------------------------------------------
5795 // Splay tree
5796
5797
5798 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
5799 if (Get(value))
5800 return this;
5801 if (successors(zone) != NULL) {
5802 for (int i = 0; i < successors(zone)->length(); i++) {
5803 OutSet* successor = successors(zone)->at(i);
5804 if (successor->Get(value))
5805 return successor;
5806 }
5807 } else {
5808 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
5809 }
5810 OutSet* result = new(zone) OutSet(first_, remaining_);
5811 result->Set(value, zone);
5812 successors(zone)->Add(result, zone);
5813 return result;
5814 }
5815
5816
5817 void OutSet::Set(unsigned value, Zone *zone) {
5818 if (value < kFirstLimit) {
5819 first_ |= (1 << value);
5820 } else {
5821 if (remaining_ == NULL)
5822 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
5823 if (remaining_->is_empty() || !remaining_->Contains(value))
5824 remaining_->Add(value, zone);
5825 }
5826 }
5827
5828
5829 bool OutSet::Get(unsigned value) const {
5830 if (value < kFirstLimit) {
5831 return (first_ & (1 << value)) != 0;
5832 } else if (remaining_ == NULL) {
5833 return false;
5834 } else {
5835 return remaining_->Contains(value);
5836 }
5837 }
5838
5839
5840 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
5841
5842
5843 void DispatchTable::AddRange(CharacterRange full_range, int value,
5844 Zone* zone) {
5845 CharacterRange current = full_range;
5846 if (tree()->is_empty()) {
5847 // If this is the first range we just insert into the table.
5848 ZoneSplayTree<Config>::Locator loc;
5849 bool inserted = tree()->Insert(current.from(), &loc);
5850 DCHECK(inserted);
5851 USE(inserted);
5852 loc.set_value(Entry(current.from(), current.to(),
5853 empty()->Extend(value, zone)));
5854 return;
5855 }
5856 // First see if there is a range to the left of this one that
5857 // overlaps.
5858 ZoneSplayTree<Config>::Locator loc;
5859 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5860 Entry* entry = &loc.value();
5861 // If we've found a range that overlaps with this one, and it
5862 // starts strictly to the left of this one, we have to fix it
5863 // because the following code only handles ranges that start on
5864 // or after the start point of the range we're adding.
5865 if (entry->from() < current.from() && entry->to() >= current.from()) {
5866 // Snap the overlapping range in half around the start point of
5867 // the range we're adding.
5868 CharacterRange left(entry->from(), current.from() - 1);
5869 CharacterRange right(current.from(), entry->to());
5870 // The left part of the overlapping range doesn't overlap.
5871 // Truncate the whole entry to be just the left part.
5872 entry->set_to(left.to());
5873 // The right part is the one that overlaps. We add this part
5874 // to the map and let the next step deal with merging it with
5875 // the range we're adding.
5876 ZoneSplayTree<Config>::Locator loc;
5877 bool inserted = tree()->Insert(right.from(), &loc);
5878 DCHECK(inserted);
5879 USE(inserted);
5880 loc.set_value(Entry(right.from(),
5881 right.to(),
5882 entry->out_set()));
5883 }
5884 }
5885 while (current.is_valid()) {
5886 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5887 (loc.value().from() <= current.to()) &&
5888 (loc.value().to() >= current.from())) {
5889 Entry* entry = &loc.value();
5890 // We have overlap. If there is space between the start point of
5891 // the range we're adding and where the overlapping range starts
5892 // then we have to add a range covering just that space.
5893 if (current.from() < entry->from()) {
5894 ZoneSplayTree<Config>::Locator ins;
5895 bool inserted = tree()->Insert(current.from(), &ins);
5896 DCHECK(inserted);
5897 USE(inserted);
5898 ins.set_value(Entry(current.from(),
5899 entry->from() - 1,
5900 empty()->Extend(value, zone)));
5901 current.set_from(entry->from());
5902 }
5903 DCHECK_EQ(current.from(), entry->from());
5904 // If the overlapping range extends beyond the one we want to add
5905 // we have to snap the right part off and add it separately.
5906 if (entry->to() > current.to()) {
5907 ZoneSplayTree<Config>::Locator ins;
5908 bool inserted = tree()->Insert(current.to() + 1, &ins);
5909 DCHECK(inserted);
5910 USE(inserted);
5911 ins.set_value(Entry(current.to() + 1,
5912 entry->to(),
5913 entry->out_set()));
5914 entry->set_to(current.to());
5915 }
5916 DCHECK(entry->to() <= current.to());
5917 // The overlapping range is now completely contained by the range
5918 // we're adding so we can just update it and move the start point
5919 // of the range we're adding just past it.
5920 entry->AddValue(value, zone);
5921 // Bail out if the last interval ended at 0xFFFF since otherwise
5922 // adding 1 will wrap around to 0.
5923 if (entry->to() == String::kMaxUtf16CodeUnit)
5924 break;
5925 DCHECK(entry->to() + 1 > current.from());
5926 current.set_from(entry->to() + 1);
5927 } else {
5928 // There is no overlap so we can just add the range
5929 ZoneSplayTree<Config>::Locator ins;
5930 bool inserted = tree()->Insert(current.from(), &ins);
5931 DCHECK(inserted);
5932 USE(inserted);
5933 ins.set_value(Entry(current.from(),
5934 current.to(),
5935 empty()->Extend(value, zone)));
5936 break;
5937 }
5938 }
5939 }
5940
5941
5942 OutSet* DispatchTable::Get(uc16 value) {
5943 ZoneSplayTree<Config>::Locator loc;
5944 if (!tree()->FindGreatestLessThan(value, &loc))
5945 return empty();
5946 Entry* entry = &loc.value();
5947 if (value <= entry->to())
5948 return entry->out_set();
5949 else
5950 return empty();
5951 }
5952
5953
5954 // -------------------------------------------------------------------
5955 // Analysis
5956
5957
5958 void Analysis::EnsureAnalyzed(RegExpNode* that) {
5959 StackLimitCheck check(isolate());
5960 if (check.HasOverflowed()) {
5961 fail("Stack overflow");
5962 return;
5963 }
5964 if (that->info()->been_analyzed || that->info()->being_analyzed)
5965 return;
5966 that->info()->being_analyzed = true;
5967 that->Accept(this);
5968 that->info()->being_analyzed = false;
5969 that->info()->been_analyzed = true;
5970 }
5971
5972
5973 void Analysis::VisitEnd(EndNode* that) {
5974 // nothing to do
5975 }
5976
5977
5978 void TextNode::CalculateOffsets() {
5979 int element_count = elements()->length();
5980 // Set up the offsets of the elements relative to the start. This is a fixed
5981 // quantity since a TextNode can only contain fixed-width things.
5982 int cp_offset = 0;
5983 for (int i = 0; i < element_count; i++) {
5984 TextElement& elm = elements()->at(i);
5985 elm.set_cp_offset(cp_offset);
5986 cp_offset += elm.length();
5987 }
5988 }
5989
5990
5991 void Analysis::VisitText(TextNode* that) {
5992 if (ignore_case_) {
5993 that->MakeCaseIndependent(isolate(), is_one_byte_);
5994 }
5995 EnsureAnalyzed(that->on_success());
5996 if (!has_failed()) {
5997 that->CalculateOffsets();
5998 }
5999 }
6000
6001
6002 void Analysis::VisitAction(ActionNode* that) {
6003 RegExpNode* target = that->on_success();
6004 EnsureAnalyzed(target);
6005 if (!has_failed()) {
6006 // If the next node is interested in what it follows then this node
6007 // has to be interested too so it can pass the information on.
6008 that->info()->AddFromFollowing(target->info());
6009 }
6010 }
6011
6012
6013 void Analysis::VisitChoice(ChoiceNode* that) {
6014 NodeInfo* info = that->info();
6015 for (int i = 0; i < that->alternatives()->length(); i++) {
6016 RegExpNode* node = that->alternatives()->at(i).node();
6017 EnsureAnalyzed(node);
6018 if (has_failed()) return;
6019 // Anything the following nodes need to know has to be known by
6020 // this node also, so it can pass it on.
6021 info->AddFromFollowing(node->info());
6022 }
6023 }
6024
6025
6026 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6027 NodeInfo* info = that->info();
6028 for (int i = 0; i < that->alternatives()->length(); i++) {
6029 RegExpNode* node = that->alternatives()->at(i).node();
6030 if (node != that->loop_node()) {
6031 EnsureAnalyzed(node);
6032 if (has_failed()) return;
6033 info->AddFromFollowing(node->info());
6034 }
6035 }
6036 // Check the loop last since it may need the value of this node
6037 // to get a correct result.
6038 EnsureAnalyzed(that->loop_node());
6039 if (!has_failed()) {
6040 info->AddFromFollowing(that->loop_node()->info());
6041 }
6042 }
6043
6044
6045 void Analysis::VisitBackReference(BackReferenceNode* that) {
6046 EnsureAnalyzed(that->on_success());
6047 }
6048
6049
6050 void Analysis::VisitAssertion(AssertionNode* that) {
6051 EnsureAnalyzed(that->on_success());
6052 }
6053
6054
6055 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6056 BoyerMooreLookahead* bm,
6057 bool not_at_start) {
6058 // Working out the set of characters that a backreference can match is too
6059 // hard, so we just say that any character can match.
6060 bm->SetRest(offset);
6061 SaveBMInfo(bm, not_at_start, offset);
6062 }
6063
6064
6065 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6066 RegExpMacroAssembler::kTableSize);
6067
6068
6069 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6070 BoyerMooreLookahead* bm, bool not_at_start) {
6071 ZoneList<GuardedAlternative>* alts = alternatives();
6072 budget = (budget - 1) / alts->length();
6073 for (int i = 0; i < alts->length(); i++) {
6074 GuardedAlternative& alt = alts->at(i);
6075 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6076 bm->SetRest(offset); // Give up trying to fill in info.
6077 SaveBMInfo(bm, not_at_start, offset);
6078 return;
6079 }
6080 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6081 }
6082 SaveBMInfo(bm, not_at_start, offset);
6083 }
6084
6085
6086 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6087 BoyerMooreLookahead* bm, bool not_at_start) {
6088 if (initial_offset >= bm->length()) return;
6089 int offset = initial_offset;
6090 int max_char = bm->max_char();
6091 for (int i = 0; i < elements()->length(); i++) {
6092 if (offset >= bm->length()) {
6093 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6094 return;
6095 }
6096 TextElement text = elements()->at(i);
6097 if (text.text_type() == TextElement::ATOM) {
6098 RegExpAtom* atom = text.atom();
6099 for (int j = 0; j < atom->length(); j++, offset++) {
6100 if (offset >= bm->length()) {
6101 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6102 return;
6103 }
6104 uc16 character = atom->data()[j];
6105 if (bm->compiler()->ignore_case()) {
6106 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6107 int length = GetCaseIndependentLetters(
6108 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6109 chars);
6110 for (int j = 0; j < length; j++) {
6111 bm->Set(offset, chars[j]);
6112 }
6113 } else {
6114 if (character <= max_char) bm->Set(offset, character);
6115 }
6116 }
6117 } else {
6118 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6119 RegExpCharacterClass* char_class = text.char_class();
6120 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6121 if (char_class->is_negated()) {
6122 bm->SetAll(offset);
6123 } else {
6124 for (int k = 0; k < ranges->length(); k++) {
6125 CharacterRange& range = ranges->at(k);
6126 if (range.from() > max_char) continue;
6127 int to = Min(max_char, static_cast<int>(range.to()));
6128 bm->SetInterval(offset, Interval(range.from(), to));
6129 }
6130 }
6131 offset++;
6132 }
6133 }
6134 if (offset >= bm->length()) {
6135 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6136 return;
6137 }
6138 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6139 true); // Not at start after a text node.
6140 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6141 }
6142
6143
6144 // -------------------------------------------------------------------
6145 // Dispatch table construction
6146
6147
6148 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6149 AddRange(CharacterRange::Everything());
6150 }
6151
6152
6153 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6154 node->set_being_calculated(true);
6155 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6156 for (int i = 0; i < alternatives->length(); i++) {
6157 set_choice_index(i);
6158 alternatives->at(i).node()->Accept(this);
6159 }
6160 node->set_being_calculated(false);
6161 }
6162
6163
6164 class AddDispatchRange {
6165 public:
6166 explicit AddDispatchRange(DispatchTableConstructor* constructor)
6167 : constructor_(constructor) { }
6168 void Call(uc32 from, DispatchTable::Entry entry);
6169 private:
6170 DispatchTableConstructor* constructor_;
6171 };
6172
6173
6174 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6175 CharacterRange range(from, entry.to());
6176 constructor_->AddRange(range);
6177 }
6178
6179
6180 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6181 if (node->being_calculated())
6182 return;
6183 DispatchTable* table = node->GetTable(ignore_case_);
6184 AddDispatchRange adder(this);
6185 table->ForEach(&adder);
6186 }
6187
6188
6189 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6190 // TODO(160): Find the node that we refer back to and propagate its start
6191 // set back to here. For now we just accept anything.
6192 AddRange(CharacterRange::Everything());
6193 }
6194
6195
6196 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6197 RegExpNode* target = that->on_success();
6198 target->Accept(this);
6199 }
6200
6201
6202 static int CompareRangeByFrom(const CharacterRange* a,
6203 const CharacterRange* b) {
6204 return Compare<uc16>(a->from(), b->from());
6205 }
6206
6207
6208 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6209 ranges->Sort(CompareRangeByFrom);
6210 uc16 last = 0;
6211 for (int i = 0; i < ranges->length(); i++) {
6212 CharacterRange range = ranges->at(i);
6213 if (last < range.from())
6214 AddRange(CharacterRange(last, range.from() - 1));
6215 if (range.to() >= last) {
6216 if (range.to() == String::kMaxUtf16CodeUnit) {
6217 return;
6218 } else {
6219 last = range.to() + 1;
6220 }
6221 }
6222 }
6223 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
6224 }
6225
6226
6227 void DispatchTableConstructor::VisitText(TextNode* that) {
6228 TextElement elm = that->elements()->at(0);
6229 switch (elm.text_type()) {
6230 case TextElement::ATOM: {
6231 uc16 c = elm.atom()->data()[0];
6232 AddRange(CharacterRange(c, c));
6233 break;
6234 }
6235 case TextElement::CHAR_CLASS: {
6236 RegExpCharacterClass* tree = elm.char_class();
6237 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6238 if (tree->is_negated()) {
6239 AddInverse(ranges);
6240 } else {
6241 for (int i = 0; i < ranges->length(); i++)
6242 AddRange(ranges->at(i));
6243 }
6244 break;
6245 }
6246 default: {
6247 UNIMPLEMENTED();
6248 }
6249 }
6250 }
6251
6252
6253 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6254 RegExpNode* target = that->on_success();
6255 target->Accept(this);
6256 }
6257
6258
6259 RegExpEngine::CompilationResult RegExpEngine::Compile(
6260 Isolate* isolate, Zone* zone, RegExpCompileData* data, bool ignore_case,
6261 bool is_global, bool is_multiline, bool is_sticky, Handle<String> pattern,
6262 Handle<String> sample_subject, bool is_one_byte) {
6263 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6264 return IrregexpRegExpTooBig(isolate);
6265 }
6266 RegExpCompiler compiler(isolate, zone, data->capture_count, ignore_case,
6267 is_one_byte);
6268
6269 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6270
6271 // Sample some characters from the middle of the string.
6272 static const int kSampleSize = 128;
6273
6274 sample_subject = String::Flatten(sample_subject);
6275 int chars_sampled = 0;
6276 int half_way = (sample_subject->length() - kSampleSize) / 2;
6277 for (int i = Max(0, half_way);
6278 i < sample_subject->length() && chars_sampled < kSampleSize;
6279 i++, chars_sampled++) {
6280 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6281 }
6282
6283 // Wrap the body of the regexp in capture #0.
6284 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6285 0,
6286 &compiler,
6287 compiler.accept());
6288 RegExpNode* node = captured_body;
6289 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6290 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6291 int max_length = data->tree->max_match();
6292 if (!is_start_anchored && !is_sticky) {
6293 // Add a .*? at the beginning, outside the body capture, unless
6294 // this expression is anchored at the beginning or sticky.
6295 RegExpNode* loop_node =
6296 RegExpQuantifier::ToNode(0,
6297 RegExpTree::kInfinity,
6298 false,
6299 new(zone) RegExpCharacterClass('*'),
6300 &compiler,
6301 captured_body,
6302 data->contains_anchor);
6303
6304 if (data->contains_anchor) {
6305 // Unroll loop once, to take care of the case that might start
6306 // at the start of input.
6307 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6308 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6309 first_step_node->AddAlternative(GuardedAlternative(
6310 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
6311 node = first_step_node;
6312 } else {
6313 node = loop_node;
6314 }
6315 }
6316 if (is_one_byte) {
6317 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6318 // Do it again to propagate the new nodes to places where they were not
6319 // put because they had not been calculated yet.
6320 if (node != NULL) {
6321 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6322 }
6323 }
6324
6325 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6326 data->node = node;
6327 Analysis analysis(isolate, ignore_case, is_one_byte);
6328 analysis.EnsureAnalyzed(node);
6329 if (analysis.has_failed()) {
6330 const char* error_message = analysis.error_message();
6331 return CompilationResult(isolate, error_message);
6332 }
6333
6334 // Create the correct assembler for the architecture.
6335 #ifndef V8_INTERPRETED_REGEXP
6336 // Native regexp implementation.
6337
6338 NativeRegExpMacroAssembler::Mode mode =
6339 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6340 : NativeRegExpMacroAssembler::UC16;
6341
6342 #if V8_TARGET_ARCH_IA32
6343 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6344 (data->capture_count + 1) * 2);
6345 #elif V8_TARGET_ARCH_X64
6346 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6347 (data->capture_count + 1) * 2);
6348 #elif V8_TARGET_ARCH_ARM
6349 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6350 (data->capture_count + 1) * 2);
6351 #elif V8_TARGET_ARCH_ARM64
6352 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6353 (data->capture_count + 1) * 2);
6354 #elif V8_TARGET_ARCH_PPC
6355 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6356 (data->capture_count + 1) * 2);
6357 #elif V8_TARGET_ARCH_MIPS
6358 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6359 (data->capture_count + 1) * 2);
6360 #elif V8_TARGET_ARCH_MIPS64
6361 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6362 (data->capture_count + 1) * 2);
6363 #elif V8_TARGET_ARCH_X87
6364 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6365 (data->capture_count + 1) * 2);
6366 #else
6367 #error "Unsupported architecture"
6368 #endif
6369
6370 #else // V8_INTERPRETED_REGEXP
6371 // Interpreted regexp implementation.
6372 EmbeddedVector<byte, 1024> codes;
6373 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6374 #endif // V8_INTERPRETED_REGEXP
6375
6376 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6377
6378 // Inserted here, instead of in Assembler, because it depends on information
6379 // in the AST that isn't replicated in the Node structure.
6380 static const int kMaxBacksearchLimit = 1024;
6381 if (is_end_anchored &&
6382 !is_start_anchored &&
6383 max_length < kMaxBacksearchLimit) {
6384 macro_assembler.SetCurrentPositionFromEnd(max_length);
6385 }
6386
6387 if (is_global) {
6388 macro_assembler.set_global_mode(
6389 (data->tree->min_match() > 0)
6390 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
6391 : RegExpMacroAssembler::GLOBAL);
6392 }
6393
6394 return compiler.Assemble(&macro_assembler,
6395 node,
6396 data->capture_count,
6397 pattern);
6398 }
6399
6400
6401 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6402 Heap* heap = pattern->GetHeap();
6403 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6404 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6405 heap->isolate()->memory_allocator()->SizeExecutable() >
6406 RegExpImpl::kRegExpExecutableMemoryLimit) {
6407 too_much = true;
6408 }
6409 return too_much;
6410 }
6411 } // namespace internal
6412 } // namespace v8
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/jsregexp-inl.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698