Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(134)

Side by Side Diff: src/jsregexp.cc

Issue 11000: Periodic merge of bleeding_edge to experimental code generator branch.... (Closed) Base URL: http://v8.googlecode.com/svn/branches/experimental/toiger/
Patch Set: Created 12 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/jsregexp.h ('k') | src/list.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution. 11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its 12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived 13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission. 14 // from this software without specific prior written permission.
15 // 15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 27
28 #define _HAS_EXCEPTIONS 0
29 #include <set>
30
28 #include "v8.h" 31 #include "v8.h"
29 32
33 #include "ast.h"
30 #include "execution.h" 34 #include "execution.h"
31 #include "factory.h" 35 #include "factory.h"
32 #include "jsregexp.h" 36 #include "jsregexp-inl.h"
33 #include "platform.h" 37 #include "platform.h"
34 #include "runtime.h" 38 #include "runtime.h"
35 #include "top.h" 39 #include "top.h"
36 #include "compilation-cache.h" 40 #include "compilation-cache.h"
41 #include "string-stream.h"
42 #include "parser.h"
43 #include "regexp-macro-assembler.h"
44 #include "regexp-macro-assembler-tracer.h"
45 #include "regexp-macro-assembler-irregexp.h"
46
47 #ifdef ARM
48 #include "regexp-macro-assembler-arm.h"
49 #else // IA32
50 #include "macro-assembler-ia32.h"
51 #include "regexp-macro-assembler-ia32.h"
52 #endif
53
54 #include "interpreter-irregexp.h"
37 55
38 // Including pcre.h undefines DEBUG to avoid getting debug output from 56 // Including pcre.h undefines DEBUG to avoid getting debug output from
39 // the JSCRE implementation. Make sure to redefine it in debug mode 57 // the JSCRE implementation. Make sure to redefine it in debug mode
40 // after having included the header file. 58 // after having included the header file.
41 #ifdef DEBUG 59 #ifdef DEBUG
42 #include "third_party/jscre/pcre.h" 60 #include "third_party/jscre/pcre.h"
43 #define DEBUG 61 #define DEBUG
44 #else 62 #else
45 #include "third_party/jscre/pcre.h" 63 #include "third_party/jscre/pcre.h"
46 #endif 64 #endif
47 65
66
48 namespace v8 { namespace internal { 67 namespace v8 { namespace internal {
49 68
50 69
51 #define CAPTURE_INDEX 0
52 #define INTERNAL_INDEX 1
53
54 static Failure* malloc_failure; 70 static Failure* malloc_failure;
55 71
56 static void* JSREMalloc(size_t size) { 72 static void* JSREMalloc(size_t size) {
57 Object* obj = Heap::AllocateByteArray(size); 73 Object* obj = Heap::AllocateByteArray(size);
58 74
59 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile 75 // If allocation failed, return a NULL pointer to JSRE, and jsRegExpCompile
60 // will return NULL to the caller, performs GC there. 76 // will return NULL to the caller, performs GC there.
61 // Also pass failure information to the caller. 77 // Also pass failure information to the caller.
62 if (obj->IsFailure()) { 78 if (obj->IsFailure()) {
63 malloc_failure = Failure::cast(obj); 79 malloc_failure = Failure::cast(obj);
(...skipping 105 matching lines...) Expand 10 before | Expand all | Expand 10 after
169 break; 185 break;
170 case 'm': 186 case 'm':
171 flags |= JSRegExp::MULTILINE; 187 flags |= JSRegExp::MULTILINE;
172 break; 188 break;
173 } 189 }
174 } 190 }
175 return JSRegExp::Flags(flags); 191 return JSRegExp::Flags(flags);
176 } 192 }
177 193
178 194
179 unibrow::Predicate<unibrow::RegExpSpecialChar, 128> is_reg_exp_special_char; 195 static inline void ThrowRegExpException(Handle<JSRegExp> re,
196 Handle<String> pattern,
197 Handle<String> error_text,
198 const char* message) {
199 Handle<JSArray> array = Factory::NewJSArray(2);
200 SetElement(array, 0, pattern);
201 SetElement(array, 1, error_text);
202 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
203 Top::Throw(*regexp_err);
204 }
180 205
181 206
182 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re, 207 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
183 Handle<String> pattern, 208 Handle<String> pattern,
184 Handle<String> flag_str) { 209 Handle<String> flag_str) {
185 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str); 210 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
186 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags); 211 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
187 bool in_cache = !cached.is_null(); 212 bool in_cache = !cached.is_null();
188 Handle<Object> result; 213 Handle<Object> result;
189 StringShape shape(*pattern);
190 if (in_cache) { 214 if (in_cache) {
191 re->set_data(*cached); 215 re->set_data(*cached);
192 result = re; 216 result = re;
193 } else { 217 } else {
194 bool is_atom = !flags.is_ignore_case(); 218 FlattenString(pattern);
195 for (int i = 0; is_atom && i < pattern->length(shape); i++) { 219 RegExpParseResult parse_result;
196 if (is_reg_exp_special_char.get(pattern->Get(shape, i))) 220 FlatStringReader reader(pattern);
197 is_atom = false; 221 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
222 // Throw an exception if we fail to parse the pattern.
223 ThrowRegExpException(re,
224 pattern,
225 parse_result.error,
226 "malformed_regexp");
227 return Handle<Object>();
198 } 228 }
199 if (is_atom) { 229 RegExpAtom* atom = parse_result.tree->AsAtom();
200 result = AtomCompile(re, pattern, flags); 230 if (atom != NULL && !flags.is_ignore_case()) {
231 if (parse_result.has_character_escapes) {
232 Vector<const uc16> atom_pattern = atom->data();
233 Handle<String> atom_string =
234 Factory::NewStringFromTwoByte(atom_pattern);
235 result = AtomCompile(re, pattern, flags, atom_string);
236 } else {
237 result = AtomCompile(re, pattern, flags, pattern);
238 }
201 } else { 239 } else {
202 result = JsreCompile(re, pattern, flags); 240 RegExpNode* node = NULL;
241 Handle<FixedArray> irregexp_data =
242 RegExpEngine::Compile(&parse_result,
243 &node,
244 flags.is_ignore_case(),
245 flags.is_multiline());
246 if (irregexp_data.is_null()) {
247 if (FLAG_disable_jscre) {
248 UNIMPLEMENTED();
249 }
250 result = JscrePrepare(re, pattern, flags);
251 } else {
252 result = IrregexpPrepare(re, pattern, flags, irregexp_data);
253 }
203 } 254 }
204 Object* data = re->data(); 255 Object* data = re->data();
205 if (data->IsFixedArray()) { 256 if (data->IsFixedArray()) {
206 // If compilation succeeded then the data is set on the regexp 257 // If compilation succeeded then the data is set on the regexp
207 // and we can store it in the cache. 258 // and we can store it in the cache.
208 Handle<FixedArray> data(FixedArray::cast(re->data())); 259 Handle<FixedArray> data(FixedArray::cast(re->data()));
209 CompilationCache::PutRegExp(pattern, flags, data); 260 CompilationCache::PutRegExp(pattern, flags, data);
210 } 261 }
211 } 262 }
212 263
213 LOG(RegExpCompileEvent(re, in_cache)); 264 LOG(RegExpCompileEvent(re, in_cache));
214 return result; 265 return result;
215 } 266 }
216 267
217 268
218 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp, 269 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
219 Handle<String> subject, 270 Handle<String> subject,
220 Handle<Object> index) { 271 Handle<Object> index) {
221 switch (regexp->TypeTag()) { 272 switch (regexp->TypeTag()) {
222 case JSRegExp::JSCRE: 273 case JSRegExp::JSCRE:
223 return JsreExec(regexp, subject, index); 274 if (FLAG_disable_jscre) {
275 UNIMPLEMENTED();
276 }
277 return JscreExec(regexp, subject, index);
224 case JSRegExp::ATOM: 278 case JSRegExp::ATOM:
225 return AtomExec(regexp, subject, index); 279 return AtomExec(regexp, subject, index);
280 case JSRegExp::IRREGEXP:
281 return IrregexpExec(regexp, subject, index);
226 default: 282 default:
227 UNREACHABLE(); 283 UNREACHABLE();
228 return Handle<Object>(); 284 return Handle<Object>();
229 } 285 }
230 } 286 }
231 287
232 288
233 Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp, 289 Handle<Object> RegExpImpl::ExecGlobal(Handle<JSRegExp> regexp,
234 Handle<String> subject) { 290 Handle<String> subject) {
235 switch (regexp->TypeTag()) { 291 switch (regexp->TypeTag()) {
236 case JSRegExp::JSCRE: 292 case JSRegExp::JSCRE:
237 return JsreExecGlobal(regexp, subject); 293 if (FLAG_disable_jscre) {
294 UNIMPLEMENTED();
295 }
296 return JscreExecGlobal(regexp, subject);
238 case JSRegExp::ATOM: 297 case JSRegExp::ATOM:
239 return AtomExecGlobal(regexp, subject); 298 return AtomExecGlobal(regexp, subject);
299 case JSRegExp::IRREGEXP:
300 return IrregexpExecGlobal(regexp, subject);
240 default: 301 default:
241 UNREACHABLE(); 302 UNREACHABLE();
242 return Handle<Object>(); 303 return Handle<Object>();
243 } 304 }
244 } 305 }
245 306
246 307
247 Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re, 308 Handle<Object> RegExpImpl::AtomCompile(Handle<JSRegExp> re,
248 Handle<String> pattern, 309 Handle<String> pattern,
249 JSRegExp::Flags flags) { 310 JSRegExp::Flags flags,
250 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, pattern); 311 Handle<String> match_pattern) {
312 Factory::SetRegExpData(re, JSRegExp::ATOM, pattern, flags, match_pattern);
251 return re; 313 return re;
252 } 314 }
253 315
254 316
255 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, 317 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
256 Handle<String> subject, 318 Handle<String> subject,
257 Handle<Object> index) { 319 Handle<Object> index) {
258 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex))); 320 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
259 321
260 uint32_t start_index; 322 uint32_t start_index;
261 if (!Array::IndexFromObject(*index, &start_index)) { 323 if (!Array::IndexFromObject(*index, &start_index)) {
262 return Handle<Smi>(Smi::FromInt(-1)); 324 return Handle<Smi>(Smi::FromInt(-1));
263 } 325 }
264 326
265 LOG(RegExpExecEvent(re, start_index, subject)); 327 LOG(RegExpExecEvent(re, start_index, subject));
266 int value = Runtime::StringMatch(subject, needle, start_index); 328 int value = Runtime::StringMatch(subject, needle, start_index);
267 if (value == -1) return Factory::null_value(); 329 if (value == -1) return Factory::null_value();
268 330
269 Handle<FixedArray> array = Factory::NewFixedArray(2); 331 Handle<FixedArray> array = Factory::NewFixedArray(2);
270 array->set(0, 332 array->set(0, Smi::FromInt(value));
271 Smi::FromInt(value), 333 array->set(1, Smi::FromInt(value + needle->length()));
272 SKIP_WRITE_BARRIER);
273 array->set(1,
274 Smi::FromInt(value + needle->length()),
275 SKIP_WRITE_BARRIER);
276 return Factory::NewJSArrayWithElements(array); 334 return Factory::NewJSArrayWithElements(array);
277 } 335 }
278 336
279 337
280 Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re, 338 Handle<Object> RegExpImpl::AtomExecGlobal(Handle<JSRegExp> re,
281 Handle<String> subject) { 339 Handle<String> subject) {
282 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex))); 340 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
283 Handle<JSArray> result = Factory::NewJSArray(1); 341 Handle<JSArray> result = Factory::NewJSArray(1);
284 int index = 0; 342 int index = 0;
285 int match_count = 0; 343 int match_count = 0;
286 int subject_length = subject->length(); 344 int subject_length = subject->length();
287 int needle_length = needle->length(); 345 int needle_length = needle->length();
288 while (true) { 346 while (true) {
289 LOG(RegExpExecEvent(re, index, subject)); 347 LOG(RegExpExecEvent(re, index, subject));
290 int value = -1; 348 int value = -1;
291 if (index + needle_length <= subject_length) { 349 if (index + needle_length <= subject_length) {
292 value = Runtime::StringMatch(subject, needle, index); 350 value = Runtime::StringMatch(subject, needle, index);
293 } 351 }
294 if (value == -1) break; 352 if (value == -1) break;
295 HandleScope scope; 353 HandleScope scope;
296 int end = value + needle_length; 354 int end = value + needle_length;
297 355
298 Handle<FixedArray> array = Factory::NewFixedArray(2); 356 Handle<FixedArray> array = Factory::NewFixedArray(2);
299 array->set(0, 357 array->set(0, Smi::FromInt(value));
300 Smi::FromInt(value), 358 array->set(1, Smi::FromInt(end));
301 SKIP_WRITE_BARRIER);
302 array->set(1,
303 Smi::FromInt(end),
304 SKIP_WRITE_BARRIER);
305 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array); 359 Handle<JSArray> pair = Factory::NewJSArrayWithElements(array);
306 SetElement(result, match_count, pair); 360 SetElement(result, match_count, pair);
307 match_count++; 361 match_count++;
308 index = end; 362 index = end;
309 if (needle_length == 0) index++; 363 if (needle_length == 0) index++;
310 } 364 }
311 return result; 365 return result;
312 } 366 }
313 367
314 368
369 Handle<Object>RegExpImpl::JscrePrepare(Handle<JSRegExp> re,
370 Handle<String> pattern,
371 JSRegExp::Flags flags) {
372 Handle<Object> value(Heap::undefined_value());
373 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
374 return re;
375 }
376
377
378 Handle<Object>RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
379 Handle<String> pattern,
380 JSRegExp::Flags flags,
381 Handle<FixedArray> irregexp_data) {
382 Factory::SetRegExpData(re, JSRegExp::IRREGEXP, pattern, flags, irregexp_data);
383 return re;
384 }
385
386
315 static inline Object* DoCompile(String* pattern, 387 static inline Object* DoCompile(String* pattern,
316 JSRegExp::Flags flags, 388 JSRegExp::Flags flags,
317 unsigned* number_of_captures, 389 unsigned* number_of_captures,
318 const char** error_message, 390 const char** error_message,
319 JscreRegExp** code) { 391 v8::jscre::JscreRegExp** code) {
320 JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case() 392 v8::jscre::JSRegExpIgnoreCaseOption case_option = flags.is_ignore_case()
321 ? JSRegExpIgnoreCase 393 ? v8::jscre::JSRegExpIgnoreCase
322 : JSRegExpDoNotIgnoreCase; 394 : v8::jscre::JSRegExpDoNotIgnoreCase;
323 JSRegExpMultilineOption multiline_option = flags.is_multiline() 395 v8::jscre::JSRegExpMultilineOption multiline_option = flags.is_multiline()
324 ? JSRegExpMultiline 396 ? v8::jscre::JSRegExpMultiline
325 : JSRegExpSingleLine; 397 : v8::jscre::JSRegExpSingleLine;
326 *error_message = NULL; 398 *error_message = NULL;
327 malloc_failure = Failure::Exception(); 399 malloc_failure = Failure::Exception();
328 *code = jsRegExpCompile(pattern->GetTwoByteData(), 400 *code = v8::jscre::jsRegExpCompile(pattern->GetTwoByteData(),
329 pattern->length(), 401 pattern->length(),
330 case_option, 402 case_option,
331 multiline_option, 403 multiline_option,
332 number_of_captures, 404 number_of_captures,
333 error_message, 405 error_message,
334 &JSREMalloc, 406 &JSREMalloc,
335 &JSREFree); 407 &JSREFree);
336 if (*code == NULL && (malloc_failure->IsRetryAfterGC() || 408 if (*code == NULL && (malloc_failure->IsRetryAfterGC() ||
337 malloc_failure->IsOutOfMemoryFailure())) { 409 malloc_failure->IsOutOfMemoryFailure())) {
338 return malloc_failure; 410 return malloc_failure;
339 } else { 411 } else {
340 // It doesn't matter which object we return here, we just need to return 412 // It doesn't matter which object we return here, we just need to return
341 // a non-failure to indicate to the GC-retry code that there was no 413 // a non-failure to indicate to the GC-retry code that there was no
342 // allocation failure. 414 // allocation failure.
343 return pattern; 415 return pattern;
344 } 416 }
345 } 417 }
346 418
347 419
348 void CompileWithRetryAfterGC(Handle<String> pattern, 420 void CompileWithRetryAfterGC(Handle<String> pattern,
349 JSRegExp::Flags flags, 421 JSRegExp::Flags flags,
350 unsigned* number_of_captures, 422 unsigned* number_of_captures,
351 const char** error_message, 423 const char** error_message,
352 JscreRegExp** code) { 424 v8::jscre::JscreRegExp** code) {
353 CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern, 425 CALL_HEAP_FUNCTION_VOID(DoCompile(*pattern,
354 flags, 426 flags,
355 number_of_captures, 427 number_of_captures,
356 error_message, 428 error_message,
357 code)); 429 code));
358 } 430 }
359 431
360 432
361 Handle<Object> RegExpImpl::JsreCompile(Handle<JSRegExp> re, 433 Handle<Object> RegExpImpl::JscreCompile(Handle<JSRegExp> re) {
362 Handle<String> pattern, 434 ASSERT_EQ(re->TypeTag(), JSRegExp::JSCRE);
363 JSRegExp::Flags flags) { 435 ASSERT(re->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined());
436
437 Handle<String> pattern(re->Pattern());
438 JSRegExp::Flags flags = re->GetFlags();
439
364 Handle<String> two_byte_pattern = StringToTwoByte(pattern); 440 Handle<String> two_byte_pattern = StringToTwoByte(pattern);
365 441
366 unsigned number_of_captures; 442 unsigned number_of_captures;
367 const char* error_message = NULL; 443 const char* error_message = NULL;
368 444
369 JscreRegExp* code = NULL; 445 v8::jscre::JscreRegExp* code = NULL;
370 FlattenString(pattern); 446 FlattenString(pattern);
371 447
372 CompileWithRetryAfterGC(two_byte_pattern, 448 CompileWithRetryAfterGC(two_byte_pattern,
373 flags, 449 flags,
374 &number_of_captures, 450 &number_of_captures,
375 &error_message, 451 &error_message,
376 &code); 452 &code);
377 453
378 if (code == NULL) { 454 if (code == NULL) {
379 // Throw an exception. 455 // Throw an exception.
380 Handle<JSArray> array = Factory::NewJSArray(2); 456 Handle<JSArray> array = Factory::NewJSArray(2);
381 SetElement(array, 0, pattern); 457 SetElement(array, 0, pattern);
382 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector( 458 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(
383 (error_message == NULL) ? "Unknown regexp error" : error_message))); 459 (error_message == NULL) ? "Unknown regexp error" : error_message)));
384 Handle<Object> regexp_err = 460 Handle<Object> regexp_err =
385 Factory::NewSyntaxError("malformed_regexp", array); 461 Factory::NewSyntaxError("malformed_regexp", array);
386 Top::Throw(*regexp_err); 462 Top::Throw(*regexp_err);
387 return Handle<Object>(); 463 return Handle<Object>();
388 } 464 }
389 465
390 // Convert the return address to a ByteArray pointer. 466 // Convert the return address to a ByteArray pointer.
391 Handle<ByteArray> internal( 467 Handle<ByteArray> internal(
392 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code))); 468 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
393 469
394 Handle<FixedArray> value = Factory::NewFixedArray(2); 470 Handle<FixedArray> value = Factory::NewFixedArray(kJscreDataLength);
395 value->set(CAPTURE_INDEX, Smi::FromInt(number_of_captures)); 471 value->set(kJscreNumberOfCapturesIndex, Smi::FromInt(number_of_captures));
396 value->set(INTERNAL_INDEX, *internal); 472 value->set(kJscreInternalIndex, *internal);
397 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value); 473 Factory::SetRegExpData(re, JSRegExp::JSCRE, pattern, flags, value);
398 474
399 return re; 475 return re;
400 } 476 }
401 477
402 478
403 Handle<Object> RegExpImpl::JsreExecOnce(Handle<JSRegExp> regexp, 479 Handle<Object> RegExpImpl::IrregexpExecOnce(Handle<JSRegExp> regexp,
404 int num_captures, 480 int num_captures,
405 Handle<String> subject, 481 Handle<String> two_byte_subject,
406 int previous_index, 482 int previous_index,
407 const uc16* two_byte_subject, 483 int* offsets_vector,
408 int* offsets_vector, 484 int offsets_vector_length) {
409 int offsets_vector_length) { 485 #ifdef DEBUG
486 if (FLAG_trace_regexp_bytecodes) {
487 String* pattern = regexp->Pattern();
488 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
489 PrintF("\n\nSubject string: '%s'\n\n", *(two_byte_subject->ToCString()));
490 }
491 #endif
492 ASSERT(StringShape(*two_byte_subject).IsTwoByteRepresentation());
493 ASSERT(two_byte_subject->IsFlat(StringShape(*two_byte_subject)));
494 bool rc;
495
496 for (int i = (num_captures + 1) * 2 - 1; i >= 0; i--) {
497 offsets_vector[i] = -1;
498 }
499
500 LOG(RegExpExecEvent(regexp, previous_index, two_byte_subject));
501
502 FixedArray* irregexp =
503 FixedArray::cast(regexp->DataAt(JSRegExp::kIrregexpDataIndex));
504 int tag = Smi::cast(irregexp->get(kIrregexpImplementationIndex))->value();
505
506 switch (tag) {
507 case RegExpMacroAssembler::kIA32Implementation: {
508 Code* code = Code::cast(irregexp->get(kIrregexpCodeIndex));
509 Address start_addr =
510 Handle<SeqTwoByteString>::cast(two_byte_subject)->GetCharsAddress();
511 int string_offset =
512 start_addr - reinterpret_cast<Address>(*two_byte_subject);
513 int start_offset = string_offset + previous_index * sizeof(uc16);
514 int end_offset =
515 string_offset + two_byte_subject->length() * sizeof(uc16);
516 typedef bool testfunc(String**, int, int, int*);
517 testfunc* test = FUNCTION_CAST<testfunc*>(code->entry());
518 rc = test(two_byte_subject.location(),
519 start_offset,
520 end_offset,
521 offsets_vector);
522 if (rc) {
523 // Capture values are relative to start_offset only.
524 for (int i = 0; i < offsets_vector_length; i++) {
525 if (offsets_vector[i] >= 0) {
526 offsets_vector[i] += previous_index;
527 }
528 }
529 }
530 break;
531 }
532 case RegExpMacroAssembler::kBytecodeImplementation: {
533 Handle<ByteArray> byte_codes = IrregexpCode(regexp);
534
535 rc = IrregexpInterpreter::Match(byte_codes,
536 two_byte_subject,
537 offsets_vector,
538 previous_index);
539 break;
540 }
541 case RegExpMacroAssembler::kARMImplementation:
542 default:
543 UNREACHABLE();
544 rc = false;
545 break;
546 }
547
548 if (!rc) {
549 return Factory::null_value();
550 }
551
552 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
553 // The captures come in (start, end+1) pairs.
554 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
555 array->set(i, Smi::FromInt(offsets_vector[i]));
556 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
557 }
558 return Factory::NewJSArrayWithElements(array);
559 }
560
561
562 Handle<Object> RegExpImpl::JscreExecOnce(Handle<JSRegExp> regexp,
563 int num_captures,
564 Handle<String> subject,
565 int previous_index,
566 const uc16* two_byte_subject,
567 int* offsets_vector,
568 int offsets_vector_length) {
410 int rc; 569 int rc;
411 { 570 {
412 AssertNoAllocation a; 571 AssertNoAllocation a;
413 ByteArray* internal = JsreInternal(regexp); 572 ByteArray* internal = JscreInternal(regexp);
414 const JscreRegExp* js_regexp = 573 const v8::jscre::JscreRegExp* js_regexp =
415 reinterpret_cast<JscreRegExp*>(internal->GetDataStartAddress()); 574 reinterpret_cast<v8::jscre::JscreRegExp*>(
575 internal->GetDataStartAddress());
416 576
417 LOG(RegExpExecEvent(regexp, previous_index, subject)); 577 LOG(RegExpExecEvent(regexp, previous_index, subject));
418 578
419 rc = jsRegExpExecute(js_regexp, 579 rc = v8::jscre::jsRegExpExecute(js_regexp,
420 two_byte_subject, 580 two_byte_subject,
421 subject->length(), 581 subject->length(),
422 previous_index, 582 previous_index,
423 offsets_vector, 583 offsets_vector,
424 offsets_vector_length); 584 offsets_vector_length);
425 } 585 }
426 586
427 // The KJS JavaScript engine returns null (ie, a failed match) when 587 // The KJS JavaScript engine returns null (ie, a failed match) when
428 // JSRE's internal match limit is exceeded. We duplicate that behavior here. 588 // JSRE's internal match limit is exceeded. We duplicate that behavior here.
429 if (rc == JSRegExpErrorNoMatch 589 if (rc == v8::jscre::JSRegExpErrorNoMatch
430 || rc == JSRegExpErrorHitLimit) { 590 || rc == v8::jscre::JSRegExpErrorHitLimit) {
431 return Factory::null_value(); 591 return Factory::null_value();
432 } 592 }
433 593
434 // Other JSRE errors: 594 // Other JSRE errors:
435 if (rc < 0) { 595 if (rc < 0) {
436 // Throw an exception. 596 // Throw an exception.
437 Handle<Object> code(Smi::FromInt(rc)); 597 Handle<Object> code(Smi::FromInt(rc));
438 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code }; 598 Handle<Object> args[2] = { Factory::LookupAsciiSymbol("jsre_exec"), code };
439 Handle<Object> regexp_err( 599 Handle<Object> regexp_err(
440 Factory::NewTypeError("jsre_error", HandleVector(args, 2))); 600 Factory::NewTypeError("jsre_error", HandleVector(args, 2)));
441 return Handle<Object>(Top::Throw(*regexp_err)); 601 return Handle<Object>(Top::Throw(*regexp_err));
442 } 602 }
443 603
444 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1)); 604 Handle<FixedArray> array = Factory::NewFixedArray(2 * (num_captures+1));
445 // The captures come in (start, end+1) pairs. 605 // The captures come in (start, end+1) pairs.
446 for (int i = 0; i < 2 * (num_captures+1); i += 2) { 606 for (int i = 0; i < 2 * (num_captures+1); i += 2) {
447 array->set(i, 607 array->set(i, Smi::FromInt(offsets_vector[i]));
448 Smi::FromInt(offsets_vector[i]), 608 array->set(i+1, Smi::FromInt(offsets_vector[i+1]));
449 SKIP_WRITE_BARRIER);
450 array->set(i+1,
451 Smi::FromInt(offsets_vector[i+1]),
452 SKIP_WRITE_BARRIER);
453 } 609 }
454 return Factory::NewJSArrayWithElements(array); 610 return Factory::NewJSArrayWithElements(array);
455 } 611 }
456 612
457 613
458 class OffsetsVector { 614 class OffsetsVector {
459 public: 615 public:
460 inline OffsetsVector(int num_captures) { 616 inline OffsetsVector(int num_registers)
461 offsets_vector_length_ = (num_captures + 1) * 3; 617 : offsets_vector_length_(num_registers) {
462 if (offsets_vector_length_ > kStaticOffsetsVectorSize) { 618 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
463 vector_ = NewArray<int>(offsets_vector_length_); 619 vector_ = NewArray<int>(offsets_vector_length_);
464 } else { 620 } else {
465 vector_ = static_offsets_vector_; 621 vector_ = static_offsets_vector_;
466 } 622 }
467 } 623 }
468 624
469 625
470 inline ~OffsetsVector() { 626 inline ~OffsetsVector() {
471 if (offsets_vector_length_ > kStaticOffsetsVectorSize) { 627 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
472 DeleteArray(vector_); 628 DeleteArray(vector_);
473 vector_ = NULL; 629 vector_ = NULL;
474 } 630 }
475 } 631 }
476 632
477 633
478 inline int* vector() { 634 inline int* vector() {
479 return vector_; 635 return vector_;
480 } 636 }
481 637
482 638
483 inline int length() { 639 inline int length() {
484 return offsets_vector_length_; 640 return offsets_vector_length_;
485 } 641 }
486 642
487 private: 643 private:
488 int* vector_; 644 int* vector_;
489 int offsets_vector_length_; 645 int offsets_vector_length_;
490 static const int kStaticOffsetsVectorSize = 30; 646 static const int kStaticOffsetsVectorSize = 50;
491 static int static_offsets_vector_[kStaticOffsetsVectorSize]; 647 static int static_offsets_vector_[kStaticOffsetsVectorSize];
492 }; 648 };
493 649
494 650
495 int OffsetsVector::static_offsets_vector_[ 651 int OffsetsVector::static_offsets_vector_[
496 OffsetsVector::kStaticOffsetsVectorSize]; 652 OffsetsVector::kStaticOffsetsVectorSize];
497 653
498 654
499 Handle<Object> RegExpImpl::JsreExec(Handle<JSRegExp> regexp, 655 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
500 Handle<String> subject, 656 Handle<String> subject,
501 Handle<Object> index) { 657 Handle<Object> index) {
658 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
659 ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());
660
502 // Prepare space for the return values. 661 // Prepare space for the return values.
503 int num_captures = JsreCapture(regexp); 662 int number_of_registers = IrregexpNumberOfRegisters(regexp);
663 OffsetsVector offsets(number_of_registers);
504 664
505 OffsetsVector offsets(num_captures); 665 int num_captures = IrregexpNumberOfCaptures(regexp);
506 666
507 int previous_index = static_cast<int>(DoubleToInteger(index->Number())); 667 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
508 668
509 Handle<String> subject16 = CachedStringToTwoByte(subject); 669 Handle<String> subject16 = CachedStringToTwoByte(subject);
510 670
511 Handle<Object> result(JsreExecOnce(regexp, num_captures, subject, 671 Handle<Object> result(IrregexpExecOnce(regexp,
512 previous_index, 672 num_captures,
513 subject16->GetTwoByteData(), 673 subject16,
514 offsets.vector(), offsets.length())); 674 previous_index,
675 offsets.vector(),
676 offsets.length()));
677 return result;
678 }
679
680
681 Handle<Object> RegExpImpl::JscreExec(Handle<JSRegExp> regexp,
682 Handle<String> subject,
683 Handle<Object> index) {
684 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
685 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
686 Handle<Object> compile_result = JscreCompile(regexp);
687 if (compile_result.is_null()) return compile_result;
688 }
689 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
690
691 int num_captures = JscreNumberOfCaptures(regexp);
692
693 OffsetsVector offsets((num_captures + 1) * 3);
694
695 int previous_index = static_cast<int>(DoubleToInteger(index->Number()));
696
697 Handle<String> subject16 = CachedStringToTwoByte(subject);
698
699 Handle<Object> result(JscreExecOnce(regexp,
700 num_captures,
701 subject,
702 previous_index,
703 subject16->GetTwoByteData(),
704 offsets.vector(),
705 offsets.length()));
515 706
516 return result; 707 return result;
517 } 708 }
518 709
519 710
520 Handle<Object> RegExpImpl::JsreExecGlobal(Handle<JSRegExp> regexp, 711 Handle<Object> RegExpImpl::IrregexpExecGlobal(Handle<JSRegExp> regexp,
521 Handle<String> subject) { 712 Handle<String> subject) {
713 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
714 ASSERT(!regexp->DataAt(JSRegExp::kIrregexpDataIndex)->IsUndefined());
715
522 // Prepare space for the return values. 716 // Prepare space for the return values.
523 int num_captures = JsreCapture(regexp); 717 int number_of_registers = IrregexpNumberOfRegisters(regexp);
524 718 OffsetsVector offsets(number_of_registers);
525 OffsetsVector offsets(num_captures);
526 719
527 int previous_index = 0; 720 int previous_index = 0;
528 721
529 Handle<JSArray> result = Factory::NewJSArray(0); 722 Handle<JSArray> result = Factory::NewJSArray(0);
723 int i = 0;
724 Handle<Object> matches;
725
726 Handle<String> subject16 = CachedStringToTwoByte(subject);
727
728 do {
729 if (previous_index > subject->length() || previous_index < 0) {
730 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
731 // string length, there is no match.
732 matches = Factory::null_value();
733 } else {
734 matches = IrregexpExecOnce(regexp,
735 IrregexpNumberOfCaptures(regexp),
736 subject16,
737 previous_index,
738 offsets.vector(),
739 offsets.length());
740
741 if (matches->IsJSArray()) {
742 SetElement(result, i, matches);
743 i++;
744 previous_index = offsets.vector()[1];
745 if (offsets.vector()[0] == offsets.vector()[1]) {
746 previous_index++;
747 }
748 }
749 }
750 } while (matches->IsJSArray());
751
752 // If we exited the loop with an exception, throw it.
753 if (matches->IsNull()) {
754 // Exited loop normally.
755 return result;
756 } else {
757 // Exited loop with the exception in matches.
758 return matches;
759 }
760 }
761
762
763 Handle<Object> RegExpImpl::JscreExecGlobal(Handle<JSRegExp> regexp,
764 Handle<String> subject) {
765 ASSERT_EQ(regexp->TypeTag(), JSRegExp::JSCRE);
766 if (regexp->DataAt(JSRegExp::kJscreDataIndex)->IsUndefined()) {
767 Handle<Object> compile_result = JscreCompile(regexp);
768 if (compile_result.is_null()) return compile_result;
769 }
770 ASSERT(regexp->DataAt(JSRegExp::kJscreDataIndex)->IsFixedArray());
771
772 // Prepare space for the return values.
773 int num_captures = JscreNumberOfCaptures(regexp);
774
775 OffsetsVector offsets((num_captures + 1) * 3);
776
777 int previous_index = 0;
778
779 Handle<JSArray> result = Factory::NewJSArray(0);
530 int i = 0; 780 int i = 0;
531 Handle<Object> matches; 781 Handle<Object> matches;
532 782
533 Handle<String> subject16 = CachedStringToTwoByte(subject); 783 Handle<String> subject16 = CachedStringToTwoByte(subject);
534 784
535 do { 785 do {
536 if (previous_index > subject->length() || previous_index < 0) { 786 if (previous_index > subject->length() || previous_index < 0) {
537 // Per ECMA-262 15.10.6.2, if the previous index is greater than the 787 // Per ECMA-262 15.10.6.2, if the previous index is greater than the
538 // string length, there is no match. 788 // string length, there is no match.
539 matches = Factory::null_value(); 789 matches = Factory::null_value();
540 } else { 790 } else {
541 matches = JsreExecOnce(regexp, num_captures, subject, previous_index, 791 matches = JscreExecOnce(regexp,
542 subject16->GetTwoByteData(), 792 num_captures,
543 offsets.vector(), offsets.length()); 793 subject,
794 previous_index,
795 subject16->GetTwoByteData(),
796 offsets.vector(),
797 offsets.length());
544 798
545 if (matches->IsJSArray()) { 799 if (matches->IsJSArray()) {
546 SetElement(result, i, matches); 800 SetElement(result, i, matches);
547 i++; 801 i++;
548 previous_index = offsets.vector()[1]; 802 previous_index = offsets.vector()[1];
549 if (offsets.vector()[0] == offsets.vector()[1]) { 803 if (offsets.vector()[0] == offsets.vector()[1]) {
550 previous_index++; 804 previous_index++;
551 } 805 }
552 } 806 }
553 } 807 }
554 } while (matches->IsJSArray()); 808 } while (matches->IsJSArray());
555 809
556 // If we exited the loop with an exception, throw it. 810 // If we exited the loop with an exception, throw it.
557 if (matches->IsNull()) { // Exited loop normally. 811 if (matches->IsNull()) {
812 // Exited loop normally.
558 return result; 813 return result;
559 } else { // Exited loop with the exception in matches. 814 } else {
815 // Exited loop with the exception in matches.
560 return matches; 816 return matches;
561 } 817 }
562 } 818 }
563 819
564 820
565 int RegExpImpl::JsreCapture(Handle<JSRegExp> re) { 821 int RegExpImpl::JscreNumberOfCaptures(Handle<JSRegExp> re) {
566 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); 822 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
567 return Smi::cast(value->get(CAPTURE_INDEX))->value(); 823 return Smi::cast(value->get(kJscreNumberOfCapturesIndex))->value();
568 } 824 }
569 825
570 826
571 ByteArray* RegExpImpl::JsreInternal(Handle<JSRegExp> re) { 827 ByteArray* RegExpImpl::JscreInternal(Handle<JSRegExp> re) {
572 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex)); 828 FixedArray* value = FixedArray::cast(re->DataAt(JSRegExp::kJscreDataIndex));
573 return ByteArray::cast(value->get(INTERNAL_INDEX)); 829 return ByteArray::cast(value->get(kJscreInternalIndex));
574 } 830 }
831
832
833 int RegExpImpl::IrregexpNumberOfCaptures(Handle<JSRegExp> re) {
834 FixedArray* value =
835 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
836 return Smi::cast(value->get(kIrregexpNumberOfCapturesIndex))->value();
837 }
838
839
840 int RegExpImpl::IrregexpNumberOfRegisters(Handle<JSRegExp> re) {
841 FixedArray* value =
842 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
843 return Smi::cast(value->get(kIrregexpNumberOfRegistersIndex))->value();
844 }
845
846
847 Handle<ByteArray> RegExpImpl::IrregexpCode(Handle<JSRegExp> re) {
848 FixedArray* value =
849 FixedArray::cast(re->DataAt(JSRegExp::kIrregexpDataIndex));
850 return Handle<ByteArray>(ByteArray::cast(value->get(kIrregexpCodeIndex)));
851 }
852
853
854 // -------------------------------------------------------------------
855 // Implmentation of the Irregexp regular expression engine.
856
857
858 void RegExpTree::AppendToText(RegExpText* text) {
859 UNREACHABLE();
860 }
861
862
863 void RegExpAtom::AppendToText(RegExpText* text) {
864 text->AddElement(TextElement::Atom(this));
865 }
866
867
868 void RegExpCharacterClass::AppendToText(RegExpText* text) {
869 text->AddElement(TextElement::CharClass(this));
870 }
871
872
873 void RegExpText::AppendToText(RegExpText* text) {
874 for (int i = 0; i < elements()->length(); i++)
875 text->AddElement(elements()->at(i));
876 }
877
878
879 TextElement TextElement::Atom(RegExpAtom* atom) {
880 TextElement result = TextElement(ATOM);
881 result.data.u_atom = atom;
882 return result;
883 }
884
885
886 TextElement TextElement::CharClass(
887 RegExpCharacterClass* char_class) {
888 TextElement result = TextElement(CHAR_CLASS);
889 result.data.u_char_class = char_class;
890 return result;
891 }
892
893
894 class RegExpCompiler {
895 public:
896 RegExpCompiler(int capture_count, bool ignore_case);
897
898 int AllocateRegister() { return next_register_++; }
899
900 Handle<FixedArray> Assemble(RegExpMacroAssembler* assembler,
901 RegExpNode* start,
902 int capture_count);
903
904 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
905
906 static const int kImplementationOffset = 0;
907 static const int kNumberOfRegistersOffset = 0;
908 static const int kCodeOffset = 1;
909
910 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
911 EndNode* accept() { return accept_; }
912 EndNode* backtrack() { return backtrack_; }
913
914 static const int kMaxRecursion = 100;
915 inline int recursion_depth() { return recursion_depth_; }
916 inline void IncrementRecursionDepth() { recursion_depth_++; }
917 inline void DecrementRecursionDepth() { recursion_depth_--; }
918
919 inline bool ignore_case() { return ignore_case_; }
920
921 private:
922 EndNode* accept_;
923 EndNode* backtrack_;
924 int next_register_;
925 List<RegExpNode*>* work_list_;
926 int recursion_depth_;
927 RegExpMacroAssembler* macro_assembler_;
928 bool ignore_case_;
929 };
930
931
932 // Attempts to compile the regexp using an Irregexp code generator. Returns
933 // a fixed array or a null handle depending on whether it succeeded.
934 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case)
935 : next_register_(2 * (capture_count + 1)),
936 work_list_(NULL),
937 recursion_depth_(0),
938 ignore_case_(ignore_case) {
939 accept_ = new EndNode(EndNode::ACCEPT);
940 backtrack_ = new EndNode(EndNode::BACKTRACK);
941 }
942
943
944 Handle<FixedArray> RegExpCompiler::Assemble(
945 RegExpMacroAssembler* macro_assembler,
946 RegExpNode* start,
947 int capture_count) {
948 #ifdef DEBUG
949 if (FLAG_trace_regexp_assembler)
950 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
951 else
952 #endif
953 macro_assembler_ = macro_assembler;
954 List <RegExpNode*> work_list(0);
955 work_list_ = &work_list;
956 Label fail;
957 macro_assembler_->PushBacktrack(&fail);
958 if (!start->GoTo(this)) {
959 fail.Unuse();
960 return Handle<FixedArray>::null();
961 }
962 while (!work_list.is_empty()) {
963 if (!work_list.RemoveLast()->GoTo(this)) {
964 fail.Unuse();
965 return Handle<FixedArray>::null();
966 }
967 }
968 macro_assembler_->Bind(&fail);
969 macro_assembler_->Fail();
970 Handle<FixedArray> array =
971 Factory::NewFixedArray(RegExpImpl::kIrregexpDataLength);
972 array->set(RegExpImpl::kIrregexpImplementationIndex,
973 Smi::FromInt(macro_assembler_->Implementation()));
974 array->set(RegExpImpl::kIrregexpNumberOfRegistersIndex,
975 Smi::FromInt(next_register_));
976 array->set(RegExpImpl::kIrregexpNumberOfCapturesIndex,
977 Smi::FromInt(capture_count));
978 Handle<Object> code = macro_assembler_->GetCode();
979 array->set(RegExpImpl::kIrregexpCodeIndex, *code);
980 work_list_ = NULL;
981 #ifdef DEBUG
982 if (FLAG_trace_regexp_assembler) {
983 delete macro_assembler_;
984 }
985 #endif
986 return array;
987 }
988
989
990 bool RegExpNode::GoTo(RegExpCompiler* compiler) {
991 // TODO(erikcorry): Implement support.
992 if (info_.follows_word_interest ||
993 info_.follows_newline_interest ||
994 info_.follows_start_interest) {
995 return false;
996 }
997 if (label_.is_bound()) {
998 compiler->macro_assembler()->GoTo(&label_);
999 return true;
1000 } else {
1001 if (compiler->recursion_depth() > RegExpCompiler::kMaxRecursion) {
1002 compiler->macro_assembler()->GoTo(&label_);
1003 compiler->AddWork(this);
1004 return true;
1005 } else {
1006 compiler->IncrementRecursionDepth();
1007 bool how_it_went = Emit(compiler);
1008 compiler->DecrementRecursionDepth();
1009 return how_it_went;
1010 }
1011 }
1012 }
1013
1014
1015 // EndNodes are special. Because they can be very common and they are very
1016 // short we normally inline them. That is, if we are asked to emit a GoTo
1017 // we just emit the entire node. Since they don't have successors this
1018 // works.
1019 bool EndNode::GoTo(RegExpCompiler* compiler) {
1020 if (info()->follows_word_interest ||
1021 info()->follows_newline_interest ||
1022 info()->follows_start_interest) {
1023 return false;
1024 }
1025 return Emit(compiler);
1026 }
1027
1028
1029 Label* RegExpNode::label() {
1030 return &label_;
1031 }
1032
1033
1034 bool EndNode::Emit(RegExpCompiler* compiler) {
1035 RegExpMacroAssembler* macro = compiler->macro_assembler();
1036 switch (action_) {
1037 case ACCEPT:
1038 if (!label()->is_bound()) Bind(macro);
1039 if (info()->at_end) {
1040 Label succeed;
1041 // LoadCurrentCharacter will go to the label if we are at the end of the
1042 // input string.
1043 macro->LoadCurrentCharacter(0, &succeed);
1044 macro->Backtrack();
1045 macro->Bind(&succeed);
1046 }
1047 macro->Succeed();
1048 return true;
1049 case BACKTRACK:
1050 if (!label()->is_bound()) Bind(macro);
1051 ASSERT(!info()->at_end);
1052 macro->Backtrack();
1053 return true;
1054 }
1055 return false;
1056 }
1057
1058
1059 void GuardedAlternative::AddGuard(Guard* guard) {
1060 if (guards_ == NULL)
1061 guards_ = new ZoneList<Guard*>(1);
1062 guards_->Add(guard);
1063 }
1064
1065
1066 ActionNode* ActionNode::StoreRegister(int reg,
1067 int val,
1068 RegExpNode* on_success) {
1069 ActionNode* result = new ActionNode(STORE_REGISTER, on_success);
1070 result->data_.u_store_register.reg = reg;
1071 result->data_.u_store_register.value = val;
1072 return result;
1073 }
1074
1075
1076 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1077 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1078 result->data_.u_increment_register.reg = reg;
1079 return result;
1080 }
1081
1082
1083 ActionNode* ActionNode::StorePosition(int reg, RegExpNode* on_success) {
1084 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1085 result->data_.u_position_register.reg = reg;
1086 return result;
1087 }
1088
1089
1090 ActionNode* ActionNode::RestorePosition(int reg, RegExpNode* on_success) {
1091 ActionNode* result = new ActionNode(RESTORE_POSITION, on_success);
1092 result->data_.u_position_register.reg = reg;
1093 return result;
1094 }
1095
1096
1097 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1098 int position_reg,
1099 RegExpNode* on_success) {
1100 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1101 result->data_.u_submatch.stack_pointer_register = stack_reg;
1102 result->data_.u_submatch.current_position_register = position_reg;
1103 return result;
1104 }
1105
1106
1107 ActionNode* ActionNode::EscapeSubmatch(int stack_reg,
1108 bool restore_position,
1109 int position_reg,
1110 RegExpNode* on_success) {
1111 ActionNode* result = new ActionNode(ESCAPE_SUBMATCH, on_success);
1112 result->data_.u_submatch.stack_pointer_register = stack_reg;
1113 if (restore_position) {
1114 result->data_.u_submatch.current_position_register = position_reg;
1115 } else {
1116 result->data_.u_submatch.current_position_register = -1;
1117 }
1118 return result;
1119 }
1120
1121
1122 #define DEFINE_ACCEPT(Type) \
1123 void Type##Node::Accept(NodeVisitor* visitor) { \
1124 visitor->Visit##Type(this); \
1125 }
1126 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1127 #undef DEFINE_ACCEPT
1128
1129
1130 // -------------------------------------------------------------------
1131 // Emit code.
1132
1133
1134 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1135 Guard* guard,
1136 Label* on_failure) {
1137 switch (guard->op()) {
1138 case Guard::LT:
1139 macro_assembler->IfRegisterGE(guard->reg(), guard->value(), on_failure);
1140 break;
1141 case Guard::GEQ:
1142 macro_assembler->IfRegisterLT(guard->reg(), guard->value(), on_failure);
1143 break;
1144 }
1145 }
1146
1147
1148 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1149 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1150
1151
1152 static inline void EmitAtomNonLetters(
1153 RegExpMacroAssembler* macro_assembler,
1154 TextElement elm,
1155 Vector<const uc16> quarks,
1156 Label* on_failure,
1157 int cp_offset) {
1158 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1159 for (int i = quarks.length() - 1; i >= 0; i--) {
1160 uc16 c = quarks[i];
1161 int length = uncanonicalize.get(c, '\0', chars);
1162 if (length <= 1) {
1163 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
1164 macro_assembler->CheckNotCharacter(c, on_failure);
1165 }
1166 }
1167 }
1168
1169
1170 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1171 uc16 c1,
1172 uc16 c2,
1173 Label* on_failure) {
1174 uc16 exor = c1 ^ c2;
1175 // Check whether exor has only one bit set.
1176 if (((exor - 1) & exor) == 0) {
1177 // If c1 and c2 differ only by one bit.
1178 // Ecma262UnCanonicalize always gives the highest number last.
1179 ASSERT(c2 > c1);
1180 macro_assembler->CheckNotCharacterAfterOr(c2, exor, on_failure);
1181 return true;
1182 }
1183 ASSERT(c2 > c1);
1184 uc16 diff = c2 - c1;
1185 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1186 // If the characters differ by 2^n but don't differ by one bit then
1187 // subtract the difference from the found character, then do the or
1188 // trick. We avoid the theoretical case where negative numbers are
1189 // involved in order to simplify code generation.
1190 macro_assembler->CheckNotCharacterAfterMinusOr(c2 - diff,
1191 diff,
1192 on_failure);
1193 return true;
1194 }
1195 return false;
1196 }
1197
1198
1199 static inline void EmitAtomLetters(
1200 RegExpMacroAssembler* macro_assembler,
1201 TextElement elm,
1202 Vector<const uc16> quarks,
1203 Label* on_failure,
1204 int cp_offset) {
1205 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1206 for (int i = quarks.length() - 1; i >= 0; i--) {
1207 uc16 c = quarks[i];
1208 int length = uncanonicalize.get(c, '\0', chars);
1209 if (length <= 1) continue;
1210 macro_assembler->LoadCurrentCharacter(cp_offset + i, on_failure);
1211 Label ok;
1212 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1213 switch (length) {
1214 case 2: {
1215 if (ShortCutEmitCharacterPair(macro_assembler,
1216 chars[0],
1217 chars[1],
1218 on_failure)) {
1219 ok.Unuse();
1220 } else {
1221 macro_assembler->CheckCharacter(chars[0], &ok);
1222 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1223 macro_assembler->Bind(&ok);
1224 }
1225 break;
1226 }
1227 case 4:
1228 macro_assembler->CheckCharacter(chars[3], &ok);
1229 // Fall through!
1230 case 3:
1231 macro_assembler->CheckCharacter(chars[0], &ok);
1232 macro_assembler->CheckCharacter(chars[1], &ok);
1233 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1234 macro_assembler->Bind(&ok);
1235 break;
1236 default:
1237 UNREACHABLE();
1238 break;
1239 }
1240 }
1241 }
1242
1243
1244 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1245 RegExpCharacterClass* cc,
1246 int cp_offset,
1247 Label* on_failure) {
1248 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure);
1249 cp_offset++;
1250
1251 ZoneList<CharacterRange>* ranges = cc->ranges();
1252
1253 Label success;
1254
1255 Label* char_is_in_class =
1256 cc->is_negated() ? on_failure : &success;
1257
1258 int range_count = ranges->length();
1259
1260 if (range_count == 0) {
1261 if (!cc->is_negated()) {
1262 macro_assembler->GoTo(on_failure);
1263 }
1264 return;
1265 }
1266
1267 for (int i = 0; i < range_count - 1; i++) {
1268 CharacterRange& range = ranges->at(i);
1269 Label next_range;
1270 uc16 from = range.from();
1271 uc16 to = range.to();
1272 if (to == from) {
1273 macro_assembler->CheckCharacter(to, char_is_in_class);
1274 } else {
1275 if (from != 0) {
1276 macro_assembler->CheckCharacterLT(from, &next_range);
1277 }
1278 if (to != 0xffff) {
1279 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1280 } else {
1281 macro_assembler->GoTo(char_is_in_class);
1282 }
1283 }
1284 macro_assembler->Bind(&next_range);
1285 }
1286
1287 CharacterRange& range = ranges->at(range_count - 1);
1288 uc16 from = range.from();
1289 uc16 to = range.to();
1290
1291 if (to == from) {
1292 if (cc->is_negated()) {
1293 macro_assembler->CheckCharacter(to, on_failure);
1294 } else {
1295 macro_assembler->CheckNotCharacter(to, on_failure);
1296 }
1297 } else {
1298 if (from != 0) {
1299 if (cc->is_negated()) {
1300 macro_assembler->CheckCharacterLT(from, &success);
1301 } else {
1302 macro_assembler->CheckCharacterLT(from, on_failure);
1303 }
1304 }
1305 if (to != 0xffff) {
1306 if (cc->is_negated()) {
1307 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1308 } else {
1309 macro_assembler->CheckCharacterGT(to, on_failure);
1310 }
1311 } else {
1312 if (cc->is_negated()) {
1313 macro_assembler->GoTo(on_failure);
1314 }
1315 }
1316 }
1317 macro_assembler->Bind(&success);
1318 }
1319
1320
1321
1322 bool TextNode::Emit(RegExpCompiler* compiler) {
1323 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1324 Bind(macro_assembler);
1325 int element_count = elms_->length();
1326 ASSERT(element_count != 0);
1327 int cp_offset = 0;
1328 if (info()->at_end) {
1329 macro_assembler->Backtrack();
1330 return true;
1331 }
1332 // First, handle straight character matches.
1333 for (int i = 0; i < element_count; i++) {
1334 TextElement elm = elms_->at(i);
1335 if (elm.type == TextElement::ATOM) {
1336 Vector<const uc16> quarks = elm.data.u_atom->data();
1337 if (compiler->ignore_case()) {
1338 EmitAtomNonLetters(macro_assembler,
1339 elm,
1340 quarks,
1341 on_failure_->label(),
1342 cp_offset);
1343 } else {
1344 macro_assembler->CheckCharacters(quarks,
1345 cp_offset,
1346 on_failure_->label());
1347 }
1348 cp_offset += quarks.length();
1349 } else {
1350 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
1351 cp_offset++;
1352 }
1353 }
1354 // Second, handle case independent letter matches if any.
1355 if (compiler->ignore_case()) {
1356 cp_offset = 0;
1357 for (int i = 0; i < element_count; i++) {
1358 TextElement elm = elms_->at(i);
1359 if (elm.type == TextElement::ATOM) {
1360 Vector<const uc16> quarks = elm.data.u_atom->data();
1361 EmitAtomLetters(macro_assembler,
1362 elm,
1363 quarks,
1364 on_failure_->label(),
1365 cp_offset);
1366 cp_offset += quarks.length();
1367 } else {
1368 cp_offset++;
1369 }
1370 }
1371 }
1372 // If the fast character matches passed then do the character classes.
1373 cp_offset = 0;
1374 for (int i = 0; i < element_count; i++) {
1375 TextElement elm = elms_->at(i);
1376 if (elm.type == TextElement::CHAR_CLASS) {
1377 RegExpCharacterClass* cc = elm.data.u_char_class;
1378 EmitCharClass(macro_assembler, cc, cp_offset, on_failure_->label());
1379 cp_offset++;
1380 } else {
1381 cp_offset += elm.data.u_atom->data().length();
1382 }
1383 }
1384
1385 compiler->AddWork(on_failure_);
1386 macro_assembler->AdvanceCurrentPosition(cp_offset);
1387 return on_success()->GoTo(compiler);
1388 }
1389
1390
1391 void TextNode::MakeCaseIndependent() {
1392 int element_count = elms_->length();
1393 for (int i = 0; i < element_count; i++) {
1394 TextElement elm = elms_->at(i);
1395 if (elm.type == TextElement::CHAR_CLASS) {
1396 RegExpCharacterClass* cc = elm.data.u_char_class;
1397 ZoneList<CharacterRange>* ranges = cc->ranges();
1398 int range_count = ranges->length();
1399 for (int i = 0; i < range_count; i++) {
1400 ranges->at(i).AddCaseEquivalents(ranges);
1401 }
1402 }
1403 }
1404 }
1405
1406
1407 bool ChoiceNode::Emit(RegExpCompiler* compiler) {
1408 int choice_count = alternatives_->length();
1409 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1410 Bind(macro_assembler);
1411 // For now we just call all choices one after the other. The idea ultimately
1412 // is to use the Dispatch table to try only the relevant ones.
1413 for (int i = 0; i < choice_count - 1; i++) {
1414 GuardedAlternative alternative = alternatives_->at(i);
1415 Label after;
1416 Label after_no_pop_cp;
1417 ZoneList<Guard*>* guards = alternative.guards();
1418 if (guards != NULL) {
1419 int guard_count = guards->length();
1420 for (int j = 0; j < guard_count; j++) {
1421 GenerateGuard(macro_assembler, guards->at(j), &after_no_pop_cp);
1422 }
1423 }
1424 macro_assembler->PushCurrentPosition();
1425 macro_assembler->PushBacktrack(&after);
1426 if (!alternative.node()->GoTo(compiler)) {
1427 after.Unuse();
1428 after_no_pop_cp.Unuse();
1429 return false;
1430 }
1431 macro_assembler->Bind(&after);
1432 macro_assembler->PopCurrentPosition();
1433 macro_assembler->Bind(&after_no_pop_cp);
1434 }
1435 GuardedAlternative alternative = alternatives_->at(choice_count - 1);
1436 ZoneList<Guard*>* guards = alternative.guards();
1437 if (guards != NULL) {
1438 int guard_count = guards->length();
1439 for (int j = 0; j < guard_count; j++) {
1440 GenerateGuard(macro_assembler, guards->at(j), on_failure_->label());
1441 }
1442 }
1443 if (!on_failure_->IsBacktrack()) {
1444 ASSERT_NOT_NULL(on_failure_ -> label());
1445 macro_assembler->PushBacktrack(on_failure_->label());
1446 compiler->AddWork(on_failure_);
1447 }
1448 if (!alternative.node()->GoTo(compiler)) {
1449 return false;
1450 }
1451 return true;
1452 }
1453
1454
1455 bool ActionNode::Emit(RegExpCompiler* compiler) {
1456 RegExpMacroAssembler* macro = compiler->macro_assembler();
1457 Bind(macro);
1458 switch (type_) {
1459 case STORE_REGISTER:
1460 macro->SetRegister(data_.u_store_register.reg,
1461 data_.u_store_register.value);
1462 break;
1463 case INCREMENT_REGISTER: {
1464 Label undo;
1465 macro->PushBacktrack(&undo);
1466 macro->AdvanceRegister(data_.u_increment_register.reg, 1);
1467 bool ok = on_success()->GoTo(compiler);
1468 if (!ok) {
1469 undo.Unuse();
1470 return false;
1471 }
1472 macro->Bind(&undo);
1473 macro->AdvanceRegister(data_.u_increment_register.reg, -1);
1474 macro->Backtrack();
1475 break;
1476 }
1477 case STORE_POSITION: {
1478 Label undo;
1479 macro->PushRegister(data_.u_position_register.reg);
1480 macro->PushBacktrack(&undo);
1481 macro->WriteCurrentPositionToRegister(data_.u_position_register.reg);
1482 bool ok = on_success()->GoTo(compiler);
1483 if (!ok) {
1484 undo.Unuse();
1485 return false;
1486 }
1487 macro->Bind(&undo);
1488 macro->PopRegister(data_.u_position_register.reg);
1489 macro->Backtrack();
1490 break;
1491 }
1492 case RESTORE_POSITION:
1493 macro->ReadCurrentPositionFromRegister(
1494 data_.u_position_register.reg);
1495 break;
1496 case BEGIN_SUBMATCH:
1497 macro->WriteCurrentPositionToRegister(
1498 data_.u_submatch.current_position_register);
1499 macro->WriteStackPointerToRegister(
1500 data_.u_submatch.stack_pointer_register);
1501 break;
1502 case ESCAPE_SUBMATCH:
1503 if (info()->at_end) {
1504 Label at_end;
1505 // Load current character jumps to the label if we are beyond the string
1506 // end.
1507 macro->LoadCurrentCharacter(0, &at_end);
1508 macro->Backtrack();
1509 macro->Bind(&at_end);
1510 }
1511 if (data_.u_submatch.current_position_register != -1) {
1512 macro->ReadCurrentPositionFromRegister(
1513 data_.u_submatch.current_position_register);
1514 }
1515 macro->ReadStackPointerFromRegister(
1516 data_.u_submatch.stack_pointer_register);
1517 break;
1518 default:
1519 UNREACHABLE();
1520 return false;
1521 }
1522 return on_success()->GoTo(compiler);
1523 }
1524
1525
1526 bool BackReferenceNode::Emit(RegExpCompiler* compiler) {
1527 RegExpMacroAssembler* macro = compiler->macro_assembler();
1528 Bind(macro);
1529 // Check whether the registers are uninitialized and always
1530 // succeed if they are.
1531 macro->IfRegisterLT(start_reg_, 0, on_success()->label());
1532 macro->IfRegisterLT(end_reg_, 0, on_success()->label());
1533 ASSERT_EQ(start_reg_ + 1, end_reg_);
1534 if (info()->at_end) {
1535 // If we are constrained to match at the end of the input then succeed
1536 // iff the back reference is empty.
1537 macro->CheckNotRegistersEqual(start_reg_, end_reg_, on_failure_->label());
1538 } else {
1539 if (compiler->ignore_case()) {
1540 macro->CheckNotBackReferenceIgnoreCase(start_reg_, on_failure_->label());
1541 } else {
1542 macro->CheckNotBackReference(start_reg_, on_failure_->label());
1543 }
1544 }
1545 return on_success()->GoTo(compiler);
1546 }
1547
1548
1549 // -------------------------------------------------------------------
1550 // Dot/dotty output
1551
1552
1553 #ifdef DEBUG
1554
1555
1556 class DotPrinter: public NodeVisitor {
1557 public:
1558 DotPrinter() : stream_(&alloc_) { }
1559 void PrintNode(const char* label, RegExpNode* node);
1560 void Visit(RegExpNode* node);
1561 void PrintOnFailure(RegExpNode* from, RegExpNode* on_failure);
1562 void PrintAttributes(RegExpNode* from);
1563 StringStream* stream() { return &stream_; }
1564 #define DECLARE_VISIT(Type) \
1565 virtual void Visit##Type(Type##Node* that);
1566 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1567 #undef DECLARE_VISIT
1568 private:
1569 HeapStringAllocator alloc_;
1570 StringStream stream_;
1571 std::set<RegExpNode*> seen_;
1572 };
1573
1574
1575 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
1576 stream()->Add("digraph G {\n graph [label=\"");
1577 for (int i = 0; label[i]; i++) {
1578 switch (label[i]) {
1579 case '\\':
1580 stream()->Add("\\\\");
1581 break;
1582 case '"':
1583 stream()->Add("\"");
1584 break;
1585 default:
1586 stream()->Put(label[i]);
1587 break;
1588 }
1589 }
1590 stream()->Add("\"];\n");
1591 Visit(node);
1592 stream()->Add("}\n");
1593 printf("%s", *(stream()->ToCString()));
1594 }
1595
1596
1597 void DotPrinter::Visit(RegExpNode* node) {
1598 if (seen_.find(node) != seen_.end())
1599 return;
1600 seen_.insert(node);
1601 node->Accept(this);
1602 }
1603
1604
1605 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
1606 if (on_failure->IsBacktrack()) return;
1607 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
1608 Visit(on_failure);
1609 }
1610
1611
1612 class TableEntryBodyPrinter {
1613 public:
1614 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
1615 : stream_(stream), choice_(choice) { }
1616 void Call(uc16 from, DispatchTable::Entry entry) {
1617 OutSet* out_set = entry.out_set();
1618 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1619 if (out_set->Get(i)) {
1620 stream()->Add(" n%p:s%io%i -> n%p;\n",
1621 choice(),
1622 from,
1623 i,
1624 choice()->alternatives()->at(i).node());
1625 }
1626 }
1627 }
1628 private:
1629 StringStream* stream() { return stream_; }
1630 ChoiceNode* choice() { return choice_; }
1631 StringStream* stream_;
1632 ChoiceNode* choice_;
1633 };
1634
1635
1636 class TableEntryHeaderPrinter {
1637 public:
1638 explicit TableEntryHeaderPrinter(StringStream* stream)
1639 : first_(true), stream_(stream) { }
1640 void Call(uc16 from, DispatchTable::Entry entry) {
1641 if (first_) {
1642 first_ = false;
1643 } else {
1644 stream()->Add("|");
1645 }
1646 stream()->Add("{\\%k-\\%k|{", from, entry.to());
1647 OutSet* out_set = entry.out_set();
1648 int priority = 0;
1649 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1650 if (out_set->Get(i)) {
1651 if (priority > 0) stream()->Add("|");
1652 stream()->Add("<s%io%i> %i", from, i, priority);
1653 priority++;
1654 }
1655 }
1656 stream()->Add("}}");
1657 }
1658 private:
1659 bool first_;
1660 StringStream* stream() { return stream_; }
1661 StringStream* stream_;
1662 };
1663
1664
1665 void DotPrinter::PrintAttributes(RegExpNode* that) {
1666 stream()->Add(" a%p [shape=Mrecord, style=dashed, color=lightgrey, "
1667 "fontcolor=lightgrey, margin=0.1, fontsize=10, label=\"{",
1668 that);
1669 NodeInfo* info = that->info();
1670 stream()->Add("{NI|%i}|{WI|%i}|{SI|%i}",
1671 info->follows_newline_interest,
1672 info->follows_word_interest,
1673 info->follows_start_interest);
1674 stream()->Add("|{DN|%i}|{DW|%i}|{DS|%i}|{AE|%i}",
1675 info->determine_newline,
1676 info->determine_word,
1677 info->determine_start,
1678 info->at_end);
1679 if (info->follows_newline != NodeInfo::UNKNOWN)
1680 stream()->Add("|{FN|%i}", info->follows_newline);
1681 if (info->follows_word != NodeInfo::UNKNOWN)
1682 stream()->Add("|{FW|%i}", info->follows_word);
1683 if (info->follows_start != NodeInfo::UNKNOWN)
1684 stream()->Add("|{FS|%i}", info->follows_start);
1685 Label* label = that->label();
1686 if (label->is_bound())
1687 stream()->Add("|{@|%x}", label->pos());
1688 stream()->Add("}\"];\n");
1689 stream()->Add(" a%p -> n%p [style=dashed, color=lightgrey, "
1690 "arrowhead=none];\n", that, that);
1691 }
1692
1693
1694 void DotPrinter::VisitChoice(ChoiceNode* that) {
1695 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
1696 TableEntryHeaderPrinter header_printer(stream());
1697 that->table()->ForEach(&header_printer);
1698 stream()->Add("\"]\n", that);
1699 PrintAttributes(that);
1700 TableEntryBodyPrinter body_printer(stream(), that);
1701 that->table()->ForEach(&body_printer);
1702 PrintOnFailure(that, that->on_failure());
1703 for (int i = 0; i < that->alternatives()->length(); i++) {
1704 GuardedAlternative alt = that->alternatives()->at(i);
1705 alt.node()->Accept(this);
1706 }
1707 }
1708
1709
1710 void DotPrinter::VisitText(TextNode* that) {
1711 stream()->Add(" n%p [label=\"", that);
1712 for (int i = 0; i < that->elements()->length(); i++) {
1713 if (i > 0) stream()->Add(" ");
1714 TextElement elm = that->elements()->at(i);
1715 switch (elm.type) {
1716 case TextElement::ATOM: {
1717 stream()->Add("'%w'", elm.data.u_atom->data());
1718 break;
1719 }
1720 case TextElement::CHAR_CLASS: {
1721 RegExpCharacterClass* node = elm.data.u_char_class;
1722 stream()->Add("[");
1723 if (node->is_negated())
1724 stream()->Add("^");
1725 for (int j = 0; j < node->ranges()->length(); j++) {
1726 CharacterRange range = node->ranges()->at(j);
1727 stream()->Add("%k-%k", range.from(), range.to());
1728 }
1729 stream()->Add("]");
1730 break;
1731 }
1732 default:
1733 UNREACHABLE();
1734 }
1735 }
1736 stream()->Add("\", shape=box, peripheries=2];\n");
1737 PrintAttributes(that);
1738 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1739 Visit(that->on_success());
1740 PrintOnFailure(that, that->on_failure());
1741 }
1742
1743
1744 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
1745 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
1746 that,
1747 that->start_register(),
1748 that->end_register());
1749 PrintAttributes(that);
1750 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1751 Visit(that->on_success());
1752 PrintOnFailure(that, that->on_failure());
1753 }
1754
1755
1756 void DotPrinter::VisitEnd(EndNode* that) {
1757 stream()->Add(" n%p [style=bold, shape=point];\n", that);
1758 PrintAttributes(that);
1759 }
1760
1761
1762 void DotPrinter::VisitAction(ActionNode* that) {
1763 stream()->Add(" n%p [", that);
1764 switch (that->type_) {
1765 case ActionNode::STORE_REGISTER:
1766 stream()->Add("label=\"$%i:=%i\", shape=octagon",
1767 that->data_.u_store_register.reg,
1768 that->data_.u_store_register.value);
1769 break;
1770 case ActionNode::INCREMENT_REGISTER:
1771 stream()->Add("label=\"$%i++\", shape=octagon",
1772 that->data_.u_increment_register.reg);
1773 break;
1774 case ActionNode::STORE_POSITION:
1775 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
1776 that->data_.u_position_register.reg);
1777 break;
1778 case ActionNode::RESTORE_POSITION:
1779 stream()->Add("label=\"$pos:=$%i\", shape=octagon",
1780 that->data_.u_position_register.reg);
1781 break;
1782 case ActionNode::BEGIN_SUBMATCH:
1783 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
1784 that->data_.u_submatch.current_position_register);
1785 break;
1786 case ActionNode::ESCAPE_SUBMATCH:
1787 stream()->Add("label=\"escape\", shape=septagon");
1788 break;
1789 }
1790 stream()->Add("];\n");
1791 PrintAttributes(that);
1792 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
1793 Visit(that->on_success());
1794 }
1795
1796
1797 class DispatchTableDumper {
1798 public:
1799 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
1800 void Call(uc16 key, DispatchTable::Entry entry);
1801 StringStream* stream() { return stream_; }
1802 private:
1803 StringStream* stream_;
1804 };
1805
1806
1807 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
1808 stream()->Add("[%k-%k]: {", key, entry.to());
1809 OutSet* set = entry.out_set();
1810 bool first = true;
1811 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
1812 if (set->Get(i)) {
1813 if (first) {
1814 first = false;
1815 } else {
1816 stream()->Add(", ");
1817 }
1818 stream()->Add("%i", i);
1819 }
1820 }
1821 stream()->Add("}\n");
1822 }
1823
1824
1825 void DispatchTable::Dump() {
1826 HeapStringAllocator alloc;
1827 StringStream stream(&alloc);
1828 DispatchTableDumper dumper(&stream);
1829 tree()->ForEach(&dumper);
1830 OS::PrintError("%s", *stream.ToCString());
1831 }
1832
1833
1834 void RegExpEngine::DotPrint(const char* label, RegExpNode* node) {
1835 DotPrinter printer;
1836 printer.PrintNode(label, node);
1837 }
1838
1839
1840 #endif // DEBUG
1841
1842
1843 // -------------------------------------------------------------------
1844 // Tree to graph conversion
1845
1846
1847 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
1848 RegExpNode* on_success,
1849 RegExpNode* on_failure) {
1850 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
1851 elms->Add(TextElement::Atom(this));
1852 return new TextNode(elms, on_success, on_failure);
1853 }
1854
1855
1856 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
1857 RegExpNode* on_success,
1858 RegExpNode* on_failure) {
1859 return new TextNode(elements(), on_success, on_failure);
1860 }
1861
1862
1863 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
1864 RegExpNode* on_success,
1865 RegExpNode* on_failure) {
1866 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
1867 elms->Add(TextElement::CharClass(this));
1868 return new TextNode(elms, on_success, on_failure);
1869 }
1870
1871
1872 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
1873 RegExpNode* on_success,
1874 RegExpNode* on_failure) {
1875 ZoneList<RegExpTree*>* alternatives = this->alternatives();
1876 int length = alternatives->length();
1877 ChoiceNode* result = new ChoiceNode(length, on_failure);
1878 for (int i = 0; i < length; i++) {
1879 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
1880 on_success,
1881 on_failure));
1882 result->AddAlternative(alternative);
1883 }
1884 return result;
1885 }
1886
1887
1888 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
1889 RegExpNode* on_success,
1890 RegExpNode* on_failure) {
1891 return ToNode(min(),
1892 max(),
1893 is_greedy(),
1894 body(),
1895 compiler,
1896 on_success,
1897 on_failure);
1898 }
1899
1900
1901 RegExpNode* RegExpQuantifier::ToNode(int min,
1902 int max,
1903 bool is_greedy,
1904 RegExpTree* body,
1905 RegExpCompiler* compiler,
1906 RegExpNode* on_success,
1907 RegExpNode* on_failure) {
1908 // x{f, t} becomes this:
1909 //
1910 // (r++)<-.
1911 // | `
1912 // | (x)
1913 // v ^
1914 // (r=0)-->(?)---/ [if r < t]
1915 // |
1916 // [if r >= f] \----> ...
1917 //
1918 //
1919 // TODO(someone): clear captures on repetition and handle empty
1920 // matches.
1921 bool has_min = min > 0;
1922 bool has_max = max < RegExpQuantifier::kInfinity;
1923 bool needs_counter = has_min || has_max;
1924 int reg_ctr = needs_counter ? compiler->AllocateRegister() : -1;
1925 ChoiceNode* center = new ChoiceNode(2, on_failure);
1926 RegExpNode* loop_return = needs_counter
1927 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
1928 : static_cast<RegExpNode*>(center);
1929 RegExpNode* body_node = body->ToNode(compiler, loop_return, on_failure);
1930 GuardedAlternative body_alt(body_node);
1931 if (has_max) {
1932 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
1933 body_alt.AddGuard(body_guard);
1934 }
1935 GuardedAlternative rest_alt(on_success);
1936 if (has_min) {
1937 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
1938 rest_alt.AddGuard(rest_guard);
1939 }
1940 if (is_greedy) {
1941 center->AddAlternative(body_alt);
1942 center->AddAlternative(rest_alt);
1943 } else {
1944 center->AddAlternative(rest_alt);
1945 center->AddAlternative(body_alt);
1946 }
1947 if (needs_counter) {
1948 return ActionNode::StoreRegister(reg_ctr, 0, center);
1949 } else {
1950 return center;
1951 }
1952 }
1953
1954
1955 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
1956 RegExpNode* on_success,
1957 RegExpNode* on_failure) {
1958 NodeInfo info;
1959 switch (type()) {
1960 case START_OF_LINE:
1961 info.follows_newline_interest = true;
1962 break;
1963 case START_OF_INPUT:
1964 info.follows_start_interest = true;
1965 break;
1966 case BOUNDARY: case NON_BOUNDARY:
1967 info.follows_word_interest = true;
1968 break;
1969 case END_OF_INPUT:
1970 info.at_end = true;
1971 break;
1972 case END_OF_LINE:
1973 // This is wrong but has the effect of making the compiler abort.
1974 info.at_end = true;
1975 }
1976 return on_success->PropagateForward(&info);
1977 }
1978
1979
1980 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
1981 RegExpNode* on_success,
1982 RegExpNode* on_failure) {
1983 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
1984 RegExpCapture::EndRegister(index()),
1985 on_success,
1986 on_failure);
1987 }
1988
1989
1990 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
1991 RegExpNode* on_success,
1992 RegExpNode* on_failure) {
1993 return on_success;
1994 }
1995
1996
1997 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
1998 RegExpNode* on_success,
1999 RegExpNode* on_failure) {
2000 int stack_pointer_register = compiler->AllocateRegister();
2001 int position_register = compiler->AllocateRegister();
2002 if (is_positive()) {
2003 // begin submatch scope
2004 // $reg = $pos
2005 // if [body]
2006 // then
2007 // $pos = $reg
2008 // escape submatch scope (drop all backtracks created in scope)
2009 // succeed
2010 // else
2011 // end submatch scope (nothing to clean up, just exit the scope)
2012 // fail
2013 return ActionNode::BeginSubmatch(
2014 stack_pointer_register,
2015 position_register,
2016 body()->ToNode(
2017 compiler,
2018 ActionNode::EscapeSubmatch(
2019 stack_pointer_register,
2020 true, // Also restore input position.
2021 position_register,
2022 on_success),
2023 on_failure));
2024 } else {
2025 // begin submatch scope
2026 // try
2027 // first if (body)
2028 // then
2029 // escape submatch scope
2030 // fail
2031 // else
2032 // backtrack
2033 // second
2034 // end submatch scope
2035 // restore current position
2036 // succeed
2037 ChoiceNode* try_node =
2038 new ChoiceNode(1, ActionNode::RestorePosition(position_register,
2039 on_success));
2040 RegExpNode* body_node = body()->ToNode(
2041 compiler,
2042 ActionNode::EscapeSubmatch(stack_pointer_register,
2043 false, // Don't also restore position
2044 0, // Unused arguments.
2045 on_failure),
2046 compiler->backtrack());
2047 GuardedAlternative body_alt(body_node);
2048 try_node->AddAlternative(body_alt);
2049 return ActionNode::BeginSubmatch(stack_pointer_register,
2050 position_register,
2051 try_node);
2052 }
2053 }
2054
2055
2056 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
2057 RegExpNode* on_success,
2058 RegExpNode* on_failure) {
2059 return ToNode(body(), index(), compiler, on_success, on_failure);
2060 }
2061
2062
2063 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
2064 int index,
2065 RegExpCompiler* compiler,
2066 RegExpNode* on_success,
2067 RegExpNode* on_failure) {
2068 int start_reg = RegExpCapture::StartRegister(index);
2069 int end_reg = RegExpCapture::EndRegister(index);
2070 RegExpNode* store_end = ActionNode::StorePosition(end_reg, on_success);
2071 RegExpNode* body_node = body->ToNode(compiler, store_end, on_failure);
2072 return ActionNode::StorePosition(start_reg, body_node);
2073 }
2074
2075
2076 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
2077 RegExpNode* on_success,
2078 RegExpNode* on_failure) {
2079 ZoneList<RegExpTree*>* children = nodes();
2080 RegExpNode* current = on_success;
2081 for (int i = children->length() - 1; i >= 0; i--) {
2082 current = children->at(i)->ToNode(compiler, current, on_failure);
2083 }
2084 return current;
2085 }
2086
2087
2088 static const int kSpaceRangeCount = 20;
2089 static const uc16 kSpaceRanges[kSpaceRangeCount] = {
2090 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680,
2091 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
2092 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000
2093 };
2094
2095
2096 static const int kWordRangeCount = 8;
2097 static const uc16 kWordRanges[kWordRangeCount] = {
2098 '0', '9', 'A', 'Z', '_', '_', 'a', 'z'
2099 };
2100
2101
2102 static const int kDigitRangeCount = 2;
2103 static const uc16 kDigitRanges[kDigitRangeCount] = {
2104 '0', '9'
2105 };
2106
2107
2108 static const int kLineTerminatorRangeCount = 6;
2109 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {
2110 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029
2111 };
2112
2113
2114 static void AddClass(const uc16* elmv,
2115 int elmc,
2116 ZoneList<CharacterRange>* ranges) {
2117 for (int i = 0; i < elmc; i += 2) {
2118 ASSERT(elmv[i] <= elmv[i + 1]);
2119 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
2120 }
2121 }
2122
2123
2124 static void AddClassNegated(const uc16 *elmv,
2125 int elmc,
2126 ZoneList<CharacterRange>* ranges) {
2127 ASSERT(elmv[0] != 0x0000);
2128 ASSERT(elmv[elmc-1] != 0xFFFF);
2129 uc16 last = 0x0000;
2130 for (int i = 0; i < elmc; i += 2) {
2131 ASSERT(last <= elmv[i] - 1);
2132 ASSERT(elmv[i] <= elmv[i + 1]);
2133 ranges->Add(CharacterRange(last, elmv[i] - 1));
2134 last = elmv[i + 1] + 1;
2135 }
2136 ranges->Add(CharacterRange(last, 0xFFFF));
2137 }
2138
2139
2140 void CharacterRange::AddClassEscape(uc16 type,
2141 ZoneList<CharacterRange>* ranges) {
2142 switch (type) {
2143 case 's':
2144 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
2145 break;
2146 case 'S':
2147 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
2148 break;
2149 case 'w':
2150 AddClass(kWordRanges, kWordRangeCount, ranges);
2151 break;
2152 case 'W':
2153 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
2154 break;
2155 case 'd':
2156 AddClass(kDigitRanges, kDigitRangeCount, ranges);
2157 break;
2158 case 'D':
2159 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
2160 break;
2161 case '.':
2162 AddClassNegated(kLineTerminatorRanges,
2163 kLineTerminatorRangeCount,
2164 ranges);
2165 break;
2166 // This is not a character range as defined by the spec but a
2167 // convenient shorthand for a character class that matches any
2168 // character.
2169 case '*':
2170 ranges->Add(CharacterRange::Everything());
2171 break;
2172 default:
2173 UNREACHABLE();
2174 }
2175 }
2176
2177
2178 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
2179 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2180 if (IsSingleton()) {
2181 // If this is a singleton we just expand the one character.
2182 int length = uncanonicalize.get(from(), '\0', chars);
2183 for (int i = 0; i < length; i++) {
2184 uc32 chr = chars[i];
2185 if (chr != from()) {
2186 ranges->Add(CharacterRange::Singleton(chars[i]));
2187 }
2188 }
2189 } else if (from() <= kRangeCanonicalizeMax &&
2190 to() <= kRangeCanonicalizeMax) {
2191 // If this is a range we expand the characters block by block,
2192 // expanding contiguous subranges (blocks) one at a time.
2193 // The approach is as follows. For a given start character we
2194 // look up the block that contains it, for instance 'a' if the
2195 // start character is 'c'. A block is characterized by the property
2196 // that all characters uncanonicalize in the same way as the first
2197 // element, except that each entry in the result is incremented
2198 // by the distance from the first element. So a-z is a block
2199 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
2200 // uncanonicalizes to ['a' + k, 'A' + k].
2201 // Once we've found the start point we look up its uncanonicalization
2202 // and produce a range for each element. For instance for [c-f]
2203 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
2204 // add a range if it is not already contained in the input, so [c-f]
2205 // will be skipped but [C-F] will be added. If this range is not
2206 // completely contained in a block we do this for all the blocks
2207 // covered by the range.
2208 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2209 // First, look up the block that contains the 'from' character.
2210 int length = canonrange.get(from(), '\0', range);
2211 if (length == 0) {
2212 range[0] = from();
2213 } else {
2214 ASSERT_EQ(1, length);
2215 }
2216 int pos = from();
2217 // The start of the current block. Note that except for the first
2218 // iteration 'start' is always equal to 'pos'.
2219 int start;
2220 // If it is not the start point of a block the entry contains the
2221 // offset of the character from the start point.
2222 if ((range[0] & kStartMarker) == 0) {
2223 start = pos - range[0];
2224 } else {
2225 start = pos;
2226 }
2227 // Then we add the ranges on at a time, incrementing the current
2228 // position to be after the last block each time. The position
2229 // always points to the start of a block.
2230 while (pos < to()) {
2231 length = canonrange.get(start, '\0', range);
2232 if (length == 0) {
2233 range[0] = start;
2234 } else {
2235 ASSERT_EQ(1, length);
2236 }
2237 ASSERT((range[0] & kStartMarker) != 0);
2238 // The start point of a block contains the distance to the end
2239 // of the range.
2240 int block_end = start + (range[0] & kPayloadMask) - 1;
2241 int end = (block_end > to()) ? to() : block_end;
2242 length = uncanonicalize.get(start, '\0', range);
2243 for (int i = 0; i < length; i++) {
2244 uc32 c = range[i];
2245 uc16 range_from = c + (pos - start);
2246 uc16 range_to = c + (end - start);
2247 if (!(from() <= range_from && range_to <= to())) {
2248 ranges->Add(CharacterRange(range_from, range_to));
2249 }
2250 }
2251 start = pos = block_end + 1;
2252 }
2253 } else {
2254 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
2255 }
2256 }
2257
2258
2259 // -------------------------------------------------------------------
2260 // Interest propagation
2261
2262
2263 RegExpNode* RegExpNode::GetSibling(NodeInfo* info) {
2264 for (int i = 0; i < siblings_.length(); i++) {
2265 RegExpNode* sibling = siblings_.Get(i);
2266 if (sibling->info()->HasSameForwardInterests(info))
2267 return sibling;
2268 }
2269 return NULL;
2270 }
2271
2272
2273 template <class C>
2274 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
2275 NodeInfo full_info(*node->info());
2276 full_info.AddFromPreceding(info);
2277 RegExpNode* sibling = node->GetSibling(&full_info);
2278 if (sibling != NULL) return sibling;
2279 node->EnsureSiblings();
2280 sibling = new C(*node);
2281 sibling->info()->AddFromPreceding(&full_info);
2282 node->AddSibling(sibling);
2283 return sibling;
2284 }
2285
2286
2287 RegExpNode* ActionNode::PropagateForward(NodeInfo* info) {
2288 NodeInfo full_info(*this->info());
2289 full_info.AddFromPreceding(info);
2290 RegExpNode* sibling = GetSibling(&full_info);
2291 if (sibling != NULL) return sibling;
2292 EnsureSiblings();
2293 ActionNode* action = new ActionNode(*this);
2294 action->info()->AddFromPreceding(&full_info);
2295 AddSibling(action);
2296 if (type_ != ESCAPE_SUBMATCH) {
2297 action->set_on_success(action->on_success()->PropagateForward(info));
2298 }
2299 return action;
2300 }
2301
2302
2303 RegExpNode* ChoiceNode::PropagateForward(NodeInfo* info) {
2304 NodeInfo full_info(*this->info());
2305 full_info.AddFromPreceding(info);
2306 RegExpNode* sibling = GetSibling(&full_info);
2307 if (sibling != NULL) return sibling;
2308 EnsureSiblings();
2309 ChoiceNode* choice = new ChoiceNode(*this);
2310 choice->info()->AddFromPreceding(&full_info);
2311 AddSibling(choice);
2312 ZoneList<GuardedAlternative>* old_alternatives = alternatives();
2313 int count = old_alternatives->length();
2314 choice->alternatives_ = new ZoneList<GuardedAlternative>(count);
2315 for (int i = 0; i < count; i++) {
2316 GuardedAlternative alternative = old_alternatives->at(i);
2317 alternative.set_node(alternative.node()->PropagateForward(info));
2318 choice->alternatives()->Add(alternative);
2319 }
2320 if (!choice->on_failure_->IsBacktrack()) {
2321 choice->on_failure_ = choice->on_failure_->PropagateForward(info);
2322 }
2323 return choice;
2324 }
2325
2326
2327 RegExpNode* EndNode::PropagateForward(NodeInfo* info) {
2328 return PropagateToEndpoint(this, info);
2329 }
2330
2331
2332 RegExpNode* BackReferenceNode::PropagateForward(NodeInfo* info) {
2333 NodeInfo full_info(*this->info());
2334 full_info.AddFromPreceding(info);
2335 RegExpNode* sibling = GetSibling(&full_info);
2336 if (sibling != NULL) return sibling;
2337 EnsureSiblings();
2338 BackReferenceNode* back_ref = new BackReferenceNode(*this);
2339 back_ref->info()->AddFromPreceding(&full_info);
2340 AddSibling(back_ref);
2341 // TODO(erikcorry): A back reference has to have two successors (by default
2342 // the same node). The first is used if the back reference matches a non-
2343 // empty back reference, the second if it matches an empty one. This doesn't
2344 // matter for at_end, which is the only one implemented right now, but it will
2345 // matter for other pieces of info.
2346 back_ref->set_on_success(back_ref->on_success()->PropagateForward(info));
2347 return back_ref;
2348 }
2349
2350
2351 RegExpNode* TextNode::PropagateForward(NodeInfo* info) {
2352 return PropagateToEndpoint(this, info);
2353 }
2354
2355
2356 // -------------------------------------------------------------------
2357 // Splay tree
2358
2359
2360 OutSet* OutSet::Extend(unsigned value) {
2361 if (Get(value))
2362 return this;
2363 if (successors() != NULL) {
2364 for (int i = 0; i < successors()->length(); i++) {
2365 OutSet* successor = successors()->at(i);
2366 if (successor->Get(value))
2367 return successor;
2368 }
2369 } else {
2370 successors_ = new ZoneList<OutSet*>(2);
2371 }
2372 OutSet* result = new OutSet(first_, remaining_);
2373 result->Set(value);
2374 successors()->Add(result);
2375 return result;
2376 }
2377
2378
2379 void OutSet::Set(unsigned value) {
2380 if (value < kFirstLimit) {
2381 first_ |= (1 << value);
2382 } else {
2383 if (remaining_ == NULL)
2384 remaining_ = new ZoneList<unsigned>(1);
2385 if (remaining_->is_empty() || !remaining_->Contains(value))
2386 remaining_->Add(value);
2387 }
2388 }
2389
2390
2391 bool OutSet::Get(unsigned value) {
2392 if (value < kFirstLimit) {
2393 return (first_ & (1 << value)) != 0;
2394 } else if (remaining_ == NULL) {
2395 return false;
2396 } else {
2397 return remaining_->Contains(value);
2398 }
2399 }
2400
2401
2402 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
2403 const DispatchTable::Entry DispatchTable::Config::kNoValue;
2404
2405
2406 void DispatchTable::AddRange(CharacterRange full_range, int value) {
2407 CharacterRange current = full_range;
2408 if (tree()->is_empty()) {
2409 // If this is the first range we just insert into the table.
2410 ZoneSplayTree<Config>::Locator loc;
2411 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
2412 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
2413 return;
2414 }
2415 // First see if there is a range to the left of this one that
2416 // overlaps.
2417 ZoneSplayTree<Config>::Locator loc;
2418 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
2419 Entry* entry = &loc.value();
2420 // If we've found a range that overlaps with this one, and it
2421 // starts strictly to the left of this one, we have to fix it
2422 // because the following code only handles ranges that start on
2423 // or after the start point of the range we're adding.
2424 if (entry->from() < current.from() && entry->to() >= current.from()) {
2425 // Snap the overlapping range in half around the start point of
2426 // the range we're adding.
2427 CharacterRange left(entry->from(), current.from() - 1);
2428 CharacterRange right(current.from(), entry->to());
2429 // The left part of the overlapping range doesn't overlap.
2430 // Truncate the whole entry to be just the left part.
2431 entry->set_to(left.to());
2432 // The right part is the one that overlaps. We add this part
2433 // to the map and let the next step deal with merging it with
2434 // the range we're adding.
2435 ZoneSplayTree<Config>::Locator loc;
2436 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
2437 loc.set_value(Entry(right.from(),
2438 right.to(),
2439 entry->out_set()));
2440 }
2441 }
2442 while (current.is_valid()) {
2443 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
2444 (loc.value().from() <= current.to()) &&
2445 (loc.value().to() >= current.from())) {
2446 Entry* entry = &loc.value();
2447 // We have overlap. If there is space between the start point of
2448 // the range we're adding and where the overlapping range starts
2449 // then we have to add a range covering just that space.
2450 if (current.from() < entry->from()) {
2451 ZoneSplayTree<Config>::Locator ins;
2452 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
2453 ins.set_value(Entry(current.from(),
2454 entry->from() - 1,
2455 empty()->Extend(value)));
2456 current.set_from(entry->from());
2457 }
2458 ASSERT_EQ(current.from(), entry->from());
2459 // If the overlapping range extends beyond the one we want to add
2460 // we have to snap the right part off and add it separately.
2461 if (entry->to() > current.to()) {
2462 ZoneSplayTree<Config>::Locator ins;
2463 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
2464 ins.set_value(Entry(current.to() + 1,
2465 entry->to(),
2466 entry->out_set()));
2467 entry->set_to(current.to());
2468 }
2469 ASSERT(entry->to() <= current.to());
2470 // The overlapping range is now completely contained by the range
2471 // we're adding so we can just update it and move the start point
2472 // of the range we're adding just past it.
2473 entry->AddValue(value);
2474 // Bail out if the last interval ended at 0xFFFF since otherwise
2475 // adding 1 will wrap around to 0.
2476 if (entry->to() == 0xFFFF)
2477 break;
2478 ASSERT(entry->to() + 1 > current.from());
2479 current.set_from(entry->to() + 1);
2480 } else {
2481 // There is no overlap so we can just add the range
2482 ZoneSplayTree<Config>::Locator ins;
2483 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
2484 ins.set_value(Entry(current.from(),
2485 current.to(),
2486 empty()->Extend(value)));
2487 break;
2488 }
2489 }
2490 }
2491
2492
2493 OutSet* DispatchTable::Get(uc16 value) {
2494 ZoneSplayTree<Config>::Locator loc;
2495 if (!tree()->FindGreatestLessThan(value, &loc))
2496 return empty();
2497 Entry* entry = &loc.value();
2498 if (value <= entry->to())
2499 return entry->out_set();
2500 else
2501 return empty();
2502 }
2503
2504
2505 // -------------------------------------------------------------------
2506 // Analysis
2507
2508
2509 void Analysis::EnsureAnalyzed(RegExpNode* that) {
2510 if (that->info()->been_analyzed || that->info()->being_analyzed)
2511 return;
2512 that->info()->being_analyzed = true;
2513 that->Accept(this);
2514 that->info()->being_analyzed = false;
2515 that->info()->been_analyzed = true;
2516 }
2517
2518
2519 void Analysis::VisitEnd(EndNode* that) {
2520 // nothing to do
2521 }
2522
2523
2524 void Analysis::VisitText(TextNode* that) {
2525 if (ignore_case_) {
2526 that->MakeCaseIndependent();
2527 }
2528 EnsureAnalyzed(that->on_success());
2529 EnsureAnalyzed(that->on_failure());
2530 NodeInfo* info = that->info();
2531 NodeInfo* next_info = that->on_success()->info();
2532 // If the following node is interested in what it follows then this
2533 // node must determine it.
2534 info->determine_newline = next_info->follows_newline_interest;
2535 info->determine_word = next_info->follows_word_interest;
2536 info->determine_start = next_info->follows_start_interest;
2537 }
2538
2539
2540 void Analysis::VisitAction(ActionNode* that) {
2541 EnsureAnalyzed(that->on_success());
2542 // If the next node is interested in what it follows then this node
2543 // has to be interested too so it can pass the information on.
2544 that->info()->AddFromFollowing(that->on_success()->info());
2545 }
2546
2547
2548 void Analysis::VisitChoice(ChoiceNode* that) {
2549 NodeInfo* info = that->info();
2550 for (int i = 0; i < that->alternatives()->length(); i++) {
2551 RegExpNode* node = that->alternatives()->at(i).node();
2552 EnsureAnalyzed(node);
2553 // Anything the following nodes need to know has to be known by
2554 // this node also, so it can pass it on.
2555 info->AddFromFollowing(node->info());
2556 }
2557 if (!that->table_calculated()) {
2558 DispatchTableConstructor cons(that->table());
2559 cons.BuildTable(that);
2560 }
2561 EnsureAnalyzed(that->on_failure());
2562 }
2563
2564
2565 void Analysis::VisitBackReference(BackReferenceNode* that) {
2566 EnsureAnalyzed(that->on_success());
2567 EnsureAnalyzed(that->on_failure());
2568 }
2569
2570
2571 // -------------------------------------------------------------------
2572 // Dispatch table construction
2573
2574
2575 void DispatchTableConstructor::VisitEnd(EndNode* that) {
2576 AddRange(CharacterRange::Everything());
2577 }
2578
2579
2580 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
2581 ASSERT(!node->table_calculated());
2582 node->set_being_calculated(true);
2583 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
2584 for (int i = 0; i < alternatives->length(); i++) {
2585 set_choice_index(i);
2586 alternatives->at(i).node()->Accept(this);
2587 }
2588 node->set_being_calculated(false);
2589 node->set_table_calculated(true);
2590 }
2591
2592
2593 class AddDispatchRange {
2594 public:
2595 explicit AddDispatchRange(DispatchTableConstructor* constructor)
2596 : constructor_(constructor) { }
2597 void Call(uc32 from, DispatchTable::Entry entry);
2598 private:
2599 DispatchTableConstructor* constructor_;
2600 };
2601
2602
2603 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
2604 CharacterRange range(from, entry.to());
2605 constructor_->AddRange(range);
2606 }
2607
2608
2609 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
2610 if (node->being_calculated())
2611 return;
2612 if (!node->table_calculated()) {
2613 DispatchTableConstructor constructor(node->table());
2614 constructor.BuildTable(node);
2615 }
2616 ASSERT(node->table_calculated());
2617 AddDispatchRange adder(this);
2618 node->table()->ForEach(&adder);
2619 }
2620
2621
2622 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
2623 // TODO(160): Find the node that we refer back to and propagate its start
2624 // set back to here. For now we just accept anything.
2625 AddRange(CharacterRange::Everything());
2626 }
2627
2628
2629
2630 static int CompareRangeByFrom(const CharacterRange* a,
2631 const CharacterRange* b) {
2632 return Compare<uc16>(a->from(), b->from());
2633 }
2634
2635
2636 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
2637 ranges->Sort(CompareRangeByFrom);
2638 uc16 last = 0;
2639 for (int i = 0; i < ranges->length(); i++) {
2640 CharacterRange range = ranges->at(i);
2641 if (last < range.from())
2642 AddRange(CharacterRange(last, range.from() - 1));
2643 if (range.to() >= last) {
2644 if (range.to() == 0xFFFF) {
2645 return;
2646 } else {
2647 last = range.to() + 1;
2648 }
2649 }
2650 }
2651 AddRange(CharacterRange(last, 0xFFFF));
2652 }
2653
2654
2655 void DispatchTableConstructor::VisitText(TextNode* that) {
2656 TextElement elm = that->elements()->at(0);
2657 switch (elm.type) {
2658 case TextElement::ATOM: {
2659 uc16 c = elm.data.u_atom->data()[0];
2660 AddRange(CharacterRange(c, c));
2661 break;
2662 }
2663 case TextElement::CHAR_CLASS: {
2664 RegExpCharacterClass* tree = elm.data.u_char_class;
2665 ZoneList<CharacterRange>* ranges = tree->ranges();
2666 if (tree->is_negated()) {
2667 AddInverse(ranges);
2668 } else {
2669 for (int i = 0; i < ranges->length(); i++)
2670 AddRange(ranges->at(i));
2671 }
2672 break;
2673 }
2674 default: {
2675 UNIMPLEMENTED();
2676 }
2677 }
2678 }
2679
2680
2681 void DispatchTableConstructor::VisitAction(ActionNode* that) {
2682 that->on_success()->Accept(this);
2683 }
2684
2685
2686 Handle<FixedArray> RegExpEngine::Compile(RegExpParseResult* input,
2687 RegExpNode** node_return,
2688 bool ignore_case,
2689 bool is_multiline) {
2690 RegExpCompiler compiler(input->capture_count, ignore_case);
2691 // Wrap the body of the regexp in capture #0.
2692 RegExpNode* captured_body = RegExpCapture::ToNode(input->tree,
2693 0,
2694 &compiler,
2695 compiler.accept(),
2696 compiler.backtrack());
2697 // Add a .*? at the beginning, outside the body capture.
2698 // Note: We could choose to not add this if the regexp is anchored at
2699 // the start of the input but I'm not sure how best to do that and
2700 // since we don't even handle ^ yet I'm saving that optimization for
2701 // later.
2702 RegExpNode* node = RegExpQuantifier::ToNode(0,
2703 RegExpQuantifier::kInfinity,
2704 false,
2705 new RegExpCharacterClass('*'),
2706 &compiler,
2707 captured_body,
2708 compiler.backtrack());
2709 if (node_return != NULL) *node_return = node;
2710 Analysis analysis(ignore_case);
2711 analysis.EnsureAnalyzed(node);
2712
2713 if (!FLAG_irregexp) {
2714 return Handle<FixedArray>::null();
2715 }
2716
2717 if (is_multiline && !FLAG_attempt_multiline_irregexp) {
2718 return Handle<FixedArray>::null();
2719 }
2720
2721 if (FLAG_irregexp_native) {
2722 #ifdef ARM
2723 UNIMPLEMENTED();
2724 #else // IA32
2725 RegExpMacroAssemblerIA32 macro_assembler(RegExpMacroAssemblerIA32::UC16,
2726 (input->capture_count + 1) * 2);
2727 return compiler.Assemble(&macro_assembler,
2728 node,
2729 input->capture_count);
2730 #endif
2731 }
2732 EmbeddedVector<byte, 1024> codes;
2733 RegExpMacroAssemblerIrregexp macro_assembler(codes);
2734 return compiler.Assemble(&macro_assembler,
2735 node,
2736 input->capture_count);
2737 }
2738
575 2739
576 }} // namespace v8::internal 2740 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/list.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698