Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(178)

Side by Side Diff: regexp2000/src/parser.cc

Issue 9110: Experimental: Fixed bug in RegExp Parser. Added feature counting in parser. (Closed)
Patch Set: Merged changes to tip of experimental branch. Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « regexp2000/src/parser.h ('k') | regexp2000/test/cctest/test-regexp.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after
221 Handle<String> type, 221 Handle<String> type,
222 Vector< Handle<Object> > arguments); 222 Vector< Handle<Object> > arguments);
223 223
224 friend class Target; 224 friend class Target;
225 friend class TargetScope; 225 friend class TargetScope;
226 friend class LexicalScope; 226 friend class LexicalScope;
227 friend class TemporaryScope; 227 friend class TemporaryScope;
228 }; 228 };
229 229
230 230
231 template <typename T, int initial_size>
232 class BufferedZoneList {
233 public:
234
235 BufferedZoneList() :
236 list_(NULL), last_(NULL) {}
237
238 // Adds element at end of list. This element is buffered and can
239 // be read using last() or removed using RemoveLast until a new Add or until
240 // RemoveLast or GetList has been called.
241 void Add(T* value) {
242 if (last_ != NULL) {
243 if (list_ == NULL) {
244 list_ = new ZoneList<T*>(initial_size);
245 }
246 list_->Add(last_);
247 }
248 last_ = value;
249 }
250
251 T* last() {
252 ASSERT(last_ != NULL);
253 return last_;
254 }
255
256 T* RemoveLast() {
257 ASSERT(last_ != NULL);
258 T* result = last_;
259 last_ = NULL;
260 return result;
261 }
262
263 void Clear() {
264 list_ = NULL;
265 last_ = NULL;
266 }
267
268 int length() {
269 int length = (list_ == NULL) ? 0 : list_->length();
270 return length + ((last_ == NULL) ? 0 : 1);
271 }
272
273 ZoneList<T*>* GetList() {
274 if (list_ == NULL) {
275 list_ = new ZoneList<T*>(initial_size);
276 }
277 if (last_ != NULL) {
278 list_->Add(last_);
279 last_ = NULL;
280 }
281 return list_;
282 }
283
284 private:
285 ZoneList<T*>* list_;
286 T* last_;
287 };
288
289 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
290 class RegExpBuilder {
291 public:
292 RegExpBuilder();
293 void AddCharacter(uc16 character);
294 void AddAtom(RegExpTree* tree);
295 void AddAssertion(RegExpTree* tree);
296 void NewAlternative(); // '|'
297 void AddQuantifierToAtom(int min, int max, bool is_greedy);
298 RegExpTree* ToRegExp();
299 private:
300 void FlushCharacters();
301 bool FlushTerms();
302 ZoneList<uc16>* characters_;
303 BufferedZoneList<RegExpTree, 2> terms_;
304 BufferedZoneList<RegExpTree, 2> alternatives_;
305 #ifdef DEBUG
306 enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
307 #define LAST(x) last_added_ = x;
308 #else
309 #define LAST(x)
310 #endif
311 };
312
313
314 RegExpBuilder::RegExpBuilder() : characters_(NULL), terms_(), alternatives_()
315 #ifdef DEBUG
316 , last_added_(ADD_NONE)
317 #endif
318 {}
319
320
321 void RegExpBuilder::FlushCharacters() {
322 if (characters_ != NULL) {
323 RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());
324 characters_ = NULL;
325 terms_.Add(atom);
326 LAST(ADD_ATOM);
327 }
328 }
329
330
331 void RegExpBuilder::AddCharacter(uc16 c) {
332 if (characters_ == NULL) {
333 characters_ = new ZoneList<uc16>(4);
334 }
335 characters_->Add(c);
336 LAST(ADD_CHAR);
337 }
338
339
340 void RegExpBuilder::AddAtom(RegExpTree* atom) {
341 FlushCharacters();
342 terms_.Add(atom);
343 LAST(ADD_ATOM);
344 }
345
346
347 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
348 FlushCharacters();
349 terms_.Add(assert);
350 LAST(ADD_ASSERT);
351 }
352
353
354 void RegExpBuilder::NewAlternative() {
355 if (!FlushTerms()) {
356 alternatives_.Add(RegExpEmpty::GetInstance());
357 }
358 }
359
360
361 bool RegExpBuilder::FlushTerms() {
362 FlushCharacters();
363 int num_terms = terms_.length();
364 if (num_terms == 0) {
365 return false;
366 }
367 RegExpTree* alternative;
368 if (num_terms == 1) {
369 alternative = terms_.last();
370 } else {
371 alternative = new RegExpAlternative(terms_.GetList());
372 }
373 alternatives_.Add(alternative);
374 terms_.Clear();
375 LAST(ADD_NONE);
376 return true;
377 }
378
379
380 RegExpTree* RegExpBuilder::ToRegExp() {
381 FlushTerms();
382 int num_alternatives = alternatives_.length();
383 if (num_alternatives == 0) {
384 return RegExpEmpty::GetInstance();
385 }
386 if (num_alternatives == 1) {
387 return alternatives_.last();
388 }
389 return new RegExpDisjunction(alternatives_.GetList());
390 }
391
392
393 void RegExpBuilder::AddQuantifierToAtom(int min, int max, bool is_greedy) {
394 RegExpTree* atom;
395 if (characters_ != NULL) {
396 ASSERT(last_added_ == ADD_CHAR);
397 // Last atom was character.
398 Vector<const uc16> char_vector = characters_->ToConstVector();
399 int num_chars = char_vector.length();
400 if (num_chars > 1) {
401 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
402 terms_.Add(new RegExpAtom(prefix));
403 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
404 }
405 characters_ = NULL;
406 atom = new RegExpAtom(char_vector);
407 } else if (terms_.length() > 0) {
408 ASSERT(last_added_ == ADD_ATOM);
409 atom = terms_.RemoveLast();
410 } else {
411 // Only call immediately after adding an atom or character!
412 UNREACHABLE();
413 return;
414 }
415 terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));
416 LAST(ADD_TERM);
417 }
418
419
231 class RegExpParser { 420 class RegExpParser {
232 public: 421 public:
233 RegExpParser(unibrow::CharacterStream* in, 422 RegExpParser(unibrow::CharacterStream* in,
234 Handle<String>* error, 423 Handle<String>* error,
235 bool multiline_mode); 424 bool multiline_mode);
236 RegExpTree* ParsePattern(bool* ok); 425 RegExpTree* ParsePattern(bool* ok);
237 RegExpTree* ParseDisjunction(bool* ok); 426 RegExpTree* ParseDisjunction(bool* ok);
238 RegExpTree* ParseAlternative(bool* ok);
239 RegExpTree* ParseTerm(bool* ok);
240 RegExpTree* ParseAtom(bool* ok);
241 RegExpTree* ParseGroup(bool* ok); 427 RegExpTree* ParseGroup(bool* ok);
242 RegExpTree* ParseCharacterClass(bool* ok); 428 RegExpTree* ParseCharacterClass(bool* ok);
243 429
244 // Parses a {...,...} quantifier and stores the range in the given 430 // Parses a {...,...} quantifier and stores the range in the given
245 // out parameters. 431 // out parameters.
246 void* ParseIntervalQuantifier(int* min_out, int* max_out, bool* ok); 432 void* ParseIntervalQuantifier(int* min_out, int* max_out, bool* ok);
247 433
248 // Parses and returns a single escaped character. The character 434 // Parses and returns a single escaped character. The character
249 // must not be 'b' or 'B' since they are usually handle specially. 435 // must not be 'b' or 'B' since they are usually handle specially.
250 uc32 ParseCharacterEscape(bool* ok); 436 uc32 ParseClassCharacterEscape(bool* ok);
251 437
252 // Checks whether the following is a length-digit hexadecimal number, 438 // Checks whether the following is a length-digit hexadecimal number,
253 // and sets the value if it is. 439 // and sets the value if it is.
254 bool ParseHexEscape(int length, uc32* value); 440 bool ParseHexEscape(int length, uc32* value);
255 441
256 uc32 ParseControlEscape(bool* ok); 442 uc32 ParseControlLetterEscape(bool* ok);
257 uc32 ParseOctalLiteral(bool* ok); 443 uc32 ParseOctalLiteral();
258 444
259 // Tries to parse the input as a backreference. If successful it 445 // Tries to parse the input as a backreference. If successful it
260 // stores the result in the output parameter and returns true. If 446 // stores the result in the output parameter and returns true. If
261 // it fails it will push back the characters read so the same characters 447 // it fails it will push back the characters read so the same characters
262 // can be reparsed. 448 // can be reparsed.
263 bool ParseBackreferenceIndex(int* index_out); 449 bool ParseBackreferenceIndex(int* index_out);
264 450
265 CharacterRange ParseClassAtom(bool* is_char_class, 451 CharacterRange ParseClassAtom(bool* is_char_class,
266 ZoneList<CharacterRange>* ranges, 452 ZoneList<CharacterRange>* ranges,
267 bool* ok); 453 bool* ok);
268 RegExpTree* ReportError(Vector<const char> message, bool* ok); 454 RegExpTree* ReportError(Vector<const char> message, bool* ok);
269 void Advance(); 455 void Advance();
270 void Advance(int dist); 456 void Advance(int dist);
271 // Pushes a read character (or potentially some other character) back 457 // Pushes a read character (or potentially some other character) back
272 // on the input stream. After pushing it back, it becomes the character 458 // on the input stream. After pushing it back, it becomes the character
273 // returned by current(). There is a limited amount of push-back buffer. 459 // returned by current(). There is a limited amount of push-back buffer.
274 // A function using PushBack should check that it doesn't push back more 460 // A function using PushBack should check that it doesn't push back more
275 // than kMaxPushback characters, and it should not push back more characters 461 // than kMaxPushback characters, and it should not push back more characters
276 // than it has read, or that it knows had been read prior to calling it. 462 // than it has read.
277 void PushBack(uc32 character); 463 void PushBack(uc32 character);
278 bool CanPushBack(); 464 bool CanPushBack();
465
466 bool HasCharacterEscapes();
467
279 static const uc32 kEndMarker = unibrow::Utf8::kBadChar; 468 static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
280 private: 469 private:
281 uc32 current() { return current_; } 470 uc32 current() { return current_; }
282 uc32 next() { return next_; } 471 uc32 next() { return next_; }
283 bool has_more() { return has_more_; } 472 bool has_more() { return has_more_; }
284 bool has_next() { return has_next_; } 473 bool has_next() { return has_next_; }
285 unibrow::CharacterStream* in() { return in_; } 474 unibrow::CharacterStream* in() { return in_; }
286 uc32 current_; 475 uc32 current_;
287 uc32 next_; 476 uc32 next_;
288 bool has_more_; 477 bool has_more_;
289 bool has_next_; 478 bool has_next_;
290 bool multiline_mode_; 479 bool multiline_mode_;
291 int captures_seen_; 480 int captures_started_;
292 unibrow::CharacterStream* in_; 481 unibrow::CharacterStream* in_;
293 Handle<String>* error_; 482 Handle<String>* error_;
294 static const int kMaxPushback = 5; 483 static const int kMaxPushback = 5;
295 int pushback_count_; 484 int pushback_count_;
296 uc32 pushback_buffer_[kMaxPushback]; 485 uc32 pushback_buffer_[kMaxPushback];
486 bool has_character_escapes_;
297 }; 487 };
298 488
299 489
300 // A temporary scope stores information during parsing, just like 490 // A temporary scope stores information during parsing, just like
301 // a plain scope. However, temporary scopes are not kept around 491 // a plain scope. However, temporary scopes are not kept around
302 // after parsing or referenced by syntax trees so they can be stack- 492 // after parsing or referenced by syntax trees so they can be stack-
303 // allocated and hence used by the pre-parser. 493 // allocated and hence used by the pre-parser.
304 class TemporaryScope BASE_EMBEDDED { 494 class TemporaryScope BASE_EMBEDDED {
305 public: 495 public:
306 explicit TemporaryScope(Parser* parser); 496 explicit TemporaryScope(Parser* parser);
(...skipping 2931 matching lines...) Expand 10 before | Expand all | Expand 10 after
3238 3428
3239 3429
3240 RegExpParser::RegExpParser(unibrow::CharacterStream* in, 3430 RegExpParser::RegExpParser(unibrow::CharacterStream* in,
3241 Handle<String>* error, 3431 Handle<String>* error,
3242 bool multiline_mode) 3432 bool multiline_mode)
3243 : current_(kEndMarker), 3433 : current_(kEndMarker),
3244 next_(kEndMarker), 3434 next_(kEndMarker),
3245 has_more_(true), 3435 has_more_(true),
3246 has_next_(true), 3436 has_next_(true),
3247 multiline_mode_(multiline_mode), 3437 multiline_mode_(multiline_mode),
3248 captures_seen_(0), 3438 captures_started_(0),
3249 in_(in), 3439 in_(in),
3250 error_(error), 3440 error_(error),
3251 pushback_count_(0) { 3441 pushback_count_(0),
3442 has_character_escapes_(false) {
3252 Advance(2); 3443 Advance(2);
3253 } 3444 }
3254 3445
3255 3446
3256 void RegExpParser::Advance() { 3447 void RegExpParser::Advance() {
3257 current_ = next_; 3448 current_ = next_;
3258 has_more_ = has_next_; 3449 has_more_ = has_next_;
3259 if (pushback_count_ > 0) { 3450 if (pushback_count_ > 0) {
3260 pushback_count_--; 3451 pushback_count_--;
3261 next_ = pushback_buffer_[pushback_count_]; 3452 next_ = pushback_buffer_[pushback_count_];
3262 has_next_ = true;
3263 } else if (in()->has_more()) { 3453 } else if (in()->has_more()) {
3264 next_ = in()->GetNext(); 3454 next_ = in()->GetNext();
3265 } else { 3455 } else {
3266 next_ = kEndMarker; 3456 next_ = kEndMarker;
3267 has_next_ = false; 3457 has_next_ = false;
3268 } 3458 }
3269 } 3459 }
3270 3460
3271 3461
3272 void RegExpParser::Advance(int dist) { 3462 void RegExpParser::Advance(int dist) {
3273 for (int i = 0; i < dist; i++) 3463 for (int i = 0; i < dist; i++)
3274 Advance(); 3464 Advance();
3275 } 3465 }
3276 3466
3277 3467
3278 void RegExpParser::PushBack(uc32 character) { 3468 void RegExpParser::PushBack(uc32 character) {
3279 if (has_next_) { 3469 if (has_next_) {
3280 ASSERT(pushback_count_ < kMaxPushback); 3470 ASSERT(pushback_count_ < kMaxPushback);
3281 pushback_buffer_[pushback_count_] = next_; 3471 pushback_buffer_[pushback_count_] = next_;
3282 pushback_count_++; 3472 pushback_count_++;
3283 } 3473 }
3284 if (has_more_) { 3474
3285 next_ = current_; 3475 next_ = current_;
3286 has_next_ = true; 3476 has_next_ = has_more_;
3287 } 3477
3288 current_ = character; 3478 current_ = character;
3289 has_more_ = true; 3479 has_more_ = true;
3290 } 3480 }
3291 3481
3292 3482
3293 bool RegExpParser::CanPushBack() { 3483 bool RegExpParser::CanPushBack() {
3294 return (pushback_count_ < kMaxPushback); 3484 return (pushback_count_ < kMaxPushback);
3295 } 3485 }
3296 3486
3487 // Reports whether the parsed string atoms contain any characters that were
3488 // escaped in the original pattern. If not, all atoms are proper substrings
3489 // of the original pattern.
3490 bool RegExpParser::HasCharacterEscapes() {
3491 return has_character_escapes_;
3492 }
3297 3493
3298 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) { 3494 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {
3299 *ok = false; 3495 *ok = false;
3300 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); 3496 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
3301 return NULL; 3497 return NULL;
3302 } 3498 }
3303 3499
3304 3500
3305 // Pattern :: 3501 // Pattern ::
3306 // Disjunction 3502 // Disjunction
3307 RegExpTree* RegExpParser::ParsePattern(bool* ok) { 3503 RegExpTree* RegExpParser::ParsePattern(bool* ok) {
3308 return ParseDisjunction(ok); 3504 RegExpTree* result = ParseDisjunction(CHECK_OK);
3505 if (has_more()) {
3506 ReportError(CStrVector("Unmatched ')'"), CHECK_OK);
3507 }
3508 return result;
3309 } 3509 }
3310 3510
3311 3511
3312 // Disjunction :: 3512 // Disjunction ::
3313 // Alternative 3513 // Alternative
3314 // Alternative | Disjunction 3514 // Alternative | Disjunction
3515 // Alternative ::
3516 // [empty]
3517 // Term Alternative
3518 // Term ::
3519 // Assertion
3520 // Atom
3521 // Atom Quantifier
3315 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) { 3522 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
3316 RegExpTree* first = ParseAlternative(CHECK_OK); 3523 RegExpBuilder builder;
3317 if (current() == '|') { 3524 while (true) {
3318 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2); 3525 switch (current()) {
3319 nodes->Add(first); 3526 case kEndMarker:
3320 while (current() == '|') { 3527 case ')':
3321 Advance(); 3528 return builder.ToRegExp();
3322 RegExpTree* next = ParseAlternative(CHECK_OK); 3529 case '|':
3323 nodes->Add(next); 3530 Advance();
3324 } 3531 builder.NewAlternative();
3325 return new RegExpDisjunction(nodes); 3532 continue;
3326 } else { 3533 case '*':
3327 return first; 3534 case '+':
3535 case '?':
3536 case '{':
3537 ReportError(CStrVector("Nothing to repeat."), CHECK_OK);
3538 case '^': {
3539 Advance();
3540 RegExpAssertion::Type type =
3541 multiline_mode_ ? RegExpAssertion::START_OF_LINE :
3542 RegExpAssertion::START_OF_INPUT;
3543 builder.AddAssertion(new RegExpAssertion(type));
3544 continue;
3545 }
3546 case '$': {
3547 Advance();
3548 RegExpAssertion::Type type =
3549 multiline_mode_ ? RegExpAssertion::END_OF_LINE :
3550 RegExpAssertion::END_OF_INPUT;
3551 builder.AddAssertion(new RegExpAssertion(type));
3552 continue;
3553 }
3554 case '.': {
3555 Advance();
3556 // everything except \x0a, \x0d, \u2028 and \u2029
3557 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
3558 CharacterRange::AddClassEscape('.', ranges);
3559 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
3560 builder.AddAtom(atom);
3561 break;
3562 }
3563 case '(': {
3564 RegExpTree* atom = ParseGroup(CHECK_OK);
3565 builder.AddAtom(atom);
3566 break;
3567 }
3568 case '[': {
3569 RegExpTree* atom = ParseCharacterClass(CHECK_OK);
3570 builder.AddAtom(atom);
3571 break;
3572 }
3573 // Atom ::
3574 // \ AtomEscape
3575 case '\\':
3576 switch (next()) {
3577 case kEndMarker:
3578 ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);
3579 case 'b':
3580 Advance(2);
3581 builder.AddAssertion(
3582 new RegExpAssertion(RegExpAssertion::BOUNDARY));
3583 continue;
3584 case 'B':
3585 Advance(2);
3586 builder.AddAssertion(
3587 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
3588 continue;
3589 // AtomEscape ::
3590 // CharacterClassEscape
3591 //
3592 // CharacterClassEscape :: one of
3593 // d D s S w W
3594 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
3595 uc32 c = next();
3596 Advance(2);
3597 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
3598 CharacterRange::AddClassEscape(c, ranges);
3599 RegExpTree* atom = new RegExpCharacterClass(ranges, false);
3600 builder.AddAtom(atom);
3601 goto has_read_atom; // Avoid setting has_character_escapes_.
3602 }
3603 case '1': case '2': case '3': case '4': case '5': case '6':
3604 case '7': case '8': case '9': {
3605 int index = 0;
3606 if (ParseBackreferenceIndex(&index)) {
3607 RegExpTree* atom = new RegExpBackreference(index);
3608 builder.AddAtom(atom);
3609 goto has_read_atom; // Avoid setting has_character_escapes_.
3610 }
3611 uc32 first_digit = next();
3612 if (first_digit == '8' || first_digit == '9') {
3613 // Treat as identity escape
3614 builder.AddCharacter(first_digit);
3615 Advance(2);
3616 break;
3617 }
3618 }
3619 // FALLTHROUGH
3620 case '0': {
3621 Advance();
3622 uc32 octal = ParseOctalLiteral();
3623 builder.AddCharacter(octal);
3624 break;
3625 }
3626 // ControlEscape :: one of
3627 // f n r t v
3628 case 'f':
3629 Advance(2);
3630 builder.AddCharacter('\f');
3631 break;
3632 case 'n':
3633 Advance(2);
3634 builder.AddCharacter('\n');
3635 break;
3636 case 'r':
3637 Advance(2);
3638 builder.AddCharacter('\r');
3639 break;
3640 case 't':
3641 Advance(2);
3642 builder.AddCharacter('\t');
3643 break;
3644 case 'v':
3645 Advance(2);
3646 builder.AddCharacter('\v');
3647 break;
3648 case 'c': {
3649 Advance(2);
3650 uc32 control = ParseControlLetterEscape(ok);
3651 builder.AddCharacter(control);
3652 break;
3653 }
3654 case 'x': {
3655 Advance(2);
3656 uc32 value;
3657 if (ParseHexEscape(2, &value)) {
3658 builder.AddCharacter(value);
3659 } else {
3660 builder.AddCharacter('x');
3661 }
3662 break;
3663 }
3664 case 'u': {
3665 Advance(2);
3666 uc32 value;
3667 if (ParseHexEscape(4, &value)) {
3668 builder.AddCharacter(value);
3669 } else {
3670 builder.AddCharacter('u');
3671 }
3672 break;
3673 }
3674 default:
3675 // Identity escape.
3676 builder.AddCharacter(next());
3677 Advance(2);
3678 break;
3679 }
3680 has_character_escapes_ = true;
3681 break;
3682 default:
3683 builder.AddCharacter(current());
3684 Advance();
3685 break;
3686 } // end switch(current())
3687
3688 has_read_atom:
3689 int min;
3690 int max;
3691 switch (current()) {
3692 // QuantifierPrefix ::
3693 // *
3694 // +
3695 // ?
3696 // {
3697 case '*':
3698 min = 0;
3699 max = RegExpQuantifier::kInfinity;
3700 Advance();
3701 break;
3702 case '+':
3703 min = 1;
3704 max = RegExpQuantifier::kInfinity;
3705 Advance();
3706 break;
3707 case '?':
3708 min = 0;
3709 max = 1;
3710 Advance();
3711 break;
3712 case '{':
3713 ParseIntervalQuantifier(&min, &max, CHECK_OK);
3714 break;
3715 default:
3716 continue;
3717 }
3718 bool is_greedy = true;
3719 if (current() == '?') {
3720 is_greedy = false;
3721 Advance();
3722 }
3723 builder.AddQuantifierToAtom(min, max, is_greedy);
3328 } 3724 }
3329 } 3725 }
3330 3726
3331
3332 static bool IsAlternativeTerminator(uc32 c) {
3333 return c == '|' || c == ')' || c == RegExpParser::kEndMarker;
3334 }
3335
3336
3337 // Alternative ::
3338 // [empty]
3339 // Alternative Term
3340 RegExpTree* RegExpParser::ParseAlternative(bool* ok) {
3341 if (!IsAlternativeTerminator(current())) {
3342 RegExpTree* first = ParseTerm(CHECK_OK);
3343 if (!IsAlternativeTerminator(current())) {
3344 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2);
3345 nodes->Add(first);
3346 while (!IsAlternativeTerminator(current())) {
3347 RegExpTree* next = ParseTerm(CHECK_OK);
3348 nodes->Add(next);
3349 }
3350 return new RegExpAlternative(nodes);
3351 } else {
3352 return first;
3353 }
3354 } else {
3355 return RegExpEmpty::GetInstance();
3356 }
3357 }
3358
3359
3360 class SourceCharacter { 3727 class SourceCharacter {
3361 public: 3728 public:
3362 static bool Is(uc32 c) { 3729 static bool Is(uc32 c) {
3363 switch (c) { 3730 switch (c) {
3364 // case ']': case '}': 3731 // case ']': case '}':
3365 // In spidermonkey and jsc these are treated as source characters 3732 // In spidermonkey and jsc these are treated as source characters
3366 // so we do too. 3733 // so we do too.
3367 case '^': case '$': case '\\': case '.': case '*': case '+': 3734 case '^': case '$': case '\\': case '.': case '*': case '+':
3368 case '?': case '(': case ')': case '[': case '{': case '|': 3735 case '?': case '(': case ')': case '[': case '{': case '|':
3369 case RegExpParser::kEndMarker: 3736 case RegExpParser::kEndMarker:
3370 return false; 3737 return false;
3371 default: 3738 default:
3372 return true; 3739 return true;
3373 } 3740 }
3374 } 3741 }
3375 }; 3742 };
3376 3743
3377 3744
3378 static unibrow::Predicate<SourceCharacter> source_character; 3745 static unibrow::Predicate<SourceCharacter> source_character;
3379 3746
3380 3747
3381 static inline bool IsSourceCharacter(uc32 c) { 3748 static inline bool IsSourceCharacter(uc32 c) {
3382 return source_character.get(c); 3749 return source_character.get(c);
3383 } 3750 }
3384 3751
3385 3752 #ifdef DEBUG
3386 static bool IsSpecialEscape(uc32 c) { 3753 // Currently only used in an ASSERT.
3754 static bool IsSpecialClassEscape(uc32 c) {
3387 switch (c) { 3755 switch (c) {
3388 case 'b': case 'B': case 'd': case 'D': case 's': case 'S': 3756 case 'd': case 'D':
3757 case 's': case 'S':
3389 case 'w': case 'W': 3758 case 'w': case 'W':
3390 return true; 3759 return true;
3391 default: 3760 default:
3392 return false; 3761 return false;
3393 } 3762 }
3394 } 3763 }
3764 #endif
3395 3765
3396 3766
3397 bool RegExpParser::ParseBackreferenceIndex(int* index_out) { 3767 bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
3398 ASSERT_EQ('\\', current()); 3768 ASSERT_EQ('\\', current());
3399 ASSERT('1' <= next() && next() <= '9'); 3769 ASSERT('1' <= next() && next() <= '9');
3400 ASSERT_EQ(0, pushback_count_); 3770 ASSERT_EQ(0, pushback_count_);
3401 // Try to parse a decimal literal that is less than then number 3771 // Try to parse a decimal literal that is no greater than the number
3402 // of previously encountered left capturing parentheses. 3772 // of previously encountered left capturing parentheses.
3403 // This is a not according the the ECMAScript specification. According to 3773 // This is a not according the the ECMAScript specification. According to
3404 // that, one must accept values up to the total number of left capturing 3774 // that, one must accept values up to the total number of left capturing
3405 // parentheses in the entire input, even if they are meaningless. 3775 // parentheses in the entire input, even if they are meaningless.
3406 if (captures_seen_ == 0) 3776 if (captures_started_ == 0)
3407 return false; 3777 return false;
3408 int value = next() - '0'; 3778 int value = next() - '0';
3409 if (value > captures_seen_) 3779 if (value > captures_started_)
3410 return false; 3780 return false;
3411 static const int kMaxChars = kMaxPushback - 2; 3781 static const int kMaxChars = kMaxPushback - 2;
3412 EmbeddedVector<uc32, kMaxChars> chars_seen; 3782 EmbeddedVector<uc32, kMaxChars> chars_seen;
3413 chars_seen[0] = next(); 3783 chars_seen[0] = next();
3414 int char_count = 1; 3784 int char_count = 1;
3415 Advance(2); 3785 Advance(2);
3416 while (true) { 3786 while (true) {
3417 uc32 c = current(); 3787 uc32 c = current();
3418 if (IsDecimalDigit(c)) { 3788 if (IsDecimalDigit(c)) {
3419 int next_value = 10 * value + (c - '0'); 3789 value = 10 * value + (c - '0');
3420 // To avoid reading past the end of the stack-allocated pushback 3790 // To avoid reading past the end of the stack-allocated pushback
3421 // buffers we only read kMaxChars before giving up. 3791 // buffers we only read kMaxChars before giving up.
3422 if (next_value > captures_seen_ || char_count > kMaxChars) { 3792 if (value > captures_started_ || char_count > kMaxChars) {
3423 // If we give up we have to push the characters we read back 3793 // If we give up we have to push the characters we read back
3424 // onto the pushback buffer in the reverse order. 3794 // onto the pushback buffer in the reverse order.
3425 for (int i = 0; i < char_count; i++) { 3795 for (int i = 0; i < char_count; i++) {
3426 PushBack(chars_seen[char_count - i - 1]); 3796 PushBack(chars_seen[char_count - i - 1]);
3427 } 3797 }
3428 PushBack('\\'); 3798 PushBack('\\');
3429 return false; 3799 return false;
3430 } 3800 }
3431 value = next_value;
3432 chars_seen[char_count++] = current(); 3801 chars_seen[char_count++] = current();
3433 Advance(); 3802 Advance();
3434 } else { 3803 } else {
3435 *index_out = value; 3804 break;
3436 return true;
3437 } 3805 }
3438 } 3806 }
3807 *index_out = value;
3808 return true;
3439 } 3809 }
3440 3810
3441 3811
3442 // Term ::
3443 // Assertion
3444 // Atom
3445 // Atom Quantifier
3446 RegExpTree* RegExpParser::ParseTerm(bool* ok) {
3447 RegExpTree* atom = NULL;
3448 switch (current()) {
3449 // Assertion ::
3450 // ^
3451 // $
3452 // \ b
3453 // \ B
3454 case '^':
3455 Advance();
3456 return new RegExpAssertion(
3457 multiline_mode_ ? RegExpAssertion::START_OF_LINE
3458 : RegExpAssertion::START_OF_INPUT);
3459 case '$':
3460 Advance();
3461 return new RegExpAssertion(
3462 multiline_mode_ ? RegExpAssertion::END_OF_LINE
3463 : RegExpAssertion::END_OF_INPUT);
3464 case '.': {
3465 Advance();
3466 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
3467 CharacterRange::AddClassEscape('.', ranges);
3468 atom = new RegExpCharacterClass(ranges, false);
3469 break;
3470 }
3471 case '(':
3472 atom = ParseGroup(CHECK_OK);
3473 break;
3474 case '[':
3475 atom = ParseCharacterClass(CHECK_OK);
3476 break;
3477 // Atom ::
3478 // \ AtomEscape
3479 case '\\':
3480 if (has_next()) {
3481 switch (next()) {
3482 case 'b':
3483 Advance(2);
3484 return new RegExpAssertion(RegExpAssertion::BOUNDARY);
3485 case 'B':
3486 Advance(2);
3487 return new RegExpAssertion(RegExpAssertion::NON_BOUNDARY);
3488 // AtomEscape ::
3489 // CharacterClassEscape
3490 //
3491 // CharacterClassEscape :: one of
3492 // d D s S w W
3493 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
3494 uc32 c = next();
3495 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
3496 CharacterRange::AddClassEscape(c, ranges);
3497 Advance(2);
3498 atom = new RegExpCharacterClass(ranges, false);
3499 goto has_read_atom;
3500 }
3501 case '1': case '2': case '3': case '4': case '5': case '6':
3502 case '7': case '8': case '9': {
3503 int index = 0;
3504 if (ParseBackreferenceIndex(&index)) {
3505 atom = new RegExpBackreference(index);
3506 goto has_read_atom;
3507 } else {
3508 // If this is not a backreference we go to the atom parser
3509 // which will read it as an octal escape or identity escape.
3510 goto parse_atom;
3511 }
3512 }
3513 default:
3514 goto parse_atom;
3515 }
3516 }
3517 // All other escapes fall through to the default case since
3518 // they correspond to single characters that can be
3519 // represented within atoms.
3520 default: {
3521 parse_atom:
3522 atom = ParseAtom(CHECK_OK);
3523 break;
3524 }
3525 }
3526 has_read_atom:
3527 int min;
3528 int max;
3529 switch (current()) {
3530 // QuantifierPrefix ::
3531 // *
3532 // +
3533 // ?
3534 // {
3535 case '*':
3536 min = 0;
3537 max = RegExpQuantifier::kInfinity;
3538 Advance();
3539 break;
3540 case '+':
3541 min = 1;
3542 max = RegExpQuantifier::kInfinity;
3543 Advance();
3544 break;
3545 case '?':
3546 min = 0;
3547 max = 1;
3548 Advance();
3549 break;
3550 case '{':
3551 ParseIntervalQuantifier(&min, &max, CHECK_OK);
3552 break;
3553 default:
3554 return atom;
3555 }
3556 bool is_greedy = true;
3557 if (current() == '?') {
3558 is_greedy = false;
3559 Advance();
3560 }
3561 return new RegExpQuantifier(min, max, is_greedy, atom);
3562 }
3563
3564
3565 // QuantifierPrefix :: 3812 // QuantifierPrefix ::
3566 // { DecimalDigits } 3813 // { DecimalDigits }
3567 // { DecimalDigits , } 3814 // { DecimalDigits , }
3568 // { DecimalDigits , DecimalDigits } 3815 // { DecimalDigits , DecimalDigits }
3569 void* RegExpParser::ParseIntervalQuantifier(int* min_out, 3816 void* RegExpParser::ParseIntervalQuantifier(int* min_out,
3570 int* max_out, 3817 int* max_out,
3571 bool* ok) { 3818 bool* ok) {
3572 ASSERT_EQ(current(), '{'); 3819 ASSERT_EQ(current(), '{');
3573 static const char* kInvalidQuantifier = "Invalid quantifier"; 3820 static const char* kInvalidQuantifier = "Invalid quantifier";
3574 Advance(); 3821 Advance();
(...skipping 30 matching lines...) Expand all
3605 } 3852 }
3606 } else { 3853 } else {
3607 ReportError(CStrVector(kInvalidQuantifier), CHECK_OK); 3854 ReportError(CStrVector(kInvalidQuantifier), CHECK_OK);
3608 } 3855 }
3609 *min_out = min; 3856 *min_out = min;
3610 *max_out = max; 3857 *max_out = max;
3611 return NULL; 3858 return NULL;
3612 } 3859 }
3613 3860
3614 3861
3615 RegExpTree* RegExpParser::ParseAtom(bool* ok) {
3616 ASSERT(current() == '\\' || IsSourceCharacter(current()));
3617 ZoneList<uc16>* buf = new ZoneList<uc16>(4);
3618 while (true) {
3619 if (IsSourceCharacter(current())) {
3620 buf->Add(current());
3621 Advance();
3622 } else if (current() == '\\') {
3623 if (!has_next()) {
3624 ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);
3625 } else if (IsSpecialEscape(next())) {
3626 // If the next thing we see is a special escape we stop
3627 // reading this atom.
3628 break;
3629 } else {
3630 uc32 escape = ParseCharacterEscape(CHECK_OK);
3631 buf->Add(escape);
3632 }
3633 } else {
3634 break;
3635 }
3636 }
3637 return new RegExpAtom(buf->ToConstVector());
3638 }
3639
3640 // Upper and lower case letters differ by one bit. 3862 // Upper and lower case letters differ by one bit.
3641 STATIC_CHECK('a'^'A' == 0x20); 3863 STATIC_CHECK('a'^'A' == 0x20);
3642 3864
3643 uc32 RegExpParser::ParseControlEscape(bool* ok) { 3865 uc32 RegExpParser::ParseControlLetterEscape(bool* ok) {
3644 ASSERT(current() == 'c');
3645 Advance();
3646 if (!has_more()) { 3866 if (!has_more()) {
3647 ReportError(CStrVector("\\c at end of pattern"), ok); 3867 ReportError(CStrVector("\\c at end of pattern"), ok);
3648 return '\0'; 3868 return '\0';
3649 } 3869 }
3650 uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters. 3870 uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.
3651 if (letter < 'A' || 'Z' < letter) { 3871 if (letter < 'A' || 'Z' < letter) {
3652 // Non-spec error-correction: "\c" followed by non-control letter is 3872 // Non-spec error-correction: "\c" followed by non-control letter is
3653 // interpreted as an IdentityEscape. 3873 // interpreted as an IdentityEscape of 'c'.
3654 return 'c'; 3874 return 'c';
3655 } 3875 }
3656 Advance(); 3876 Advance();
3657 return letter & 0x1f; // Remainder modulo 32, per specification. 3877 return letter & 0x1f; // Remainder modulo 32, per specification.
3658 } 3878 }
3659 3879
3660 3880
3661 uc32 RegExpParser::ParseOctalLiteral(bool* ok) { 3881 uc32 RegExpParser::ParseOctalLiteral() {
3662 ASSERT('0' <= current() && current() <= '7'); 3882 ASSERT('0' <= current() && current() <= '7');
3663 // For compatibility with some other browsers (not all), we parse 3883 // For compatibility with some other browsers (not all), we parse
3664 // up to three octal digits with a value below 256. 3884 // up to three octal digits with a value below 256.
3665 uc32 value = current() - '0'; 3885 uc32 value = current() - '0';
3666 Advance(); 3886 Advance();
3667 if ('0' <= current() && current() <= '7') { 3887 if ('0' <= current() && current() <= '7') {
3668 value = value * 8 + current() - '0'; 3888 value = value * 8 + current() - '0';
3669 Advance(); 3889 Advance();
3670 if (value < 32 && '0' <= current() && current() <= '7') { 3890 if (value < 32 && '0' <= current() && current() <= '7') {
3671 value = value * 8 + current() - '0'; 3891 value = value * 8 + current() - '0';
3672 Advance(); 3892 Advance();
3673 } 3893 }
3674 } 3894 }
3675 return value; 3895 return value;
3676 } 3896 }
3677 3897
3898
3678 bool RegExpParser::ParseHexEscape(int length, uc32 *value) { 3899 bool RegExpParser::ParseHexEscape(int length, uc32 *value) {
3679 static const int kMaxChars = kMaxPushback; 3900 static const int kMaxChars = kMaxPushback;
3680 EmbeddedVector<uc32, kMaxChars> chars_seen; 3901 EmbeddedVector<uc32, kMaxChars> chars_seen;
3681 ASSERT(length <= kMaxChars); 3902 ASSERT(length <= kMaxChars);
3682 uc32 val = 0; 3903 uc32 val = 0;
3683 bool done = false; 3904 bool done = false;
3684 for (int i = 0; !done; i++) { 3905 for (int i = 0; !done; i++) {
3685 uc32 c = current(); 3906 uc32 c = current();
3686 int d = HexValue(c); 3907 int d = HexValue(c);
3687 if (d < 0) { 3908 if (d < 0) {
3688 while (i > 0) { 3909 while (i > 0) {
3689 i--; 3910 i--;
3690 PushBack(chars_seen[i]); 3911 PushBack(chars_seen[i]);
3691 } 3912 }
3692 return false; 3913 return false;
3693 } 3914 }
3694 val = val * 16 + d; 3915 val = val * 16 + d;
3695 Advance(); 3916 Advance();
3696 if (i < length - 1) { 3917 if (i < length - 1) {
3697 chars_seen[i] = c; 3918 chars_seen[i] = c;
3698 } else { 3919 } else {
3699 done = true; 3920 done = true;
3700 } 3921 }
3701 } 3922 }
3702 *value = val; 3923 *value = val;
3703 return true; 3924 return true;
3704 } 3925 }
3705 3926
3706 3927
3707 uc32 RegExpParser::ParseCharacterEscape(bool* ok) { 3928 uc32 RegExpParser::ParseClassCharacterEscape(bool* ok) {
3708 ASSERT(current() == '\\'); 3929 ASSERT(current() == '\\');
3709 ASSERT(has_next() && !IsSpecialEscape(next())); 3930 ASSERT(has_next() && !IsSpecialClassEscape(next()));
3710 Advance(); 3931 Advance();
3711 ASSERT(current() != 'b' && current() != 'B');
3712 switch (current()) { 3932 switch (current()) {
3713 // ControlEscape :: one of 3933 // ControlEscape :: one of
3714 // f n r t v 3934 // f n r t v
3715 case 'f': 3935 case 'f':
3716 Advance(); 3936 Advance();
3717 return '\f'; 3937 return '\f';
3718 case 'n': 3938 case 'n':
3719 Advance(); 3939 Advance();
3720 return '\n'; 3940 return '\n';
3721 case 'r': 3941 case 'r':
3722 Advance(); 3942 Advance();
3723 return '\r'; 3943 return '\r';
3724 case 't': 3944 case 't':
3725 Advance(); 3945 Advance();
3726 return '\t'; 3946 return '\t';
3727 case 'v': 3947 case 'v':
3728 Advance(); 3948 Advance();
3729 return '\v'; 3949 return '\v';
3730 case 'c': 3950 case 'c':
3731 // Spec mandates that next character is ASCII letter. 3951 return ParseControlLetterEscape(ok);
3732 // If not, we error-correct by interpreting "\c" as "c".
3733 return ParseControlEscape(ok);
3734 case '0': case '1': case '2': case '3': case '4': case '5': 3952 case '0': case '1': case '2': case '3': case '4': case '5':
3735 case '6': case '7': 3953 case '6': case '7':
3736 // For compatibility, we interpret a decimal escape that isn't 3954 // For compatibility, we interpret a decimal escape that isn't
3737 // a back reference (and therefore either \0 or not valid according 3955 // a back reference (and therefore either \0 or not valid according
3738 // to the specification) as a 1..3 digit octal character code. 3956 // to the specification) as a 1..3 digit octal character code.
3739 return ParseOctalLiteral(ok); 3957 return ParseOctalLiteral();
3740 case 'x': { 3958 case 'x': {
3741 Advance(); 3959 Advance();
3742 uc32 value; 3960 uc32 value;
3743 if (ParseHexEscape(2, &value)) { 3961 if (ParseHexEscape(2, &value)) {
3744 return value; 3962 return value;
3745 } 3963 }
3746 // If \x is not followed by a two-digit hexadecimal, treat it 3964 // If \x is not followed by a two-digit hexadecimal, treat it
3747 // as an identity escape. 3965 // as an identity escape.
3748 return 'x'; 3966 return 'x';
3749 } 3967 }
(...skipping 27 matching lines...) Expand all
3777 if (current() == '?') { 3995 if (current() == '?') {
3778 switch (next()) { 3996 switch (next()) {
3779 case ':': case '=': case '!': 3997 case ':': case '=': case '!':
3780 type = next(); 3998 type = next();
3781 Advance(2); 3999 Advance(2);
3782 break; 4000 break;
3783 default: 4001 default:
3784 ReportError(CStrVector("Invalid group"), CHECK_OK); 4002 ReportError(CStrVector("Invalid group"), CHECK_OK);
3785 break; 4003 break;
3786 } 4004 }
4005 } else {
4006 captures_started_++;
3787 } 4007 }
4008 int capture_index = captures_started_;
3788 RegExpTree* body = ParseDisjunction(CHECK_OK); 4009 RegExpTree* body = ParseDisjunction(CHECK_OK);
3789 if (current() != ')') { 4010 if (current() != ')') {
3790 ReportError(CStrVector("Unterminated group"), CHECK_OK); 4011 ReportError(CStrVector("Unterminated group"), CHECK_OK);
3791 } 4012 }
3792 Advance(); 4013 Advance();
3793 if (type == '(') { 4014 if (type == '(') {
3794 captures_seen_++; 4015 return new RegExpCapture(body, capture_index);
3795 return new RegExpCapture(body);
3796 } else if (type == ':') { 4016 } else if (type == ':') {
3797 return body; 4017 return body;
3798 } else { 4018 } else {
3799 ASSERT(type == '=' || type == '!'); 4019 ASSERT(type == '=' || type == '!');
3800 bool is_positive = (type == '='); 4020 bool is_positive = (type == '=');
3801 return new RegExpLookahead(body, is_positive); 4021 return new RegExpLookahead(body, is_positive);
3802 } 4022 }
3803 } 4023 }
3804 4024
3805 4025
3806 CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class, 4026 CharacterRange RegExpParser::ParseClassAtom(bool* is_char_class,
3807 ZoneList<CharacterRange>* ranges, 4027 ZoneList<CharacterRange>* ranges,
3808 bool* ok) { 4028 bool* ok) {
3809 ASSERT_EQ(false, *is_char_class); 4029 ASSERT_EQ(false, *is_char_class);
3810 uc32 first = current(); 4030 uc32 first = current();
3811 if (first == '\\') { 4031 if (first == '\\') {
3812 switch (next()) { 4032 switch (next()) {
3813 case 'b':
3814 Advance(2);
3815 return CharacterRange::Singleton('\b');
3816 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': { 4033 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {
3817 *is_char_class = true; 4034 *is_char_class = true;
3818 uc32 c = next(); 4035 uc32 c = next();
3819 CharacterRange::AddClassEscape(c, ranges); 4036 CharacterRange::AddClassEscape(c, ranges);
3820 Advance(2); 4037 Advance(2);
3821 return NULL; 4038 return NULL;
3822 } 4039 }
3823 default: 4040 default:
3824 uc32 c = ParseCharacterEscape(CHECK_OK); 4041 uc32 c = ParseClassCharacterEscape(CHECK_OK);
3825 return CharacterRange::Singleton(c); 4042 return CharacterRange::Singleton(c);
3826 } 4043 }
3827 } else { 4044 } else {
3828 Advance(); 4045 Advance();
3829 return CharacterRange::Singleton(first); 4046 return CharacterRange::Singleton(first);
3830 } 4047 }
3831 } 4048 }
3832 4049
3833 4050
3834 RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) { 4051 RegExpTree* RegExpParser::ParseCharacterClass(bool* ok) {
(...skipping 12 matching lines...) Expand all
3847 while (has_more() && current() != ']') { 4064 while (has_more() && current() != ']') {
3848 if (current() == '-') { 4065 if (current() == '-') {
3849 Advance(); 4066 Advance();
3850 ranges->Add(CharacterRange::Singleton('-')); 4067 ranges->Add(CharacterRange::Singleton('-'));
3851 } else { 4068 } else {
3852 bool is_char_class = false; 4069 bool is_char_class = false;
3853 CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK); 4070 CharacterRange first = ParseClassAtom(&is_char_class, ranges, CHECK_OK);
3854 if (!is_char_class) { 4071 if (!is_char_class) {
3855 if (current() == '-') { 4072 if (current() == '-') {
3856 Advance(); 4073 Advance();
3857 CharacterRange next = ParseClassAtom(&is_char_class, ranges, CHECK_OK) ; 4074 CharacterRange next =
4075 ParseClassAtom(&is_char_class, ranges, CHECK_OK);
3858 if (is_char_class) { 4076 if (is_char_class) {
3859 return ReportError(CStrVector(kIllegal), CHECK_OK); 4077 return ReportError(CStrVector(kIllegal), CHECK_OK);
3860 } 4078 }
3861 if (first.from() > next.to()) { 4079 if (first.from() > next.to()) {
3862 return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK); 4080 return ReportError(CStrVector(kRangeOutOfOrder), CHECK_OK);
3863 } 4081 }
3864 ranges->Add(CharacterRange::Range(first.from(), next.to())); 4082 ranges->Add(CharacterRange::Range(first.from(), next.to()));
3865 } else { 4083 } else {
3866 ranges->Add(first); 4084 ranges->Add(first);
3867 } 4085 }
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
3922 if (!parser.PreParseProgram(stream)) return NULL; 4140 if (!parser.PreParseProgram(stream)) return NULL;
3923 // The list owns the backing store so we need to clone the vector. 4141 // The list owns the backing store so we need to clone the vector.
3924 // That way, the result will be exactly the right size rather than 4142 // That way, the result will be exactly the right size rather than
3925 // the expected 50% too large. 4143 // the expected 50% too large.
3926 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone(); 4144 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone();
3927 return new ScriptDataImpl(store); 4145 return new ScriptDataImpl(store);
3928 } 4146 }
3929 4147
3930 4148
3931 RegExpTree* ParseRegExp(unibrow::CharacterStream* stream, 4149 RegExpTree* ParseRegExp(unibrow::CharacterStream* stream,
3932 Handle<String>* error) { 4150 Handle<String>* error,
4151 bool* has_character_escapes) {
3933 ASSERT(error->is_null()); 4152 ASSERT(error->is_null());
3934 RegExpParser parser(stream, error, false); // Get multiline flag somehow 4153 RegExpParser parser(stream, error, false); // Get multiline flag somehow
3935 bool ok = true; 4154 bool ok = true;
3936 RegExpTree* result = parser.ParsePattern(&ok); 4155 RegExpTree* result = parser.ParsePattern(&ok);
3937 if (!ok) { 4156 if (!ok) {
3938 ASSERT(result == NULL); 4157 ASSERT(result == NULL);
3939 ASSERT(!error->is_null()); 4158 ASSERT(!error->is_null());
3940 } else { 4159 } else {
3941 ASSERT(result != NULL); 4160 ASSERT(result != NULL);
3942 ASSERT(error->is_null()); 4161 ASSERT(error->is_null());
3943 } 4162 }
4163 if (ok && has_character_escapes != NULL) {
4164 *has_character_escapes = parser.HasCharacterEscapes();
4165 }
3944 return result; 4166 return result;
3945 } 4167 }
3946 4168
3947 4169
3948 FunctionLiteral* MakeAST(bool compile_in_global_context, 4170 FunctionLiteral* MakeAST(bool compile_in_global_context,
3949 Handle<Script> script, 4171 Handle<Script> script,
3950 v8::Extension* extension, 4172 v8::Extension* extension,
3951 ScriptDataImpl* pre_data) { 4173 ScriptDataImpl* pre_data) {
3952 bool allow_natives_syntax = 4174 bool allow_natives_syntax =
3953 always_allow_natives_syntax || 4175 always_allow_natives_syntax ||
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
3990 start_position, 4212 start_position,
3991 is_expression); 4213 is_expression);
3992 return result; 4214 return result;
3993 } 4215 }
3994 4216
3995 4217
3996 #undef NEW 4218 #undef NEW
3997 4219
3998 4220
3999 } } // namespace v8::internal 4221 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « regexp2000/src/parser.h ('k') | regexp2000/test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698