Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(273)

Side by Side Diff: regexp2000/src/parser.cc

Issue 8871: Experimental RegExp: changed handling of non-standard escape sequences. (Closed)
Patch Set: RegExp escape handling, with review comments. Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | regexp2000/test/cctest/test-regexp.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 212 matching lines...) Expand 10 before | Expand all | Expand 10 after
223 223
224 friend class Target; 224 friend class Target;
225 friend class TargetScope; 225 friend class TargetScope;
226 friend class LexicalScope; 226 friend class LexicalScope;
227 friend class TemporaryScope; 227 friend class TemporaryScope;
228 }; 228 };
229 229
230 230
231 class RegExpParser { 231 class RegExpParser {
232 public: 232 public:
233 RegExpParser(unibrow::CharacterStream* in, Handle<String>* error); 233 RegExpParser(unibrow::CharacterStream* in,
234 Handle<String>* error,
235 bool multiline_mode);
234 RegExpTree* ParsePattern(bool* ok); 236 RegExpTree* ParsePattern(bool* ok);
235 RegExpTree* ParseDisjunction(bool* ok); 237 RegExpTree* ParseDisjunction(bool* ok);
236 RegExpTree* ParseAlternative(bool* ok); 238 RegExpTree* ParseAlternative(bool* ok);
237 RegExpTree* ParseTerm(bool* ok); 239 RegExpTree* ParseTerm(bool* ok);
238 RegExpTree* ParseAtom(bool* ok); 240 RegExpTree* ParseAtom(bool* ok);
239 RegExpTree* ParseGroup(bool* ok); 241 RegExpTree* ParseGroup(bool* ok);
240 RegExpTree* ParseCharacterClass(bool* ok); 242 RegExpTree* ParseCharacterClass(bool* ok);
241 243
242 // Parses a {...,...} quantifier and stores the range in the given 244 // Parses a {...,...} quantifier and stores the range in the given
243 // out parameters. 245 // out parameters.
244 void* ParseIntervalQuantifier(int* min_out, int* max_out, bool* ok); 246 void* ParseIntervalQuantifier(int* min_out, int* max_out, bool* ok);
245 247
246 // Parses and returns a single escaped character. The character 248 // Parses and returns a single escaped character. The character
247 // must not be 'b' or 'B' since they are usually handle specially. 249 // must not be 'b' or 'B' since they are usually handle specially.
248 uc32 ParseCharacterEscape(bool* ok); 250 uc32 ParseCharacterEscape(bool* ok);
249 251
250 uc32 ParseHexEscape(int length); 252 // Checks whether the following is a length-digit hexadecimal number,
253 // and sets the value if it is.
254 bool ParseHexEscape(int length, uc32* value);
251 255
252 uc32 ParseControlEscape(bool* ok); 256 uc32 ParseControlEscape(bool* ok);
253 uc32 ParseOctalLiteral(bool* ok); 257 uc32 ParseOctalLiteral(bool* ok);
254 258
255 // Tries to parse the input as a backreference. If successful it 259 // Tries to parse the input as a backreference. If successful it
256 // stores the result in the output parameter and returns true. If 260 // stores the result in the output parameter and returns true. If
257 // it fails it will push back the characters read so the same characters 261 // it fails it will push back the characters read so the same characters
258 // can be reparsed. 262 // can be reparsed.
259 bool ParseBackreferenceIndex(int* index_out); 263 bool ParseBackreferenceIndex(int* index_out);
260 264
261 CharacterRange ParseClassAtom(bool* ok); 265 CharacterRange ParseClassAtom(bool* ok);
262 RegExpTree* ReportError(Vector<const char> message, bool* ok); 266 RegExpTree* ReportError(Vector<const char> message, bool* ok);
263 void Advance(); 267 void Advance();
264 void Advance(int dist); 268 void Advance(int dist);
269 // Pushes a read character (or potentially some other character) back
270 // on the input stream. After pushing it back, it becomes the character
271 // returned by current(). There is a limited amount of push-back buffer.
272 // A function using PushBack should check that it doesn't push back more
273 // than kMaxPushback characters, and it should not push back more characters
274 // than it has read, or that it knows had been read prior to calling it.
275 void PushBack(uc32 character);
276 bool CanPushBack();
265 static const uc32 kEndMarker = unibrow::Utf8::kBadChar; 277 static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
266 private: 278 private:
267 uc32 current() { return current_; } 279 uc32 current() { return current_; }
268 uc32 next() { return next_; } 280 uc32 next() { return next_; }
269 bool has_more() { return has_more_; } 281 bool has_more() { return has_more_; }
270 bool has_next() { return has_next_; } 282 bool has_next() { return has_next_; }
271 unibrow::CharacterStream* in() { return in_; } 283 unibrow::CharacterStream* in() { return in_; }
272 uc32 current_; 284 uc32 current_;
273 uc32 next_; 285 uc32 next_;
274 bool has_more_; 286 bool has_more_;
275 bool has_next_; 287 bool has_next_;
288 bool multiline_mode_;
276 int captures_seen_; 289 int captures_seen_;
277 unibrow::CharacterStream* in_; 290 unibrow::CharacterStream* in_;
278 Handle<String>* error_; 291 Handle<String>* error_;
279 static const int kMaxPushback = 5; 292 static const int kMaxPushback = 5;
280 int pushback_count_; 293 int pushback_count_;
281 uc32 pushback_buffer_[kMaxPushback]; 294 uc32 pushback_buffer_[kMaxPushback];
282 }; 295 };
283 296
284 297
285 // A temporary scope stores information during parsing, just like 298 // A temporary scope stores information during parsing, just like
(...skipping 2927 matching lines...) Expand 10 before | Expand all | Expand 10 after
3213 args->Add(new Literal(array)); 3226 args->Add(new Literal(array));
3214 return new Throw(new CallRuntime(constructor, NULL, args), 3227 return new Throw(new CallRuntime(constructor, NULL, args),
3215 scanner().location().beg_pos); 3228 scanner().location().beg_pos);
3216 } 3229 }
3217 3230
3218 3231
3219 // ---------------------------------------------------------------------------- 3232 // ----------------------------------------------------------------------------
3220 // Regular expressions 3233 // Regular expressions
3221 3234
3222 3235
3223 RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error) 3236 RegExpParser::RegExpParser(unibrow::CharacterStream* in,
3237 Handle<String>* error,
3238 bool multiline_mode)
3224 : current_(kEndMarker), 3239 : current_(kEndMarker),
3225 next_(kEndMarker), 3240 next_(kEndMarker),
3226 has_more_(true), 3241 has_more_(true),
3227 has_next_(true), 3242 has_next_(true),
3243 multiline_mode_(multiline_mode),
3228 captures_seen_(0), 3244 captures_seen_(0),
3229 in_(in), 3245 in_(in),
3230 error_(error), 3246 error_(error),
3231 pushback_count_(0) { 3247 pushback_count_(0) {
3232 Advance(2); 3248 Advance(2);
3233 } 3249 }
3234 3250
3235 3251
3236 void RegExpParser::Advance() { 3252 void RegExpParser::Advance() {
3237 current_ = next_; 3253 current_ = next_;
(...skipping 10 matching lines...) Expand all
3248 } 3264 }
3249 } 3265 }
3250 3266
3251 3267
3252 void RegExpParser::Advance(int dist) { 3268 void RegExpParser::Advance(int dist) {
3253 for (int i = 0; i < dist; i++) 3269 for (int i = 0; i < dist; i++)
3254 Advance(); 3270 Advance();
3255 } 3271 }
3256 3272
3257 3273
3274 void RegExpParser::PushBack(uc32 character) {
3275 if (has_next_) {
3276 ASSERT(pushback_count_ < kMaxPushback);
3277 pushback_buffer_[pushback_count_] = next_;
3278 pushback_count_++;
3279 }
3280 if (has_more_) {
3281 next_ = current_;
3282 has_next_ = true;
3283 }
3284 current_ = character;
3285 has_more_ = true;
3286 }
3287
3288
3289 bool RegExpParser::CanPushBack() {
3290 return (pushback_count_ < kMaxPushback);
3291 }
3292
3293
3258 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) { 3294 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {
3259 *ok = false; 3295 *ok = false;
3260 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); 3296 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
3261 return NULL; 3297 return NULL;
3262 } 3298 }
3263 3299
3264 3300
3265 // Pattern :: 3301 // Pattern ::
3266 // Disjunction 3302 // Disjunction
3267 RegExpTree* RegExpParser::ParsePattern(bool* ok) { 3303 RegExpTree* RegExpParser::ParsePattern(bool* ok) {
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after
3351 default: 3387 default:
3352 return false; 3388 return false;
3353 } 3389 }
3354 } 3390 }
3355 3391
3356 3392
3357 bool RegExpParser::ParseBackreferenceIndex(int* index_out) { 3393 bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
3358 ASSERT_EQ('\\', current()); 3394 ASSERT_EQ('\\', current());
3359 ASSERT('1' <= next() && next() <= '9'); 3395 ASSERT('1' <= next() && next() <= '9');
3360 ASSERT_EQ(0, pushback_count_); 3396 ASSERT_EQ(0, pushback_count_);
3397 // Try to parse a decimal literal that is less than then number
3398 // of previously encountered left capturing parentheses.
3399 // This is a not according the the ECMAScript specification. According to
3400 // that, one must accept values up to the total number of left capturing
3401 // parentheses in the entire input, even if they are meaningless.
3361 if (captures_seen_ == 0) 3402 if (captures_seen_ == 0)
3362 return false; 3403 return false;
3363 int value = next() - '0'; 3404 int value = next() - '0';
3364 if (value > captures_seen_) 3405 if (value > captures_seen_)
3365 return false; 3406 return false;
3366 static const int kMaxChars = kMaxPushback - 2; 3407 static const int kMaxChars = kMaxPushback - 2;
3367 EmbeddedVector<uc32, kMaxChars> chars_seen; 3408 EmbeddedVector<uc32, kMaxChars> chars_seen;
3368 chars_seen[0] = next(); 3409 chars_seen[0] = next();
3369 int char_count = 1; 3410 int char_count = 1;
3370 Advance(2); 3411 Advance(2);
3371 while (true) { 3412 while (true) {
3372 uc32 c = current(); 3413 uc32 c = current();
3373 if (IsDecimalDigit(c)) { 3414 if (IsDecimalDigit(c)) {
3374 int next_value = 10 * value + (c - '0'); 3415 int next_value = 10 * value + (c - '0');
3375 // To avoid reading past the end of the stack-allocated pushback 3416 // To avoid reading past the end of the stack-allocated pushback
3376 // buffers we only read kMaxChars before giving up. 3417 // buffers we only read kMaxChars before giving up.
3377 if (next_value > captures_seen_ || char_count > kMaxChars) { 3418 if (next_value > captures_seen_ || char_count > kMaxChars) {
3378 // If we give up we have to push the characters we read back 3419 // If we give up we have to push the characters we read back
3379 // onto the pushback buffer in the reverse order. 3420 // onto the pushback buffer in the reverse order.
3380 pushback_buffer_[0] = current(); 3421 for (int i = 0; i < char_count; i++) {
3381 for (int i = 0; i < char_count; i++) 3422 PushBack(chars_seen[char_count - i - 1]);
3382 pushback_buffer_[i + 1] = chars_seen[char_count - i - 1]; 3423 }
3383 pushback_buffer_[char_count + 1] = '\\'; 3424 PushBack('\\');
3384 pushback_count_ = char_count + 2;
3385 // Then, once we've filled up the buffer, we read the two
3386 // first characters into the lookahead. This is a roundabout
3387 // way of doing it but makes the code simpler.
3388 Advance(2);
3389 return false; 3425 return false;
3390 } else {
3391 value = next_value;
3392 chars_seen[char_count++] = current();
3393 Advance();
3394 } 3426 }
3427 value = next_value;
3428 chars_seen[char_count++] = current();
3429 Advance();
3395 } else { 3430 } else {
3396 *index_out = value; 3431 *index_out = value;
3397 return true; 3432 return true;
3398 } 3433 }
3399 } 3434 }
3400 } 3435 }
3401 3436
3402 3437
3403 // Term :: 3438 // Term ::
3404 // Assertion 3439 // Assertion
3405 // Atom 3440 // Atom
3406 // Atom Quantifier 3441 // Atom Quantifier
3407 RegExpTree* RegExpParser::ParseTerm(bool* ok) { 3442 RegExpTree* RegExpParser::ParseTerm(bool* ok) {
3408 RegExpTree* atom = NULL; 3443 RegExpTree* atom = NULL;
3409 switch (current()) { 3444 switch (current()) {
3410 // Assertion :: 3445 // Assertion ::
3411 // ^ 3446 // ^
3412 // $ 3447 // $
3413 // \ b 3448 // \ b
3414 // \ B 3449 // \ B
3415 case '^': 3450 case '^':
3416 Advance(); 3451 Advance();
3417 // Make the type of assertion dependent on multi/nonmultiline. 3452 return new RegExpAssertion(
3418 return new RegExpAssertion(RegExpAssertion::START_OF_INPUT); 3453 multiline_mode_ ? RegExpAssertion::START_OF_LINE
3454 : RegExpAssertion::START_OF_INPUT);
3419 case '$': 3455 case '$':
3420 Advance(); 3456 Advance();
3421 // Make the type of assertion dependent on multi/nonmultiline. 3457 return new RegExpAssertion(
3422 return new RegExpAssertion(RegExpAssertion::END_OF_INPUT); 3458 multiline_mode_ ? RegExpAssertion::END_OF_LINE
3459 : RegExpAssertion::END_OF_INPUT);
3423 case '.': 3460 case '.':
3424 Advance(); 3461 Advance();
3425 atom = new RegExpCharacterClass(CharacterRange::CharacterClass('.')); 3462 atom = new RegExpCharacterClass(CharacterRange::CharacterClass('.'));
3426 break; 3463 break;
3427 case '(': 3464 case '(':
3428 atom = ParseGroup(CHECK_OK); 3465 atom = ParseGroup(CHECK_OK);
3429 break; 3466 break;
3430 case '[': 3467 case '[':
3431 atom = ParseCharacterClass(CHECK_OK); 3468 atom = ParseCharacterClass(CHECK_OK);
3432 break; 3469 break;
(...skipping 20 matching lines...) Expand all
3453 goto has_read_atom; 3490 goto has_read_atom;
3454 } 3491 }
3455 case '1': case '2': case '3': case '4': case '5': case '6': 3492 case '1': case '2': case '3': case '4': case '5': case '6':
3456 case '7': case '8': case '9': { 3493 case '7': case '8': case '9': {
3457 int index = 0; 3494 int index = 0;
3458 if (ParseBackreferenceIndex(&index)) { 3495 if (ParseBackreferenceIndex(&index)) {
3459 atom = new RegExpBackreference(index); 3496 atom = new RegExpBackreference(index);
3460 goto has_read_atom; 3497 goto has_read_atom;
3461 } else { 3498 } else {
3462 // If this is not a backreference we go to the atom parser 3499 // If this is not a backreference we go to the atom parser
3463 // which will read it as an octal escape. 3500 // which will read it as an octal escape or identity escape.
3464 goto parse_atom; 3501 goto parse_atom;
3465 } 3502 }
3466 } 3503 }
3467 default: 3504 default:
3468 goto parse_atom; 3505 goto parse_atom;
3469 } 3506 }
3470 } 3507 }
3471 // All other escapes fall through to the default case since 3508 // All other escapes fall through to the default case since
3472 // they correspond to single characters that can be 3509 // they correspond to single characters that can be
3473 // represented within atoms. 3510 // represented within atoms.
(...skipping 110 matching lines...) Expand 10 before | Expand all | Expand 10 after
3584 uc32 escape = ParseCharacterEscape(CHECK_OK); 3621 uc32 escape = ParseCharacterEscape(CHECK_OK);
3585 buf->Add(escape); 3622 buf->Add(escape);
3586 } 3623 }
3587 } else { 3624 } else {
3588 break; 3625 break;
3589 } 3626 }
3590 } 3627 }
3591 return new RegExpAtom(buf->ToConstVector()); 3628 return new RegExpAtom(buf->ToConstVector());
3592 } 3629 }
3593 3630
3631 // Upper and lower case letters differ by one bit.
3632 STATIC_CHECK('a'^'A' == 0x20);
3594 3633
3595 uc32 RegExpParser::ParseControlEscape(bool* ok) { 3634 uc32 RegExpParser::ParseControlEscape(bool* ok) {
3596 ASSERT(current() == 'c'); 3635 ASSERT(current() == 'c');
3597 Advance(); 3636 Advance();
3598 if (!has_more()) { 3637 if (!has_more()) {
3599 ReportError(CStrVector("\\c at end of pattern"), ok); 3638 ReportError(CStrVector("\\c at end of pattern"), ok);
3600 return '\0'; 3639 return '\0';
3601 } else {
3602 uc32 letter = current();
3603 if (!('a' <= letter && letter <= 'z') &&
3604 !('A' <= letter && letter <= 'Z')) {
3605 ReportError(CStrVector("Illegal control letter"), ok);
3606 return '\0';
3607 }
3608 Advance();
3609 return letter & ((1 << 5) - 1);
3610 } 3640 }
3641 uc32 letter = current() & ~(0x20); // Collapse upper and lower case letters.
3642 if (letter < 'A' || 'Z' < letter) {
3643 // Non-spec error-correction: "\c" followed by non-control letter is
3644 // interpreted as an IdentityEscape.
3645 return 'c';
3646 }
3647 Advance();
3648 return letter & 0x1f; // Remainder modulo 32, per specification.
3611 } 3649 }
3612 3650
3613 3651
3614 uc32 RegExpParser::ParseOctalLiteral(bool* ok) { 3652 uc32 RegExpParser::ParseOctalLiteral(bool* ok) {
3615 ASSERT('0' <= current() && current() <= '7'); 3653 ASSERT('0' <= current() && current() <= '7');
3616 // Here we're really supposed to break out after the first digit 3654 // For compatibility with some other browsers (not all), we parse
3617 // if it is '0' but the other implementations don't do that so 3655 // up to three octal digits with a value below 256.
3618 // neither do we. Is this deviation from the spec error prone? 3656 uc32 value = current() - '0';
3619 // Yes, it's probably as error prone as it's possible to get. Isn't 3657 Advance();
3620 // JavaScript wonderful? 3658 if ('0' <= current() && current() <= '7') {
3621 uc32 value = 0; 3659 value = value * 8 + current() - '0';
3622 while ('0' <= current() && current() <= '7') { 3660 Advance();
3623 int next = (8 * value) + (current() - '0'); 3661 if (value < 32 && '0' <= current() && current() <= '7') {
3624 if (next >= 256) { 3662 value = value * 8 + current() - '0';
3625 break;
3626 } else {
3627 value = next;
3628 Advance(); 3663 Advance();
3629 } 3664 }
3630 } 3665 }
3631 return value; 3666 return value;
3632 } 3667 }
3633 3668
3634 3669 bool RegExpParser::ParseHexEscape(int length, uc32 *value) {
3635 uc32 RegExpParser::ParseHexEscape(int length) { 3670 static const int kMaxChars = kMaxPushback;
3636 uc32 value = 0; 3671 EmbeddedVector<uc32, kMaxChars> chars_seen;
3637 for (int i = 0; i < length; i++) { 3672 ASSERT(length <= kMaxChars);
3638 int d = HexValue(current()); 3673 uc32 val = 0;
3639 if (d < 0) 3674 bool done = false;
3640 return value; 3675 for (int i = 0; !done; i++) {
3641 value = value * 16 + d; 3676 uc32 c = current();
3677 int d = HexValue(c);
3678 if (d < 0) {
3679 while (i > 0) {
3680 i--;
3681 PushBack(chars_seen[i]);
3682 }
3683 return false;
3684 }
3685 val = val * 16 + d;
3642 Advance(); 3686 Advance();
3687 if (i < length - 1) {
3688 chars_seen[i] = c;
3689 } else {
3690 done = true;
3691 }
3643 } 3692 }
3644 3693 *value = val;
3645 return value; 3694 return true;
3646 } 3695 }
3647 3696
3648 3697
3649 uc32 RegExpParser::ParseCharacterEscape(bool* ok) { 3698 uc32 RegExpParser::ParseCharacterEscape(bool* ok) {
3650 ASSERT(current() == '\\'); 3699 ASSERT(current() == '\\');
3651 ASSERT(has_next() && !IsSpecialEscape(next())); 3700 ASSERT(has_next() && !IsSpecialEscape(next()));
3652 Advance(); 3701 Advance();
3653 ASSERT(current() != 'b' && current() != 'B'); 3702 ASSERT(current() != 'b' && current() != 'B');
3654 switch (current()) { 3703 switch (current()) {
3655 // ControlEscape :: one of 3704 // ControlEscape :: one of
3656 // f n r t v 3705 // f n r t v
3657 case 'f': 3706 case 'f':
3658 Advance(); 3707 Advance();
3659 return '\f'; 3708 return '\f';
3660 case 'n': 3709 case 'n':
3661 Advance(); 3710 Advance();
3662 return '\n'; 3711 return '\n';
3663 case 'r': 3712 case 'r':
3664 Advance(); 3713 Advance();
3665 return '\r'; 3714 return '\r';
3666 case 't': 3715 case 't':
3667 Advance(); 3716 Advance();
3668 return '\t'; 3717 return '\t';
3669 case 'v': 3718 case 'v':
3670 Advance(); 3719 Advance();
3671 return '\v'; 3720 return '\v';
3672 case 'c': 3721 case 'c':
3722 // Spec mandates that next character is ASCII letter.
3723 // If not, we error-correct by interpreting "\c" as "c".
3673 return ParseControlEscape(ok); 3724 return ParseControlEscape(ok);
3674 case '0': case '1': case '2': case '3': case '4': case '5': 3725 case '0': case '1': case '2': case '3': case '4': case '5':
3675 case '6': case '7': 3726 case '6': case '7':
3676 // We're really supposed to read this as a decimal integer 3727 // For compatibility, we interpret a decimal escape that isn't
3677 // literal which is base 10 but for whatever reason the other 3728 // a back reference (and therefore either \0 or not valid according
3678 // implementations read base 8. It's hard to believe that the 3729 // to the specification) as a 1..3 digit octal character code.
3679 // spec was written by some ofthe same people that wrote the
3680 // other implementations...
3681 return ParseOctalLiteral(ok); 3730 return ParseOctalLiteral(ok);
3682 case 'x': 3731 case 'x': {
3683 Advance(); 3732 Advance();
3684 return ParseHexEscape(2); 3733 uc32 value;
3685 case 'A': case 'Z': { 3734 if (ParseHexEscape(2, &value)) {
3735 return value;
3736 }
3737 // If \x is not followed by a two-digit hexadecimal, treat it
3738 // as an identity escape.
3739 return 'x';
3740 }
3741 case 'u': {
3742 Advance();
3743 uc32 value;
3744 if (ParseHexEscape(4, &value)) {
3745 return value;
3746 }
3747 // If \u is not followed by a four-digit hexadecimal, treat it
3748 // as an identity escape.
3749 return 'u';
3750 }
3751 default: {
3752 // Extended identity escape. We accept any character that hasn't
3753 // been matched by a more specific case, not just the subset required
3754 // by the ECMAScript specification.
3686 uc32 result = current(); 3755 uc32 result = current();
3687 Advance(); 3756 Advance();
3688 return result; 3757 return result;
3689 }
3690 default: {
3691 ASSERT(!Scanner::kIsIdentifierPart.get(current()));
3692 uc32 result = current();
3693 Advance();
3694 return result;
3695 } 3758 }
3696 } 3759 }
3697 return 0; 3760 return 0;
3698 } 3761 }
3699 3762
3700 3763
3701 RegExpTree* RegExpParser::ParseGroup(bool* ok) { 3764 RegExpTree* RegExpParser::ParseGroup(bool* ok) {
3702 ASSERT_EQ(current(), '('); 3765 ASSERT_EQ(current(), '(');
3703 char type = '('; 3766 char type = '(';
3704 Advance(); 3767 Advance();
(...skipping 135 matching lines...) Expand 10 before | Expand all | Expand 10 after
3840 // That way, the result will be exactly the right size rather than 3903 // That way, the result will be exactly the right size rather than
3841 // the expected 50% too large. 3904 // the expected 50% too large.
3842 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone(); 3905 Vector<unsigned> store = parser.recorder()->store()->ToVector().Clone();
3843 return new ScriptDataImpl(store); 3906 return new ScriptDataImpl(store);
3844 } 3907 }
3845 3908
3846 3909
3847 RegExpTree* ParseRegExp(unibrow::CharacterStream* stream, 3910 RegExpTree* ParseRegExp(unibrow::CharacterStream* stream,
3848 Handle<String>* error) { 3911 Handle<String>* error) {
3849 ASSERT(error->is_null()); 3912 ASSERT(error->is_null());
3850 RegExpParser parser(stream, error); 3913 RegExpParser parser(stream, error, false); // Get multiline flag somehow
3851 bool ok = true; 3914 bool ok = true;
3852 RegExpTree* result = parser.ParsePattern(&ok); 3915 RegExpTree* result = parser.ParsePattern(&ok);
3853 if (!ok) { 3916 if (!ok) {
3854 ASSERT(result == NULL); 3917 ASSERT(result == NULL);
3855 ASSERT(!error->is_null()); 3918 ASSERT(!error->is_null());
3856 } else { 3919 } else {
3857 ASSERT(result != NULL); 3920 ASSERT(result != NULL);
3858 ASSERT(error->is_null()); 3921 ASSERT(error->is_null());
3859 } 3922 }
3860 return result; 3923 return result;
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
3906 start_position, 3969 start_position,
3907 is_expression); 3970 is_expression);
3908 return result; 3971 return result;
3909 } 3972 }
3910 3973
3911 3974
3912 #undef NEW 3975 #undef NEW
3913 3976
3914 3977
3915 } } // namespace v8::internal 3978 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « no previous file | regexp2000/test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698