src/parser.cc - Issue 8635: Backreferences

Side by Side Diff: src/parser.cc

Issue 8635: Backreferences (Closed)

Patch Set: Added test case Created 12 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 234 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
245	245

246 // Parses and returns a single escaped character. The character	246 // Parses and returns a single escaped character. The character

247 // must not be 'b' or 'B' since they are usually handle specially.	247 // must not be 'b' or 'B' since they are usually handle specially.

248 uc32 ParseCharacterEscape(bool* ok);	248 uc32 ParseCharacterEscape(bool* ok);

249	249

250 uc32 ParseHexEscape(int length);	250 uc32 ParseHexEscape(int length);

251	251

252 uc32 ParseControlEscape(bool* ok);	252 uc32 ParseControlEscape(bool* ok);

253 uc32 ParseOctalLiteral(bool* ok);	253 uc32 ParseOctalLiteral(bool* ok);

254	254

	255 // Tries to parse the input as a backreference. If successful it

	256 // stores the result in the output parameter and returns true. If

	257 // it fails it will push back the characters read so the same characters

	258 // can be reparsed.

	259 bool ParseBackreferenceIndex(int* index_out);

	260

255 CharacterRange ParseClassAtom(bool* ok);	261 CharacterRange ParseClassAtom(bool* ok);

256 RegExpTree* ReportError(Vector<const char> message, bool* ok);	262 RegExpTree* ReportError(Vector<const char> message, bool* ok);

257 void Advance();	263 void Advance();

258 void Advance(int dist);	264 void Advance(int dist);

259 static const uc32 kEndMarker = unibrow::Utf8::kBadChar;	265 static const uc32 kEndMarker = unibrow::Utf8::kBadChar;

260 private:	266 private:

261 uc32 current() { return current_; }	267 uc32 current() { return current_; }

262 uc32 next() { return next_; }	268 uc32 next() { return next_; }

263 bool has_more() { return has_more_; }	269 bool has_more() { return has_more_; }

264 bool has_next() { return has_next_; }	270 bool has_next() { return has_next_; }

265 unibrow::CharacterStream* in() { return in_; }	271 unibrow::CharacterStream* in() { return in_; }

266 uc32 current_;	272 uc32 current_;

267 uc32 next_;	273 uc32 next_;

268 bool has_more_;	274 bool has_more_;

269 bool has_next_;	275 bool has_next_;

270 int captures_seen_;	276 int captures_seen_;

271 unibrow::CharacterStream* in_;	277 unibrow::CharacterStream* in_;

272 Handle<String>* error_;	278 Handle<String>* error_;

	279 static const int kMaxPushback = 5;

	280 int pushback_count_;

	281 uc32 pushback_buffer_[kMaxPushback];

273 };	282 };

274	283

275	284

276 // A temporary scope stores information during parsing, just like	285 // A temporary scope stores information during parsing, just like

277 // a plain scope. However, temporary scopes are not kept around	286 // a plain scope. However, temporary scopes are not kept around

278 // after parsing or referenced by syntax trees so they can be stack-	287 // after parsing or referenced by syntax trees so they can be stack-

279 // allocated and hence used by the pre-parser.	288 // allocated and hence used by the pre-parser.

280 class TemporaryScope BASE_EMBEDDED {	289 class TemporaryScope BASE_EMBEDDED {

281 public:	290 public:

282 explicit TemporaryScope(Parser* parser);	291 explicit TemporaryScope(Parser* parser);

(...skipping 2918 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3201 }	3210 }

3202 ZoneList<Expression> args = new ZoneList<Expression*>(2);	3211 ZoneList<Expression> args = new ZoneList<Expression*>(2);

3203 args->Add(new Literal(type));	3212 args->Add(new Literal(type));

3204 args->Add(new Literal(array));	3213 args->Add(new Literal(array));

3205 return new Throw(new CallRuntime(constructor, NULL, args),	3214 return new Throw(new CallRuntime(constructor, NULL, args),

3206 scanner().location().beg_pos);	3215 scanner().location().beg_pos);

3207 }	3216 }

3208	3217

3209	3218

3210 // ----------------------------------------------------------------------------	3219 // ----------------------------------------------------------------------------

3211 // Regular expressions.	3220 // Regular expressions

	3221

3212	3222

3213 RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)	3223 RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)

3214 : current_(kEndMarker),	3224 : current_(kEndMarker),

3215 next_(kEndMarker),	3225 next_(kEndMarker),

3216 has_more_(true),	3226 has_more_(true),

3217 has_next_(true),	3227 has_next_(true),

3218 captures_seen_(0),	3228 captures_seen_(0),

3219 in_(in),	3229 in_(in),

3220 error_(error) {	3230 error_(error),

	3231 pushback_count_(0) {

3221 Advance(2);	3232 Advance(2);

3222 }	3233 }

3223	3234

	3235

3224 void RegExpParser::Advance() {	3236 void RegExpParser::Advance() {

3225 current_ = next_;	3237 current_ = next_;

3226 has_more_ = has_next_;	3238 has_more_ = has_next_;

3227 if (in()->has_more()) {	3239 if (pushback_count_ > 0) {

	3240 pushback_count_--;

	3241 next_ = pushback_buffer_[pushback_count_];

	3242 has_next_ = true;

	3243 } else if (in()->has_more()) {

3228 next_ = in()->GetNext();	3244 next_ = in()->GetNext();
	Lasse Reichstein 2008/10/28 10:09:52 the has_next variable is set to true in the push_b the has_next variable is set to true in the push_back case, but not in the in()->has_more() case. Is it known to be true in the latter case? If so, why isn't it known to be true in the former? Christian Plesner Hansen 2008/10/28 11:17:45 We know it will work because we advance twice afte We know it will work because we advance twice after setting up the pushback buffer.
3229 } else {	3245 } else {

3230 next_ = kEndMarker;	3246 next_ = kEndMarker;

3231 has_next_ = false;	3247 has_next_ = false;

3232 }	3248 }

3233 }	3249 }

3234	3250

	3251

3235 void RegExpParser::Advance(int dist) {	3252 void RegExpParser::Advance(int dist) {

3236 for (int i = 0; i < dist; i++)	3253 for (int i = 0; i < dist; i++)

3237 Advance();	3254 Advance();

3238 }	3255 }

3239	3256

	3257

3240 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {	3258 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {

3241 *ok = false;	3259 *ok = false;

3242 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);	3260 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);

3243 return NULL;	3261 return NULL;

3244 }	3262 }

3245	3263

	3264

3246 // Pattern ::	3265 // Pattern ::

3247 // Disjunction	3266 // Disjunction

3248 RegExpTree* RegExpParser::ParsePattern(bool* ok) {	3267 RegExpTree* RegExpParser::ParsePattern(bool* ok) {

3249 return ParseDisjunction(ok);	3268 return ParseDisjunction(ok);

3250 }	3269 }

3251	3270

	3271

3252 // Disjunction ::	3272 // Disjunction ::

3253 // Alternative	3273 // Alternative

3254 // Alternative \| Disjunction	3274 // Alternative \| Disjunction

3255 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {	3275 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {

3256 RegExpTree* first = ParseAlternative(CHECK_OK);	3276 RegExpTree* first = ParseAlternative(CHECK_OK);

3257 if (current() == '\|') {	3277 if (current() == '\|') {

3258 ZoneList<RegExpTree> nodes = new ZoneList<RegExpTree*>(2);	3278 ZoneList<RegExpTree> nodes = new ZoneList<RegExpTree*>(2);

3259 nodes->Add(first);	3279 nodes->Add(first);

3260 while (current() == '\|') {	3280 while (current() == '\|') {

3261 Advance();	3281 Advance();

3262 RegExpTree* next = ParseAlternative(CHECK_OK);	3282 RegExpTree* next = ParseAlternative(CHECK_OK);

3263 nodes->Add(next);	3283 nodes->Add(next);

3264 }	3284 }

3265 return new RegExpDisjunction(nodes);	3285 return new RegExpDisjunction(nodes);

3266 } else {	3286 } else {

3267 return first;	3287 return first;

3268 }	3288 }

3269 }	3289 }

3270	3290

	3291

3271 static bool IsAlternativeTerminator(uc32 c) {	3292 static bool IsAlternativeTerminator(uc32 c) {

3272 return c == '\|' \|\| c == ')' \|\| c == RegExpParser::kEndMarker;	3293 return c == '\|' \|\| c == ')' \|\| c == RegExpParser::kEndMarker;

3273 }	3294 }

3274	3295

	3296

3275 // Alternative ::	3297 // Alternative ::

3276 // [empty]	3298 // [empty]

3277 // Alternative Term	3299 // Alternative Term

3278 RegExpTree* RegExpParser::ParseAlternative(bool* ok) {	3300 RegExpTree* RegExpParser::ParseAlternative(bool* ok) {

3279 if (!IsAlternativeTerminator(current())) {	3301 if (!IsAlternativeTerminator(current())) {

3280 RegExpTree* first = ParseTerm(CHECK_OK);	3302 RegExpTree* first = ParseTerm(CHECK_OK);

3281 if (!IsAlternativeTerminator(current())) {	3303 if (!IsAlternativeTerminator(current())) {

3282 ZoneList<RegExpTree> nodes = new ZoneList<RegExpTree*>(2);	3304 ZoneList<RegExpTree> nodes = new ZoneList<RegExpTree*>(2);

3283 nodes->Add(first);	3305 nodes->Add(first);

3284 while (!IsAlternativeTerminator(current())) {	3306 while (!IsAlternativeTerminator(current())) {

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3325 switch (c) {	3347 switch (c) {

3326 case 'b': case 'B': case 'd': case 'D': case 's': case 'S':	3348 case 'b': case 'B': case 'd': case 'D': case 's': case 'S':

3327 case 'w': case 'W':	3349 case 'w': case 'W':

3328 return true;	3350 return true;

3329 default:	3351 default:

3330 return false;	3352 return false;

3331 }	3353 }

3332 }	3354 }

3333	3355

3334	3356

	3357 bool RegExpParser::ParseBackreferenceIndex(int* index_out) {

	3358 ASSERT_EQ('\\', current());

	3359 ASSERT('1' <= next() && next() <= '9');

	3360 ASSERT_EQ(0, pushback_count_);

	3361 if (captures_seen_ == 0)

	3362 return false;

	3363 int value = next() - '0';

	3364 if (value > captures_seen_)

	3365 return false;

	3366 static const int kMaxChars = kMaxPushback - 2;

	3367 EmbeddedVector<uc32, kMaxChars> chars_seen;
	Lasse Reichstein 2008/10/28 10:09:52 EmbeddedVector is used instead of array to have ra EmbeddedVector is used instead of array to have range checking in DEBUG mode? Christian Plesner Hansen 2008/10/28 11:17:45 Bingo Bingo
	3368 chars_seen[0] = next();

	3369 int char_count = 1;

	3370 Advance(2);

	3371 while (true) {

	3372 uc32 c = current();

	3373 if (IsDecimalDigit(c)) {

	3374 int next_value = 10 * value + (c - '0');

	3375 // To avoid reading past the end of thw stack-allocated pushback
	Lasse Reichstein 2008/10/28 10:09:52 "thw" -> "the" "thw" -> "the"
	3376 // buffers we only read kMaxChars before giving up.

	3377 if (next_value > captures_seen_ \|\| char_count > kMaxChars) {

	3378 // If we give up we have to push the characters we read back

	3379 // onto the pushback buffer in the reverse order.

	3380 pushback_buffer_[0] = current();

	3381 for (int i = 0; i < char_count; i++)

	3382 pushback_buffer_[i + 1] = chars_seen[char_count - i - 1];

	3383 pushback_buffer_[char_count + 1] = '\\';

	3384 pushback_count_ = char_count + 2;

	3385 // Then, once we've filled up the buffer, we read the two

	3386 // first characters into the lookahead. This is a roundabout

	3387 // way of doing it but makes the code simpler.

	3388 Advance(2);

	3389 return false;

	3390 } else {

	3391 value = next_value;

	3392 chars_seen[char_count++] = current();

	3393 Advance();

	3394 }

	3395 } else {

	3396 *index_out = value;

	3397 return true;

	3398 }

	3399 }

	3400 }

	3401

	3402

3335 // Term ::	3403 // Term ::

3336 // Assertion	3404 // Assertion

3337 // Atom	3405 // Atom

3338 // Atom Quantifier	3406 // Atom Quantifier

3339 RegExpTree* RegExpParser::ParseTerm(bool* ok) {	3407 RegExpTree* RegExpParser::ParseTerm(bool* ok) {

3340 RegExpTree* atom = NULL;	3408 RegExpTree* atom = NULL;

3341 switch (current()) {	3409 switch (current()) {

3342 // Assertion ::	3410 // Assertion ::

3343 // ^	3411 // ^

3344 // $	3412 // $

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3377 // CharacterClassEscape	3445 // CharacterClassEscape

3378 //	3446 //

3379 // CharacterClassEscape :: one of	3447 // CharacterClassEscape :: one of

3380 // d D s S w W	3448 // d D s S w W

3381 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {	3449 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

3382 uc32 c = next();	3450 uc32 c = next();

3383 Advance(2);	3451 Advance(2);

3384 atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c));	3452 atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c));

3385 goto has_read_atom;	3453 goto has_read_atom;

3386 }	3454 }

3387 // Todo backreferences	3455 case '1': case '2': case '3': case '4': case '5': case '6':

	3456 case '7': case '8': case '9': {

	3457 int index = 0;

	3458 if (ParseBackreferenceIndex(&index)) {

	3459 atom = new RegExpBackreference(index);

	3460 goto has_read_atom;

	3461 } else {

	3462 // If this is not a backreference we go to the atom parser

	3463 // which will read it as an octal escape.

	3464 goto parse_atom;

	3465 }

	3466 }

3388 default:	3467 default:

3389 break;	3468 goto parse_atom;

3390 }	3469 }

3391 }	3470 }

3392 // All other escapes fall through to the default case since	3471 // All other escapes fall through to the default case since

3393 // they correspond to single characters that can be	3472 // they correspond to single characters that can be

3394 // represented within atoms.	3473 // represented within atoms.

3395 default: {	3474 default: {

	3475 parse_atom:

3396 atom = ParseAtom(CHECK_OK);	3476 atom = ParseAtom(CHECK_OK);

3397 break;	3477 break;

3398 }	3478 }

3399 }	3479 }

3400 has_read_atom:	3480 has_read_atom:

3401 int min;	3481 int min;

3402 int max;	3482 int max;

3403 switch (current()) {	3483 switch (current()) {

3404 // QuantifierPrefix ::	3484 // QuantifierPrefix ::

3405 // *	3485 // *

3406 // +	3486 // +

3407 // ?	3487 // ?
	Lasse Reichstein 2008/10/28 10:09:52 and " // {" ? and " // {" ?
3408 case '*':	3488 case '*':

3409 min = 0;	3489 min = 0;

3410 max = RegExpQuantifier::kInfinity;	3490 max = RegExpQuantifier::kInfinity;

3411 Advance();	3491 Advance();

3412 break;	3492 break;

3413 case '+':	3493 case '+':

3414 min = 1;	3494 min = 1;

3415 max = RegExpQuantifier::kInfinity;	3495 max = RegExpQuantifier::kInfinity;

3416 Advance();	3496 Advance();

3417 break;	3497 break;

(...skipping 407 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3825 start_position,	3905 start_position,

3826 is_expression);	3906 is_expression);

3827 return result;	3907 return result;

3828 }	3908 }

3829	3909

3830	3910

3831 #undef NEW	3911 #undef NEW

3832	3912

3833	3913

3834 } } // namespace v8::internal	3914 } } // namespace v8::internal

OLD	NEW

« src/ast.h ('K') | « src/jsregexp.cc ('k') | test/cctest/test-regexp.cc » ('j') | test/cctest/test-regexp.cc » ('J')