Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(330)

Side by Side Diff: src/parser.cc

Issue 8635: Backreferences (Closed)
Patch Set: Added test case Created 12 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 234 matching lines...) Expand 10 before | Expand all | Expand 10 after
245 245
246 // Parses and returns a single escaped character. The character 246 // Parses and returns a single escaped character. The character
247 // must not be 'b' or 'B' since they are usually handle specially. 247 // must not be 'b' or 'B' since they are usually handle specially.
248 uc32 ParseCharacterEscape(bool* ok); 248 uc32 ParseCharacterEscape(bool* ok);
249 249
250 uc32 ParseHexEscape(int length); 250 uc32 ParseHexEscape(int length);
251 251
252 uc32 ParseControlEscape(bool* ok); 252 uc32 ParseControlEscape(bool* ok);
253 uc32 ParseOctalLiteral(bool* ok); 253 uc32 ParseOctalLiteral(bool* ok);
254 254
255 // Tries to parse the input as a backreference. If successful it
256 // stores the result in the output parameter and returns true. If
257 // it fails it will push back the characters read so the same characters
258 // can be reparsed.
259 bool ParseBackreferenceIndex(int* index_out);
260
255 CharacterRange ParseClassAtom(bool* ok); 261 CharacterRange ParseClassAtom(bool* ok);
256 RegExpTree* ReportError(Vector<const char> message, bool* ok); 262 RegExpTree* ReportError(Vector<const char> message, bool* ok);
257 void Advance(); 263 void Advance();
258 void Advance(int dist); 264 void Advance(int dist);
259 static const uc32 kEndMarker = unibrow::Utf8::kBadChar; 265 static const uc32 kEndMarker = unibrow::Utf8::kBadChar;
260 private: 266 private:
261 uc32 current() { return current_; } 267 uc32 current() { return current_; }
262 uc32 next() { return next_; } 268 uc32 next() { return next_; }
263 bool has_more() { return has_more_; } 269 bool has_more() { return has_more_; }
264 bool has_next() { return has_next_; } 270 bool has_next() { return has_next_; }
265 unibrow::CharacterStream* in() { return in_; } 271 unibrow::CharacterStream* in() { return in_; }
266 uc32 current_; 272 uc32 current_;
267 uc32 next_; 273 uc32 next_;
268 bool has_more_; 274 bool has_more_;
269 bool has_next_; 275 bool has_next_;
270 int captures_seen_; 276 int captures_seen_;
271 unibrow::CharacterStream* in_; 277 unibrow::CharacterStream* in_;
272 Handle<String>* error_; 278 Handle<String>* error_;
279 static const int kMaxPushback = 5;
280 int pushback_count_;
281 uc32 pushback_buffer_[kMaxPushback];
273 }; 282 };
274 283
275 284
276 // A temporary scope stores information during parsing, just like 285 // A temporary scope stores information during parsing, just like
277 // a plain scope. However, temporary scopes are not kept around 286 // a plain scope. However, temporary scopes are not kept around
278 // after parsing or referenced by syntax trees so they can be stack- 287 // after parsing or referenced by syntax trees so they can be stack-
279 // allocated and hence used by the pre-parser. 288 // allocated and hence used by the pre-parser.
280 class TemporaryScope BASE_EMBEDDED { 289 class TemporaryScope BASE_EMBEDDED {
281 public: 290 public:
282 explicit TemporaryScope(Parser* parser); 291 explicit TemporaryScope(Parser* parser);
(...skipping 2918 matching lines...) Expand 10 before | Expand all | Expand 10 after
3201 } 3210 }
3202 ZoneList<Expression*>* args = new ZoneList<Expression*>(2); 3211 ZoneList<Expression*>* args = new ZoneList<Expression*>(2);
3203 args->Add(new Literal(type)); 3212 args->Add(new Literal(type));
3204 args->Add(new Literal(array)); 3213 args->Add(new Literal(array));
3205 return new Throw(new CallRuntime(constructor, NULL, args), 3214 return new Throw(new CallRuntime(constructor, NULL, args),
3206 scanner().location().beg_pos); 3215 scanner().location().beg_pos);
3207 } 3216 }
3208 3217
3209 3218
3210 // ---------------------------------------------------------------------------- 3219 // ----------------------------------------------------------------------------
3211 // Regular expressions. 3220 // Regular expressions
3221
3212 3222
3213 RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error) 3223 RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error)
3214 : current_(kEndMarker), 3224 : current_(kEndMarker),
3215 next_(kEndMarker), 3225 next_(kEndMarker),
3216 has_more_(true), 3226 has_more_(true),
3217 has_next_(true), 3227 has_next_(true),
3218 captures_seen_(0), 3228 captures_seen_(0),
3219 in_(in), 3229 in_(in),
3220 error_(error) { 3230 error_(error),
3231 pushback_count_(0) {
3221 Advance(2); 3232 Advance(2);
3222 } 3233 }
3223 3234
3235
3224 void RegExpParser::Advance() { 3236 void RegExpParser::Advance() {
3225 current_ = next_; 3237 current_ = next_;
3226 has_more_ = has_next_; 3238 has_more_ = has_next_;
3227 if (in()->has_more()) { 3239 if (pushback_count_ > 0) {
3240 pushback_count_--;
3241 next_ = pushback_buffer_[pushback_count_];
3242 has_next_ = true;
3243 } else if (in()->has_more()) {
3228 next_ = in()->GetNext(); 3244 next_ = in()->GetNext();
Lasse Reichstein 2008/10/28 10:09:52 the has_next variable is set to true in the push_b
Christian Plesner Hansen 2008/10/28 11:17:45 We know it will work because we advance twice afte
3229 } else { 3245 } else {
3230 next_ = kEndMarker; 3246 next_ = kEndMarker;
3231 has_next_ = false; 3247 has_next_ = false;
3232 } 3248 }
3233 } 3249 }
3234 3250
3251
3235 void RegExpParser::Advance(int dist) { 3252 void RegExpParser::Advance(int dist) {
3236 for (int i = 0; i < dist; i++) 3253 for (int i = 0; i < dist; i++)
3237 Advance(); 3254 Advance();
3238 } 3255 }
3239 3256
3257
3240 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) { 3258 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) {
3241 *ok = false; 3259 *ok = false;
3242 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); 3260 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED);
3243 return NULL; 3261 return NULL;
3244 } 3262 }
3245 3263
3264
3246 // Pattern :: 3265 // Pattern ::
3247 // Disjunction 3266 // Disjunction
3248 RegExpTree* RegExpParser::ParsePattern(bool* ok) { 3267 RegExpTree* RegExpParser::ParsePattern(bool* ok) {
3249 return ParseDisjunction(ok); 3268 return ParseDisjunction(ok);
3250 } 3269 }
3251 3270
3271
3252 // Disjunction :: 3272 // Disjunction ::
3253 // Alternative 3273 // Alternative
3254 // Alternative | Disjunction 3274 // Alternative | Disjunction
3255 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) { 3275 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) {
3256 RegExpTree* first = ParseAlternative(CHECK_OK); 3276 RegExpTree* first = ParseAlternative(CHECK_OK);
3257 if (current() == '|') { 3277 if (current() == '|') {
3258 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2); 3278 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2);
3259 nodes->Add(first); 3279 nodes->Add(first);
3260 while (current() == '|') { 3280 while (current() == '|') {
3261 Advance(); 3281 Advance();
3262 RegExpTree* next = ParseAlternative(CHECK_OK); 3282 RegExpTree* next = ParseAlternative(CHECK_OK);
3263 nodes->Add(next); 3283 nodes->Add(next);
3264 } 3284 }
3265 return new RegExpDisjunction(nodes); 3285 return new RegExpDisjunction(nodes);
3266 } else { 3286 } else {
3267 return first; 3287 return first;
3268 } 3288 }
3269 } 3289 }
3270 3290
3291
3271 static bool IsAlternativeTerminator(uc32 c) { 3292 static bool IsAlternativeTerminator(uc32 c) {
3272 return c == '|' || c == ')' || c == RegExpParser::kEndMarker; 3293 return c == '|' || c == ')' || c == RegExpParser::kEndMarker;
3273 } 3294 }
3274 3295
3296
3275 // Alternative :: 3297 // Alternative ::
3276 // [empty] 3298 // [empty]
3277 // Alternative Term 3299 // Alternative Term
3278 RegExpTree* RegExpParser::ParseAlternative(bool* ok) { 3300 RegExpTree* RegExpParser::ParseAlternative(bool* ok) {
3279 if (!IsAlternativeTerminator(current())) { 3301 if (!IsAlternativeTerminator(current())) {
3280 RegExpTree* first = ParseTerm(CHECK_OK); 3302 RegExpTree* first = ParseTerm(CHECK_OK);
3281 if (!IsAlternativeTerminator(current())) { 3303 if (!IsAlternativeTerminator(current())) {
3282 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2); 3304 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2);
3283 nodes->Add(first); 3305 nodes->Add(first);
3284 while (!IsAlternativeTerminator(current())) { 3306 while (!IsAlternativeTerminator(current())) {
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
3325 switch (c) { 3347 switch (c) {
3326 case 'b': case 'B': case 'd': case 'D': case 's': case 'S': 3348 case 'b': case 'B': case 'd': case 'D': case 's': case 'S':
3327 case 'w': case 'W': 3349 case 'w': case 'W':
3328 return true; 3350 return true;
3329 default: 3351 default:
3330 return false; 3352 return false;
3331 } 3353 }
3332 } 3354 }
3333 3355
3334 3356
3357 bool RegExpParser::ParseBackreferenceIndex(int* index_out) {
3358 ASSERT_EQ('\\', current());
3359 ASSERT('1' <= next() && next() <= '9');
3360 ASSERT_EQ(0, pushback_count_);
3361 if (captures_seen_ == 0)
3362 return false;
3363 int value = next() - '0';
3364 if (value > captures_seen_)
3365 return false;
3366 static const int kMaxChars = kMaxPushback - 2;
3367 EmbeddedVector<uc32, kMaxChars> chars_seen;
Lasse Reichstein 2008/10/28 10:09:52 EmbeddedVector is used instead of array to have ra
Christian Plesner Hansen 2008/10/28 11:17:45 Bingo
3368 chars_seen[0] = next();
3369 int char_count = 1;
3370 Advance(2);
3371 while (true) {
3372 uc32 c = current();
3373 if (IsDecimalDigit(c)) {
3374 int next_value = 10 * value + (c - '0');
3375 // To avoid reading past the end of thw stack-allocated pushback
Lasse Reichstein 2008/10/28 10:09:52 "thw" -> "the"
3376 // buffers we only read kMaxChars before giving up.
3377 if (next_value > captures_seen_ || char_count > kMaxChars) {
3378 // If we give up we have to push the characters we read back
3379 // onto the pushback buffer in the reverse order.
3380 pushback_buffer_[0] = current();
3381 for (int i = 0; i < char_count; i++)
3382 pushback_buffer_[i + 1] = chars_seen[char_count - i - 1];
3383 pushback_buffer_[char_count + 1] = '\\';
3384 pushback_count_ = char_count + 2;
3385 // Then, once we've filled up the buffer, we read the two
3386 // first characters into the lookahead. This is a roundabout
3387 // way of doing it but makes the code simpler.
3388 Advance(2);
3389 return false;
3390 } else {
3391 value = next_value;
3392 chars_seen[char_count++] = current();
3393 Advance();
3394 }
3395 } else {
3396 *index_out = value;
3397 return true;
3398 }
3399 }
3400 }
3401
3402
3335 // Term :: 3403 // Term ::
3336 // Assertion 3404 // Assertion
3337 // Atom 3405 // Atom
3338 // Atom Quantifier 3406 // Atom Quantifier
3339 RegExpTree* RegExpParser::ParseTerm(bool* ok) { 3407 RegExpTree* RegExpParser::ParseTerm(bool* ok) {
3340 RegExpTree* atom = NULL; 3408 RegExpTree* atom = NULL;
3341 switch (current()) { 3409 switch (current()) {
3342 // Assertion :: 3410 // Assertion ::
3343 // ^ 3411 // ^
3344 // $ 3412 // $
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
3377 // CharacterClassEscape 3445 // CharacterClassEscape
3378 // 3446 //
3379 // CharacterClassEscape :: one of 3447 // CharacterClassEscape :: one of
3380 // d D s S w W 3448 // d D s S w W
3381 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { 3449 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
3382 uc32 c = next(); 3450 uc32 c = next();
3383 Advance(2); 3451 Advance(2);
3384 atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c)); 3452 atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c));
3385 goto has_read_atom; 3453 goto has_read_atom;
3386 } 3454 }
3387 // Todo backreferences 3455 case '1': case '2': case '3': case '4': case '5': case '6':
3456 case '7': case '8': case '9': {
3457 int index = 0;
3458 if (ParseBackreferenceIndex(&index)) {
3459 atom = new RegExpBackreference(index);
3460 goto has_read_atom;
3461 } else {
3462 // If this is not a backreference we go to the atom parser
3463 // which will read it as an octal escape.
3464 goto parse_atom;
3465 }
3466 }
3388 default: 3467 default:
3389 break; 3468 goto parse_atom;
3390 } 3469 }
3391 } 3470 }
3392 // All other escapes fall through to the default case since 3471 // All other escapes fall through to the default case since
3393 // they correspond to single characters that can be 3472 // they correspond to single characters that can be
3394 // represented within atoms. 3473 // represented within atoms.
3395 default: { 3474 default: {
3475 parse_atom:
3396 atom = ParseAtom(CHECK_OK); 3476 atom = ParseAtom(CHECK_OK);
3397 break; 3477 break;
3398 } 3478 }
3399 } 3479 }
3400 has_read_atom: 3480 has_read_atom:
3401 int min; 3481 int min;
3402 int max; 3482 int max;
3403 switch (current()) { 3483 switch (current()) {
3404 // QuantifierPrefix :: 3484 // QuantifierPrefix ::
3405 // * 3485 // *
3406 // + 3486 // +
3407 // ? 3487 // ?
Lasse Reichstein 2008/10/28 10:09:52 and " // {" ?
3408 case '*': 3488 case '*':
3409 min = 0; 3489 min = 0;
3410 max = RegExpQuantifier::kInfinity; 3490 max = RegExpQuantifier::kInfinity;
3411 Advance(); 3491 Advance();
3412 break; 3492 break;
3413 case '+': 3493 case '+':
3414 min = 1; 3494 min = 1;
3415 max = RegExpQuantifier::kInfinity; 3495 max = RegExpQuantifier::kInfinity;
3416 Advance(); 3496 Advance();
3417 break; 3497 break;
(...skipping 407 matching lines...) Expand 10 before | Expand all | Expand 10 after
3825 start_position, 3905 start_position,
3826 is_expression); 3906 is_expression);
3827 return result; 3907 return result;
3828 } 3908 }
3829 3909
3830 3910
3831 #undef NEW 3911 #undef NEW
3832 3912
3833 3913
3834 } } // namespace v8::internal 3914 } } // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698