OLD | NEW |
---|---|
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2008 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 234 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
245 | 245 |
246 // Parses and returns a single escaped character. The character | 246 // Parses and returns a single escaped character. The character |
247 // must not be 'b' or 'B' since they are usually handle specially. | 247 // must not be 'b' or 'B' since they are usually handle specially. |
248 uc32 ParseCharacterEscape(bool* ok); | 248 uc32 ParseCharacterEscape(bool* ok); |
249 | 249 |
250 uc32 ParseHexEscape(int length); | 250 uc32 ParseHexEscape(int length); |
251 | 251 |
252 uc32 ParseControlEscape(bool* ok); | 252 uc32 ParseControlEscape(bool* ok); |
253 uc32 ParseOctalLiteral(bool* ok); | 253 uc32 ParseOctalLiteral(bool* ok); |
254 | 254 |
255 // Tries to parse the input as a backreference. If successful it | |
256 // stores the result in the output parameter and returns true. If | |
257 // it fails it will push back the characters read so the same characters | |
258 // can be reparsed. | |
259 bool ParseBackreferenceIndex(int* index_out); | |
260 | |
255 CharacterRange ParseClassAtom(bool* ok); | 261 CharacterRange ParseClassAtom(bool* ok); |
256 RegExpTree* ReportError(Vector<const char> message, bool* ok); | 262 RegExpTree* ReportError(Vector<const char> message, bool* ok); |
257 void Advance(); | 263 void Advance(); |
258 void Advance(int dist); | 264 void Advance(int dist); |
259 static const uc32 kEndMarker = unibrow::Utf8::kBadChar; | 265 static const uc32 kEndMarker = unibrow::Utf8::kBadChar; |
260 private: | 266 private: |
261 uc32 current() { return current_; } | 267 uc32 current() { return current_; } |
262 uc32 next() { return next_; } | 268 uc32 next() { return next_; } |
263 bool has_more() { return has_more_; } | 269 bool has_more() { return has_more_; } |
264 bool has_next() { return has_next_; } | 270 bool has_next() { return has_next_; } |
265 unibrow::CharacterStream* in() { return in_; } | 271 unibrow::CharacterStream* in() { return in_; } |
266 uc32 current_; | 272 uc32 current_; |
267 uc32 next_; | 273 uc32 next_; |
268 bool has_more_; | 274 bool has_more_; |
269 bool has_next_; | 275 bool has_next_; |
270 int captures_seen_; | 276 int captures_seen_; |
271 unibrow::CharacterStream* in_; | 277 unibrow::CharacterStream* in_; |
272 Handle<String>* error_; | 278 Handle<String>* error_; |
279 static const int kMaxPushback = 5; | |
280 int pushback_count_; | |
281 uc32 pushback_buffer_[kMaxPushback]; | |
273 }; | 282 }; |
274 | 283 |
275 | 284 |
276 // A temporary scope stores information during parsing, just like | 285 // A temporary scope stores information during parsing, just like |
277 // a plain scope. However, temporary scopes are not kept around | 286 // a plain scope. However, temporary scopes are not kept around |
278 // after parsing or referenced by syntax trees so they can be stack- | 287 // after parsing or referenced by syntax trees so they can be stack- |
279 // allocated and hence used by the pre-parser. | 288 // allocated and hence used by the pre-parser. |
280 class TemporaryScope BASE_EMBEDDED { | 289 class TemporaryScope BASE_EMBEDDED { |
281 public: | 290 public: |
282 explicit TemporaryScope(Parser* parser); | 291 explicit TemporaryScope(Parser* parser); |
(...skipping 2918 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3201 } | 3210 } |
3202 ZoneList<Expression*>* args = new ZoneList<Expression*>(2); | 3211 ZoneList<Expression*>* args = new ZoneList<Expression*>(2); |
3203 args->Add(new Literal(type)); | 3212 args->Add(new Literal(type)); |
3204 args->Add(new Literal(array)); | 3213 args->Add(new Literal(array)); |
3205 return new Throw(new CallRuntime(constructor, NULL, args), | 3214 return new Throw(new CallRuntime(constructor, NULL, args), |
3206 scanner().location().beg_pos); | 3215 scanner().location().beg_pos); |
3207 } | 3216 } |
3208 | 3217 |
3209 | 3218 |
3210 // ---------------------------------------------------------------------------- | 3219 // ---------------------------------------------------------------------------- |
3211 // Regular expressions. | 3220 // Regular expressions |
3221 | |
3212 | 3222 |
3213 RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error) | 3223 RegExpParser::RegExpParser(unibrow::CharacterStream* in, Handle<String>* error) |
3214 : current_(kEndMarker), | 3224 : current_(kEndMarker), |
3215 next_(kEndMarker), | 3225 next_(kEndMarker), |
3216 has_more_(true), | 3226 has_more_(true), |
3217 has_next_(true), | 3227 has_next_(true), |
3218 captures_seen_(0), | 3228 captures_seen_(0), |
3219 in_(in), | 3229 in_(in), |
3220 error_(error) { | 3230 error_(error), |
3231 pushback_count_(0) { | |
3221 Advance(2); | 3232 Advance(2); |
3222 } | 3233 } |
3223 | 3234 |
3235 | |
3224 void RegExpParser::Advance() { | 3236 void RegExpParser::Advance() { |
3225 current_ = next_; | 3237 current_ = next_; |
3226 has_more_ = has_next_; | 3238 has_more_ = has_next_; |
3227 if (in()->has_more()) { | 3239 if (pushback_count_ > 0) { |
3240 pushback_count_--; | |
3241 next_ = pushback_buffer_[pushback_count_]; | |
3242 has_next_ = true; | |
3243 } else if (in()->has_more()) { | |
3228 next_ = in()->GetNext(); | 3244 next_ = in()->GetNext(); |
Lasse Reichstein
2008/10/28 10:09:52
the has_next variable is set to true in the push_b
Christian Plesner Hansen
2008/10/28 11:17:45
We know it will work because we advance twice afte
| |
3229 } else { | 3245 } else { |
3230 next_ = kEndMarker; | 3246 next_ = kEndMarker; |
3231 has_next_ = false; | 3247 has_next_ = false; |
3232 } | 3248 } |
3233 } | 3249 } |
3234 | 3250 |
3251 | |
3235 void RegExpParser::Advance(int dist) { | 3252 void RegExpParser::Advance(int dist) { |
3236 for (int i = 0; i < dist; i++) | 3253 for (int i = 0; i < dist; i++) |
3237 Advance(); | 3254 Advance(); |
3238 } | 3255 } |
3239 | 3256 |
3257 | |
3240 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) { | 3258 RegExpTree* RegExpParser::ReportError(Vector<const char> message, bool* ok) { |
3241 *ok = false; | 3259 *ok = false; |
3242 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); | 3260 *error_ = Factory::NewStringFromAscii(message, NOT_TENURED); |
3243 return NULL; | 3261 return NULL; |
3244 } | 3262 } |
3245 | 3263 |
3264 | |
3246 // Pattern :: | 3265 // Pattern :: |
3247 // Disjunction | 3266 // Disjunction |
3248 RegExpTree* RegExpParser::ParsePattern(bool* ok) { | 3267 RegExpTree* RegExpParser::ParsePattern(bool* ok) { |
3249 return ParseDisjunction(ok); | 3268 return ParseDisjunction(ok); |
3250 } | 3269 } |
3251 | 3270 |
3271 | |
3252 // Disjunction :: | 3272 // Disjunction :: |
3253 // Alternative | 3273 // Alternative |
3254 // Alternative | Disjunction | 3274 // Alternative | Disjunction |
3255 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) { | 3275 RegExpTree* RegExpParser::ParseDisjunction(bool* ok) { |
3256 RegExpTree* first = ParseAlternative(CHECK_OK); | 3276 RegExpTree* first = ParseAlternative(CHECK_OK); |
3257 if (current() == '|') { | 3277 if (current() == '|') { |
3258 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2); | 3278 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2); |
3259 nodes->Add(first); | 3279 nodes->Add(first); |
3260 while (current() == '|') { | 3280 while (current() == '|') { |
3261 Advance(); | 3281 Advance(); |
3262 RegExpTree* next = ParseAlternative(CHECK_OK); | 3282 RegExpTree* next = ParseAlternative(CHECK_OK); |
3263 nodes->Add(next); | 3283 nodes->Add(next); |
3264 } | 3284 } |
3265 return new RegExpDisjunction(nodes); | 3285 return new RegExpDisjunction(nodes); |
3266 } else { | 3286 } else { |
3267 return first; | 3287 return first; |
3268 } | 3288 } |
3269 } | 3289 } |
3270 | 3290 |
3291 | |
3271 static bool IsAlternativeTerminator(uc32 c) { | 3292 static bool IsAlternativeTerminator(uc32 c) { |
3272 return c == '|' || c == ')' || c == RegExpParser::kEndMarker; | 3293 return c == '|' || c == ')' || c == RegExpParser::kEndMarker; |
3273 } | 3294 } |
3274 | 3295 |
3296 | |
3275 // Alternative :: | 3297 // Alternative :: |
3276 // [empty] | 3298 // [empty] |
3277 // Alternative Term | 3299 // Alternative Term |
3278 RegExpTree* RegExpParser::ParseAlternative(bool* ok) { | 3300 RegExpTree* RegExpParser::ParseAlternative(bool* ok) { |
3279 if (!IsAlternativeTerminator(current())) { | 3301 if (!IsAlternativeTerminator(current())) { |
3280 RegExpTree* first = ParseTerm(CHECK_OK); | 3302 RegExpTree* first = ParseTerm(CHECK_OK); |
3281 if (!IsAlternativeTerminator(current())) { | 3303 if (!IsAlternativeTerminator(current())) { |
3282 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2); | 3304 ZoneList<RegExpTree*>* nodes = new ZoneList<RegExpTree*>(2); |
3283 nodes->Add(first); | 3305 nodes->Add(first); |
3284 while (!IsAlternativeTerminator(current())) { | 3306 while (!IsAlternativeTerminator(current())) { |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3325 switch (c) { | 3347 switch (c) { |
3326 case 'b': case 'B': case 'd': case 'D': case 's': case 'S': | 3348 case 'b': case 'B': case 'd': case 'D': case 's': case 'S': |
3327 case 'w': case 'W': | 3349 case 'w': case 'W': |
3328 return true; | 3350 return true; |
3329 default: | 3351 default: |
3330 return false; | 3352 return false; |
3331 } | 3353 } |
3332 } | 3354 } |
3333 | 3355 |
3334 | 3356 |
3357 bool RegExpParser::ParseBackreferenceIndex(int* index_out) { | |
3358 ASSERT_EQ('\\', current()); | |
3359 ASSERT('1' <= next() && next() <= '9'); | |
3360 ASSERT_EQ(0, pushback_count_); | |
3361 if (captures_seen_ == 0) | |
3362 return false; | |
3363 int value = next() - '0'; | |
3364 if (value > captures_seen_) | |
3365 return false; | |
3366 static const int kMaxChars = kMaxPushback - 2; | |
3367 EmbeddedVector<uc32, kMaxChars> chars_seen; | |
Lasse Reichstein
2008/10/28 10:09:52
EmbeddedVector is used instead of array to have ra
Christian Plesner Hansen
2008/10/28 11:17:45
Bingo
| |
3368 chars_seen[0] = next(); | |
3369 int char_count = 1; | |
3370 Advance(2); | |
3371 while (true) { | |
3372 uc32 c = current(); | |
3373 if (IsDecimalDigit(c)) { | |
3374 int next_value = 10 * value + (c - '0'); | |
3375 // To avoid reading past the end of thw stack-allocated pushback | |
Lasse Reichstein
2008/10/28 10:09:52
"thw" -> "the"
| |
3376 // buffers we only read kMaxChars before giving up. | |
3377 if (next_value > captures_seen_ || char_count > kMaxChars) { | |
3378 // If we give up we have to push the characters we read back | |
3379 // onto the pushback buffer in the reverse order. | |
3380 pushback_buffer_[0] = current(); | |
3381 for (int i = 0; i < char_count; i++) | |
3382 pushback_buffer_[i + 1] = chars_seen[char_count - i - 1]; | |
3383 pushback_buffer_[char_count + 1] = '\\'; | |
3384 pushback_count_ = char_count + 2; | |
3385 // Then, once we've filled up the buffer, we read the two | |
3386 // first characters into the lookahead. This is a roundabout | |
3387 // way of doing it but makes the code simpler. | |
3388 Advance(2); | |
3389 return false; | |
3390 } else { | |
3391 value = next_value; | |
3392 chars_seen[char_count++] = current(); | |
3393 Advance(); | |
3394 } | |
3395 } else { | |
3396 *index_out = value; | |
3397 return true; | |
3398 } | |
3399 } | |
3400 } | |
3401 | |
3402 | |
3335 // Term :: | 3403 // Term :: |
3336 // Assertion | 3404 // Assertion |
3337 // Atom | 3405 // Atom |
3338 // Atom Quantifier | 3406 // Atom Quantifier |
3339 RegExpTree* RegExpParser::ParseTerm(bool* ok) { | 3407 RegExpTree* RegExpParser::ParseTerm(bool* ok) { |
3340 RegExpTree* atom = NULL; | 3408 RegExpTree* atom = NULL; |
3341 switch (current()) { | 3409 switch (current()) { |
3342 // Assertion :: | 3410 // Assertion :: |
3343 // ^ | 3411 // ^ |
3344 // $ | 3412 // $ |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3377 // CharacterClassEscape | 3445 // CharacterClassEscape |
3378 // | 3446 // |
3379 // CharacterClassEscape :: one of | 3447 // CharacterClassEscape :: one of |
3380 // d D s S w W | 3448 // d D s S w W |
3381 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { | 3449 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': { |
3382 uc32 c = next(); | 3450 uc32 c = next(); |
3383 Advance(2); | 3451 Advance(2); |
3384 atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c)); | 3452 atom = new RegExpCharacterClass(CharacterRange::CharacterClass(c)); |
3385 goto has_read_atom; | 3453 goto has_read_atom; |
3386 } | 3454 } |
3387 // Todo backreferences | 3455 case '1': case '2': case '3': case '4': case '5': case '6': |
3456 case '7': case '8': case '9': { | |
3457 int index = 0; | |
3458 if (ParseBackreferenceIndex(&index)) { | |
3459 atom = new RegExpBackreference(index); | |
3460 goto has_read_atom; | |
3461 } else { | |
3462 // If this is not a backreference we go to the atom parser | |
3463 // which will read it as an octal escape. | |
3464 goto parse_atom; | |
3465 } | |
3466 } | |
3388 default: | 3467 default: |
3389 break; | 3468 goto parse_atom; |
3390 } | 3469 } |
3391 } | 3470 } |
3392 // All other escapes fall through to the default case since | 3471 // All other escapes fall through to the default case since |
3393 // they correspond to single characters that can be | 3472 // they correspond to single characters that can be |
3394 // represented within atoms. | 3473 // represented within atoms. |
3395 default: { | 3474 default: { |
3475 parse_atom: | |
3396 atom = ParseAtom(CHECK_OK); | 3476 atom = ParseAtom(CHECK_OK); |
3397 break; | 3477 break; |
3398 } | 3478 } |
3399 } | 3479 } |
3400 has_read_atom: | 3480 has_read_atom: |
3401 int min; | 3481 int min; |
3402 int max; | 3482 int max; |
3403 switch (current()) { | 3483 switch (current()) { |
3404 // QuantifierPrefix :: | 3484 // QuantifierPrefix :: |
3405 // * | 3485 // * |
3406 // + | 3486 // + |
3407 // ? | 3487 // ? |
Lasse Reichstein
2008/10/28 10:09:52
and " // {" ?
| |
3408 case '*': | 3488 case '*': |
3409 min = 0; | 3489 min = 0; |
3410 max = RegExpQuantifier::kInfinity; | 3490 max = RegExpQuantifier::kInfinity; |
3411 Advance(); | 3491 Advance(); |
3412 break; | 3492 break; |
3413 case '+': | 3493 case '+': |
3414 min = 1; | 3494 min = 1; |
3415 max = RegExpQuantifier::kInfinity; | 3495 max = RegExpQuantifier::kInfinity; |
3416 Advance(); | 3496 Advance(); |
3417 break; | 3497 break; |
(...skipping 407 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3825 start_position, | 3905 start_position, |
3826 is_expression); | 3906 is_expression); |
3827 return result; | 3907 return result; |
3828 } | 3908 } |
3829 | 3909 |
3830 | 3910 |
3831 #undef NEW | 3911 #undef NEW |
3832 | 3912 |
3833 | 3913 |
3834 } } // namespace v8::internal | 3914 } } // namespace v8::internal |
OLD | NEW |