src/parser.cc - Issue 10759: Introduce text nodes

Side by Side Diff: src/parser.cc

Issue 10759: Introduce text nodes (Closed)

Patch Set: Fixed repeated RemoveLast issue Created 12 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 238 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
249 }	249 }

250	250

251 T* last() {	251 T* last() {

252 ASSERT(last_ != NULL);	252 ASSERT(last_ != NULL);

253 return last_;	253 return last_;

254 }	254 }

255	255

256 T* RemoveLast() {	256 T* RemoveLast() {

257 ASSERT(last_ != NULL);	257 ASSERT(last_ != NULL);

258 T* result = last_;	258 T* result = last_;

259 last_ = NULL;	259 if (list_ != NULL && list_->length() > 0)

	260 last_ = list_->RemoveLast();
	Lasse Reichstein 2008/11/15 23:58:06 This allows removing more than the last added elem This allows removing more than the last added element from the buffer. Removing the last element is only a feature to support adding parsed quantifiers after adding the atom it quantifies. The method (or class for that matter) wasn't intended to be general purpose. Is there a current need for this feature? Christian Plesner Hansen 2008/11/16 09:05:06 Not as such, no, but it's not a good property that Not as such, no, but it's not a good property that using RemoveLast puts the data structure in a semi-inconsistent state. What I needed was actually to use last() after RemoveLast has been used and that didn't work. Lasse Reichstein 2008/11/16 10:53:15 Ah, so there was a need :) Ah, so there was a need :)
	261 else

	262 last_ = NULL;

260 return result;	263 return result;

261 }	264 }

262	265

	266 T* Get(int i) {

	267 ASSERT(0 <= i && i < length());

	268 if (list_ == NULL) {

	269 ASSERT_EQ(0, i);

	270 return last_;

	271 } else {

	272 if (i == list_->length()) {

	273 ASSERT(last_ != NULL);

	274 return last_;

	275 } else {

	276 return list_->at(i);

	277 }

	278 }

	279 }

	280

263 void Clear() {	281 void Clear() {

264 list_ = NULL;	282 list_ = NULL;

265 last_ = NULL;	283 last_ = NULL;

266 }	284 }

267	285

268 int length() {	286 int length() {

269 int length = (list_ == NULL) ? 0 : list_->length();	287 int length = (list_ == NULL) ? 0 : list_->length();

270 return length + ((last_ == NULL) ? 0 : 1);	288 return length + ((last_ == NULL) ? 0 : 1);

271 }	289 }

272	290

(...skipping 14 matching lines...) Expand all Loading...
287 };	305 };

288	306

289 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.	307 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.

290 class RegExpBuilder {	308 class RegExpBuilder {

291 public:	309 public:

292 RegExpBuilder();	310 RegExpBuilder();

293 void AddCharacter(uc16 character);	311 void AddCharacter(uc16 character);

294 // "Adds" an empty expression. Does nothing except consume a	312 // "Adds" an empty expression. Does nothing except consume a

295 // following quantifier	313 // following quantifier

296 void AddEmpty();	314 void AddEmpty();

297 void AddAtom(RegExpTree* tree);	315 void AddTerm(RegExpTree* tree);
	Lasse Reichstein 2008/11/15 23:58:06 I would prefer if this is still called AddAtom (se I would prefer if this is still called AddAtom (see below). Christian Plesner Hansen 2008/11/16 09:05:06 You're right -- I've been using the term 'atom' in You're right -- I've been using the term 'atom' incorrectly as meaning flat character sequences. I'll fix that.
298 void AddAssertion(RegExpTree* tree);	316 void AddAssertion(RegExpTree* tree);

299 void NewAlternative(); // '\|'	317 void NewAlternative(); // '\|'

300 void AddQuantifierToAtom(int min, int max, bool is_greedy);	318 void AddQuantifierToAtom(int min, int max, bool is_greedy);

301 RegExpTree* ToRegExp();	319 RegExpTree* ToRegExp();

302 private:	320 private:

303 void FlushCharacters();	321 void FlushCharacters();

	322 void FlushText();

304 bool FlushTerms();	323 bool FlushTerms();

305 bool pending_empty_;	324 bool pending_empty_;

306 ZoneList<uc16>* characters_;	325 ZoneList<uc16>* characters_;

307 BufferedZoneList<RegExpTree, 2> terms_;	326 BufferedZoneList<RegExpTree, 2> terms_;

	327 BufferedZoneList<RegExpTree, 2> text_;

308 BufferedZoneList<RegExpTree, 2> alternatives_;	328 BufferedZoneList<RegExpTree, 2> alternatives_;

309 #ifdef DEBUG	329 #ifdef DEBUG

310 enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;	330 enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;

311 #define LAST(x) last_added_ = x;	331 #define LAST(x) last_added_ = x;

312 #else	332 #else

313 #define LAST(x)	333 #define LAST(x)

314 #endif	334 #endif

315 };	335 };

316	336

317	337

318 RegExpBuilder::RegExpBuilder()	338 RegExpBuilder::RegExpBuilder()

319 : pending_empty_(false), characters_(NULL), terms_(), alternatives_()	339 : pending_empty_(false), characters_(NULL), terms_(), alternatives_()

320 #ifdef DEBUG	340 #ifdef DEBUG

321 , last_added_(ADD_NONE)	341 , last_added_(ADD_NONE)

322 #endif	342 #endif

323 {}	343 {}

324	344

325	345

326 void RegExpBuilder::FlushCharacters() {	346 void RegExpBuilder::FlushCharacters() {

327 pending_empty_ = false;	347 pending_empty_ = false;

328 if (characters_ != NULL) {	348 if (characters_ != NULL) {

329 RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());	349 RegExpTree* atom = new RegExpAtom(characters_->ToConstVector());

330 characters_ = NULL;	350 characters_ = NULL;

331 terms_.Add(atom);	351 text_.Add(atom);

332 LAST(ADD_ATOM);	352 LAST(ADD_ATOM);

333 }	353 }

334 }	354 }

335	355

336	356

	357 void RegExpBuilder::FlushText() {

	358 FlushCharacters();

	359 int num_text = text_.length();

	360 if (num_text == 0) {

	361 return;

	362 } else if (num_text == 1) {

	363 terms_.Add(text_.last());

	364 } else {

	365 RegExpText* text = new RegExpText();

	366 for (int i = 0; i < num_text; i++)

	367 text_.Get(i)->AppendToText(text);

	368 terms_.Add(text);
	Lasse Reichstein 2008/11/15 23:58:06 The other places where a buffer is used, the conte The other places where a buffer is used, the contents are extracted into a single list, using GetList(). Is there an advantage to building a RegExpText incrementally instead of passing it a single ZoneList (which already exists)? Christian Plesner Hansen 2008/11/16 09:05:06 If we're always using GetList anyway then why even If we're always using GetList anyway then why even use buffered lists? I'd rather not use GetList because in the (common) case where there is just one element if will allocate an unnecessary one-element list. We only ever need AppendToText for text nodes and passing the text node means that it doesn't have to expose that it happens to have been implemented using a list. I can easily imagine us changing the internal representation, for instance to avoid allocating a list when there is only one element. Lasse Reichstein 2008/11/16 10:53:15 The current uses all have special cases for one el The current uses all have special cases for one element, where they just use last(). If there are more than one element, they use GetList() to get all the elements as a list, and pass it to the constructors that already happened to expect a list (RegExpDisjunction, RegExpAlternatives). RegExpText seems to follow the same pattern: If there is only one element, just return that, otherwise create a wrapper RegExpText to contain the list of text elements. If RegExpText had a constructor taking a pointer to a list, it could choose to use that instead of creating a new list. However, I can see that adding one RegExpText node to another might want to flatten the resulting tree, which leads to different requirements.
	369 }

	370 text_.Clear();

	371 }

	372

	373

337 void RegExpBuilder::AddCharacter(uc16 c) {	374 void RegExpBuilder::AddCharacter(uc16 c) {

338 pending_empty_ = false;	375 pending_empty_ = false;

339 if (characters_ == NULL) {	376 if (characters_ == NULL) {

340 characters_ = new ZoneList<uc16>(4);	377 characters_ = new ZoneList<uc16>(4);

341 }	378 }

342 characters_->Add(c);	379 characters_->Add(c);

343 LAST(ADD_CHAR);	380 LAST(ADD_CHAR);

344 }	381 }

345	382

346	383

347 void RegExpBuilder::AddEmpty() {	384 void RegExpBuilder::AddEmpty() {

348 pending_empty_ = true;	385 pending_empty_ = true;

349 }	386 }

350	387

351	388

352 void RegExpBuilder::AddAtom(RegExpTree* atom) {	389 void RegExpBuilder::AddTerm(RegExpTree* term) {
	Lasse Reichstein 2008/11/15 23:58:06 The original method was called AddAtom to match th The original method was called AddAtom to match the grammar productions. An Atom is something that can be quantified. A Term cannot have a quantifier applied. I would keep this method called AddAtom, and make another called AddText (to match AddCharacter) or AddCharacterClass (because that's the only text content that is added directly and not using AddCharacter). Christian Plesner Hansen 2008/11/16 09:05:06 You don't always know that you're adding a charact You don't always know that you're adding a character class. Consider "foo (?:[bar]) baz". Also, text nodes can themselves be added to other text nodes. Lasse Reichstein 2008/11/16 10:53:15 I can see the point. If the result of a sub-parse I can see the point. If the result of a sub-parse is a single text node, then it might be recognized as such an added to text_ instead of terms_. So I would use AddText to add it, and any character classes, together with setting last_ to ADD_TEXT.
353 FlushCharacters();	390 if (term->IsTextElement()) {

354 terms_.Add(atom);	391 FlushCharacters();

	392 text_.Add(term);

	393 } else {

	394 FlushText();

	395 terms_.Add(term);

	396 }

355 LAST(ADD_ATOM);	397 LAST(ADD_ATOM);

356 }	398 }

357	399

358	400

359 void RegExpBuilder::AddAssertion(RegExpTree* assert) {	401 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

360 FlushCharacters();	402 FlushText();

361 terms_.Add(assert);	403 terms_.Add(assert);

362 LAST(ADD_ASSERT);	404 LAST(ADD_ASSERT);

363 }	405 }

364	406

365	407

366 void RegExpBuilder::NewAlternative() {	408 void RegExpBuilder::NewAlternative() {

367 if (!FlushTerms()) {	409 if (!FlushTerms()) {

368 alternatives_.Add(RegExpEmpty::GetInstance());	410 alternatives_.Add(RegExpEmpty::GetInstance());

369 }	411 }

370 }	412 }

371	413

372	414

373 bool RegExpBuilder::FlushTerms() {	415 bool RegExpBuilder::FlushTerms() {

374 FlushCharacters();	416 FlushText();

375 int num_terms = terms_.length();	417 int num_terms = terms_.length();

376 if (num_terms == 0) {	418 if (num_terms == 0) {

377 return false;	419 return false;

378 }	420 }

379 RegExpTree* alternative;	421 RegExpTree* alternative;

380 if (num_terms == 1) {	422 if (num_terms == 1) {

381 alternative = terms_.last();	423 alternative = terms_.last();

382 } else {	424 } else {

383 alternative = new RegExpAlternative(terms_.GetList());	425 alternative = new RegExpAlternative(terms_.GetList());

384 }	426 }

(...skipping 23 matching lines...) Expand all Loading...
408 return;	450 return;

409 }	451 }

410 RegExpTree* atom;	452 RegExpTree* atom;

411 if (characters_ != NULL) {	453 if (characters_ != NULL) {

412 ASSERT(last_added_ == ADD_CHAR);	454 ASSERT(last_added_ == ADD_CHAR);

413 // Last atom was character.	455 // Last atom was character.

414 Vector<const uc16> char_vector = characters_->ToConstVector();	456 Vector<const uc16> char_vector = characters_->ToConstVector();

415 int num_chars = char_vector.length();	457 int num_chars = char_vector.length();

416 if (num_chars > 1) {	458 if (num_chars > 1) {

417 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);	459 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);

418 terms_.Add(new RegExpAtom(prefix));	460 text_.Add(new RegExpAtom(prefix));

419 char_vector = char_vector.SubVector(num_chars - 1, num_chars);	461 char_vector = char_vector.SubVector(num_chars - 1, num_chars);

420 }	462 }

421 characters_ = NULL;	463 characters_ = NULL;

422 atom = new RegExpAtom(char_vector);	464 atom = new RegExpAtom(char_vector);

	465 FlushText();

	466 } else if (text_.length() > 0) {

	467 ASSERT(last_added_ == ADD_ATOM);
	Lasse Reichstein 2008/11/15 23:58:06 The last thing added was a character class. The l The last thing added was a character class. The last_added property is pure debugging, so it should be as specific as possible. I would add an ADD_CHAR_CLASS to represent it. Christian Plesner Hansen 2008/11/16 09:05:06 What about "foo (?:bar) baz"? What about "foo (?:bar) baz"? Lasse Reichstein 2008/11/16 10:53:15 Indeed. It would have to be ADD_TEXT to capture th Indeed. It would have to be ADD_TEXT to capture that.
	468 atom = text_.RemoveLast();

	469 FlushText();

423 } else if (terms_.length() > 0) {	470 } else if (terms_.length() > 0) {

424 ASSERT(last_added_ == ADD_ATOM);	471 ASSERT(last_added_ == ADD_ATOM);

425 atom = terms_.RemoveLast();	472 atom = terms_.RemoveLast();

426 } else {	473 } else {

427 // Only call immediately after adding an atom or character!	474 // Only call immediately after adding an atom or character!

428 UNREACHABLE();	475 UNREACHABLE();

429 return;	476 return;

430 }	477 }

431 terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));	478 terms_.Add(new RegExpQuantifier(min, max, is_greedy, atom));

432 LAST(ADD_TERM);	479 LAST(ADD_TERM);

(...skipping 3145 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3578 RegExpAssertion::END_OF_INPUT;	3625 RegExpAssertion::END_OF_INPUT;

3579 builder.AddAssertion(new RegExpAssertion(type));	3626 builder.AddAssertion(new RegExpAssertion(type));

3580 continue;	3627 continue;

3581 }	3628 }

3582 case '.': {	3629 case '.': {

3583 Advance();	3630 Advance();

3584 // everything except \x0a, \x0d, \u2028 and \u2029	3631 // everything except \x0a, \x0d, \u2028 and \u2029

3585 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);	3632 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

3586 CharacterRange::AddClassEscape('.', ranges);	3633 CharacterRange::AddClassEscape('.', ranges);

3587 RegExpTree* atom = new RegExpCharacterClass(ranges, false);	3634 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

3588 builder.AddAtom(atom);	3635 builder.AddTerm(atom);

3589 break;	3636 break;

3590 }	3637 }

3591 case '(': {	3638 case '(': {

3592 RegExpTree* atom = ParseGroup(CHECK_OK);	3639 RegExpTree* atom = ParseGroup(CHECK_OK);

3593 builder.AddAtom(atom);	3640 builder.AddTerm(atom);
	Lasse Reichstein 2008/11/16 10:53:15 At this point we could check whether the atom is a At this point we could check whether the atom is a text node, and use a specialized AddText instead if it is. We can't recognize assertions and use AddAssertion, because (?:\b) is an atom and can be quantified, even if it's ridiculous :(
3594 break;	3641 break;

3595 }	3642 }

3596 case '[': {	3643 case '[': {

3597 RegExpTree* atom = ParseCharacterClass(CHECK_OK);	3644 RegExpTree* atom = ParseCharacterClass(CHECK_OK);

3598 builder.AddAtom(atom);	3645 builder.AddTerm(atom);

3599 break;	3646 break;

3600 }	3647 }

3601 // Atom ::	3648 // Atom ::

3602 // \ AtomEscape	3649 // \ AtomEscape

3603 case '\\':	3650 case '\\':

3604 switch (next()) {	3651 switch (next()) {

3605 case kEndMarker:	3652 case kEndMarker:

3606 ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);	3653 ReportError(CStrVector("\\ at end of pattern"), CHECK_OK);

3607 case 'b':	3654 case 'b':

3608 Advance(2);	3655 Advance(2);

3609 builder.AddAssertion(	3656 builder.AddAssertion(

3610 new RegExpAssertion(RegExpAssertion::BOUNDARY));	3657 new RegExpAssertion(RegExpAssertion::BOUNDARY));

3611 continue;	3658 continue;

3612 case 'B':	3659 case 'B':

3613 Advance(2);	3660 Advance(2);

3614 builder.AddAssertion(	3661 builder.AddAssertion(

3615 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));	3662 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

3616 continue;	3663 continue;

3617 // AtomEscape ::	3664 // AtomEscape ::

3618 // CharacterClassEscape	3665 // CharacterClassEscape

3619 //	3666 //

3620 // CharacterClassEscape :: one of	3667 // CharacterClassEscape :: one of

3621 // d D s S w W	3668 // d D s S w W

3622 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {	3669 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

3623 uc32 c = next();	3670 uc32 c = next();

3624 Advance(2);	3671 Advance(2);

3625 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);	3672 ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);

3626 CharacterRange::AddClassEscape(c, ranges);	3673 CharacterRange::AddClassEscape(c, ranges);

3627 RegExpTree* atom = new RegExpCharacterClass(ranges, false);	3674 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

3628 builder.AddAtom(atom);	3675 builder.AddTerm(atom);

3629 goto has_read_atom; // Avoid setting has_character_escapes_.	3676 goto has_read_atom; // Avoid setting has_character_escapes_.

3630 }	3677 }

3631 case '1': case '2': case '3': case '4': case '5': case '6':	3678 case '1': case '2': case '3': case '4': case '5': case '6':

3632 case '7': case '8': case '9': {	3679 case '7': case '8': case '9': {

3633 int index = 0;	3680 int index = 0;

3634 if (ParseBackreferenceIndex(&index)) {	3681 if (ParseBackreferenceIndex(&index)) {

3635 RegExpCapture* capture = captures_->at(index - 1);	3682 RegExpCapture* capture = captures_->at(index - 1);

3636 if (capture == NULL \|\| capture->available() != CAPTURE_AVAILABLE) {	3683 if (capture == NULL \|\| capture->available() != CAPTURE_AVAILABLE) {

3637 // Prepare to ignore a following quantifier	3684 // Prepare to ignore a following quantifier

3638 builder.AddEmpty();	3685 builder.AddEmpty();

3639 goto has_read_atom;	3686 goto has_read_atom;

3640 }	3687 }

3641 RegExpTree* atom = new RegExpBackreference(capture);	3688 RegExpTree* atom = new RegExpBackreference(capture);

3642 builder.AddAtom(atom);	3689 builder.AddTerm(atom);

3643 goto has_read_atom; // Avoid setting has_character_escapes_.	3690 goto has_read_atom; // Avoid setting has_character_escapes_.

3644 }	3691 }

3645 uc32 first_digit = next();	3692 uc32 first_digit = next();

3646 if (first_digit == '8' \|\| first_digit == '9') {	3693 if (first_digit == '8' \|\| first_digit == '9') {

3647 // Treat as identity escape	3694 // Treat as identity escape

3648 builder.AddCharacter(first_digit);	3695 builder.AddCharacter(first_digit);

3649 Advance(2);	3696 Advance(2);

3650 break;	3697 break;

3651 }	3698 }

3652 }	3699 }

(...skipping 619 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4272 start_position,	4319 start_position,

4273 is_expression);	4320 is_expression);

4274 return result;	4321 return result;

4275 }	4322 }

4276	4323

4277	4324

4278 #undef NEW	4325 #undef NEW

4279	4326

4280	4327

4281 } } // namespace v8::internal	4328 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/jsregexp.cc ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »