Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/jsregexp.cc

Issue 16506: Recognize standard character classes and implement more efficient matchers. (Closed)
Patch Set: Now lints Created 11 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved. 1 // Copyright 2006-2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 249 matching lines...) Expand 10 before | Expand all | Expand 10 after
260 "malformed_regexp"); 260 "malformed_regexp");
261 return Handle<Object>::null(); 261 return Handle<Object>::null();
262 } 262 }
263 263
264 if (parse_result.simple && !flags.is_ignore_case()) { 264 if (parse_result.simple && !flags.is_ignore_case()) {
265 // Parse-tree is a single atom that is equal to the pattern. 265 // Parse-tree is a single atom that is equal to the pattern.
266 result = AtomCompile(re, pattern, flags, pattern); 266 result = AtomCompile(re, pattern, flags, pattern);
267 } else if (parse_result.tree->IsAtom() && 267 } else if (parse_result.tree->IsAtom() &&
268 !flags.is_ignore_case() && 268 !flags.is_ignore_case() &&
269 parse_result.capture_count == 0) { 269 parse_result.capture_count == 0) {
270 // TODO(lrn) Accept capture_count > 0 on atoms.
271 RegExpAtom* atom = parse_result.tree->AsAtom(); 270 RegExpAtom* atom = parse_result.tree->AsAtom();
272 Vector<const uc16> atom_pattern = atom->data(); 271 Vector<const uc16> atom_pattern = atom->data();
273 Handle<String> atom_string = 272 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
274 Factory::NewStringFromTwoByte(atom_pattern);
275 result = AtomCompile(re, pattern, flags, atom_string); 273 result = AtomCompile(re, pattern, flags, atom_string);
276 } else if (FLAG_irregexp) { 274 } else if (FLAG_irregexp) {
277 result = IrregexpPrepare(re, pattern, flags); 275 result = IrregexpPrepare(re, pattern, flags);
278 } else { 276 } else {
279 result = JscrePrepare(re, pattern, flags); 277 result = JscrePrepare(re, pattern, flags);
280 } 278 }
281 Object* data = re->data(); 279 Object* data = re->data();
282 if (data->IsFixedArray()) { 280 if (data->IsFixedArray()) {
283 // If compilation succeeded then the data is set on the regexp 281 // If compilation succeeded then the data is set on the regexp
284 // and we can store it in the cache. 282 // and we can store it in the cache.
(...skipping 220 matching lines...) Expand 10 before | Expand all | Expand 10 after
505 JscreCompileWithRetryAfterGC(two_byte_pattern, 503 JscreCompileWithRetryAfterGC(two_byte_pattern,
506 flags, 504 flags,
507 &number_of_captures, 505 &number_of_captures,
508 &error_message, 506 &error_message,
509 &code); 507 &code);
510 508
511 if (code == NULL) { 509 if (code == NULL) {
512 // Throw an exception. 510 // Throw an exception.
513 Handle<JSArray> array = Factory::NewJSArray(2); 511 Handle<JSArray> array = Factory::NewJSArray(2);
514 SetElement(array, 0, pattern); 512 SetElement(array, 0, pattern);
515 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector( 513 const char* message =
516 (error_message == NULL) ? "Unknown regexp error" : error_message))); 514 (error_message == NULL) ? "Unknown regexp error" : error_message;
515 SetElement(array, 1, Factory::NewStringFromUtf8(CStrVector(message)));
517 Handle<Object> regexp_err = 516 Handle<Object> regexp_err =
518 Factory::NewSyntaxError("malformed_regexp", array); 517 Factory::NewSyntaxError("malformed_regexp", array);
519 Top::Throw(*regexp_err); 518 Top::Throw(*regexp_err);
520 return Handle<Object>(); 519 return Handle<Object>();
521 } 520 }
522 521
523 // Convert the return address to a ByteArray pointer. 522 // Convert the return address to a ByteArray pointer.
524 Handle<ByteArray> internal( 523 Handle<ByteArray> internal(
525 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code))); 524 ByteArray::FromDataStartAddress(reinterpret_cast<Address>(code)));
526 525
(...skipping 1210 matching lines...) Expand 10 before | Expand all | Expand 10 after
1737 } 1736 }
1738 1737
1739 1738
1740 static void EmitCharClass(RegExpMacroAssembler* macro_assembler, 1739 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1741 RegExpCharacterClass* cc, 1740 RegExpCharacterClass* cc,
1742 int cp_offset, 1741 int cp_offset,
1743 Label* on_failure, 1742 Label* on_failure,
1744 bool check_offset, 1743 bool check_offset,
1745 bool ascii, 1744 bool ascii,
1746 bool preloaded) { 1745 bool preloaded) {
1746 if (cc->is_standard() &&
1747 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1748 cp_offset, check_offset,
Mads Ager (chromium) 2009/01/02 10:45:16 I would put one argument per line here.
1749 on_failure)) {
1750 return;
1751 }
1752
1747 ZoneList<CharacterRange>* ranges = cc->ranges(); 1753 ZoneList<CharacterRange>* ranges = cc->ranges();
1748 int max_char; 1754 int max_char;
1749 if (ascii) { 1755 if (ascii) {
1750 max_char = String::kMaxAsciiCharCode; 1756 max_char = String::kMaxAsciiCharCode;
1751 } else { 1757 } else {
1752 max_char = String::kMaxUC16CharCode; 1758 max_char = String::kMaxUC16CharCode;
1753 } 1759 }
1754 1760
1755 Label success; 1761 Label success;
1756 1762
(...skipping 1581 matching lines...) Expand 10 before | Expand all | Expand 10 after
3338 printer.PrintNode(label, node); 3344 printer.PrintNode(label, node);
3339 } 3345 }
3340 3346
3341 3347
3342 #endif // DEBUG 3348 #endif // DEBUG
3343 3349
3344 3350
3345 // ------------------------------------------------------------------- 3351 // -------------------------------------------------------------------
3346 // Tree to graph conversion 3352 // Tree to graph conversion
3347 3353
3354 static const int kSpaceRangeCount = 20;
3355 static const int kSpaceRangeAsciiCount = 4;
3356 static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3357 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3358 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3359
3360 static const int kWordRangeCount = 8;
3361 static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3362 '_', 'a', 'z' };
3363
3364 static const int kDigitRangeCount = 2;
3365 static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3366
3367 static const int kLineTerminatorRangeCount = 6;
3368 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3369 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
3348 3370
3349 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler, 3371 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3350 RegExpNode* on_success) { 3372 RegExpNode* on_success) {
3351 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1); 3373 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3352 elms->Add(TextElement::Atom(this)); 3374 elms->Add(TextElement::Atom(this));
3353 return new TextNode(elms, on_success); 3375 return new TextNode(elms, on_success);
3354 } 3376 }
3355 3377
3356 3378
3357 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, 3379 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3358 RegExpNode* on_success) { 3380 RegExpNode* on_success) {
3359 return new TextNode(elements(), on_success); 3381 return new TextNode(elements(), on_success);
3360 } 3382 }
3361 3383
3384 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3385 const uc16* special_class,
3386 int length) {
3387 ASSERT(ranges->length() != 0);
3388 ASSERT(length != 0);
3389 ASSERT(special_class[0] != 0);
3390 if (ranges->length() != (length>>1)+1) {
Mads Ager (chromium) 2009/01/02 10:45:16 Space around the binary operators?
3391 return false;
3392 }
3393 CharacterRange range = ranges->at(0);
3394 if (range.from() != 0) {
3395 return false;
3396 }
3397 for (int i = 0; i < length; i += 2) {
3398 if (special_class[i] != (range.to() + 1)) {
3399 return false;
3400 }
3401 range = ranges->at((i>>1)+1);
Mads Ager (chromium) 2009/01/02 10:45:16 Space around the binary operators?
3402 if (special_class[i+1] != range.from() - 1) {
Mads Ager (chromium) 2009/01/02 10:45:16 Space around the binary operator?
3403 return false;
3404 }
3405 }
3406 if (range.to() != 0xffff) {
3407 return false;
3408 }
3409 return true;
3410 }
3411
3412
3413 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3414 const uc16* special_class,
3415 int length) {
3416 if (ranges->length() * 2 != length) {
3417 return false;
3418 }
3419 for (int i = 0; i < length; i+=2) {
Mads Ager (chromium) 2009/01/02 10:45:16 Space around the '+=' for consistency?
3420 CharacterRange range = ranges->at(i >> 1);
3421 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3422 return false;
3423 }
3424 }
3425 return true;
3426 }
3427
3428
3429 bool RegExpCharacterClass::is_standard() {
3430 // TODO(lrn): Remove need for this function, by not throwing away information
3431 // along the way.
3432 if (is_negated_) {
3433 return false;
3434 }
3435 if (set_.is_standard()) {
3436 return true;
3437 }
3438 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3439 set_.set_standard_set_type('s');
3440 return true;
3441 }
3442 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3443 set_.set_standard_set_type('S');
3444 return true;
3445 }
3446 if (CompareInverseRanges(set_.ranges(),
3447 kLineTerminatorRanges,
3448 kLineTerminatorRangeCount)) {
3449 set_.set_standard_set_type('.');
3450 return true;
3451 }
3452 return false;
3453 }
3454
3362 3455
3363 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 3456 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
3364 RegExpNode* on_success) { 3457 RegExpNode* on_success) {
3365 return new TextNode(this, on_success); 3458 return new TextNode(this, on_success);
3366 } 3459 }
3367 3460
3368 3461
3369 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 3462 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
3370 RegExpNode* on_success) { 3463 RegExpNode* on_success) {
3371 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 3464 ZoneList<RegExpTree*>* alternatives = this->alternatives();
(...skipping 221 matching lines...) Expand 10 before | Expand all | Expand 10 after
3593 RegExpNode* on_success) { 3686 RegExpNode* on_success) {
3594 ZoneList<RegExpTree*>* children = nodes(); 3687 ZoneList<RegExpTree*>* children = nodes();
3595 RegExpNode* current = on_success; 3688 RegExpNode* current = on_success;
3596 for (int i = children->length() - 1; i >= 0; i--) { 3689 for (int i = children->length() - 1; i >= 0; i--) {
3597 current = children->at(i)->ToNode(compiler, current); 3690 current = children->at(i)->ToNode(compiler, current);
3598 } 3691 }
3599 return current; 3692 return current;
3600 } 3693 }
3601 3694
3602 3695
3603 static const int kSpaceRangeCount = 20;
3604 static const uc16 kSpaceRanges[kSpaceRangeCount] = {
3605 0x0009, 0x000D, 0x0020, 0x0020, 0x00A0, 0x00A0, 0x1680,
3606 0x1680, 0x180E, 0x180E, 0x2000, 0x200A, 0x2028, 0x2029,
3607 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000
3608 };
3609
3610
3611 static const int kWordRangeCount = 8;
3612 static const uc16 kWordRanges[kWordRangeCount] = {
3613 '0', '9', 'A', 'Z', '_', '_', 'a', 'z'
3614 };
3615
3616
3617 static const int kDigitRangeCount = 2;
3618 static const uc16 kDigitRanges[kDigitRangeCount] = {
3619 '0', '9'
3620 };
3621
3622
3623 static const int kLineTerminatorRangeCount = 6;
3624 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = {
3625 0x000A, 0x000A, 0x000D, 0x000D, 0x2028, 0x2029
3626 };
3627
3628
3629 static void AddClass(const uc16* elmv, 3696 static void AddClass(const uc16* elmv,
3630 int elmc, 3697 int elmc,
3631 ZoneList<CharacterRange>* ranges) { 3698 ZoneList<CharacterRange>* ranges) {
3632 for (int i = 0; i < elmc; i += 2) { 3699 for (int i = 0; i < elmc; i += 2) {
3633 ASSERT(elmv[i] <= elmv[i + 1]); 3700 ASSERT(elmv[i] <= elmv[i + 1]);
3634 ranges->Add(CharacterRange(elmv[i], elmv[i + 1])); 3701 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3635 } 3702 }
3636 } 3703 }
3637 3704
3638 3705
(...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after
3814 } 3881 }
3815 } 3882 }
3816 start = pos = block_end + 1; 3883 start = pos = block_end + 1;
3817 } 3884 }
3818 } else { 3885 } else {
3819 // TODO(plesner) when we've fixed the 2^11 bug in unibrow. 3886 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
3820 } 3887 }
3821 } 3888 }
3822 3889
3823 3890
3891 ZoneList<CharacterRange>* CharacterSet::ranges() {
3892 if (ranges_ == NULL) {
3893 ranges_ = new ZoneList<CharacterRange>(2);
3894 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
3895 }
3896 return ranges_;
3897 }
3898
3899
3900
3824 // ------------------------------------------------------------------- 3901 // -------------------------------------------------------------------
3825 // Interest propagation 3902 // Interest propagation
3826 3903
3827 3904
3828 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) { 3905 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
3829 for (int i = 0; i < siblings_.length(); i++) { 3906 for (int i = 0; i < siblings_.length(); i++) {
3830 RegExpNode* sibling = siblings_.Get(i); 3907 RegExpNode* sibling = siblings_.Get(i);
3831 if (sibling->info()->Matches(info)) 3908 if (sibling->info()->Matches(info))
3832 return sibling; 3909 return sibling;
3833 } 3910 }
(...skipping 481 matching lines...) Expand 10 before | Expand all | Expand 10 after
4315 EmbeddedVector<byte, 1024> codes; 4392 EmbeddedVector<byte, 1024> codes;
4316 RegExpMacroAssemblerIrregexp macro_assembler(codes); 4393 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4317 return compiler.Assemble(&macro_assembler, 4394 return compiler.Assemble(&macro_assembler,
4318 node, 4395 node,
4319 data->capture_count, 4396 data->capture_count,
4320 pattern); 4397 pattern);
4321 } 4398 }
4322 4399
4323 4400
4324 }} // namespace v8::internal 4401 }} // namespace v8::internal
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698