src/jsregexp.cc - Issue 378024: * Fix regexp benchmark regression where we were doing work to...

Side by Side Diff: src/jsregexp.cc

Issue 378024: * Fix regexp benchmark regression where we were doing work to... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: Created 11 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved.	1 // Copyright 2006-2009 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 2414 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2425 quick_check_performed_.Advance(by, compiler->ascii());	2425 quick_check_performed_.Advance(by, compiler->ascii());

2426 cp_offset_ += by;	2426 cp_offset_ += by;

2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {	2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {

2428 compiler->SetRegExpTooBig();	2428 compiler->SetRegExpTooBig();

2429 cp_offset_ = 0;	2429 cp_offset_ = 0;

2430 }	2430 }

2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);	2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);

2432 }	2432 }

2433	2433

2434	2434

2435 void TextNode::MakeCaseIndependent() {	2435 void TextNode::MakeCaseIndependent(bool is_ascii) {

2436 int element_count = elms_->length();	2436 int element_count = elms_->length();

2437 for (int i = 0; i < element_count; i++) {	2437 for (int i = 0; i < element_count; i++) {

2438 TextElement elm = elms_->at(i);	2438 TextElement elm = elms_->at(i);

2439 if (elm.type == TextElement::CHAR_CLASS) {	2439 if (elm.type == TextElement::CHAR_CLASS) {

2440 RegExpCharacterClass* cc = elm.data.u_char_class;	2440 RegExpCharacterClass* cc = elm.data.u_char_class;

	2441 // None of the standard regexps is different in the case independent case
	Christian Plesner Hansen 2009/11/09 09:56:52 s/regexps is/character classes are/ s/regexps is/character classes are/
	2442 // and it slows us down if we don't know that.

	2443 if (cc->is_standard()) continue;

2441 ZoneList<CharacterRange>* ranges = cc->ranges();	2444 ZoneList<CharacterRange>* ranges = cc->ranges();

2442 int range_count = ranges->length();	2445 int range_count = ranges->length();

2443 for (int j = 0; j < range_count; j++) {	2446 for (int j = 0; j < range_count; j++) {

2444 ranges->at(j).AddCaseEquivalents(ranges);	2447 ranges->at(j).AddCaseEquivalents(ranges, is_ascii);

2445 }	2448 }

2446 }	2449 }

2447 }	2450 }

2448 }	2451 }

2449	2452

2450	2453

2451 int TextNode::GreedyLoopTextLength() {	2454 int TextNode::GreedyLoopTextLength() {

2452 TextElement elm = elms_->at(elms_->length() - 1);	2455 TextElement elm = elms_->at(elms_->length() - 1);

2453 if (elm.type == TextElement::CHAR_CLASS) {	2456 if (elm.type == TextElement::CHAR_CLASS) {

2454 return elm.cp_offset + 1;	2457 return elm.cp_offset + 1;

(...skipping 1450 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3905 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);	3908 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);

3906 for (int i = 0; i < overlay.length(); i += 2) {	3909 for (int i = 0; i < overlay.length(); i += 2) {

3907 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),	3910 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),

3908 CharacterRangeSplitter::kInOverlay);	3911 CharacterRangeSplitter::kInOverlay);

3909 }	3912 }

3910 CharacterRangeSplitter callback(included, excluded);	3913 CharacterRangeSplitter callback(included, excluded);

3911 table.ForEach(&callback);	3914 table.ForEach(&callback);

3912 }	3915 }

3913	3916

3914	3917

3915 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {	3918 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,

	3919 int bottom,

	3920 int top);

	3921

	3922

	3923 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,

	3924 bool is_ascii) {

	3925 uc16 bottom = from();

	3926 uc16 top = to();

	3927 if (is_ascii) {

	3928 if (bottom > String::kMaxAsciiCharCode) return;

	3929 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;

	3930 }

3916 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	3931 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

3917 if (IsSingleton()) {	3932 if (top == bottom) {

3918 // If this is a singleton we just expand the one character.	3933 // If this is a singleton we just expand the one character.

3919 int length = uncanonicalize.get(from(), '\0', chars);	3934 int length = uncanonicalize.get(bottom, '\0', chars);

3920 for (int i = 0; i < length; i++) {	3935 for (int i = 0; i < length; i++) {

3921 uc32 chr = chars[i];	3936 uc32 chr = chars[i];

3922 if (chr != from()) {	3937 if (chr != bottom) {

3923 ranges->Add(CharacterRange::Singleton(chars[i]));	3938 ranges->Add(CharacterRange::Singleton(chars[i]));

3924 }	3939 }

3925 }	3940 }

3926 } else if (from() <= kRangeCanonicalizeMax &&	3941 } else if (bottom <= kRangeCanonicalizeMax &&

3927 to() <= kRangeCanonicalizeMax) {	3942 top <= kRangeCanonicalizeMax) {

3928 // If this is a range we expand the characters block by block,	3943 // If this is a range we expand the characters block by block,

3929 // expanding contiguous subranges (blocks) one at a time.	3944 // expanding contiguous subranges (blocks) one at a time.

3930 // The approach is as follows. For a given start character we	3945 // The approach is as follows. For a given start character we

3931 // look up the block that contains it, for instance 'a' if the	3946 // look up the block that contains it, for instance 'a' if the

3932 // start character is 'c'. A block is characterized by the property	3947 // start character is 'c'. A block is characterized by the property

3933 // that all characters uncanonicalize in the same way as the first	3948 // that all characters uncanonicalize in the same way as the first

3934 // element, except that each entry in the result is incremented	3949 // element, except that each entry in the result is incremented

3935 // by the distance from the first element. So a-z is a block	3950 // by the distance from the first element. So a-z is a block

3936 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter	3951 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter

3937 // uncanonicalizes to ['a' + k, 'A' + k].	3952 // uncanonicalizes to ['a' + k, 'A' + k].

3938 // Once we've found the start point we look up its uncanonicalization	3953 // Once we've found the start point we look up its uncanonicalization

3939 // and produce a range for each element. For instance for [c-f]	3954 // and produce a range for each element. For instance for [c-f]

3940 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only	3955 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only

3941 // add a range if it is not already contained in the input, so [c-f]	3956 // add a range if it is not already contained in the input, so [c-f]

3942 // will be skipped but [C-F] will be added. If this range is not	3957 // will be skipped but [C-F] will be added. If this range is not

3943 // completely contained in a block we do this for all the blocks	3958 // completely contained in a block we do this for all the blocks

3944 // covered by the range.	3959 // covered by the range.

3945 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];	3960 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

3946 // First, look up the block that contains the 'from' character.	3961 // First, look up the block that contains the 'bottom' character.

3947 int length = canonrange.get(from(), '\0', range);	3962 int length = canonrange.get(bottom, '\0', range);

3948 if (length == 0) {	3963 if (length == 0) {

3949 range[0] = from();	3964 range[0] = bottom;

3950 } else {	3965 } else {

3951 ASSERT_EQ(1, length);	3966 ASSERT_EQ(1, length);

3952 }	3967 }

3953 int pos = from();	3968 int pos = bottom;

3954 // The start of the current block. Note that except for the first	3969 // The start of the current block. Note that except for the first

3955 // iteration 'start' is always equal to 'pos'.	3970 // iteration 'start' is always equal to 'pos'.

3956 int start;	3971 int start;

3957 // If it is not the start point of a block the entry contains the	3972 // If it is not the start point of a block the entry contains the

3958 // offset of the character from the start point.	3973 // offset of the character from the start point.

3959 if ((range[0] & kStartMarker) == 0) {	3974 if ((range[0] & kStartMarker) == 0) {

3960 start = pos - range[0];	3975 start = pos - range[0];

3961 } else {	3976 } else {

3962 start = pos;	3977 start = pos;

3963 }	3978 }

3964 // Then we add the ranges one at a time, incrementing the current	3979 // Then we add the ranges one at a time, incrementing the current

3965 // position to be after the last block each time. The position	3980 // position to be after the last block each time. The position

3966 // always points to the start of a block.	3981 // always points to the start of a block.

3967 while (pos < to()) {	3982 while (pos < top) {

3968 length = canonrange.get(start, '\0', range);	3983 length = canonrange.get(start, '\0', range);

3969 if (length == 0) {	3984 if (length == 0) {

3970 range[0] = start;	3985 range[0] = start;

3971 } else {	3986 } else {

3972 ASSERT_EQ(1, length);	3987 ASSERT_EQ(1, length);

3973 }	3988 }

3974 ASSERT((range[0] & kStartMarker) != 0);	3989 ASSERT((range[0] & kStartMarker) != 0);

3975 // The start point of a block contains the distance to the end	3990 // The start point of a block contains the distance to the end

3976 // of the range.	3991 // of the range.

3977 int block_end = start + (range[0] & kPayloadMask) - 1;	3992 int block_end = start + (range[0] & kPayloadMask) - 1;

3978 int end = (block_end > to()) ? to() : block_end;	3993 int end = (block_end > top) ? top : block_end;

3979 length = uncanonicalize.get(start, '\0', range);	3994 length = uncanonicalize.get(start, '\0', range);

3980 for (int i = 0; i < length; i++) {	3995 for (int i = 0; i < length; i++) {

3981 uc32 c = range[i];	3996 uc32 c = range[i];

3982 uc16 range_from = c + (pos - start);	3997 uc16 range_from = c + (pos - start);

3983 uc16 range_to = c + (end - start);	3998 uc16 range_to = c + (end - start);

3984 if (!(from() <= range_from && range_to <= to())) {	3999 if (!(bottom <= range_from && range_to <= top)) {

3985 ranges->Add(CharacterRange(range_from, range_to));	4000 ranges->Add(CharacterRange(range_from, range_to));

3986 }	4001 }

3987 }	4002 }

3988 start = pos = block_end + 1;	4003 start = pos = block_end + 1;

3989 }	4004 }

3990 } else if (from() > 0 \|\| to() < String::kMaxUC16CharCode) {	4005 } else {

3991 // Unibrow ranges don't work for high characters due to the "2^11 bug".	4006 // Unibrow ranges don't work for high characters due to the "2^11 bug".

3992 // Therefore we do something dumber for these ranges. We don't bother	4007 // Therefore we do something dumber for these ranges.

3993 // if the range is 0-max (as encountered at the start of an unanchored	4008 AddUncanonicals(ranges, bottom, top);

3994 // regexp).	4009 }

3995 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);	4010 }

3996 int bottom = from();	4011

3997 int top = to();	4012

3998 for (int i = bottom; i <= top; i++) {	4013 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,

3999 int length = uncanonicalize.get(i, '\0', chars);	4014 int bottom,

4000 for (int j = 0; j < length; j++) {	4015 int top) {

4001 uc32 chr = chars[j];	4016 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

4002 if (chr != i && chr < bottom \|\| chr > top) {	4017 // Zones with no case mappings:

4003 characters->Add(chr);	4018 // 0x0600 - 0x0fff

	4019 // 0x1100 - 0x1cff

	4020 // 0x2000 - 0x20ff

	4021 // 0x2200 - 0x23ff

	4022 // 0x2500 - 0x2bff

	4023 // 0x2e00 - 0xa5ff

	4024 // 0xa800 - 0xfaff

	4025 // 0xfc00 - 0xfeff

	4026 const int boundary_count = 18;

	4027 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this

	4028 // array. This is to split up big ranges and not because they actually denote

	4029 // a case-mapping-free-zone.

	4030 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);

	4031 const int kFirstRealCaselessZoneIndex = 2;

	4032 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,

	4033 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,

	4034 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};

	4035

	4036 // Special ASCII rule from spec can save us some work here.

	4037 if (bottom == 0x80 && top == 0xffff) return;

	4038

	4039 // We have optimized support for this range.

	4040 if (top <= CharacterRange::kRangeCanonicalizeMax) {

	4041 CharacterRange range(bottom, top);

	4042 range.AddCaseEquivalents(ranges, false);

	4043 return;

	4044 }

	4045

	4046 // Split up very large ranges.

	4047 for (int i = 0; i < boundary_count; i++) {

	4048 if (bottom < boundaries[i] && top >= boundaries[i]) {

	4049 AddUncanonicals(ranges, bottom, boundaries[i] - 1);

	4050 AddUncanonicals(ranges, boundaries[i], top);

	4051 return;

	4052 }

	4053 }

	4054

	4055 // If we are completely in a zone with no case mappings then we are done.

	4056 // We start at 2 so as not to except the ASCII range from mappings.

	4057 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {

	4058 if (bottom >= boundaries[i] && top < boundaries[i + 1]) {

	4059 #ifdef DEBUG

	4060 for (int j = bottom; j <= top; j++) {

	4061 unsigned current_char = j;

	4062 int length = uncanonicalize.get(current_char, '\0', chars);

	4063 for (int k = 0; k < length; k++) {

	4064 ASSERT(chars[k] == current_char);

4004 }	4065 }

4005 }	4066 }

	4067 #endif

	4068 return;

4006 }	4069 }

4007 if (characters->length() > 0) {	4070 }

4008 int new_from = characters->at(0);	4071

4009 int new_to = new_from;	4072 // Step through the range finding equivalent characters.

4010 for (int i = 1; i < characters->length(); i++) {	4073 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);

4011 int chr = characters->at(i);	4074 for (int i = bottom; i <= top; i++) {

4012 if (chr == new_to + 1) {	4075 int length = uncanonicalize.get(i, '\0', chars);

4013 new_to++;	4076 for (int j = 0; j < length; j++) {

	4077 uc32 chr = chars[j];

	4078 if (chr != i && chr < bottom \|\| chr > top) {

	4079 characters->Add(chr);

	4080 }

	4081 }

	4082 }

	4083

	4084 // Step through the equivalent characters finding simple ranges and

	4085 // adding ranges to the character class.

	4086 if (characters->length() > 0) {

	4087 int new_from = characters->at(0);

	4088 int new_to = new_from;

	4089 for (int i = 1; i < characters->length(); i++) {

	4090 int chr = characters->at(i);

	4091 if (chr == new_to + 1) {

	4092 new_to++;

	4093 } else {

	4094 if (new_to == new_from) {

	4095 ranges->Add(CharacterRange::Singleton(new_from));

4014 } else {	4096 } else {

4015 if (new_to == new_from) {	4097 ranges->Add(CharacterRange(new_from, new_to));

4016 ranges->Add(CharacterRange::Singleton(new_from));

4017 } else {

4018 ranges->Add(CharacterRange(new_from, new_to));

4019 }

4020 new_from = new_to = chr;

4021 }	4098 }

	4099 new_from = new_to = chr;

4022 }	4100 }

4023 if (new_to == new_from) {	4101 }

4024 ranges->Add(CharacterRange::Singleton(new_from));	4102 if (new_to == new_from) {

4025 } else {	4103 ranges->Add(CharacterRange::Singleton(new_from));

4026 ranges->Add(CharacterRange(new_from, new_to));	4104 } else {

4027 }	4105 ranges->Add(CharacterRange(new_from, new_to));

4028 }	4106 }

4029 }	4107 }

4030 }	4108 }

4031	4109

4032	4110

4033 ZoneList<CharacterRange>* CharacterSet::ranges() {	4111 ZoneList<CharacterRange>* CharacterSet::ranges() {

4034 if (ranges_ == NULL) {	4112 if (ranges_ == NULL) {

4035 ranges_ = new ZoneList<CharacterRange>(2);	4113 ranges_ = new ZoneList<CharacterRange>(2);

4036 CharacterRange::AddClassEscape(standard_set_type_, ranges_);	4114 CharacterRange::AddClassEscape(standard_set_type_, ranges_);

4037 }	4115 }

(...skipping 226 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4264 } else {	4342 } else {

4265 cp_offset++;	4343 cp_offset++;

4266 Vector<const uc16> quarks = elm.data.u_atom->data();	4344 Vector<const uc16> quarks = elm.data.u_atom->data();

4267 }	4345 }

4268 }	4346 }

4269 }	4347 }

4270	4348

4271	4349

4272 void Analysis::VisitText(TextNode* that) {	4350 void Analysis::VisitText(TextNode* that) {

4273 if (ignore_case_) {	4351 if (ignore_case_) {

4274 that->MakeCaseIndependent();	4352 that->MakeCaseIndependent(is_ascii_);

4275 }	4353 }

4276 EnsureAnalyzed(that->on_success());	4354 EnsureAnalyzed(that->on_success());

4277 if (!has_failed()) {	4355 if (!has_failed()) {

4278 that->CalculateOffsets();	4356 that->CalculateOffsets();

4279 }	4357 }

4280 }	4358 }

4281	4359

4282	4360

4283 void Analysis::VisitAction(ActionNode* that) {	4361 void Analysis::VisitAction(ActionNode* that) {

4284 RegExpNode* target = that->on_success();	4362 RegExpNode* target = that->on_success();

(...skipping 197 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4482 ChoiceNode* first_step_node = new ChoiceNode(2);	4560 ChoiceNode* first_step_node = new ChoiceNode(2);

4483 first_step_node->AddAlternative(GuardedAlternative(captured_body));	4561 first_step_node->AddAlternative(GuardedAlternative(captured_body));

4484 first_step_node->AddAlternative(GuardedAlternative(	4562 first_step_node->AddAlternative(GuardedAlternative(

4485 new TextNode(new RegExpCharacterClass('*'), loop_node)));	4563 new TextNode(new RegExpCharacterClass('*'), loop_node)));

4486 node = first_step_node;	4564 node = first_step_node;

4487 } else {	4565 } else {

4488 node = loop_node;	4566 node = loop_node;

4489 }	4567 }

4490 }	4568 }

4491 data->node = node;	4569 data->node = node;

4492 Analysis analysis(ignore_case);	4570 Analysis analysis(ignore_case, is_ascii);

4493 analysis.EnsureAnalyzed(node);	4571 analysis.EnsureAnalyzed(node);

4494 if (analysis.has_failed()) {	4572 if (analysis.has_failed()) {

4495 const char* error_message = analysis.error_message();	4573 const char* error_message = analysis.error_message();

4496 return CompilationResult(error_message);	4574 return CompilationResult(error_message);

4497 }	4575 }

4498	4576

4499 NodeInfo info = *node->info();	4577 NodeInfo info = *node->info();

4500	4578

4501 // Create the correct assembler for the architecture.	4579 // Create the correct assembler for the architecture.

4502 #ifdef V8_NATIVE_REGEXP	4580 #ifdef V8_NATIVE_REGEXP

(...skipping 17 matching lines...) Expand all Loading...
4520 RegExpMacroAssemblerIrregexp macro_assembler(codes);	4598 RegExpMacroAssemblerIrregexp macro_assembler(codes);

4521 #endif	4599 #endif

4522	4600

4523 return compiler.Assemble(&macro_assembler,	4601 return compiler.Assemble(&macro_assembler,

4524 node,	4602 node,

4525 data->capture_count,	4603 data->capture_count,

4526 pattern);	4604 pattern);

4527 }	4605 }

4528	4606

4529 }} // namespace v8::internal	4607 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »