src/jsregexp.cc - Issue 389001: Merge r3236 and r3243 from bleeding_edge to trunk....

Side by Side Diff: src/jsregexp.cc

Issue 389001: Merge r3236 and r3243 from bleeding_edge to trunk.... (Closed) Base URL: http://v8.googlecode.com/svn/trunk/

Patch Set: '' Created 11 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved.	1 // Copyright 2006-2009 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 2414 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2425 quick_check_performed_.Advance(by, compiler->ascii());	2425 quick_check_performed_.Advance(by, compiler->ascii());

2426 cp_offset_ += by;	2426 cp_offset_ += by;

2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {	2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {

2428 compiler->SetRegExpTooBig();	2428 compiler->SetRegExpTooBig();

2429 cp_offset_ = 0;	2429 cp_offset_ = 0;

2430 }	2430 }

2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);	2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);

2432 }	2432 }

2433	2433

2434	2434

2435 void TextNode::MakeCaseIndependent() {	2435 void TextNode::MakeCaseIndependent(bool is_ascii) {

2436 int element_count = elms_->length();	2436 int element_count = elms_->length();

2437 for (int i = 0; i < element_count; i++) {	2437 for (int i = 0; i < element_count; i++) {

2438 TextElement elm = elms_->at(i);	2438 TextElement elm = elms_->at(i);

2439 if (elm.type == TextElement::CHAR_CLASS) {	2439 if (elm.type == TextElement::CHAR_CLASS) {

2440 RegExpCharacterClass* cc = elm.data.u_char_class;	2440 RegExpCharacterClass* cc = elm.data.u_char_class;

	2441 // None of the standard character classses is different in the case

	2442 // independent case and it slows us down if we don't know that.

	2443 if (cc->is_standard()) continue;

2441 ZoneList<CharacterRange>* ranges = cc->ranges();	2444 ZoneList<CharacterRange>* ranges = cc->ranges();

2442 int range_count = ranges->length();	2445 int range_count = ranges->length();

2443 for (int i = 0; i < range_count; i++) {	2446 for (int j = 0; j < range_count; j++) {

2444 ranges->at(i).AddCaseEquivalents(ranges);	2447 ranges->at(j).AddCaseEquivalents(ranges, is_ascii);

2445 }	2448 }

2446 }	2449 }

2447 }	2450 }

2448 }	2451 }

2449	2452

2450	2453

2451 int TextNode::GreedyLoopTextLength() {	2454 int TextNode::GreedyLoopTextLength() {

2452 TextElement elm = elms_->at(elms_->length() - 1);	2455 TextElement elm = elms_->at(elms_->length() - 1);

2453 if (elm.type == TextElement::CHAR_CLASS) {	2456 if (elm.type == TextElement::CHAR_CLASS) {

2454 return elm.cp_offset + 1;	2457 return elm.cp_offset + 1;

(...skipping 1450 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3905 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);	3908 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);

3906 for (int i = 0; i < overlay.length(); i += 2) {	3909 for (int i = 0; i < overlay.length(); i += 2) {

3907 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),	3910 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),

3908 CharacterRangeSplitter::kInOverlay);	3911 CharacterRangeSplitter::kInOverlay);

3909 }	3912 }

3910 CharacterRangeSplitter callback(included, excluded);	3913 CharacterRangeSplitter callback(included, excluded);

3911 table.ForEach(&callback);	3914 table.ForEach(&callback);

3912 }	3915 }

3913	3916

3914	3917

3915 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {	3918 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,

	3919 int bottom,

	3920 int top);

	3921

	3922

	3923 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,

	3924 bool is_ascii) {

	3925 uc16 bottom = from();

	3926 uc16 top = to();

	3927 if (is_ascii) {

	3928 if (bottom > String::kMaxAsciiCharCode) return;

	3929 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;

	3930 }

3916 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	3931 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

3917 if (IsSingleton()) {	3932 if (top == bottom) {

3918 // If this is a singleton we just expand the one character.	3933 // If this is a singleton we just expand the one character.

3919 int length = uncanonicalize.get(from(), '\0', chars);	3934 int length = uncanonicalize.get(bottom, '\0', chars);

3920 for (int i = 0; i < length; i++) {	3935 for (int i = 0; i < length; i++) {

3921 uc32 chr = chars[i];	3936 uc32 chr = chars[i];

3922 if (chr != from()) {	3937 if (chr != bottom) {

3923 ranges->Add(CharacterRange::Singleton(chars[i]));	3938 ranges->Add(CharacterRange::Singleton(chars[i]));

3924 }	3939 }

3925 }	3940 }

3926 } else if (from() <= kRangeCanonicalizeMax &&	3941 } else if (bottom <= kRangeCanonicalizeMax &&

3927 to() <= kRangeCanonicalizeMax) {	3942 top <= kRangeCanonicalizeMax) {

3928 // If this is a range we expand the characters block by block,	3943 // If this is a range we expand the characters block by block,

3929 // expanding contiguous subranges (blocks) one at a time.	3944 // expanding contiguous subranges (blocks) one at a time.

3930 // The approach is as follows. For a given start character we	3945 // The approach is as follows. For a given start character we

3931 // look up the block that contains it, for instance 'a' if the	3946 // look up the block that contains it, for instance 'a' if the

3932 // start character is 'c'. A block is characterized by the property	3947 // start character is 'c'. A block is characterized by the property

3933 // that all characters uncanonicalize in the same way as the first	3948 // that all characters uncanonicalize in the same way as the first

3934 // element, except that each entry in the result is incremented	3949 // element, except that each entry in the result is incremented

3935 // by the distance from the first element. So a-z is a block	3950 // by the distance from the first element. So a-z is a block

3936 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter	3951 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter

3937 // uncanonicalizes to ['a' + k, 'A' + k].	3952 // uncanonicalizes to ['a' + k, 'A' + k].

3938 // Once we've found the start point we look up its uncanonicalization	3953 // Once we've found the start point we look up its uncanonicalization

3939 // and produce a range for each element. For instance for [c-f]	3954 // and produce a range for each element. For instance for [c-f]

3940 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only	3955 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only

3941 // add a range if it is not already contained in the input, so [c-f]	3956 // add a range if it is not already contained in the input, so [c-f]

3942 // will be skipped but [C-F] will be added. If this range is not	3957 // will be skipped but [C-F] will be added. If this range is not

3943 // completely contained in a block we do this for all the blocks	3958 // completely contained in a block we do this for all the blocks

3944 // covered by the range.	3959 // covered by the range.

3945 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];	3960 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

3946 // First, look up the block that contains the 'from' character.	3961 // First, look up the block that contains the 'bottom' character.

3947 int length = canonrange.get(from(), '\0', range);	3962 int length = canonrange.get(bottom, '\0', range);

3948 if (length == 0) {	3963 if (length == 0) {

3949 range[0] = from();	3964 range[0] = bottom;

3950 } else {	3965 } else {

3951 ASSERT_EQ(1, length);	3966 ASSERT_EQ(1, length);

3952 }	3967 }

3953 int pos = from();	3968 int pos = bottom;

3954 // The start of the current block. Note that except for the first	3969 // The start of the current block. Note that except for the first

3955 // iteration 'start' is always equal to 'pos'.	3970 // iteration 'start' is always equal to 'pos'.

3956 int start;	3971 int start;

3957 // If it is not the start point of a block the entry contains the	3972 // If it is not the start point of a block the entry contains the

3958 // offset of the character from the start point.	3973 // offset of the character from the start point.

3959 if ((range[0] & kStartMarker) == 0) {	3974 if ((range[0] & kStartMarker) == 0) {

3960 start = pos - range[0];	3975 start = pos - range[0];

3961 } else {	3976 } else {

3962 start = pos;	3977 start = pos;

3963 }	3978 }

3964 // Then we add the ranges on at a time, incrementing the current	3979 // Then we add the ranges one at a time, incrementing the current

3965 // position to be after the last block each time. The position	3980 // position to be after the last block each time. The position

3966 // always points to the start of a block.	3981 // always points to the start of a block.

3967 while (pos < to()) {	3982 while (pos < top) {

3968 length = canonrange.get(start, '\0', range);	3983 length = canonrange.get(start, '\0', range);

3969 if (length == 0) {	3984 if (length == 0) {

3970 range[0] = start;	3985 range[0] = start;

3971 } else {	3986 } else {

3972 ASSERT_EQ(1, length);	3987 ASSERT_EQ(1, length);

3973 }	3988 }

3974 ASSERT((range[0] & kStartMarker) != 0);	3989 ASSERT((range[0] & kStartMarker) != 0);

3975 // The start point of a block contains the distance to the end	3990 // The start point of a block contains the distance to the end

3976 // of the range.	3991 // of the range.

3977 int block_end = start + (range[0] & kPayloadMask) - 1;	3992 int block_end = start + (range[0] & kPayloadMask) - 1;

3978 int end = (block_end > to()) ? to() : block_end;	3993 int end = (block_end > top) ? top : block_end;

3979 length = uncanonicalize.get(start, '\0', range);	3994 length = uncanonicalize.get(start, '\0', range);

3980 for (int i = 0; i < length; i++) {	3995 for (int i = 0; i < length; i++) {

3981 uc32 c = range[i];	3996 uc32 c = range[i];

3982 uc16 range_from = c + (pos - start);	3997 uc16 range_from = c + (pos - start);

3983 uc16 range_to = c + (end - start);	3998 uc16 range_to = c + (end - start);

3984 if (!(from() <= range_from && range_to <= to())) {	3999 if (!(bottom <= range_from && range_to <= top)) {

3985 ranges->Add(CharacterRange(range_from, range_to));	4000 ranges->Add(CharacterRange(range_from, range_to));

3986 }	4001 }

3987 }	4002 }

3988 start = pos = block_end + 1;	4003 start = pos = block_end + 1;

3989 }	4004 }

3990 } else {	4005 } else {

3991 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.	4006 // Unibrow ranges don't work for high characters due to the "2^11 bug".

	4007 // Therefore we do something dumber for these ranges.

	4008 AddUncanonicals(ranges, bottom, top);

	4009 }

	4010 }

	4011

	4012

	4013 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,

	4014 int bottom,

	4015 int top) {

	4016 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

	4017 // Zones with no case mappings. There is a DEBUG-mode loop to assert that

	4018 // this table is correct.

	4019 // 0x0600 - 0x0fff

	4020 // 0x1100 - 0x1cff

	4021 // 0x2000 - 0x20ff

	4022 // 0x2200 - 0x23ff

	4023 // 0x2500 - 0x2bff

	4024 // 0x2e00 - 0xa5ff

	4025 // 0xa800 - 0xfaff

	4026 // 0xfc00 - 0xfeff

	4027 const int boundary_count = 18;

	4028 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this

	4029 // array. This is to split up big ranges and not because they actually denote

	4030 // a case-mapping-free-zone.

	4031 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);

	4032 const int kFirstRealCaselessZoneIndex = 2;

	4033 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,

	4034 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,

	4035 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};

	4036

	4037 // Special ASCII rule from spec can save us some work here.

	4038 if (bottom == 0x80 && top == 0xffff) return;

	4039

	4040 // We have optimized support for this range.

	4041 if (top <= CharacterRange::kRangeCanonicalizeMax) {

	4042 CharacterRange range(bottom, top);

	4043 range.AddCaseEquivalents(ranges, false);

	4044 return;

	4045 }

	4046

	4047 // Split up very large ranges. This helps remove ranges where there are no

	4048 // case mappings.

	4049 for (int i = 0; i < boundary_count; i++) {

	4050 if (bottom < boundaries[i] && top >= boundaries[i]) {

	4051 AddUncanonicals(ranges, bottom, boundaries[i] - 1);

	4052 AddUncanonicals(ranges, boundaries[i], top);

	4053 return;

	4054 }

	4055 }

	4056

	4057 // If we are completely in a zone with no case mappings then we are done.

	4058 // We start at 2 so as not to except the ASCII range from mappings.

	4059 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {

	4060 if (bottom >= boundaries[i] && top < boundaries[i + 1]) {

	4061 #ifdef DEBUG

	4062 for (int j = bottom; j <= top; j++) {

	4063 unsigned current_char = j;

	4064 int length = uncanonicalize.get(current_char, '\0', chars);

	4065 for (int k = 0; k < length; k++) {

	4066 ASSERT(chars[k] == current_char);

	4067 }

	4068 }

	4069 #endif

	4070 return;

	4071 }

	4072 }

	4073

	4074 // Step through the range finding equivalent characters.

	4075 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);

	4076 for (int i = bottom; i <= top; i++) {

	4077 int length = uncanonicalize.get(i, '\0', chars);

	4078 for (int j = 0; j < length; j++) {

	4079 uc32 chr = chars[j];

	4080 if (chr != i && (chr < bottom \|\| chr > top)) {

	4081 characters->Add(chr);

	4082 }

	4083 }

	4084 }

	4085

	4086 // Step through the equivalent characters finding simple ranges and

	4087 // adding ranges to the character class.

	4088 if (characters->length() > 0) {

	4089 int new_from = characters->at(0);

	4090 int new_to = new_from;

	4091 for (int i = 1; i < characters->length(); i++) {

	4092 int chr = characters->at(i);

	4093 if (chr == new_to + 1) {

	4094 new_to++;

	4095 } else {

	4096 if (new_to == new_from) {

	4097 ranges->Add(CharacterRange::Singleton(new_from));

	4098 } else {

	4099 ranges->Add(CharacterRange(new_from, new_to));

	4100 }

	4101 new_from = new_to = chr;

	4102 }

	4103 }

	4104 if (new_to == new_from) {

	4105 ranges->Add(CharacterRange::Singleton(new_from));

	4106 } else {

	4107 ranges->Add(CharacterRange(new_from, new_to));

	4108 }

3992 }	4109 }

3993 }	4110 }

3994	4111

3995	4112

3996 ZoneList<CharacterRange>* CharacterSet::ranges() {	4113 ZoneList<CharacterRange>* CharacterSet::ranges() {

3997 if (ranges_ == NULL) {	4114 if (ranges_ == NULL) {

3998 ranges_ = new ZoneList<CharacterRange>(2);	4115 ranges_ = new ZoneList<CharacterRange>(2);

3999 CharacterRange::AddClassEscape(standard_set_type_, ranges_);	4116 CharacterRange::AddClassEscape(standard_set_type_, ranges_);

4000 }	4117 }

4001 return ranges_;	4118 return ranges_;

(...skipping 225 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4227 } else {	4344 } else {

4228 cp_offset++;	4345 cp_offset++;

4229 Vector<const uc16> quarks = elm.data.u_atom->data();	4346 Vector<const uc16> quarks = elm.data.u_atom->data();

4230 }	4347 }

4231 }	4348 }

4232 }	4349 }

4233	4350

4234	4351

4235 void Analysis::VisitText(TextNode* that) {	4352 void Analysis::VisitText(TextNode* that) {

4236 if (ignore_case_) {	4353 if (ignore_case_) {

4237 that->MakeCaseIndependent();	4354 that->MakeCaseIndependent(is_ascii_);

4238 }	4355 }

4239 EnsureAnalyzed(that->on_success());	4356 EnsureAnalyzed(that->on_success());

4240 if (!has_failed()) {	4357 if (!has_failed()) {

4241 that->CalculateOffsets();	4358 that->CalculateOffsets();

4242 }	4359 }

4243 }	4360 }

4244	4361

4245	4362

4246 void Analysis::VisitAction(ActionNode* that) {	4363 void Analysis::VisitAction(ActionNode* that) {

4247 RegExpNode* target = that->on_success();	4364 RegExpNode* target = that->on_success();

(...skipping 197 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4445 ChoiceNode* first_step_node = new ChoiceNode(2);	4562 ChoiceNode* first_step_node = new ChoiceNode(2);

4446 first_step_node->AddAlternative(GuardedAlternative(captured_body));	4563 first_step_node->AddAlternative(GuardedAlternative(captured_body));

4447 first_step_node->AddAlternative(GuardedAlternative(	4564 first_step_node->AddAlternative(GuardedAlternative(

4448 new TextNode(new RegExpCharacterClass('*'), loop_node)));	4565 new TextNode(new RegExpCharacterClass('*'), loop_node)));

4449 node = first_step_node;	4566 node = first_step_node;

4450 } else {	4567 } else {

4451 node = loop_node;	4568 node = loop_node;

4452 }	4569 }

4453 }	4570 }

4454 data->node = node;	4571 data->node = node;

4455 Analysis analysis(ignore_case);	4572 Analysis analysis(ignore_case, is_ascii);

4456 analysis.EnsureAnalyzed(node);	4573 analysis.EnsureAnalyzed(node);

4457 if (analysis.has_failed()) {	4574 if (analysis.has_failed()) {

4458 const char* error_message = analysis.error_message();	4575 const char* error_message = analysis.error_message();

4459 return CompilationResult(error_message);	4576 return CompilationResult(error_message);

4460 }	4577 }

4461	4578

4462 NodeInfo info = *node->info();	4579 NodeInfo info = *node->info();

4463	4580

4464 // Create the correct assembler for the architecture.	4581 // Create the correct assembler for the architecture.

4465 #ifdef V8_NATIVE_REGEXP	4582 #ifdef V8_NATIVE_REGEXP

(...skipping 17 matching lines...) Expand all Loading...
4483 RegExpMacroAssemblerIrregexp macro_assembler(codes);	4600 RegExpMacroAssemblerIrregexp macro_assembler(codes);

4484 #endif	4601 #endif

4485	4602

4486 return compiler.Assemble(&macro_assembler,	4603 return compiler.Assemble(&macro_assembler,

4487 node,	4604 node,

4488 data->capture_count,	4605 data->capture_count,

4489 pattern);	4606 pattern);

4490 }	4607 }

4491	4608

4492 }} // namespace v8::internal	4609 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | src/version.cc » ('j') | no next file with comments »