Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1153)

Side by Side Diff: src/jsregexp.cc

Issue 378024: * Fix regexp benchmark regression where we were doing work to... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: Created 11 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved. 1 // Copyright 2006-2009 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 2414 matching lines...) Expand 10 before | Expand all | Expand 10 after
2425 quick_check_performed_.Advance(by, compiler->ascii()); 2425 quick_check_performed_.Advance(by, compiler->ascii());
2426 cp_offset_ += by; 2426 cp_offset_ += by;
2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2428 compiler->SetRegExpTooBig(); 2428 compiler->SetRegExpTooBig();
2429 cp_offset_ = 0; 2429 cp_offset_ = 0;
2430 } 2430 }
2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2432 } 2432 }
2433 2433
2434 2434
2435 void TextNode::MakeCaseIndependent() { 2435 void TextNode::MakeCaseIndependent(bool is_ascii) {
2436 int element_count = elms_->length(); 2436 int element_count = elms_->length();
2437 for (int i = 0; i < element_count; i++) { 2437 for (int i = 0; i < element_count; i++) {
2438 TextElement elm = elms_->at(i); 2438 TextElement elm = elms_->at(i);
2439 if (elm.type == TextElement::CHAR_CLASS) { 2439 if (elm.type == TextElement::CHAR_CLASS) {
2440 RegExpCharacterClass* cc = elm.data.u_char_class; 2440 RegExpCharacterClass* cc = elm.data.u_char_class;
2441 // None of the standard regexps is different in the case independent case
Christian Plesner Hansen 2009/11/09 09:56:52 s/regexps is/character classes are/
2442 // and it slows us down if we don't know that.
2443 if (cc->is_standard()) continue;
2441 ZoneList<CharacterRange>* ranges = cc->ranges(); 2444 ZoneList<CharacterRange>* ranges = cc->ranges();
2442 int range_count = ranges->length(); 2445 int range_count = ranges->length();
2443 for (int j = 0; j < range_count; j++) { 2446 for (int j = 0; j < range_count; j++) {
2444 ranges->at(j).AddCaseEquivalents(ranges); 2447 ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
2445 } 2448 }
2446 } 2449 }
2447 } 2450 }
2448 } 2451 }
2449 2452
2450 2453
2451 int TextNode::GreedyLoopTextLength() { 2454 int TextNode::GreedyLoopTextLength() {
2452 TextElement elm = elms_->at(elms_->length() - 1); 2455 TextElement elm = elms_->at(elms_->length() - 1);
2453 if (elm.type == TextElement::CHAR_CLASS) { 2456 if (elm.type == TextElement::CHAR_CLASS) {
2454 return elm.cp_offset + 1; 2457 return elm.cp_offset + 1;
(...skipping 1450 matching lines...) Expand 10 before | Expand all | Expand 10 after
3905 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); 3908 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3906 for (int i = 0; i < overlay.length(); i += 2) { 3909 for (int i = 0; i < overlay.length(); i += 2) {
3907 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), 3910 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3908 CharacterRangeSplitter::kInOverlay); 3911 CharacterRangeSplitter::kInOverlay);
3909 } 3912 }
3910 CharacterRangeSplitter callback(included, excluded); 3913 CharacterRangeSplitter callback(included, excluded);
3911 table.ForEach(&callback); 3914 table.ForEach(&callback);
3912 } 3915 }
3913 3916
3914 3917
3915 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) { 3918 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
3919 int bottom,
3920 int top);
3921
3922
3923 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
3924 bool is_ascii) {
3925 uc16 bottom = from();
3926 uc16 top = to();
3927 if (is_ascii) {
3928 if (bottom > String::kMaxAsciiCharCode) return;
3929 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
3930 }
3916 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 3931 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3917 if (IsSingleton()) { 3932 if (top == bottom) {
3918 // If this is a singleton we just expand the one character. 3933 // If this is a singleton we just expand the one character.
3919 int length = uncanonicalize.get(from(), '\0', chars); 3934 int length = uncanonicalize.get(bottom, '\0', chars);
3920 for (int i = 0; i < length; i++) { 3935 for (int i = 0; i < length; i++) {
3921 uc32 chr = chars[i]; 3936 uc32 chr = chars[i];
3922 if (chr != from()) { 3937 if (chr != bottom) {
3923 ranges->Add(CharacterRange::Singleton(chars[i])); 3938 ranges->Add(CharacterRange::Singleton(chars[i]));
3924 } 3939 }
3925 } 3940 }
3926 } else if (from() <= kRangeCanonicalizeMax && 3941 } else if (bottom <= kRangeCanonicalizeMax &&
3927 to() <= kRangeCanonicalizeMax) { 3942 top <= kRangeCanonicalizeMax) {
3928 // If this is a range we expand the characters block by block, 3943 // If this is a range we expand the characters block by block,
3929 // expanding contiguous subranges (blocks) one at a time. 3944 // expanding contiguous subranges (blocks) one at a time.
3930 // The approach is as follows. For a given start character we 3945 // The approach is as follows. For a given start character we
3931 // look up the block that contains it, for instance 'a' if the 3946 // look up the block that contains it, for instance 'a' if the
3932 // start character is 'c'. A block is characterized by the property 3947 // start character is 'c'. A block is characterized by the property
3933 // that all characters uncanonicalize in the same way as the first 3948 // that all characters uncanonicalize in the same way as the first
3934 // element, except that each entry in the result is incremented 3949 // element, except that each entry in the result is incremented
3935 // by the distance from the first element. So a-z is a block 3950 // by the distance from the first element. So a-z is a block
3936 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter 3951 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3937 // uncanonicalizes to ['a' + k, 'A' + k]. 3952 // uncanonicalizes to ['a' + k, 'A' + k].
3938 // Once we've found the start point we look up its uncanonicalization 3953 // Once we've found the start point we look up its uncanonicalization
3939 // and produce a range for each element. For instance for [c-f] 3954 // and produce a range for each element. For instance for [c-f]
3940 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only 3955 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3941 // add a range if it is not already contained in the input, so [c-f] 3956 // add a range if it is not already contained in the input, so [c-f]
3942 // will be skipped but [C-F] will be added. If this range is not 3957 // will be skipped but [C-F] will be added. If this range is not
3943 // completely contained in a block we do this for all the blocks 3958 // completely contained in a block we do this for all the blocks
3944 // covered by the range. 3959 // covered by the range.
3945 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 3960 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3946 // First, look up the block that contains the 'from' character. 3961 // First, look up the block that contains the 'bottom' character.
3947 int length = canonrange.get(from(), '\0', range); 3962 int length = canonrange.get(bottom, '\0', range);
3948 if (length == 0) { 3963 if (length == 0) {
3949 range[0] = from(); 3964 range[0] = bottom;
3950 } else { 3965 } else {
3951 ASSERT_EQ(1, length); 3966 ASSERT_EQ(1, length);
3952 } 3967 }
3953 int pos = from(); 3968 int pos = bottom;
3954 // The start of the current block. Note that except for the first 3969 // The start of the current block. Note that except for the first
3955 // iteration 'start' is always equal to 'pos'. 3970 // iteration 'start' is always equal to 'pos'.
3956 int start; 3971 int start;
3957 // If it is not the start point of a block the entry contains the 3972 // If it is not the start point of a block the entry contains the
3958 // offset of the character from the start point. 3973 // offset of the character from the start point.
3959 if ((range[0] & kStartMarker) == 0) { 3974 if ((range[0] & kStartMarker) == 0) {
3960 start = pos - range[0]; 3975 start = pos - range[0];
3961 } else { 3976 } else {
3962 start = pos; 3977 start = pos;
3963 } 3978 }
3964 // Then we add the ranges one at a time, incrementing the current 3979 // Then we add the ranges one at a time, incrementing the current
3965 // position to be after the last block each time. The position 3980 // position to be after the last block each time. The position
3966 // always points to the start of a block. 3981 // always points to the start of a block.
3967 while (pos < to()) { 3982 while (pos < top) {
3968 length = canonrange.get(start, '\0', range); 3983 length = canonrange.get(start, '\0', range);
3969 if (length == 0) { 3984 if (length == 0) {
3970 range[0] = start; 3985 range[0] = start;
3971 } else { 3986 } else {
3972 ASSERT_EQ(1, length); 3987 ASSERT_EQ(1, length);
3973 } 3988 }
3974 ASSERT((range[0] & kStartMarker) != 0); 3989 ASSERT((range[0] & kStartMarker) != 0);
3975 // The start point of a block contains the distance to the end 3990 // The start point of a block contains the distance to the end
3976 // of the range. 3991 // of the range.
3977 int block_end = start + (range[0] & kPayloadMask) - 1; 3992 int block_end = start + (range[0] & kPayloadMask) - 1;
3978 int end = (block_end > to()) ? to() : block_end; 3993 int end = (block_end > top) ? top : block_end;
3979 length = uncanonicalize.get(start, '\0', range); 3994 length = uncanonicalize.get(start, '\0', range);
3980 for (int i = 0; i < length; i++) { 3995 for (int i = 0; i < length; i++) {
3981 uc32 c = range[i]; 3996 uc32 c = range[i];
3982 uc16 range_from = c + (pos - start); 3997 uc16 range_from = c + (pos - start);
3983 uc16 range_to = c + (end - start); 3998 uc16 range_to = c + (end - start);
3984 if (!(from() <= range_from && range_to <= to())) { 3999 if (!(bottom <= range_from && range_to <= top)) {
3985 ranges->Add(CharacterRange(range_from, range_to)); 4000 ranges->Add(CharacterRange(range_from, range_to));
3986 } 4001 }
3987 } 4002 }
3988 start = pos = block_end + 1; 4003 start = pos = block_end + 1;
3989 } 4004 }
3990 } else if (from() > 0 || to() < String::kMaxUC16CharCode) { 4005 } else {
3991 // Unibrow ranges don't work for high characters due to the "2^11 bug". 4006 // Unibrow ranges don't work for high characters due to the "2^11 bug".
3992 // Therefore we do something dumber for these ranges. We don't bother 4007 // Therefore we do something dumber for these ranges.
3993 // if the range is 0-max (as encountered at the start of an unanchored 4008 AddUncanonicals(ranges, bottom, top);
3994 // regexp). 4009 }
3995 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); 4010 }
3996 int bottom = from(); 4011
3997 int top = to(); 4012
3998 for (int i = bottom; i <= top; i++) { 4013 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
3999 int length = uncanonicalize.get(i, '\0', chars); 4014 int bottom,
4000 for (int j = 0; j < length; j++) { 4015 int top) {
4001 uc32 chr = chars[j]; 4016 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4002 if (chr != i && chr < bottom || chr > top) { 4017 // Zones with no case mappings:
4003 characters->Add(chr); 4018 // 0x0600 - 0x0fff
4019 // 0x1100 - 0x1cff
4020 // 0x2000 - 0x20ff
4021 // 0x2200 - 0x23ff
4022 // 0x2500 - 0x2bff
4023 // 0x2e00 - 0xa5ff
4024 // 0xa800 - 0xfaff
4025 // 0xfc00 - 0xfeff
4026 const int boundary_count = 18;
4027 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this
4028 // array. This is to split up big ranges and not because they actually denote
4029 // a case-mapping-free-zone.
4030 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
4031 const int kFirstRealCaselessZoneIndex = 2;
4032 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
4033 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
4034 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
4035
4036 // Special ASCII rule from spec can save us some work here.
4037 if (bottom == 0x80 && top == 0xffff) return;
4038
4039 // We have optimized support for this range.
4040 if (top <= CharacterRange::kRangeCanonicalizeMax) {
4041 CharacterRange range(bottom, top);
4042 range.AddCaseEquivalents(ranges, false);
4043 return;
4044 }
4045
4046 // Split up very large ranges.
4047 for (int i = 0; i < boundary_count; i++) {
4048 if (bottom < boundaries[i] && top >= boundaries[i]) {
4049 AddUncanonicals(ranges, bottom, boundaries[i] - 1);
4050 AddUncanonicals(ranges, boundaries[i], top);
4051 return;
4052 }
4053 }
4054
4055 // If we are completely in a zone with no case mappings then we are done.
4056 // We start at 2 so as not to except the ASCII range from mappings.
4057 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
4058 if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
4059 #ifdef DEBUG
4060 for (int j = bottom; j <= top; j++) {
4061 unsigned current_char = j;
4062 int length = uncanonicalize.get(current_char, '\0', chars);
4063 for (int k = 0; k < length; k++) {
4064 ASSERT(chars[k] == current_char);
4004 } 4065 }
4005 } 4066 }
4067 #endif
4068 return;
4006 } 4069 }
4007 if (characters->length() > 0) { 4070 }
4008 int new_from = characters->at(0); 4071
4009 int new_to = new_from; 4072 // Step through the range finding equivalent characters.
4010 for (int i = 1; i < characters->length(); i++) { 4073 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
4011 int chr = characters->at(i); 4074 for (int i = bottom; i <= top; i++) {
4012 if (chr == new_to + 1) { 4075 int length = uncanonicalize.get(i, '\0', chars);
4013 new_to++; 4076 for (int j = 0; j < length; j++) {
4077 uc32 chr = chars[j];
4078 if (chr != i && chr < bottom || chr > top) {
4079 characters->Add(chr);
4080 }
4081 }
4082 }
4083
4084 // Step through the equivalent characters finding simple ranges and
4085 // adding ranges to the character class.
4086 if (characters->length() > 0) {
4087 int new_from = characters->at(0);
4088 int new_to = new_from;
4089 for (int i = 1; i < characters->length(); i++) {
4090 int chr = characters->at(i);
4091 if (chr == new_to + 1) {
4092 new_to++;
4093 } else {
4094 if (new_to == new_from) {
4095 ranges->Add(CharacterRange::Singleton(new_from));
4014 } else { 4096 } else {
4015 if (new_to == new_from) { 4097 ranges->Add(CharacterRange(new_from, new_to));
4016 ranges->Add(CharacterRange::Singleton(new_from));
4017 } else {
4018 ranges->Add(CharacterRange(new_from, new_to));
4019 }
4020 new_from = new_to = chr;
4021 } 4098 }
4099 new_from = new_to = chr;
4022 } 4100 }
4023 if (new_to == new_from) { 4101 }
4024 ranges->Add(CharacterRange::Singleton(new_from)); 4102 if (new_to == new_from) {
4025 } else { 4103 ranges->Add(CharacterRange::Singleton(new_from));
4026 ranges->Add(CharacterRange(new_from, new_to)); 4104 } else {
4027 } 4105 ranges->Add(CharacterRange(new_from, new_to));
4028 } 4106 }
4029 } 4107 }
4030 } 4108 }
4031 4109
4032 4110
4033 ZoneList<CharacterRange>* CharacterSet::ranges() { 4111 ZoneList<CharacterRange>* CharacterSet::ranges() {
4034 if (ranges_ == NULL) { 4112 if (ranges_ == NULL) {
4035 ranges_ = new ZoneList<CharacterRange>(2); 4113 ranges_ = new ZoneList<CharacterRange>(2);
4036 CharacterRange::AddClassEscape(standard_set_type_, ranges_); 4114 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4037 } 4115 }
(...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after
4264 } else { 4342 } else {
4265 cp_offset++; 4343 cp_offset++;
4266 Vector<const uc16> quarks = elm.data.u_atom->data(); 4344 Vector<const uc16> quarks = elm.data.u_atom->data();
4267 } 4345 }
4268 } 4346 }
4269 } 4347 }
4270 4348
4271 4349
4272 void Analysis::VisitText(TextNode* that) { 4350 void Analysis::VisitText(TextNode* that) {
4273 if (ignore_case_) { 4351 if (ignore_case_) {
4274 that->MakeCaseIndependent(); 4352 that->MakeCaseIndependent(is_ascii_);
4275 } 4353 }
4276 EnsureAnalyzed(that->on_success()); 4354 EnsureAnalyzed(that->on_success());
4277 if (!has_failed()) { 4355 if (!has_failed()) {
4278 that->CalculateOffsets(); 4356 that->CalculateOffsets();
4279 } 4357 }
4280 } 4358 }
4281 4359
4282 4360
4283 void Analysis::VisitAction(ActionNode* that) { 4361 void Analysis::VisitAction(ActionNode* that) {
4284 RegExpNode* target = that->on_success(); 4362 RegExpNode* target = that->on_success();
(...skipping 197 matching lines...) Expand 10 before | Expand all | Expand 10 after
4482 ChoiceNode* first_step_node = new ChoiceNode(2); 4560 ChoiceNode* first_step_node = new ChoiceNode(2);
4483 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 4561 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4484 first_step_node->AddAlternative(GuardedAlternative( 4562 first_step_node->AddAlternative(GuardedAlternative(
4485 new TextNode(new RegExpCharacterClass('*'), loop_node))); 4563 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4486 node = first_step_node; 4564 node = first_step_node;
4487 } else { 4565 } else {
4488 node = loop_node; 4566 node = loop_node;
4489 } 4567 }
4490 } 4568 }
4491 data->node = node; 4569 data->node = node;
4492 Analysis analysis(ignore_case); 4570 Analysis analysis(ignore_case, is_ascii);
4493 analysis.EnsureAnalyzed(node); 4571 analysis.EnsureAnalyzed(node);
4494 if (analysis.has_failed()) { 4572 if (analysis.has_failed()) {
4495 const char* error_message = analysis.error_message(); 4573 const char* error_message = analysis.error_message();
4496 return CompilationResult(error_message); 4574 return CompilationResult(error_message);
4497 } 4575 }
4498 4576
4499 NodeInfo info = *node->info(); 4577 NodeInfo info = *node->info();
4500 4578
4501 // Create the correct assembler for the architecture. 4579 // Create the correct assembler for the architecture.
4502 #ifdef V8_NATIVE_REGEXP 4580 #ifdef V8_NATIVE_REGEXP
(...skipping 17 matching lines...) Expand all
4520 RegExpMacroAssemblerIrregexp macro_assembler(codes); 4598 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4521 #endif 4599 #endif
4522 4600
4523 return compiler.Assemble(&macro_assembler, 4601 return compiler.Assemble(&macro_assembler,
4524 node, 4602 node,
4525 data->capture_count, 4603 data->capture_count,
4526 pattern); 4604 pattern);
4527 } 4605 }
4528 4606
4529 }} // namespace v8::internal 4607 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698