Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(33)

Side by Side Diff: src/jsregexp.cc

Issue 389001: Merge r3236 and r3243 from bleeding_edge to trunk.... (Closed) Base URL: http://v8.googlecode.com/svn/trunk/
Patch Set: '' Created 11 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/jsregexp.h ('k') | src/version.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved. 1 // Copyright 2006-2009 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 2414 matching lines...) Expand 10 before | Expand all | Expand 10 after
2425 quick_check_performed_.Advance(by, compiler->ascii()); 2425 quick_check_performed_.Advance(by, compiler->ascii());
2426 cp_offset_ += by; 2426 cp_offset_ += by;
2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { 2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2428 compiler->SetRegExpTooBig(); 2428 compiler->SetRegExpTooBig();
2429 cp_offset_ = 0; 2429 cp_offset_ = 0;
2430 } 2430 }
2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); 2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2432 } 2432 }
2433 2433
2434 2434
2435 void TextNode::MakeCaseIndependent() { 2435 void TextNode::MakeCaseIndependent(bool is_ascii) {
2436 int element_count = elms_->length(); 2436 int element_count = elms_->length();
2437 for (int i = 0; i < element_count; i++) { 2437 for (int i = 0; i < element_count; i++) {
2438 TextElement elm = elms_->at(i); 2438 TextElement elm = elms_->at(i);
2439 if (elm.type == TextElement::CHAR_CLASS) { 2439 if (elm.type == TextElement::CHAR_CLASS) {
2440 RegExpCharacterClass* cc = elm.data.u_char_class; 2440 RegExpCharacterClass* cc = elm.data.u_char_class;
2441 // None of the standard character classses is different in the case
2442 // independent case and it slows us down if we don't know that.
2443 if (cc->is_standard()) continue;
2441 ZoneList<CharacterRange>* ranges = cc->ranges(); 2444 ZoneList<CharacterRange>* ranges = cc->ranges();
2442 int range_count = ranges->length(); 2445 int range_count = ranges->length();
2443 for (int i = 0; i < range_count; i++) { 2446 for (int j = 0; j < range_count; j++) {
2444 ranges->at(i).AddCaseEquivalents(ranges); 2447 ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
2445 } 2448 }
2446 } 2449 }
2447 } 2450 }
2448 } 2451 }
2449 2452
2450 2453
2451 int TextNode::GreedyLoopTextLength() { 2454 int TextNode::GreedyLoopTextLength() {
2452 TextElement elm = elms_->at(elms_->length() - 1); 2455 TextElement elm = elms_->at(elms_->length() - 1);
2453 if (elm.type == TextElement::CHAR_CLASS) { 2456 if (elm.type == TextElement::CHAR_CLASS) {
2454 return elm.cp_offset + 1; 2457 return elm.cp_offset + 1;
(...skipping 1450 matching lines...) Expand 10 before | Expand all | Expand 10 after
3905 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); 3908 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3906 for (int i = 0; i < overlay.length(); i += 2) { 3909 for (int i = 0; i < overlay.length(); i += 2) {
3907 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), 3910 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3908 CharacterRangeSplitter::kInOverlay); 3911 CharacterRangeSplitter::kInOverlay);
3909 } 3912 }
3910 CharacterRangeSplitter callback(included, excluded); 3913 CharacterRangeSplitter callback(included, excluded);
3911 table.ForEach(&callback); 3914 table.ForEach(&callback);
3912 } 3915 }
3913 3916
3914 3917
3915 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) { 3918 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
3919 int bottom,
3920 int top);
3921
3922
3923 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
3924 bool is_ascii) {
3925 uc16 bottom = from();
3926 uc16 top = to();
3927 if (is_ascii) {
3928 if (bottom > String::kMaxAsciiCharCode) return;
3929 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
3930 }
3916 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 3931 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3917 if (IsSingleton()) { 3932 if (top == bottom) {
3918 // If this is a singleton we just expand the one character. 3933 // If this is a singleton we just expand the one character.
3919 int length = uncanonicalize.get(from(), '\0', chars); 3934 int length = uncanonicalize.get(bottom, '\0', chars);
3920 for (int i = 0; i < length; i++) { 3935 for (int i = 0; i < length; i++) {
3921 uc32 chr = chars[i]; 3936 uc32 chr = chars[i];
3922 if (chr != from()) { 3937 if (chr != bottom) {
3923 ranges->Add(CharacterRange::Singleton(chars[i])); 3938 ranges->Add(CharacterRange::Singleton(chars[i]));
3924 } 3939 }
3925 } 3940 }
3926 } else if (from() <= kRangeCanonicalizeMax && 3941 } else if (bottom <= kRangeCanonicalizeMax &&
3927 to() <= kRangeCanonicalizeMax) { 3942 top <= kRangeCanonicalizeMax) {
3928 // If this is a range we expand the characters block by block, 3943 // If this is a range we expand the characters block by block,
3929 // expanding contiguous subranges (blocks) one at a time. 3944 // expanding contiguous subranges (blocks) one at a time.
3930 // The approach is as follows. For a given start character we 3945 // The approach is as follows. For a given start character we
3931 // look up the block that contains it, for instance 'a' if the 3946 // look up the block that contains it, for instance 'a' if the
3932 // start character is 'c'. A block is characterized by the property 3947 // start character is 'c'. A block is characterized by the property
3933 // that all characters uncanonicalize in the same way as the first 3948 // that all characters uncanonicalize in the same way as the first
3934 // element, except that each entry in the result is incremented 3949 // element, except that each entry in the result is incremented
3935 // by the distance from the first element. So a-z is a block 3950 // by the distance from the first element. So a-z is a block
3936 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter 3951 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3937 // uncanonicalizes to ['a' + k, 'A' + k]. 3952 // uncanonicalizes to ['a' + k, 'A' + k].
3938 // Once we've found the start point we look up its uncanonicalization 3953 // Once we've found the start point we look up its uncanonicalization
3939 // and produce a range for each element. For instance for [c-f] 3954 // and produce a range for each element. For instance for [c-f]
3940 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only 3955 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3941 // add a range if it is not already contained in the input, so [c-f] 3956 // add a range if it is not already contained in the input, so [c-f]
3942 // will be skipped but [C-F] will be added. If this range is not 3957 // will be skipped but [C-F] will be added. If this range is not
3943 // completely contained in a block we do this for all the blocks 3958 // completely contained in a block we do this for all the blocks
3944 // covered by the range. 3959 // covered by the range.
3945 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 3960 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3946 // First, look up the block that contains the 'from' character. 3961 // First, look up the block that contains the 'bottom' character.
3947 int length = canonrange.get(from(), '\0', range); 3962 int length = canonrange.get(bottom, '\0', range);
3948 if (length == 0) { 3963 if (length == 0) {
3949 range[0] = from(); 3964 range[0] = bottom;
3950 } else { 3965 } else {
3951 ASSERT_EQ(1, length); 3966 ASSERT_EQ(1, length);
3952 } 3967 }
3953 int pos = from(); 3968 int pos = bottom;
3954 // The start of the current block. Note that except for the first 3969 // The start of the current block. Note that except for the first
3955 // iteration 'start' is always equal to 'pos'. 3970 // iteration 'start' is always equal to 'pos'.
3956 int start; 3971 int start;
3957 // If it is not the start point of a block the entry contains the 3972 // If it is not the start point of a block the entry contains the
3958 // offset of the character from the start point. 3973 // offset of the character from the start point.
3959 if ((range[0] & kStartMarker) == 0) { 3974 if ((range[0] & kStartMarker) == 0) {
3960 start = pos - range[0]; 3975 start = pos - range[0];
3961 } else { 3976 } else {
3962 start = pos; 3977 start = pos;
3963 } 3978 }
3964 // Then we add the ranges on at a time, incrementing the current 3979 // Then we add the ranges one at a time, incrementing the current
3965 // position to be after the last block each time. The position 3980 // position to be after the last block each time. The position
3966 // always points to the start of a block. 3981 // always points to the start of a block.
3967 while (pos < to()) { 3982 while (pos < top) {
3968 length = canonrange.get(start, '\0', range); 3983 length = canonrange.get(start, '\0', range);
3969 if (length == 0) { 3984 if (length == 0) {
3970 range[0] = start; 3985 range[0] = start;
3971 } else { 3986 } else {
3972 ASSERT_EQ(1, length); 3987 ASSERT_EQ(1, length);
3973 } 3988 }
3974 ASSERT((range[0] & kStartMarker) != 0); 3989 ASSERT((range[0] & kStartMarker) != 0);
3975 // The start point of a block contains the distance to the end 3990 // The start point of a block contains the distance to the end
3976 // of the range. 3991 // of the range.
3977 int block_end = start + (range[0] & kPayloadMask) - 1; 3992 int block_end = start + (range[0] & kPayloadMask) - 1;
3978 int end = (block_end > to()) ? to() : block_end; 3993 int end = (block_end > top) ? top : block_end;
3979 length = uncanonicalize.get(start, '\0', range); 3994 length = uncanonicalize.get(start, '\0', range);
3980 for (int i = 0; i < length; i++) { 3995 for (int i = 0; i < length; i++) {
3981 uc32 c = range[i]; 3996 uc32 c = range[i];
3982 uc16 range_from = c + (pos - start); 3997 uc16 range_from = c + (pos - start);
3983 uc16 range_to = c + (end - start); 3998 uc16 range_to = c + (end - start);
3984 if (!(from() <= range_from && range_to <= to())) { 3999 if (!(bottom <= range_from && range_to <= top)) {
3985 ranges->Add(CharacterRange(range_from, range_to)); 4000 ranges->Add(CharacterRange(range_from, range_to));
3986 } 4001 }
3987 } 4002 }
3988 start = pos = block_end + 1; 4003 start = pos = block_end + 1;
3989 } 4004 }
3990 } else { 4005 } else {
3991 // TODO(plesner) when we've fixed the 2^11 bug in unibrow. 4006 // Unibrow ranges don't work for high characters due to the "2^11 bug".
4007 // Therefore we do something dumber for these ranges.
4008 AddUncanonicals(ranges, bottom, top);
4009 }
4010 }
4011
4012
4013 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
4014 int bottom,
4015 int top) {
4016 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4017 // Zones with no case mappings. There is a DEBUG-mode loop to assert that
4018 // this table is correct.
4019 // 0x0600 - 0x0fff
4020 // 0x1100 - 0x1cff
4021 // 0x2000 - 0x20ff
4022 // 0x2200 - 0x23ff
4023 // 0x2500 - 0x2bff
4024 // 0x2e00 - 0xa5ff
4025 // 0xa800 - 0xfaff
4026 // 0xfc00 - 0xfeff
4027 const int boundary_count = 18;
4028 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this
4029 // array. This is to split up big ranges and not because they actually denote
4030 // a case-mapping-free-zone.
4031 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
4032 const int kFirstRealCaselessZoneIndex = 2;
4033 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
4034 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
4035 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
4036
4037 // Special ASCII rule from spec can save us some work here.
4038 if (bottom == 0x80 && top == 0xffff) return;
4039
4040 // We have optimized support for this range.
4041 if (top <= CharacterRange::kRangeCanonicalizeMax) {
4042 CharacterRange range(bottom, top);
4043 range.AddCaseEquivalents(ranges, false);
4044 return;
4045 }
4046
4047 // Split up very large ranges. This helps remove ranges where there are no
4048 // case mappings.
4049 for (int i = 0; i < boundary_count; i++) {
4050 if (bottom < boundaries[i] && top >= boundaries[i]) {
4051 AddUncanonicals(ranges, bottom, boundaries[i] - 1);
4052 AddUncanonicals(ranges, boundaries[i], top);
4053 return;
4054 }
4055 }
4056
4057 // If we are completely in a zone with no case mappings then we are done.
4058 // We start at 2 so as not to except the ASCII range from mappings.
4059 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
4060 if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
4061 #ifdef DEBUG
4062 for (int j = bottom; j <= top; j++) {
4063 unsigned current_char = j;
4064 int length = uncanonicalize.get(current_char, '\0', chars);
4065 for (int k = 0; k < length; k++) {
4066 ASSERT(chars[k] == current_char);
4067 }
4068 }
4069 #endif
4070 return;
4071 }
4072 }
4073
4074 // Step through the range finding equivalent characters.
4075 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
4076 for (int i = bottom; i <= top; i++) {
4077 int length = uncanonicalize.get(i, '\0', chars);
4078 for (int j = 0; j < length; j++) {
4079 uc32 chr = chars[j];
4080 if (chr != i && (chr < bottom || chr > top)) {
4081 characters->Add(chr);
4082 }
4083 }
4084 }
4085
4086 // Step through the equivalent characters finding simple ranges and
4087 // adding ranges to the character class.
4088 if (characters->length() > 0) {
4089 int new_from = characters->at(0);
4090 int new_to = new_from;
4091 for (int i = 1; i < characters->length(); i++) {
4092 int chr = characters->at(i);
4093 if (chr == new_to + 1) {
4094 new_to++;
4095 } else {
4096 if (new_to == new_from) {
4097 ranges->Add(CharacterRange::Singleton(new_from));
4098 } else {
4099 ranges->Add(CharacterRange(new_from, new_to));
4100 }
4101 new_from = new_to = chr;
4102 }
4103 }
4104 if (new_to == new_from) {
4105 ranges->Add(CharacterRange::Singleton(new_from));
4106 } else {
4107 ranges->Add(CharacterRange(new_from, new_to));
4108 }
3992 } 4109 }
3993 } 4110 }
3994 4111
3995 4112
3996 ZoneList<CharacterRange>* CharacterSet::ranges() { 4113 ZoneList<CharacterRange>* CharacterSet::ranges() {
3997 if (ranges_ == NULL) { 4114 if (ranges_ == NULL) {
3998 ranges_ = new ZoneList<CharacterRange>(2); 4115 ranges_ = new ZoneList<CharacterRange>(2);
3999 CharacterRange::AddClassEscape(standard_set_type_, ranges_); 4116 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4000 } 4117 }
4001 return ranges_; 4118 return ranges_;
(...skipping 225 matching lines...) Expand 10 before | Expand all | Expand 10 after
4227 } else { 4344 } else {
4228 cp_offset++; 4345 cp_offset++;
4229 Vector<const uc16> quarks = elm.data.u_atom->data(); 4346 Vector<const uc16> quarks = elm.data.u_atom->data();
4230 } 4347 }
4231 } 4348 }
4232 } 4349 }
4233 4350
4234 4351
4235 void Analysis::VisitText(TextNode* that) { 4352 void Analysis::VisitText(TextNode* that) {
4236 if (ignore_case_) { 4353 if (ignore_case_) {
4237 that->MakeCaseIndependent(); 4354 that->MakeCaseIndependent(is_ascii_);
4238 } 4355 }
4239 EnsureAnalyzed(that->on_success()); 4356 EnsureAnalyzed(that->on_success());
4240 if (!has_failed()) { 4357 if (!has_failed()) {
4241 that->CalculateOffsets(); 4358 that->CalculateOffsets();
4242 } 4359 }
4243 } 4360 }
4244 4361
4245 4362
4246 void Analysis::VisitAction(ActionNode* that) { 4363 void Analysis::VisitAction(ActionNode* that) {
4247 RegExpNode* target = that->on_success(); 4364 RegExpNode* target = that->on_success();
(...skipping 197 matching lines...) Expand 10 before | Expand all | Expand 10 after
4445 ChoiceNode* first_step_node = new ChoiceNode(2); 4562 ChoiceNode* first_step_node = new ChoiceNode(2);
4446 first_step_node->AddAlternative(GuardedAlternative(captured_body)); 4563 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4447 first_step_node->AddAlternative(GuardedAlternative( 4564 first_step_node->AddAlternative(GuardedAlternative(
4448 new TextNode(new RegExpCharacterClass('*'), loop_node))); 4565 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4449 node = first_step_node; 4566 node = first_step_node;
4450 } else { 4567 } else {
4451 node = loop_node; 4568 node = loop_node;
4452 } 4569 }
4453 } 4570 }
4454 data->node = node; 4571 data->node = node;
4455 Analysis analysis(ignore_case); 4572 Analysis analysis(ignore_case, is_ascii);
4456 analysis.EnsureAnalyzed(node); 4573 analysis.EnsureAnalyzed(node);
4457 if (analysis.has_failed()) { 4574 if (analysis.has_failed()) {
4458 const char* error_message = analysis.error_message(); 4575 const char* error_message = analysis.error_message();
4459 return CompilationResult(error_message); 4576 return CompilationResult(error_message);
4460 } 4577 }
4461 4578
4462 NodeInfo info = *node->info(); 4579 NodeInfo info = *node->info();
4463 4580
4464 // Create the correct assembler for the architecture. 4581 // Create the correct assembler for the architecture.
4465 #ifdef V8_NATIVE_REGEXP 4582 #ifdef V8_NATIVE_REGEXP
(...skipping 17 matching lines...) Expand all
4483 RegExpMacroAssemblerIrregexp macro_assembler(codes); 4600 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4484 #endif 4601 #endif
4485 4602
4486 return compiler.Assemble(&macro_assembler, 4603 return compiler.Assemble(&macro_assembler,
4487 node, 4604 node,
4488 data->capture_count, 4605 data->capture_count,
4489 pattern); 4606 pattern);
4490 } 4607 }
4491 4608
4492 }} // namespace v8::internal 4609 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/version.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698