OLD | NEW |
---|---|
1 // Copyright 2006-2009 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2009 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 2414 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2425 quick_check_performed_.Advance(by, compiler->ascii()); | 2425 quick_check_performed_.Advance(by, compiler->ascii()); |
2426 cp_offset_ += by; | 2426 cp_offset_ += by; |
2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { | 2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { |
2428 compiler->SetRegExpTooBig(); | 2428 compiler->SetRegExpTooBig(); |
2429 cp_offset_ = 0; | 2429 cp_offset_ = 0; |
2430 } | 2430 } |
2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); | 2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); |
2432 } | 2432 } |
2433 | 2433 |
2434 | 2434 |
2435 void TextNode::MakeCaseIndependent() { | 2435 void TextNode::MakeCaseIndependent(bool is_ascii) { |
2436 int element_count = elms_->length(); | 2436 int element_count = elms_->length(); |
2437 for (int i = 0; i < element_count; i++) { | 2437 for (int i = 0; i < element_count; i++) { |
2438 TextElement elm = elms_->at(i); | 2438 TextElement elm = elms_->at(i); |
2439 if (elm.type == TextElement::CHAR_CLASS) { | 2439 if (elm.type == TextElement::CHAR_CLASS) { |
2440 RegExpCharacterClass* cc = elm.data.u_char_class; | 2440 RegExpCharacterClass* cc = elm.data.u_char_class; |
2441 // None of the standard regexps is different in the case independent case | |
Christian Plesner Hansen
2009/11/09 09:56:52
s/regexps is/character classes are/
| |
2442 // and it slows us down if we don't know that. | |
2443 if (cc->is_standard()) continue; | |
2441 ZoneList<CharacterRange>* ranges = cc->ranges(); | 2444 ZoneList<CharacterRange>* ranges = cc->ranges(); |
2442 int range_count = ranges->length(); | 2445 int range_count = ranges->length(); |
2443 for (int j = 0; j < range_count; j++) { | 2446 for (int j = 0; j < range_count; j++) { |
2444 ranges->at(j).AddCaseEquivalents(ranges); | 2447 ranges->at(j).AddCaseEquivalents(ranges, is_ascii); |
2445 } | 2448 } |
2446 } | 2449 } |
2447 } | 2450 } |
2448 } | 2451 } |
2449 | 2452 |
2450 | 2453 |
2451 int TextNode::GreedyLoopTextLength() { | 2454 int TextNode::GreedyLoopTextLength() { |
2452 TextElement elm = elms_->at(elms_->length() - 1); | 2455 TextElement elm = elms_->at(elms_->length() - 1); |
2453 if (elm.type == TextElement::CHAR_CLASS) { | 2456 if (elm.type == TextElement::CHAR_CLASS) { |
2454 return elm.cp_offset + 1; | 2457 return elm.cp_offset + 1; |
(...skipping 1450 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3905 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); | 3908 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); |
3906 for (int i = 0; i < overlay.length(); i += 2) { | 3909 for (int i = 0; i < overlay.length(); i += 2) { |
3907 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), | 3910 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), |
3908 CharacterRangeSplitter::kInOverlay); | 3911 CharacterRangeSplitter::kInOverlay); |
3909 } | 3912 } |
3910 CharacterRangeSplitter callback(included, excluded); | 3913 CharacterRangeSplitter callback(included, excluded); |
3911 table.ForEach(&callback); | 3914 table.ForEach(&callback); |
3912 } | 3915 } |
3913 | 3916 |
3914 | 3917 |
3915 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) { | 3918 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
3919 int bottom, | |
3920 int top); | |
3921 | |
3922 | |
3923 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, | |
3924 bool is_ascii) { | |
3925 uc16 bottom = from(); | |
3926 uc16 top = to(); | |
3927 if (is_ascii) { | |
3928 if (bottom > String::kMaxAsciiCharCode) return; | |
3929 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; | |
3930 } | |
3916 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 3931 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
3917 if (IsSingleton()) { | 3932 if (top == bottom) { |
3918 // If this is a singleton we just expand the one character. | 3933 // If this is a singleton we just expand the one character. |
3919 int length = uncanonicalize.get(from(), '\0', chars); | 3934 int length = uncanonicalize.get(bottom, '\0', chars); |
3920 for (int i = 0; i < length; i++) { | 3935 for (int i = 0; i < length; i++) { |
3921 uc32 chr = chars[i]; | 3936 uc32 chr = chars[i]; |
3922 if (chr != from()) { | 3937 if (chr != bottom) { |
3923 ranges->Add(CharacterRange::Singleton(chars[i])); | 3938 ranges->Add(CharacterRange::Singleton(chars[i])); |
3924 } | 3939 } |
3925 } | 3940 } |
3926 } else if (from() <= kRangeCanonicalizeMax && | 3941 } else if (bottom <= kRangeCanonicalizeMax && |
3927 to() <= kRangeCanonicalizeMax) { | 3942 top <= kRangeCanonicalizeMax) { |
3928 // If this is a range we expand the characters block by block, | 3943 // If this is a range we expand the characters block by block, |
3929 // expanding contiguous subranges (blocks) one at a time. | 3944 // expanding contiguous subranges (blocks) one at a time. |
3930 // The approach is as follows. For a given start character we | 3945 // The approach is as follows. For a given start character we |
3931 // look up the block that contains it, for instance 'a' if the | 3946 // look up the block that contains it, for instance 'a' if the |
3932 // start character is 'c'. A block is characterized by the property | 3947 // start character is 'c'. A block is characterized by the property |
3933 // that all characters uncanonicalize in the same way as the first | 3948 // that all characters uncanonicalize in the same way as the first |
3934 // element, except that each entry in the result is incremented | 3949 // element, except that each entry in the result is incremented |
3935 // by the distance from the first element. So a-z is a block | 3950 // by the distance from the first element. So a-z is a block |
3936 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter | 3951 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter |
3937 // uncanonicalizes to ['a' + k, 'A' + k]. | 3952 // uncanonicalizes to ['a' + k, 'A' + k]. |
3938 // Once we've found the start point we look up its uncanonicalization | 3953 // Once we've found the start point we look up its uncanonicalization |
3939 // and produce a range for each element. For instance for [c-f] | 3954 // and produce a range for each element. For instance for [c-f] |
3940 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only | 3955 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only |
3941 // add a range if it is not already contained in the input, so [c-f] | 3956 // add a range if it is not already contained in the input, so [c-f] |
3942 // will be skipped but [C-F] will be added. If this range is not | 3957 // will be skipped but [C-F] will be added. If this range is not |
3943 // completely contained in a block we do this for all the blocks | 3958 // completely contained in a block we do this for all the blocks |
3944 // covered by the range. | 3959 // covered by the range. |
3945 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 3960 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
3946 // First, look up the block that contains the 'from' character. | 3961 // First, look up the block that contains the 'bottom' character. |
3947 int length = canonrange.get(from(), '\0', range); | 3962 int length = canonrange.get(bottom, '\0', range); |
3948 if (length == 0) { | 3963 if (length == 0) { |
3949 range[0] = from(); | 3964 range[0] = bottom; |
3950 } else { | 3965 } else { |
3951 ASSERT_EQ(1, length); | 3966 ASSERT_EQ(1, length); |
3952 } | 3967 } |
3953 int pos = from(); | 3968 int pos = bottom; |
3954 // The start of the current block. Note that except for the first | 3969 // The start of the current block. Note that except for the first |
3955 // iteration 'start' is always equal to 'pos'. | 3970 // iteration 'start' is always equal to 'pos'. |
3956 int start; | 3971 int start; |
3957 // If it is not the start point of a block the entry contains the | 3972 // If it is not the start point of a block the entry contains the |
3958 // offset of the character from the start point. | 3973 // offset of the character from the start point. |
3959 if ((range[0] & kStartMarker) == 0) { | 3974 if ((range[0] & kStartMarker) == 0) { |
3960 start = pos - range[0]; | 3975 start = pos - range[0]; |
3961 } else { | 3976 } else { |
3962 start = pos; | 3977 start = pos; |
3963 } | 3978 } |
3964 // Then we add the ranges one at a time, incrementing the current | 3979 // Then we add the ranges one at a time, incrementing the current |
3965 // position to be after the last block each time. The position | 3980 // position to be after the last block each time. The position |
3966 // always points to the start of a block. | 3981 // always points to the start of a block. |
3967 while (pos < to()) { | 3982 while (pos < top) { |
3968 length = canonrange.get(start, '\0', range); | 3983 length = canonrange.get(start, '\0', range); |
3969 if (length == 0) { | 3984 if (length == 0) { |
3970 range[0] = start; | 3985 range[0] = start; |
3971 } else { | 3986 } else { |
3972 ASSERT_EQ(1, length); | 3987 ASSERT_EQ(1, length); |
3973 } | 3988 } |
3974 ASSERT((range[0] & kStartMarker) != 0); | 3989 ASSERT((range[0] & kStartMarker) != 0); |
3975 // The start point of a block contains the distance to the end | 3990 // The start point of a block contains the distance to the end |
3976 // of the range. | 3991 // of the range. |
3977 int block_end = start + (range[0] & kPayloadMask) - 1; | 3992 int block_end = start + (range[0] & kPayloadMask) - 1; |
3978 int end = (block_end > to()) ? to() : block_end; | 3993 int end = (block_end > top) ? top : block_end; |
3979 length = uncanonicalize.get(start, '\0', range); | 3994 length = uncanonicalize.get(start, '\0', range); |
3980 for (int i = 0; i < length; i++) { | 3995 for (int i = 0; i < length; i++) { |
3981 uc32 c = range[i]; | 3996 uc32 c = range[i]; |
3982 uc16 range_from = c + (pos - start); | 3997 uc16 range_from = c + (pos - start); |
3983 uc16 range_to = c + (end - start); | 3998 uc16 range_to = c + (end - start); |
3984 if (!(from() <= range_from && range_to <= to())) { | 3999 if (!(bottom <= range_from && range_to <= top)) { |
3985 ranges->Add(CharacterRange(range_from, range_to)); | 4000 ranges->Add(CharacterRange(range_from, range_to)); |
3986 } | 4001 } |
3987 } | 4002 } |
3988 start = pos = block_end + 1; | 4003 start = pos = block_end + 1; |
3989 } | 4004 } |
3990 } else if (from() > 0 || to() < String::kMaxUC16CharCode) { | 4005 } else { |
3991 // Unibrow ranges don't work for high characters due to the "2^11 bug". | 4006 // Unibrow ranges don't work for high characters due to the "2^11 bug". |
3992 // Therefore we do something dumber for these ranges. We don't bother | 4007 // Therefore we do something dumber for these ranges. |
3993 // if the range is 0-max (as encountered at the start of an unanchored | 4008 AddUncanonicals(ranges, bottom, top); |
3994 // regexp). | 4009 } |
3995 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); | 4010 } |
3996 int bottom = from(); | 4011 |
3997 int top = to(); | 4012 |
3998 for (int i = bottom; i <= top; i++) { | 4013 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
3999 int length = uncanonicalize.get(i, '\0', chars); | 4014 int bottom, |
4000 for (int j = 0; j < length; j++) { | 4015 int top) { |
4001 uc32 chr = chars[j]; | 4016 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
4002 if (chr != i && chr < bottom || chr > top) { | 4017 // Zones with no case mappings: |
4003 characters->Add(chr); | 4018 // 0x0600 - 0x0fff |
4019 // 0x1100 - 0x1cff | |
4020 // 0x2000 - 0x20ff | |
4021 // 0x2200 - 0x23ff | |
4022 // 0x2500 - 0x2bff | |
4023 // 0x2e00 - 0xa5ff | |
4024 // 0xa800 - 0xfaff | |
4025 // 0xfc00 - 0xfeff | |
4026 const int boundary_count = 18; | |
4027 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this | |
4028 // array. This is to split up big ranges and not because they actually denote | |
4029 // a case-mapping-free-zone. | |
4030 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600); | |
4031 const int kFirstRealCaselessZoneIndex = 2; | |
4032 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax, | |
4033 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, | |
4034 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; | |
4035 | |
4036 // Special ASCII rule from spec can save us some work here. | |
4037 if (bottom == 0x80 && top == 0xffff) return; | |
4038 | |
4039 // We have optimized support for this range. | |
4040 if (top <= CharacterRange::kRangeCanonicalizeMax) { | |
4041 CharacterRange range(bottom, top); | |
4042 range.AddCaseEquivalents(ranges, false); | |
4043 return; | |
4044 } | |
4045 | |
4046 // Split up very large ranges. | |
4047 for (int i = 0; i < boundary_count; i++) { | |
4048 if (bottom < boundaries[i] && top >= boundaries[i]) { | |
4049 AddUncanonicals(ranges, bottom, boundaries[i] - 1); | |
4050 AddUncanonicals(ranges, boundaries[i], top); | |
4051 return; | |
4052 } | |
4053 } | |
4054 | |
4055 // If we are completely in a zone with no case mappings then we are done. | |
4056 // We start at 2 so as not to except the ASCII range from mappings. | |
4057 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { | |
4058 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { | |
4059 #ifdef DEBUG | |
4060 for (int j = bottom; j <= top; j++) { | |
4061 unsigned current_char = j; | |
4062 int length = uncanonicalize.get(current_char, '\0', chars); | |
4063 for (int k = 0; k < length; k++) { | |
4064 ASSERT(chars[k] == current_char); | |
4004 } | 4065 } |
4005 } | 4066 } |
4067 #endif | |
4068 return; | |
4006 } | 4069 } |
4007 if (characters->length() > 0) { | 4070 } |
4008 int new_from = characters->at(0); | 4071 |
4009 int new_to = new_from; | 4072 // Step through the range finding equivalent characters. |
4010 for (int i = 1; i < characters->length(); i++) { | 4073 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); |
4011 int chr = characters->at(i); | 4074 for (int i = bottom; i <= top; i++) { |
4012 if (chr == new_to + 1) { | 4075 int length = uncanonicalize.get(i, '\0', chars); |
4013 new_to++; | 4076 for (int j = 0; j < length; j++) { |
4077 uc32 chr = chars[j]; | |
4078 if (chr != i && chr < bottom || chr > top) { | |
4079 characters->Add(chr); | |
4080 } | |
4081 } | |
4082 } | |
4083 | |
4084 // Step through the equivalent characters finding simple ranges and | |
4085 // adding ranges to the character class. | |
4086 if (characters->length() > 0) { | |
4087 int new_from = characters->at(0); | |
4088 int new_to = new_from; | |
4089 for (int i = 1; i < characters->length(); i++) { | |
4090 int chr = characters->at(i); | |
4091 if (chr == new_to + 1) { | |
4092 new_to++; | |
4093 } else { | |
4094 if (new_to == new_from) { | |
4095 ranges->Add(CharacterRange::Singleton(new_from)); | |
4014 } else { | 4096 } else { |
4015 if (new_to == new_from) { | 4097 ranges->Add(CharacterRange(new_from, new_to)); |
4016 ranges->Add(CharacterRange::Singleton(new_from)); | |
4017 } else { | |
4018 ranges->Add(CharacterRange(new_from, new_to)); | |
4019 } | |
4020 new_from = new_to = chr; | |
4021 } | 4098 } |
4099 new_from = new_to = chr; | |
4022 } | 4100 } |
4023 if (new_to == new_from) { | 4101 } |
4024 ranges->Add(CharacterRange::Singleton(new_from)); | 4102 if (new_to == new_from) { |
4025 } else { | 4103 ranges->Add(CharacterRange::Singleton(new_from)); |
4026 ranges->Add(CharacterRange(new_from, new_to)); | 4104 } else { |
4027 } | 4105 ranges->Add(CharacterRange(new_from, new_to)); |
4028 } | 4106 } |
4029 } | 4107 } |
4030 } | 4108 } |
4031 | 4109 |
4032 | 4110 |
4033 ZoneList<CharacterRange>* CharacterSet::ranges() { | 4111 ZoneList<CharacterRange>* CharacterSet::ranges() { |
4034 if (ranges_ == NULL) { | 4112 if (ranges_ == NULL) { |
4035 ranges_ = new ZoneList<CharacterRange>(2); | 4113 ranges_ = new ZoneList<CharacterRange>(2); |
4036 CharacterRange::AddClassEscape(standard_set_type_, ranges_); | 4114 CharacterRange::AddClassEscape(standard_set_type_, ranges_); |
4037 } | 4115 } |
(...skipping 226 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4264 } else { | 4342 } else { |
4265 cp_offset++; | 4343 cp_offset++; |
4266 Vector<const uc16> quarks = elm.data.u_atom->data(); | 4344 Vector<const uc16> quarks = elm.data.u_atom->data(); |
4267 } | 4345 } |
4268 } | 4346 } |
4269 } | 4347 } |
4270 | 4348 |
4271 | 4349 |
4272 void Analysis::VisitText(TextNode* that) { | 4350 void Analysis::VisitText(TextNode* that) { |
4273 if (ignore_case_) { | 4351 if (ignore_case_) { |
4274 that->MakeCaseIndependent(); | 4352 that->MakeCaseIndependent(is_ascii_); |
4275 } | 4353 } |
4276 EnsureAnalyzed(that->on_success()); | 4354 EnsureAnalyzed(that->on_success()); |
4277 if (!has_failed()) { | 4355 if (!has_failed()) { |
4278 that->CalculateOffsets(); | 4356 that->CalculateOffsets(); |
4279 } | 4357 } |
4280 } | 4358 } |
4281 | 4359 |
4282 | 4360 |
4283 void Analysis::VisitAction(ActionNode* that) { | 4361 void Analysis::VisitAction(ActionNode* that) { |
4284 RegExpNode* target = that->on_success(); | 4362 RegExpNode* target = that->on_success(); |
(...skipping 197 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
4482 ChoiceNode* first_step_node = new ChoiceNode(2); | 4560 ChoiceNode* first_step_node = new ChoiceNode(2); |
4483 first_step_node->AddAlternative(GuardedAlternative(captured_body)); | 4561 first_step_node->AddAlternative(GuardedAlternative(captured_body)); |
4484 first_step_node->AddAlternative(GuardedAlternative( | 4562 first_step_node->AddAlternative(GuardedAlternative( |
4485 new TextNode(new RegExpCharacterClass('*'), loop_node))); | 4563 new TextNode(new RegExpCharacterClass('*'), loop_node))); |
4486 node = first_step_node; | 4564 node = first_step_node; |
4487 } else { | 4565 } else { |
4488 node = loop_node; | 4566 node = loop_node; |
4489 } | 4567 } |
4490 } | 4568 } |
4491 data->node = node; | 4569 data->node = node; |
4492 Analysis analysis(ignore_case); | 4570 Analysis analysis(ignore_case, is_ascii); |
4493 analysis.EnsureAnalyzed(node); | 4571 analysis.EnsureAnalyzed(node); |
4494 if (analysis.has_failed()) { | 4572 if (analysis.has_failed()) { |
4495 const char* error_message = analysis.error_message(); | 4573 const char* error_message = analysis.error_message(); |
4496 return CompilationResult(error_message); | 4574 return CompilationResult(error_message); |
4497 } | 4575 } |
4498 | 4576 |
4499 NodeInfo info = *node->info(); | 4577 NodeInfo info = *node->info(); |
4500 | 4578 |
4501 // Create the correct assembler for the architecture. | 4579 // Create the correct assembler for the architecture. |
4502 #ifdef V8_NATIVE_REGEXP | 4580 #ifdef V8_NATIVE_REGEXP |
(...skipping 17 matching lines...) Expand all Loading... | |
4520 RegExpMacroAssemblerIrregexp macro_assembler(codes); | 4598 RegExpMacroAssemblerIrregexp macro_assembler(codes); |
4521 #endif | 4599 #endif |
4522 | 4600 |
4523 return compiler.Assemble(¯o_assembler, | 4601 return compiler.Assemble(¯o_assembler, |
4524 node, | 4602 node, |
4525 data->capture_count, | 4603 data->capture_count, |
4526 pattern); | 4604 pattern); |
4527 } | 4605 } |
4528 | 4606 |
4529 }} // namespace v8::internal | 4607 }} // namespace v8::internal |
OLD | NEW |