| OLD | NEW |
| 1 // Copyright 2006-2009 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2009 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 2414 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2425 quick_check_performed_.Advance(by, compiler->ascii()); | 2425 quick_check_performed_.Advance(by, compiler->ascii()); |
| 2426 cp_offset_ += by; | 2426 cp_offset_ += by; |
| 2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { | 2427 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) { |
| 2428 compiler->SetRegExpTooBig(); | 2428 compiler->SetRegExpTooBig(); |
| 2429 cp_offset_ = 0; | 2429 cp_offset_ = 0; |
| 2430 } | 2430 } |
| 2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); | 2431 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by); |
| 2432 } | 2432 } |
| 2433 | 2433 |
| 2434 | 2434 |
| 2435 void TextNode::MakeCaseIndependent() { | 2435 void TextNode::MakeCaseIndependent(bool is_ascii) { |
| 2436 int element_count = elms_->length(); | 2436 int element_count = elms_->length(); |
| 2437 for (int i = 0; i < element_count; i++) { | 2437 for (int i = 0; i < element_count; i++) { |
| 2438 TextElement elm = elms_->at(i); | 2438 TextElement elm = elms_->at(i); |
| 2439 if (elm.type == TextElement::CHAR_CLASS) { | 2439 if (elm.type == TextElement::CHAR_CLASS) { |
| 2440 RegExpCharacterClass* cc = elm.data.u_char_class; | 2440 RegExpCharacterClass* cc = elm.data.u_char_class; |
| 2441 // None of the standard character classses is different in the case |
| 2442 // independent case and it slows us down if we don't know that. |
| 2443 if (cc->is_standard()) continue; |
| 2441 ZoneList<CharacterRange>* ranges = cc->ranges(); | 2444 ZoneList<CharacterRange>* ranges = cc->ranges(); |
| 2442 int range_count = ranges->length(); | 2445 int range_count = ranges->length(); |
| 2443 for (int i = 0; i < range_count; i++) { | 2446 for (int j = 0; j < range_count; j++) { |
| 2444 ranges->at(i).AddCaseEquivalents(ranges); | 2447 ranges->at(j).AddCaseEquivalents(ranges, is_ascii); |
| 2445 } | 2448 } |
| 2446 } | 2449 } |
| 2447 } | 2450 } |
| 2448 } | 2451 } |
| 2449 | 2452 |
| 2450 | 2453 |
| 2451 int TextNode::GreedyLoopTextLength() { | 2454 int TextNode::GreedyLoopTextLength() { |
| 2452 TextElement elm = elms_->at(elms_->length() - 1); | 2455 TextElement elm = elms_->at(elms_->length() - 1); |
| 2453 if (elm.type == TextElement::CHAR_CLASS) { | 2456 if (elm.type == TextElement::CHAR_CLASS) { |
| 2454 return elm.cp_offset + 1; | 2457 return elm.cp_offset + 1; |
| (...skipping 1450 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3905 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); | 3908 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase); |
| 3906 for (int i = 0; i < overlay.length(); i += 2) { | 3909 for (int i = 0; i < overlay.length(); i += 2) { |
| 3907 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), | 3910 table.AddRange(CharacterRange(overlay[i], overlay[i+1]), |
| 3908 CharacterRangeSplitter::kInOverlay); | 3911 CharacterRangeSplitter::kInOverlay); |
| 3909 } | 3912 } |
| 3910 CharacterRangeSplitter callback(included, excluded); | 3913 CharacterRangeSplitter callback(included, excluded); |
| 3911 table.ForEach(&callback); | 3914 table.ForEach(&callback); |
| 3912 } | 3915 } |
| 3913 | 3916 |
| 3914 | 3917 |
| 3915 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) { | 3918 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
| 3919 int bottom, |
| 3920 int top); |
| 3921 |
| 3922 |
| 3923 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges, |
| 3924 bool is_ascii) { |
| 3925 uc16 bottom = from(); |
| 3926 uc16 top = to(); |
| 3927 if (is_ascii) { |
| 3928 if (bottom > String::kMaxAsciiCharCode) return; |
| 3929 if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode; |
| 3930 } |
| 3916 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 3931 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 3917 if (IsSingleton()) { | 3932 if (top == bottom) { |
| 3918 // If this is a singleton we just expand the one character. | 3933 // If this is a singleton we just expand the one character. |
| 3919 int length = uncanonicalize.get(from(), '\0', chars); | 3934 int length = uncanonicalize.get(bottom, '\0', chars); |
| 3920 for (int i = 0; i < length; i++) { | 3935 for (int i = 0; i < length; i++) { |
| 3921 uc32 chr = chars[i]; | 3936 uc32 chr = chars[i]; |
| 3922 if (chr != from()) { | 3937 if (chr != bottom) { |
| 3923 ranges->Add(CharacterRange::Singleton(chars[i])); | 3938 ranges->Add(CharacterRange::Singleton(chars[i])); |
| 3924 } | 3939 } |
| 3925 } | 3940 } |
| 3926 } else if (from() <= kRangeCanonicalizeMax && | 3941 } else if (bottom <= kRangeCanonicalizeMax && |
| 3927 to() <= kRangeCanonicalizeMax) { | 3942 top <= kRangeCanonicalizeMax) { |
| 3928 // If this is a range we expand the characters block by block, | 3943 // If this is a range we expand the characters block by block, |
| 3929 // expanding contiguous subranges (blocks) one at a time. | 3944 // expanding contiguous subranges (blocks) one at a time. |
| 3930 // The approach is as follows. For a given start character we | 3945 // The approach is as follows. For a given start character we |
| 3931 // look up the block that contains it, for instance 'a' if the | 3946 // look up the block that contains it, for instance 'a' if the |
| 3932 // start character is 'c'. A block is characterized by the property | 3947 // start character is 'c'. A block is characterized by the property |
| 3933 // that all characters uncanonicalize in the same way as the first | 3948 // that all characters uncanonicalize in the same way as the first |
| 3934 // element, except that each entry in the result is incremented | 3949 // element, except that each entry in the result is incremented |
| 3935 // by the distance from the first element. So a-z is a block | 3950 // by the distance from the first element. So a-z is a block |
| 3936 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter | 3951 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter |
| 3937 // uncanonicalizes to ['a' + k, 'A' + k]. | 3952 // uncanonicalizes to ['a' + k, 'A' + k]. |
| 3938 // Once we've found the start point we look up its uncanonicalization | 3953 // Once we've found the start point we look up its uncanonicalization |
| 3939 // and produce a range for each element. For instance for [c-f] | 3954 // and produce a range for each element. For instance for [c-f] |
| 3940 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only | 3955 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only |
| 3941 // add a range if it is not already contained in the input, so [c-f] | 3956 // add a range if it is not already contained in the input, so [c-f] |
| 3942 // will be skipped but [C-F] will be added. If this range is not | 3957 // will be skipped but [C-F] will be added. If this range is not |
| 3943 // completely contained in a block we do this for all the blocks | 3958 // completely contained in a block we do this for all the blocks |
| 3944 // covered by the range. | 3959 // covered by the range. |
| 3945 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 3960 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 3946 // First, look up the block that contains the 'from' character. | 3961 // First, look up the block that contains the 'bottom' character. |
| 3947 int length = canonrange.get(from(), '\0', range); | 3962 int length = canonrange.get(bottom, '\0', range); |
| 3948 if (length == 0) { | 3963 if (length == 0) { |
| 3949 range[0] = from(); | 3964 range[0] = bottom; |
| 3950 } else { | 3965 } else { |
| 3951 ASSERT_EQ(1, length); | 3966 ASSERT_EQ(1, length); |
| 3952 } | 3967 } |
| 3953 int pos = from(); | 3968 int pos = bottom; |
| 3954 // The start of the current block. Note that except for the first | 3969 // The start of the current block. Note that except for the first |
| 3955 // iteration 'start' is always equal to 'pos'. | 3970 // iteration 'start' is always equal to 'pos'. |
| 3956 int start; | 3971 int start; |
| 3957 // If it is not the start point of a block the entry contains the | 3972 // If it is not the start point of a block the entry contains the |
| 3958 // offset of the character from the start point. | 3973 // offset of the character from the start point. |
| 3959 if ((range[0] & kStartMarker) == 0) { | 3974 if ((range[0] & kStartMarker) == 0) { |
| 3960 start = pos - range[0]; | 3975 start = pos - range[0]; |
| 3961 } else { | 3976 } else { |
| 3962 start = pos; | 3977 start = pos; |
| 3963 } | 3978 } |
| 3964 // Then we add the ranges on at a time, incrementing the current | 3979 // Then we add the ranges one at a time, incrementing the current |
| 3965 // position to be after the last block each time. The position | 3980 // position to be after the last block each time. The position |
| 3966 // always points to the start of a block. | 3981 // always points to the start of a block. |
| 3967 while (pos < to()) { | 3982 while (pos < top) { |
| 3968 length = canonrange.get(start, '\0', range); | 3983 length = canonrange.get(start, '\0', range); |
| 3969 if (length == 0) { | 3984 if (length == 0) { |
| 3970 range[0] = start; | 3985 range[0] = start; |
| 3971 } else { | 3986 } else { |
| 3972 ASSERT_EQ(1, length); | 3987 ASSERT_EQ(1, length); |
| 3973 } | 3988 } |
| 3974 ASSERT((range[0] & kStartMarker) != 0); | 3989 ASSERT((range[0] & kStartMarker) != 0); |
| 3975 // The start point of a block contains the distance to the end | 3990 // The start point of a block contains the distance to the end |
| 3976 // of the range. | 3991 // of the range. |
| 3977 int block_end = start + (range[0] & kPayloadMask) - 1; | 3992 int block_end = start + (range[0] & kPayloadMask) - 1; |
| 3978 int end = (block_end > to()) ? to() : block_end; | 3993 int end = (block_end > top) ? top : block_end; |
| 3979 length = uncanonicalize.get(start, '\0', range); | 3994 length = uncanonicalize.get(start, '\0', range); |
| 3980 for (int i = 0; i < length; i++) { | 3995 for (int i = 0; i < length; i++) { |
| 3981 uc32 c = range[i]; | 3996 uc32 c = range[i]; |
| 3982 uc16 range_from = c + (pos - start); | 3997 uc16 range_from = c + (pos - start); |
| 3983 uc16 range_to = c + (end - start); | 3998 uc16 range_to = c + (end - start); |
| 3984 if (!(from() <= range_from && range_to <= to())) { | 3999 if (!(bottom <= range_from && range_to <= top)) { |
| 3985 ranges->Add(CharacterRange(range_from, range_to)); | 4000 ranges->Add(CharacterRange(range_from, range_to)); |
| 3986 } | 4001 } |
| 3987 } | 4002 } |
| 3988 start = pos = block_end + 1; | 4003 start = pos = block_end + 1; |
| 3989 } | 4004 } |
| 3990 } else { | 4005 } else { |
| 3991 // TODO(plesner) when we've fixed the 2^11 bug in unibrow. | 4006 // Unibrow ranges don't work for high characters due to the "2^11 bug". |
| 4007 // Therefore we do something dumber for these ranges. |
| 4008 AddUncanonicals(ranges, bottom, top); |
| 4009 } |
| 4010 } |
| 4011 |
| 4012 |
| 4013 static void AddUncanonicals(ZoneList<CharacterRange>* ranges, |
| 4014 int bottom, |
| 4015 int top) { |
| 4016 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 4017 // Zones with no case mappings. There is a DEBUG-mode loop to assert that |
| 4018 // this table is correct. |
| 4019 // 0x0600 - 0x0fff |
| 4020 // 0x1100 - 0x1cff |
| 4021 // 0x2000 - 0x20ff |
| 4022 // 0x2200 - 0x23ff |
| 4023 // 0x2500 - 0x2bff |
| 4024 // 0x2e00 - 0xa5ff |
| 4025 // 0xa800 - 0xfaff |
| 4026 // 0xfc00 - 0xfeff |
| 4027 const int boundary_count = 18; |
| 4028 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this |
| 4029 // array. This is to split up big ranges and not because they actually denote |
| 4030 // a case-mapping-free-zone. |
| 4031 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600); |
| 4032 const int kFirstRealCaselessZoneIndex = 2; |
| 4033 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax, |
| 4034 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, |
| 4035 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; |
| 4036 |
| 4037 // Special ASCII rule from spec can save us some work here. |
| 4038 if (bottom == 0x80 && top == 0xffff) return; |
| 4039 |
| 4040 // We have optimized support for this range. |
| 4041 if (top <= CharacterRange::kRangeCanonicalizeMax) { |
| 4042 CharacterRange range(bottom, top); |
| 4043 range.AddCaseEquivalents(ranges, false); |
| 4044 return; |
| 4045 } |
| 4046 |
| 4047 // Split up very large ranges. This helps remove ranges where there are no |
| 4048 // case mappings. |
| 4049 for (int i = 0; i < boundary_count; i++) { |
| 4050 if (bottom < boundaries[i] && top >= boundaries[i]) { |
| 4051 AddUncanonicals(ranges, bottom, boundaries[i] - 1); |
| 4052 AddUncanonicals(ranges, boundaries[i], top); |
| 4053 return; |
| 4054 } |
| 4055 } |
| 4056 |
| 4057 // If we are completely in a zone with no case mappings then we are done. |
| 4058 // We start at 2 so as not to except the ASCII range from mappings. |
| 4059 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { |
| 4060 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { |
| 4061 #ifdef DEBUG |
| 4062 for (int j = bottom; j <= top; j++) { |
| 4063 unsigned current_char = j; |
| 4064 int length = uncanonicalize.get(current_char, '\0', chars); |
| 4065 for (int k = 0; k < length; k++) { |
| 4066 ASSERT(chars[k] == current_char); |
| 4067 } |
| 4068 } |
| 4069 #endif |
| 4070 return; |
| 4071 } |
| 4072 } |
| 4073 |
| 4074 // Step through the range finding equivalent characters. |
| 4075 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100); |
| 4076 for (int i = bottom; i <= top; i++) { |
| 4077 int length = uncanonicalize.get(i, '\0', chars); |
| 4078 for (int j = 0; j < length; j++) { |
| 4079 uc32 chr = chars[j]; |
| 4080 if (chr != i && (chr < bottom || chr > top)) { |
| 4081 characters->Add(chr); |
| 4082 } |
| 4083 } |
| 4084 } |
| 4085 |
| 4086 // Step through the equivalent characters finding simple ranges and |
| 4087 // adding ranges to the character class. |
| 4088 if (characters->length() > 0) { |
| 4089 int new_from = characters->at(0); |
| 4090 int new_to = new_from; |
| 4091 for (int i = 1; i < characters->length(); i++) { |
| 4092 int chr = characters->at(i); |
| 4093 if (chr == new_to + 1) { |
| 4094 new_to++; |
| 4095 } else { |
| 4096 if (new_to == new_from) { |
| 4097 ranges->Add(CharacterRange::Singleton(new_from)); |
| 4098 } else { |
| 4099 ranges->Add(CharacterRange(new_from, new_to)); |
| 4100 } |
| 4101 new_from = new_to = chr; |
| 4102 } |
| 4103 } |
| 4104 if (new_to == new_from) { |
| 4105 ranges->Add(CharacterRange::Singleton(new_from)); |
| 4106 } else { |
| 4107 ranges->Add(CharacterRange(new_from, new_to)); |
| 4108 } |
| 3992 } | 4109 } |
| 3993 } | 4110 } |
| 3994 | 4111 |
| 3995 | 4112 |
| 3996 ZoneList<CharacterRange>* CharacterSet::ranges() { | 4113 ZoneList<CharacterRange>* CharacterSet::ranges() { |
| 3997 if (ranges_ == NULL) { | 4114 if (ranges_ == NULL) { |
| 3998 ranges_ = new ZoneList<CharacterRange>(2); | 4115 ranges_ = new ZoneList<CharacterRange>(2); |
| 3999 CharacterRange::AddClassEscape(standard_set_type_, ranges_); | 4116 CharacterRange::AddClassEscape(standard_set_type_, ranges_); |
| 4000 } | 4117 } |
| 4001 return ranges_; | 4118 return ranges_; |
| (...skipping 225 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4227 } else { | 4344 } else { |
| 4228 cp_offset++; | 4345 cp_offset++; |
| 4229 Vector<const uc16> quarks = elm.data.u_atom->data(); | 4346 Vector<const uc16> quarks = elm.data.u_atom->data(); |
| 4230 } | 4347 } |
| 4231 } | 4348 } |
| 4232 } | 4349 } |
| 4233 | 4350 |
| 4234 | 4351 |
| 4235 void Analysis::VisitText(TextNode* that) { | 4352 void Analysis::VisitText(TextNode* that) { |
| 4236 if (ignore_case_) { | 4353 if (ignore_case_) { |
| 4237 that->MakeCaseIndependent(); | 4354 that->MakeCaseIndependent(is_ascii_); |
| 4238 } | 4355 } |
| 4239 EnsureAnalyzed(that->on_success()); | 4356 EnsureAnalyzed(that->on_success()); |
| 4240 if (!has_failed()) { | 4357 if (!has_failed()) { |
| 4241 that->CalculateOffsets(); | 4358 that->CalculateOffsets(); |
| 4242 } | 4359 } |
| 4243 } | 4360 } |
| 4244 | 4361 |
| 4245 | 4362 |
| 4246 void Analysis::VisitAction(ActionNode* that) { | 4363 void Analysis::VisitAction(ActionNode* that) { |
| 4247 RegExpNode* target = that->on_success(); | 4364 RegExpNode* target = that->on_success(); |
| (...skipping 197 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4445 ChoiceNode* first_step_node = new ChoiceNode(2); | 4562 ChoiceNode* first_step_node = new ChoiceNode(2); |
| 4446 first_step_node->AddAlternative(GuardedAlternative(captured_body)); | 4563 first_step_node->AddAlternative(GuardedAlternative(captured_body)); |
| 4447 first_step_node->AddAlternative(GuardedAlternative( | 4564 first_step_node->AddAlternative(GuardedAlternative( |
| 4448 new TextNode(new RegExpCharacterClass('*'), loop_node))); | 4565 new TextNode(new RegExpCharacterClass('*'), loop_node))); |
| 4449 node = first_step_node; | 4566 node = first_step_node; |
| 4450 } else { | 4567 } else { |
| 4451 node = loop_node; | 4568 node = loop_node; |
| 4452 } | 4569 } |
| 4453 } | 4570 } |
| 4454 data->node = node; | 4571 data->node = node; |
| 4455 Analysis analysis(ignore_case); | 4572 Analysis analysis(ignore_case, is_ascii); |
| 4456 analysis.EnsureAnalyzed(node); | 4573 analysis.EnsureAnalyzed(node); |
| 4457 if (analysis.has_failed()) { | 4574 if (analysis.has_failed()) { |
| 4458 const char* error_message = analysis.error_message(); | 4575 const char* error_message = analysis.error_message(); |
| 4459 return CompilationResult(error_message); | 4576 return CompilationResult(error_message); |
| 4460 } | 4577 } |
| 4461 | 4578 |
| 4462 NodeInfo info = *node->info(); | 4579 NodeInfo info = *node->info(); |
| 4463 | 4580 |
| 4464 // Create the correct assembler for the architecture. | 4581 // Create the correct assembler for the architecture. |
| 4465 #ifdef V8_NATIVE_REGEXP | 4582 #ifdef V8_NATIVE_REGEXP |
| (...skipping 17 matching lines...) Expand all Loading... |
| 4483 RegExpMacroAssemblerIrregexp macro_assembler(codes); | 4600 RegExpMacroAssemblerIrregexp macro_assembler(codes); |
| 4484 #endif | 4601 #endif |
| 4485 | 4602 |
| 4486 return compiler.Assemble(¯o_assembler, | 4603 return compiler.Assemble(¯o_assembler, |
| 4487 node, | 4604 node, |
| 4488 data->capture_count, | 4605 data->capture_count, |
| 4489 pattern); | 4606 pattern); |
| 4490 } | 4607 } |
| 4491 | 4608 |
| 4492 }} // namespace v8::internal | 4609 }} // namespace v8::internal |
| OLD | NEW |