src/jsregexp.cc - Issue 361033: Fix bug 486, Cyrillic character ranges in case independent regexps....

Side by Side Diff: src/jsregexp.cc

Issue 361033: Fix bug 486, Cyrillic character ranges in case independent regexps.... (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/

Patch Set: '' Created 11 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved.	1 // Copyright 2006-2009 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 2422 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2433	2433

2434	2434

2435 void TextNode::MakeCaseIndependent() {	2435 void TextNode::MakeCaseIndependent() {

2436 int element_count = elms_->length();	2436 int element_count = elms_->length();

2437 for (int i = 0; i < element_count; i++) {	2437 for (int i = 0; i < element_count; i++) {

2438 TextElement elm = elms_->at(i);	2438 TextElement elm = elms_->at(i);

2439 if (elm.type == TextElement::CHAR_CLASS) {	2439 if (elm.type == TextElement::CHAR_CLASS) {

2440 RegExpCharacterClass* cc = elm.data.u_char_class;	2440 RegExpCharacterClass* cc = elm.data.u_char_class;

2441 ZoneList<CharacterRange>* ranges = cc->ranges();	2441 ZoneList<CharacterRange>* ranges = cc->ranges();

2442 int range_count = ranges->length();	2442 int range_count = ranges->length();

2443 for (int i = 0; i < range_count; i++) {	2443 for (int j = 0; j < range_count; j++) {

2444 ranges->at(i).AddCaseEquivalents(ranges);	2444 ranges->at(j).AddCaseEquivalents(ranges);

2445 }	2445 }

2446 }	2446 }

2447 }	2447 }

2448 }	2448 }

2449	2449

2450	2450

2451 int TextNode::GreedyLoopTextLength() {	2451 int TextNode::GreedyLoopTextLength() {

2452 TextElement elm = elms_->at(elms_->length() - 1);	2452 TextElement elm = elms_->at(elms_->length() - 1);

2453 if (elm.type == TextElement::CHAR_CLASS) {	2453 if (elm.type == TextElement::CHAR_CLASS) {

2454 return elm.cp_offset + 1;	2454 return elm.cp_offset + 1;

(...skipping 1499 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3954 // The start of the current block. Note that except for the first	3954 // The start of the current block. Note that except for the first

3955 // iteration 'start' is always equal to 'pos'.	3955 // iteration 'start' is always equal to 'pos'.

3956 int start;	3956 int start;

3957 // If it is not the start point of a block the entry contains the	3957 // If it is not the start point of a block the entry contains the

3958 // offset of the character from the start point.	3958 // offset of the character from the start point.

3959 if ((range[0] & kStartMarker) == 0) {	3959 if ((range[0] & kStartMarker) == 0) {

3960 start = pos - range[0];	3960 start = pos - range[0];

3961 } else {	3961 } else {

3962 start = pos;	3962 start = pos;

3963 }	3963 }

3964 // Then we add the ranges on at a time, incrementing the current	3964 // Then we add the ranges one at a time, incrementing the current

3965 // position to be after the last block each time. The position	3965 // position to be after the last block each time. The position

3966 // always points to the start of a block.	3966 // always points to the start of a block.

3967 while (pos < to()) {	3967 while (pos < to()) {

3968 length = canonrange.get(start, '\0', range);	3968 length = canonrange.get(start, '\0', range);

3969 if (length == 0) {	3969 if (length == 0) {

3970 range[0] = start;	3970 range[0] = start;

3971 } else {	3971 } else {

3972 ASSERT_EQ(1, length);	3972 ASSERT_EQ(1, length);

3973 }	3973 }

3974 ASSERT((range[0] & kStartMarker) != 0);	3974 ASSERT((range[0] & kStartMarker) != 0);

3975 // The start point of a block contains the distance to the end	3975 // The start point of a block contains the distance to the end

3976 // of the range.	3976 // of the range.

3977 int block_end = start + (range[0] & kPayloadMask) - 1;	3977 int block_end = start + (range[0] & kPayloadMask) - 1;

3978 int end = (block_end > to()) ? to() : block_end;	3978 int end = (block_end > to()) ? to() : block_end;

3979 length = uncanonicalize.get(start, '\0', range);	3979 length = uncanonicalize.get(start, '\0', range);

3980 for (int i = 0; i < length; i++) {	3980 for (int i = 0; i < length; i++) {

3981 uc32 c = range[i];	3981 uc32 c = range[i];

3982 uc16 range_from = c + (pos - start);	3982 uc16 range_from = c + (pos - start);

3983 uc16 range_to = c + (end - start);	3983 uc16 range_to = c + (end - start);

3984 if (!(from() <= range_from && range_to <= to())) {	3984 if (!(from() <= range_from && range_to <= to())) {

3985 ranges->Add(CharacterRange(range_from, range_to));	3985 ranges->Add(CharacterRange(range_from, range_to));

3986 }	3986 }

3987 }	3987 }

3988 start = pos = block_end + 1;	3988 start = pos = block_end + 1;

3989 }	3989 }

3990 } else {	3990 } else if (from() > 0 \|\| to() < String::kMaxUC16CharCode) {

3991 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.	3991 // Unibrow ranges don't work for high characters due to the "2^11 bug".

	3992 // Therefore we do something dumber for these ranges. We don't bother

	3993 // if the range is 0-max (as encountered at the start of an unanchored

	3994 // regexp).

	3995 ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);

	3996 int bottom = from();

	3997 int top = to();

	3998 for (int i = bottom; i <= top; i++) {

	3999 int length = uncanonicalize.get(i, '\0', chars);

	4000 for (int j = 0; j < length; j++) {

	4001 uc32 chr = chars[j];

	4002 if (chr != i && chr < bottom \|\| chr > top) {

	4003 characters->Add(chr);

	4004 }

	4005 }

	4006 }

	4007 if (characters->length() > 0) {

	4008 int new_from = characters->at(0);

	4009 int new_to = new_from;

	4010 for (int i = 1; i < characters->length(); i++) {

	4011 int chr = characters->at(i);

	4012 if (chr == new_to + 1) {

	4013 new_to++;

	4014 } else {

	4015 if (new_to == new_from) {

	4016 ranges->Add(CharacterRange::Singleton(new_from));

	4017 } else {

	4018 ranges->Add(CharacterRange(new_from, new_to));

	4019 }

	4020 new_from = new_to = chr;

	4021 }

	4022 }

	4023 if (new_to == new_from) {

	4024 ranges->Add(CharacterRange::Singleton(new_from));

	4025 } else {

	4026 ranges->Add(CharacterRange(new_from, new_to));

	4027 }

	4028 }

3992 }	4029 }

3993 }	4030 }

3994	4031

3995	4032

3996 ZoneList<CharacterRange>* CharacterSet::ranges() {	4033 ZoneList<CharacterRange>* CharacterSet::ranges() {

3997 if (ranges_ == NULL) {	4034 if (ranges_ == NULL) {

3998 ranges_ = new ZoneList<CharacterRange>(2);	4035 ranges_ = new ZoneList<CharacterRange>(2);

3999 CharacterRange::AddClassEscape(standard_set_type_, ranges_);	4036 CharacterRange::AddClassEscape(standard_set_type_, ranges_);

4000 }	4037 }

4001 return ranges_;	4038 return ranges_;

(...skipping 481 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4483 RegExpMacroAssemblerIrregexp macro_assembler(codes);	4520 RegExpMacroAssemblerIrregexp macro_assembler(codes);

4484 #endif	4521 #endif

4485	4522

4486 return compiler.Assemble(&macro_assembler,	4523 return compiler.Assemble(&macro_assembler,

4487 node,	4524 node,

4488 data->capture_count,	4525 data->capture_count,

4489 pattern);	4526 pattern);

4490 }	4527 }

4491	4528

4492 }} // namespace v8::internal	4529 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « no previous file | test/mjsunit/cyrillic.js » ('j') | test/mjsunit/regress/regress-486.js » ('J')