src/jsregexp.cc - Issue 3030026: Updated unicode library.

Side by Side Diff: src/jsregexp.cc

Issue 3030026: Updated unicode library. (Closed)

Patch Set: Removed outdated comments. Created 10 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved.	1 // Copyright 2006-2009 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 1254 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1265 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;	1265 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;

1266 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;	1266 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;

1267	1267

1268	1268

1269 // Returns the number of characters in the equivalence class, omitting those	1269 // Returns the number of characters in the equivalence class, omitting those

1270 // that cannot occur in the source string because it is ASCII.	1270 // that cannot occur in the source string because it is ASCII.

1271 static int GetCaseIndependentLetters(uc16 character,	1271 static int GetCaseIndependentLetters(uc16 character,

1272 bool ascii_subject,	1272 bool ascii_subject,

1273 unibrow::uchar* letters) {	1273 unibrow::uchar* letters) {

1274 int length = uncanonicalize.get(character, '\0', letters);	1274 int length = uncanonicalize.get(character, '\0', letters);

1275 // Unibrow returns 0 or 1 for characters where case independependence is	1275 // Unibrow returns 0 or 1 for characters where case independence is

1276 // trivial.	1276 // trivial.

1277 if (length == 0) {	1277 if (length == 0) {

1278 letters[0] = character;	1278 letters[0] = character;

1279 length = 1;	1279 length = 1;

1280 }	1280 }

1281 if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {	1281 if (!ascii_subject \|\| character <= String::kMaxAsciiCharCode) {

1282 return length;	1282 return length;

1283 }	1283 }

1284 // The standard requires that non-ASCII characters cannot have ASCII	1284 // The standard requires that non-ASCII characters cannot have ASCII

1285 // character codes in their equivalence class.	1285 // character codes in their equivalence class.

(...skipping 2733 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4019 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];	4019 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];

4020 if (top == bottom) {	4020 if (top == bottom) {

4021 // If this is a singleton we just expand the one character.	4021 // If this is a singleton we just expand the one character.

4022 int length = uncanonicalize.get(bottom, '\0', chars);	4022 int length = uncanonicalize.get(bottom, '\0', chars);

4023 for (int i = 0; i < length; i++) {	4023 for (int i = 0; i < length; i++) {

4024 uc32 chr = chars[i];	4024 uc32 chr = chars[i];

4025 if (chr != bottom) {	4025 if (chr != bottom) {

4026 ranges->Add(CharacterRange::Singleton(chars[i]));	4026 ranges->Add(CharacterRange::Singleton(chars[i]));

4027 }	4027 }

4028 }	4028 }

4029 } else if (bottom <= kRangeCanonicalizeMax &&	4029 } else {

4030 top <= kRangeCanonicalizeMax) {

4031 // If this is a range we expand the characters block by block,	4030 // If this is a range we expand the characters block by block,

4032 // expanding contiguous subranges (blocks) one at a time.	4031 // expanding contiguous subranges (blocks) one at a time.

4033 // The approach is as follows. For a given start character we	4032 // The approach is as follows. For a given start character we

4034 // look up the block that contains it, for instance 'a' if the	4033 // look up the remainder of the block that contains it (represented

4035 // start character is 'c'. A block is characterized by the property	4034 // by the end point), for instance we find 'z' if the character

4036 // that all characters uncanonicalize in the same way as the first	4035 // is 'c'. A block is characterized by the property

4037 // element, except that each entry in the result is incremented	4036 // that all characters uncanonicalize in the same way, except that

4038 // by the distance from the first element. So a-z is a block	4037 // each entry in the result is incremented by the distance from the first

4039 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter	4038 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and

4040 // uncanonicalizes to ['a' + k, 'A' + k].	4039 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].

4041 // Once we've found the start point we look up its uncanonicalization	4040 // Once we've found the end point we look up its uncanonicalization

4042 // and produce a range for each element. For instance for [c-f]	4041 // and produce a range for each element. For instance for [c-f]

4043 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only	4042 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only

4044 // add a range if it is not already contained in the input, so [c-f]	4043 // add a range if it is not already contained in the input, so [c-f]

4045 // will be skipped but [C-F] will be added. If this range is not	4044 // will be skipped but [C-F] will be added. If this range is not

4046 // completely contained in a block we do this for all the blocks	4045 // completely contained in a block we do this for all the blocks

4047 // covered by the range.	4046 // covered by the range (handling characters that is not in a block

	4047 // as a "singleton block").

4048 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];	4048 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];

4049 // First, look up the block that contains the 'bottom' character.

4050 int length = canonrange.get(bottom, '\0', range);

4051 if (length == 0) {

4052 range[0] = bottom;

4053 } else {

4054 ASSERT_EQ(1, length);

4055 }

4056 int pos = bottom;	4049 int pos = bottom;

4057 // The start of the current block. Note that except for the first

4058 // iteration 'start' is always equal to 'pos'.

4059 int start;

4060 // If it is not the start point of a block the entry contains the

4061 // offset of the character from the start point.

4062 if ((range[0] & kStartMarker) == 0) {

4063 start = pos - range[0];

4064 } else {

4065 start = pos;

4066 }

4067 // Then we add the ranges one at a time, incrementing the current

4068 // position to be after the last block each time. The position

4069 // always points to the start of a block.

4070 while (pos < top) {	4050 while (pos < top) {

4071 length = canonrange.get(start, '\0', range);	4051 int length = canonrange.get(pos, '\0', range);

	4052 uc16 block_end;

4072 if (length == 0) {	4053 if (length == 0) {

4073 range[0] = start;	4054 block_end = pos;

4074 } else {	4055 } else {

4075 ASSERT_EQ(1, length);	4056 ASSERT_EQ(1, length);

	4057 block_end = range[0];

4076 }	4058 }

4077 ASSERT((range[0] & kStartMarker) != 0);

4078 // The start point of a block contains the distance to the end

4079 // of the range.

4080 int block_end = start + (range[0] & kPayloadMask) - 1;

4081 int end = (block_end > top) ? top : block_end;	4059 int end = (block_end > top) ? top : block_end;

4082 length = uncanonicalize.get(start, '\0', range);	4060 length = uncanonicalize.get(block_end, '\0', range);

4083 for (int i = 0; i < length; i++) {	4061 for (int i = 0; i < length; i++) {

4084 uc32 c = range[i];	4062 uc32 c = range[i];

4085 uc16 range_from = c + (pos - start);	4063 uc16 range_from = c - (block_end - pos);

4086 uc16 range_to = c + (end - start);	4064 uc16 range_to = c - (block_end - end);

4087 if (!(bottom <= range_from && range_to <= top)) {	4065 if (!(bottom <= range_from && range_to <= top)) {

4088 ranges->Add(CharacterRange(range_from, range_to));	4066 ranges->Add(CharacterRange(range_from, range_to));

4089 }	4067 }

4090 }	4068 }

4091 start = pos = block_end + 1;	4069 pos = end + 1;

4092 }	4070 }

4093 } else {

4094 // Unibrow ranges don't work for high characters due to the "2^11 bug".

4095 // Therefore we do something dumber for these ranges.

4096 AddUncanonicals(ranges, bottom, top);

4097 }	4071 }

4098 }	4072 }

4099	4073

4100	4074

4101 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {	4075 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {

4102 ASSERT_NOT_NULL(ranges);	4076 ASSERT_NOT_NULL(ranges);

4103 int n = ranges->length();	4077 int n = ranges->length();

4104 if (n <= 1) return true;	4078 if (n <= 1) return true;

4105 int max = ranges->at(0).to();	4079 int max = ranges->at(0).to();

4106 for (int i = 1; i < n; i++) {	4080 for (int i = 1; i < n; i++) {

(...skipping 94 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4201 // this table is correct.	4175 // this table is correct.

4202 // 0x0600 - 0x0fff	4176 // 0x0600 - 0x0fff

4203 // 0x1100 - 0x1cff	4177 // 0x1100 - 0x1cff

4204 // 0x2000 - 0x20ff	4178 // 0x2000 - 0x20ff

4205 // 0x2200 - 0x23ff	4179 // 0x2200 - 0x23ff

4206 // 0x2500 - 0x2bff	4180 // 0x2500 - 0x2bff

4207 // 0x2e00 - 0xa5ff	4181 // 0x2e00 - 0xa5ff

4208 // 0xa800 - 0xfaff	4182 // 0xa800 - 0xfaff

4209 // 0xfc00 - 0xfeff	4183 // 0xfc00 - 0xfeff

4210 const int boundary_count = 18;	4184 const int boundary_count = 18;

4211 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this	4185 int boundaries[] = {

4212 // array. This is to split up big ranges and not because they actually denote

4213 // a case-mapping-free-zone.

4214 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);

4215 const int kFirstRealCaselessZoneIndex = 2;

4216 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,

4217 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,	4186 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,

4218 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};	4187 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};

4219	4188

4220 // Special ASCII rule from spec can save us some work here.	4189 // Special ASCII rule from spec can save us some work here.

4221 if (bottom == 0x80 && top == 0xffff) return;	4190 if (bottom == 0x80 && top == 0xffff) return;

4222	4191

4223 // We have optimized support for this range.	4192 if (top <= boundaries[0]) {

4224 if (top <= CharacterRange::kRangeCanonicalizeMax) {

4225 CharacterRange range(bottom, top);	4193 CharacterRange range(bottom, top);

4226 range.AddCaseEquivalents(ranges, false);	4194 range.AddCaseEquivalents(ranges, false);

4227 return;	4195 return;

4228 }	4196 }

4229	4197

4230 // Split up very large ranges. This helps remove ranges where there are no	4198 // Split up very large ranges. This helps remove ranges where there are no

4231 // case mappings.	4199 // case mappings.

4232 for (int i = 0; i < boundary_count; i++) {	4200 for (int i = 0; i < boundary_count; i++) {

4233 if (bottom < boundaries[i] && top >= boundaries[i]) {	4201 if (bottom < boundaries[i] && top >= boundaries[i]) {

4234 AddUncanonicals(ranges, bottom, boundaries[i] - 1);	4202 AddUncanonicals(ranges, bottom, boundaries[i] - 1);

4235 AddUncanonicals(ranges, boundaries[i], top);	4203 AddUncanonicals(ranges, boundaries[i], top);

4236 return;	4204 return;

4237 }	4205 }

4238 }	4206 }

4239	4207

4240 // If we are completely in a zone with no case mappings then we are done.	4208 // If we are completely in a zone with no case mappings then we are done.

4241 // We start at 2 so as not to except the ASCII range from mappings.	4209 for (int i = 0; i < boundary_count; i += 2) {

4242 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {

4243 if (bottom >= boundaries[i] && top < boundaries[i + 1]) {	4210 if (bottom >= boundaries[i] && top < boundaries[i + 1]) {

4244 #ifdef DEBUG	4211 #ifdef DEBUG

4245 for (int j = bottom; j <= top; j++) {	4212 for (int j = bottom; j <= top; j++) {

4246 unsigned current_char = j;	4213 unsigned current_char = j;

4247 int length = uncanonicalize.get(current_char, '\0', chars);	4214 int length = uncanonicalize.get(current_char, '\0', chars);

4248 for (int k = 0; k < length; k++) {	4215 for (int k = 0; k < length; k++) {

4249 ASSERT(chars[k] == current_char);	4216 ASSERT(chars[k] == current_char);

4250 }	4217 }

4251 }	4218 }

4252 #endif	4219 #endif

(...skipping 1021 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5274 node,	5241 node,

5275 data->capture_count,	5242 data->capture_count,

5276 pattern);	5243 pattern);

5277 }	5244 }

5278	5245

5279	5246

5280 int OffsetsVector::static_offsets_vector_[	5247 int OffsetsVector::static_offsets_vector_[

5281 OffsetsVector::kStaticOffsetsVectorSize];	5248 OffsetsVector::kStaticOffsetsVectorSize];

5282	5249

5283 }} // namespace v8::internal	5250 }} // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »