Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(595)

Side by Side Diff: src/jsregexp.cc

Issue 3030026: Updated unicode library. (Closed)
Patch Set: Removed outdated comments. Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved. 1 // Copyright 2006-2009 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 1254 matching lines...) Expand 10 before | Expand all | Expand 10 after
1265 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; 1265 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1266 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; 1266 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1267 1267
1268 1268
1269 // Returns the number of characters in the equivalence class, omitting those 1269 // Returns the number of characters in the equivalence class, omitting those
1270 // that cannot occur in the source string because it is ASCII. 1270 // that cannot occur in the source string because it is ASCII.
1271 static int GetCaseIndependentLetters(uc16 character, 1271 static int GetCaseIndependentLetters(uc16 character,
1272 bool ascii_subject, 1272 bool ascii_subject,
1273 unibrow::uchar* letters) { 1273 unibrow::uchar* letters) {
1274 int length = uncanonicalize.get(character, '\0', letters); 1274 int length = uncanonicalize.get(character, '\0', letters);
1275 // Unibrow returns 0 or 1 for characters where case independependence is 1275 // Unibrow returns 0 or 1 for characters where case independence is
1276 // trivial. 1276 // trivial.
1277 if (length == 0) { 1277 if (length == 0) {
1278 letters[0] = character; 1278 letters[0] = character;
1279 length = 1; 1279 length = 1;
1280 } 1280 }
1281 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { 1281 if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1282 return length; 1282 return length;
1283 } 1283 }
1284 // The standard requires that non-ASCII characters cannot have ASCII 1284 // The standard requires that non-ASCII characters cannot have ASCII
1285 // character codes in their equivalence class. 1285 // character codes in their equivalence class.
(...skipping 2733 matching lines...) Expand 10 before | Expand all | Expand 10 after
4019 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4019 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4020 if (top == bottom) { 4020 if (top == bottom) {
4021 // If this is a singleton we just expand the one character. 4021 // If this is a singleton we just expand the one character.
4022 int length = uncanonicalize.get(bottom, '\0', chars); 4022 int length = uncanonicalize.get(bottom, '\0', chars);
4023 for (int i = 0; i < length; i++) { 4023 for (int i = 0; i < length; i++) {
4024 uc32 chr = chars[i]; 4024 uc32 chr = chars[i];
4025 if (chr != bottom) { 4025 if (chr != bottom) {
4026 ranges->Add(CharacterRange::Singleton(chars[i])); 4026 ranges->Add(CharacterRange::Singleton(chars[i]));
4027 } 4027 }
4028 } 4028 }
4029 } else if (bottom <= kRangeCanonicalizeMax && 4029 } else {
4030 top <= kRangeCanonicalizeMax) {
4031 // If this is a range we expand the characters block by block, 4030 // If this is a range we expand the characters block by block,
4032 // expanding contiguous subranges (blocks) one at a time. 4031 // expanding contiguous subranges (blocks) one at a time.
4033 // The approach is as follows. For a given start character we 4032 // The approach is as follows. For a given start character we
4034 // look up the block that contains it, for instance 'a' if the 4033 // look up the remainder of the block that contains it (represented
4035 // start character is 'c'. A block is characterized by the property 4034 // by the end point), for instance we find 'z' if the character
4036 // that all characters uncanonicalize in the same way as the first 4035 // is 'c'. A block is characterized by the property
4037 // element, except that each entry in the result is incremented 4036 // that all characters uncanonicalize in the same way, except that
4038 // by the distance from the first element. So a-z is a block 4037 // each entry in the result is incremented by the distance from the first
4039 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter 4038 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
4040 // uncanonicalizes to ['a' + k, 'A' + k]. 4039 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4041 // Once we've found the start point we look up its uncanonicalization 4040 // Once we've found the end point we look up its uncanonicalization
4042 // and produce a range for each element. For instance for [c-f] 4041 // and produce a range for each element. For instance for [c-f]
4043 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only 4042 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
4044 // add a range if it is not already contained in the input, so [c-f] 4043 // add a range if it is not already contained in the input, so [c-f]
4045 // will be skipped but [C-F] will be added. If this range is not 4044 // will be skipped but [C-F] will be added. If this range is not
4046 // completely contained in a block we do this for all the blocks 4045 // completely contained in a block we do this for all the blocks
4047 // covered by the range. 4046 // covered by the range (handling characters that is not in a block
4047 // as a "singleton block").
4048 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; 4048 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4049 // First, look up the block that contains the 'bottom' character.
4050 int length = canonrange.get(bottom, '\0', range);
4051 if (length == 0) {
4052 range[0] = bottom;
4053 } else {
4054 ASSERT_EQ(1, length);
4055 }
4056 int pos = bottom; 4049 int pos = bottom;
4057 // The start of the current block. Note that except for the first
4058 // iteration 'start' is always equal to 'pos'.
4059 int start;
4060 // If it is not the start point of a block the entry contains the
4061 // offset of the character from the start point.
4062 if ((range[0] & kStartMarker) == 0) {
4063 start = pos - range[0];
4064 } else {
4065 start = pos;
4066 }
4067 // Then we add the ranges one at a time, incrementing the current
4068 // position to be after the last block each time. The position
4069 // always points to the start of a block.
4070 while (pos < top) { 4050 while (pos < top) {
4071 length = canonrange.get(start, '\0', range); 4051 int length = canonrange.get(pos, '\0', range);
4052 uc16 block_end;
4072 if (length == 0) { 4053 if (length == 0) {
4073 range[0] = start; 4054 block_end = pos;
4074 } else { 4055 } else {
4075 ASSERT_EQ(1, length); 4056 ASSERT_EQ(1, length);
4057 block_end = range[0];
4076 } 4058 }
4077 ASSERT((range[0] & kStartMarker) != 0);
4078 // The start point of a block contains the distance to the end
4079 // of the range.
4080 int block_end = start + (range[0] & kPayloadMask) - 1;
4081 int end = (block_end > top) ? top : block_end; 4059 int end = (block_end > top) ? top : block_end;
4082 length = uncanonicalize.get(start, '\0', range); 4060 length = uncanonicalize.get(block_end, '\0', range);
4083 for (int i = 0; i < length; i++) { 4061 for (int i = 0; i < length; i++) {
4084 uc32 c = range[i]; 4062 uc32 c = range[i];
4085 uc16 range_from = c + (pos - start); 4063 uc16 range_from = c - (block_end - pos);
4086 uc16 range_to = c + (end - start); 4064 uc16 range_to = c - (block_end - end);
4087 if (!(bottom <= range_from && range_to <= top)) { 4065 if (!(bottom <= range_from && range_to <= top)) {
4088 ranges->Add(CharacterRange(range_from, range_to)); 4066 ranges->Add(CharacterRange(range_from, range_to));
4089 } 4067 }
4090 } 4068 }
4091 start = pos = block_end + 1; 4069 pos = end + 1;
4092 } 4070 }
4093 } else {
4094 // Unibrow ranges don't work for high characters due to the "2^11 bug".
4095 // Therefore we do something dumber for these ranges.
4096 AddUncanonicals(ranges, bottom, top);
4097 } 4071 }
4098 } 4072 }
4099 4073
4100 4074
4101 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { 4075 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
4102 ASSERT_NOT_NULL(ranges); 4076 ASSERT_NOT_NULL(ranges);
4103 int n = ranges->length(); 4077 int n = ranges->length();
4104 if (n <= 1) return true; 4078 if (n <= 1) return true;
4105 int max = ranges->at(0).to(); 4079 int max = ranges->at(0).to();
4106 for (int i = 1; i < n; i++) { 4080 for (int i = 1; i < n; i++) {
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
4201 // this table is correct. 4175 // this table is correct.
4202 // 0x0600 - 0x0fff 4176 // 0x0600 - 0x0fff
4203 // 0x1100 - 0x1cff 4177 // 0x1100 - 0x1cff
4204 // 0x2000 - 0x20ff 4178 // 0x2000 - 0x20ff
4205 // 0x2200 - 0x23ff 4179 // 0x2200 - 0x23ff
4206 // 0x2500 - 0x2bff 4180 // 0x2500 - 0x2bff
4207 // 0x2e00 - 0xa5ff 4181 // 0x2e00 - 0xa5ff
4208 // 0xa800 - 0xfaff 4182 // 0xa800 - 0xfaff
4209 // 0xfc00 - 0xfeff 4183 // 0xfc00 - 0xfeff
4210 const int boundary_count = 18; 4184 const int boundary_count = 18;
4211 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this 4185 int boundaries[] = {
4212 // array. This is to split up big ranges and not because they actually denote
4213 // a case-mapping-free-zone.
4214 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
4215 const int kFirstRealCaselessZoneIndex = 2;
4216 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
4217 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, 4186 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
4218 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; 4187 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
4219 4188
4220 // Special ASCII rule from spec can save us some work here. 4189 // Special ASCII rule from spec can save us some work here.
4221 if (bottom == 0x80 && top == 0xffff) return; 4190 if (bottom == 0x80 && top == 0xffff) return;
4222 4191
4223 // We have optimized support for this range. 4192 if (top <= boundaries[0]) {
4224 if (top <= CharacterRange::kRangeCanonicalizeMax) {
4225 CharacterRange range(bottom, top); 4193 CharacterRange range(bottom, top);
4226 range.AddCaseEquivalents(ranges, false); 4194 range.AddCaseEquivalents(ranges, false);
4227 return; 4195 return;
4228 } 4196 }
4229 4197
4230 // Split up very large ranges. This helps remove ranges where there are no 4198 // Split up very large ranges. This helps remove ranges where there are no
4231 // case mappings. 4199 // case mappings.
4232 for (int i = 0; i < boundary_count; i++) { 4200 for (int i = 0; i < boundary_count; i++) {
4233 if (bottom < boundaries[i] && top >= boundaries[i]) { 4201 if (bottom < boundaries[i] && top >= boundaries[i]) {
4234 AddUncanonicals(ranges, bottom, boundaries[i] - 1); 4202 AddUncanonicals(ranges, bottom, boundaries[i] - 1);
4235 AddUncanonicals(ranges, boundaries[i], top); 4203 AddUncanonicals(ranges, boundaries[i], top);
4236 return; 4204 return;
4237 } 4205 }
4238 } 4206 }
4239 4207
4240 // If we are completely in a zone with no case mappings then we are done. 4208 // If we are completely in a zone with no case mappings then we are done.
4241 // We start at 2 so as not to except the ASCII range from mappings. 4209 for (int i = 0; i < boundary_count; i += 2) {
4242 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
4243 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { 4210 if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
4244 #ifdef DEBUG 4211 #ifdef DEBUG
4245 for (int j = bottom; j <= top; j++) { 4212 for (int j = bottom; j <= top; j++) {
4246 unsigned current_char = j; 4213 unsigned current_char = j;
4247 int length = uncanonicalize.get(current_char, '\0', chars); 4214 int length = uncanonicalize.get(current_char, '\0', chars);
4248 for (int k = 0; k < length; k++) { 4215 for (int k = 0; k < length; k++) {
4249 ASSERT(chars[k] == current_char); 4216 ASSERT(chars[k] == current_char);
4250 } 4217 }
4251 } 4218 }
4252 #endif 4219 #endif
(...skipping 1021 matching lines...) Expand 10 before | Expand all | Expand 10 after
5274 node, 5241 node,
5275 data->capture_count, 5242 data->capture_count,
5276 pattern); 5243 pattern);
5277 } 5244 }
5278 5245
5279 5246
5280 int OffsetsVector::static_offsets_vector_[ 5247 int OffsetsVector::static_offsets_vector_[
5281 OffsetsVector::kStaticOffsetsVectorSize]; 5248 OffsetsVector::kStaticOffsetsVectorSize];
5282 5249
5283 }} // namespace v8::internal 5250 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/unicode.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698