OLD | NEW |
1 // Copyright 2006-2009 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2009 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 1254 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1265 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; | 1265 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; |
1266 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; | 1266 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; |
1267 | 1267 |
1268 | 1268 |
1269 // Returns the number of characters in the equivalence class, omitting those | 1269 // Returns the number of characters in the equivalence class, omitting those |
1270 // that cannot occur in the source string because it is ASCII. | 1270 // that cannot occur in the source string because it is ASCII. |
1271 static int GetCaseIndependentLetters(uc16 character, | 1271 static int GetCaseIndependentLetters(uc16 character, |
1272 bool ascii_subject, | 1272 bool ascii_subject, |
1273 unibrow::uchar* letters) { | 1273 unibrow::uchar* letters) { |
1274 int length = uncanonicalize.get(character, '\0', letters); | 1274 int length = uncanonicalize.get(character, '\0', letters); |
1275 // Unibrow returns 0 or 1 for characters where case independependence is | 1275 // Unibrow returns 0 or 1 for characters where case independence is |
1276 // trivial. | 1276 // trivial. |
1277 if (length == 0) { | 1277 if (length == 0) { |
1278 letters[0] = character; | 1278 letters[0] = character; |
1279 length = 1; | 1279 length = 1; |
1280 } | 1280 } |
1281 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { | 1281 if (!ascii_subject || character <= String::kMaxAsciiCharCode) { |
1282 return length; | 1282 return length; |
1283 } | 1283 } |
1284 // The standard requires that non-ASCII characters cannot have ASCII | 1284 // The standard requires that non-ASCII characters cannot have ASCII |
1285 // character codes in their equivalence class. | 1285 // character codes in their equivalence class. |
(...skipping 2733 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4019 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 4019 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
4020 if (top == bottom) { | 4020 if (top == bottom) { |
4021 // If this is a singleton we just expand the one character. | 4021 // If this is a singleton we just expand the one character. |
4022 int length = uncanonicalize.get(bottom, '\0', chars); | 4022 int length = uncanonicalize.get(bottom, '\0', chars); |
4023 for (int i = 0; i < length; i++) { | 4023 for (int i = 0; i < length; i++) { |
4024 uc32 chr = chars[i]; | 4024 uc32 chr = chars[i]; |
4025 if (chr != bottom) { | 4025 if (chr != bottom) { |
4026 ranges->Add(CharacterRange::Singleton(chars[i])); | 4026 ranges->Add(CharacterRange::Singleton(chars[i])); |
4027 } | 4027 } |
4028 } | 4028 } |
4029 } else if (bottom <= kRangeCanonicalizeMax && | 4029 } else { |
4030 top <= kRangeCanonicalizeMax) { | |
4031 // If this is a range we expand the characters block by block, | 4030 // If this is a range we expand the characters block by block, |
4032 // expanding contiguous subranges (blocks) one at a time. | 4031 // expanding contiguous subranges (blocks) one at a time. |
4033 // The approach is as follows. For a given start character we | 4032 // The approach is as follows. For a given start character we |
4034 // look up the block that contains it, for instance 'a' if the | 4033 // look up the remainder of the block that contains it (represented |
4035 // start character is 'c'. A block is characterized by the property | 4034 // by the end point), for instance we find 'z' if the character |
4036 // that all characters uncanonicalize in the same way as the first | 4035 // is 'c'. A block is characterized by the property |
4037 // element, except that each entry in the result is incremented | 4036 // that all characters uncanonicalize in the same way, except that |
4038 // by the distance from the first element. So a-z is a block | 4037 // each entry in the result is incremented by the distance from the first |
4039 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter | 4038 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and |
4040 // uncanonicalizes to ['a' + k, 'A' + k]. | 4039 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. |
4041 // Once we've found the start point we look up its uncanonicalization | 4040 // Once we've found the end point we look up its uncanonicalization |
4042 // and produce a range for each element. For instance for [c-f] | 4041 // and produce a range for each element. For instance for [c-f] |
4043 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only | 4042 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only |
4044 // add a range if it is not already contained in the input, so [c-f] | 4043 // add a range if it is not already contained in the input, so [c-f] |
4045 // will be skipped but [C-F] will be added. If this range is not | 4044 // will be skipped but [C-F] will be added. If this range is not |
4046 // completely contained in a block we do this for all the blocks | 4045 // completely contained in a block we do this for all the blocks |
4047 // covered by the range. | 4046 // covered by the range (handling characters that is not in a block |
| 4047 // as a "singleton block"). |
4048 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 4048 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
4049 // First, look up the block that contains the 'bottom' character. | |
4050 int length = canonrange.get(bottom, '\0', range); | |
4051 if (length == 0) { | |
4052 range[0] = bottom; | |
4053 } else { | |
4054 ASSERT_EQ(1, length); | |
4055 } | |
4056 int pos = bottom; | 4049 int pos = bottom; |
4057 // The start of the current block. Note that except for the first | |
4058 // iteration 'start' is always equal to 'pos'. | |
4059 int start; | |
4060 // If it is not the start point of a block the entry contains the | |
4061 // offset of the character from the start point. | |
4062 if ((range[0] & kStartMarker) == 0) { | |
4063 start = pos - range[0]; | |
4064 } else { | |
4065 start = pos; | |
4066 } | |
4067 // Then we add the ranges one at a time, incrementing the current | |
4068 // position to be after the last block each time. The position | |
4069 // always points to the start of a block. | |
4070 while (pos < top) { | 4050 while (pos < top) { |
4071 length = canonrange.get(start, '\0', range); | 4051 int length = canonrange.get(pos, '\0', range); |
| 4052 uc16 block_end; |
4072 if (length == 0) { | 4053 if (length == 0) { |
4073 range[0] = start; | 4054 block_end = pos; |
4074 } else { | 4055 } else { |
4075 ASSERT_EQ(1, length); | 4056 ASSERT_EQ(1, length); |
| 4057 block_end = range[0]; |
4076 } | 4058 } |
4077 ASSERT((range[0] & kStartMarker) != 0); | |
4078 // The start point of a block contains the distance to the end | |
4079 // of the range. | |
4080 int block_end = start + (range[0] & kPayloadMask) - 1; | |
4081 int end = (block_end > top) ? top : block_end; | 4059 int end = (block_end > top) ? top : block_end; |
4082 length = uncanonicalize.get(start, '\0', range); | 4060 length = uncanonicalize.get(block_end, '\0', range); |
4083 for (int i = 0; i < length; i++) { | 4061 for (int i = 0; i < length; i++) { |
4084 uc32 c = range[i]; | 4062 uc32 c = range[i]; |
4085 uc16 range_from = c + (pos - start); | 4063 uc16 range_from = c - (block_end - pos); |
4086 uc16 range_to = c + (end - start); | 4064 uc16 range_to = c - (block_end - end); |
4087 if (!(bottom <= range_from && range_to <= top)) { | 4065 if (!(bottom <= range_from && range_to <= top)) { |
4088 ranges->Add(CharacterRange(range_from, range_to)); | 4066 ranges->Add(CharacterRange(range_from, range_to)); |
4089 } | 4067 } |
4090 } | 4068 } |
4091 start = pos = block_end + 1; | 4069 pos = end + 1; |
4092 } | 4070 } |
4093 } else { | |
4094 // Unibrow ranges don't work for high characters due to the "2^11 bug". | |
4095 // Therefore we do something dumber for these ranges. | |
4096 AddUncanonicals(ranges, bottom, top); | |
4097 } | 4071 } |
4098 } | 4072 } |
4099 | 4073 |
4100 | 4074 |
4101 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { | 4075 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { |
4102 ASSERT_NOT_NULL(ranges); | 4076 ASSERT_NOT_NULL(ranges); |
4103 int n = ranges->length(); | 4077 int n = ranges->length(); |
4104 if (n <= 1) return true; | 4078 if (n <= 1) return true; |
4105 int max = ranges->at(0).to(); | 4079 int max = ranges->at(0).to(); |
4106 for (int i = 1; i < n; i++) { | 4080 for (int i = 1; i < n; i++) { |
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4201 // this table is correct. | 4175 // this table is correct. |
4202 // 0x0600 - 0x0fff | 4176 // 0x0600 - 0x0fff |
4203 // 0x1100 - 0x1cff | 4177 // 0x1100 - 0x1cff |
4204 // 0x2000 - 0x20ff | 4178 // 0x2000 - 0x20ff |
4205 // 0x2200 - 0x23ff | 4179 // 0x2200 - 0x23ff |
4206 // 0x2500 - 0x2bff | 4180 // 0x2500 - 0x2bff |
4207 // 0x2e00 - 0xa5ff | 4181 // 0x2e00 - 0xa5ff |
4208 // 0xa800 - 0xfaff | 4182 // 0xa800 - 0xfaff |
4209 // 0xfc00 - 0xfeff | 4183 // 0xfc00 - 0xfeff |
4210 const int boundary_count = 18; | 4184 const int boundary_count = 18; |
4211 // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this | 4185 int boundaries[] = { |
4212 // array. This is to split up big ranges and not because they actually denote | |
4213 // a case-mapping-free-zone. | |
4214 ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600); | |
4215 const int kFirstRealCaselessZoneIndex = 2; | |
4216 int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax, | |
4217 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, | 4186 0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, |
4218 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; | 4187 0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; |
4219 | 4188 |
4220 // Special ASCII rule from spec can save us some work here. | 4189 // Special ASCII rule from spec can save us some work here. |
4221 if (bottom == 0x80 && top == 0xffff) return; | 4190 if (bottom == 0x80 && top == 0xffff) return; |
4222 | 4191 |
4223 // We have optimized support for this range. | 4192 if (top <= boundaries[0]) { |
4224 if (top <= CharacterRange::kRangeCanonicalizeMax) { | |
4225 CharacterRange range(bottom, top); | 4193 CharacterRange range(bottom, top); |
4226 range.AddCaseEquivalents(ranges, false); | 4194 range.AddCaseEquivalents(ranges, false); |
4227 return; | 4195 return; |
4228 } | 4196 } |
4229 | 4197 |
4230 // Split up very large ranges. This helps remove ranges where there are no | 4198 // Split up very large ranges. This helps remove ranges where there are no |
4231 // case mappings. | 4199 // case mappings. |
4232 for (int i = 0; i < boundary_count; i++) { | 4200 for (int i = 0; i < boundary_count; i++) { |
4233 if (bottom < boundaries[i] && top >= boundaries[i]) { | 4201 if (bottom < boundaries[i] && top >= boundaries[i]) { |
4234 AddUncanonicals(ranges, bottom, boundaries[i] - 1); | 4202 AddUncanonicals(ranges, bottom, boundaries[i] - 1); |
4235 AddUncanonicals(ranges, boundaries[i], top); | 4203 AddUncanonicals(ranges, boundaries[i], top); |
4236 return; | 4204 return; |
4237 } | 4205 } |
4238 } | 4206 } |
4239 | 4207 |
4240 // If we are completely in a zone with no case mappings then we are done. | 4208 // If we are completely in a zone with no case mappings then we are done. |
4241 // We start at 2 so as not to except the ASCII range from mappings. | 4209 for (int i = 0; i < boundary_count; i += 2) { |
4242 for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { | |
4243 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { | 4210 if (bottom >= boundaries[i] && top < boundaries[i + 1]) { |
4244 #ifdef DEBUG | 4211 #ifdef DEBUG |
4245 for (int j = bottom; j <= top; j++) { | 4212 for (int j = bottom; j <= top; j++) { |
4246 unsigned current_char = j; | 4213 unsigned current_char = j; |
4247 int length = uncanonicalize.get(current_char, '\0', chars); | 4214 int length = uncanonicalize.get(current_char, '\0', chars); |
4248 for (int k = 0; k < length; k++) { | 4215 for (int k = 0; k < length; k++) { |
4249 ASSERT(chars[k] == current_char); | 4216 ASSERT(chars[k] == current_char); |
4250 } | 4217 } |
4251 } | 4218 } |
4252 #endif | 4219 #endif |
(...skipping 1021 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5274 node, | 5241 node, |
5275 data->capture_count, | 5242 data->capture_count, |
5276 pattern); | 5243 pattern); |
5277 } | 5244 } |
5278 | 5245 |
5279 | 5246 |
5280 int OffsetsVector::static_offsets_vector_[ | 5247 int OffsetsVector::static_offsets_vector_[ |
5281 OffsetsVector::kStaticOffsetsVectorSize]; | 5248 OffsetsVector::kStaticOffsetsVectorSize]; |
5282 | 5249 |
5283 }} // namespace v8::internal | 5250 }} // namespace v8::internal |
OLD | NEW |