| OLD | NEW | 
|---|
| 1 // Copyright 2006-2009 the V8 project authors. All rights reserved. | 1 // Copyright 2006-2009 the V8 project authors. All rights reserved. | 
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without | 
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are | 
| 4 // met: | 4 // met: | 
| 5 // | 5 // | 
| 6 //     * Redistributions of source code must retain the above copyright | 6 //     * Redistributions of source code must retain the above copyright | 
| 7 //       notice, this list of conditions and the following disclaimer. | 7 //       notice, this list of conditions and the following disclaimer. | 
| 8 //     * Redistributions in binary form must reproduce the above | 8 //     * Redistributions in binary form must reproduce the above | 
| 9 //       copyright notice, this list of conditions and the following | 9 //       copyright notice, this list of conditions and the following | 
| 10 //       disclaimer in the documentation and/or other materials provided | 10 //       disclaimer in the documentation and/or other materials provided | 
| (...skipping 1254 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 1265 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; | 1265 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize; | 
| 1266 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; | 1266 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange; | 
| 1267 | 1267 | 
| 1268 | 1268 | 
| 1269 // Returns the number of characters in the equivalence class, omitting those | 1269 // Returns the number of characters in the equivalence class, omitting those | 
| 1270 // that cannot occur in the source string because it is ASCII. | 1270 // that cannot occur in the source string because it is ASCII. | 
| 1271 static int GetCaseIndependentLetters(uc16 character, | 1271 static int GetCaseIndependentLetters(uc16 character, | 
| 1272                                      bool ascii_subject, | 1272                                      bool ascii_subject, | 
| 1273                                      unibrow::uchar* letters) { | 1273                                      unibrow::uchar* letters) { | 
| 1274   int length = uncanonicalize.get(character, '\0', letters); | 1274   int length = uncanonicalize.get(character, '\0', letters); | 
| 1275   // Unibrow returns 0 or 1 for characters where case independependence is | 1275   // Unibrow returns 0 or 1 for characters where case independence is | 
| 1276   // trivial. | 1276   // trivial. | 
| 1277   if (length == 0) { | 1277   if (length == 0) { | 
| 1278     letters[0] = character; | 1278     letters[0] = character; | 
| 1279     length = 1; | 1279     length = 1; | 
| 1280   } | 1280   } | 
| 1281   if (!ascii_subject || character <= String::kMaxAsciiCharCode) { | 1281   if (!ascii_subject || character <= String::kMaxAsciiCharCode) { | 
| 1282     return length; | 1282     return length; | 
| 1283   } | 1283   } | 
| 1284   // The standard requires that non-ASCII characters cannot have ASCII | 1284   // The standard requires that non-ASCII characters cannot have ASCII | 
| 1285   // character codes in their equivalence class. | 1285   // character codes in their equivalence class. | 
| (...skipping 2733 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 4019   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 4019   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 
| 4020   if (top == bottom) { | 4020   if (top == bottom) { | 
| 4021     // If this is a singleton we just expand the one character. | 4021     // If this is a singleton we just expand the one character. | 
| 4022     int length = uncanonicalize.get(bottom, '\0', chars); | 4022     int length = uncanonicalize.get(bottom, '\0', chars); | 
| 4023     for (int i = 0; i < length; i++) { | 4023     for (int i = 0; i < length; i++) { | 
| 4024       uc32 chr = chars[i]; | 4024       uc32 chr = chars[i]; | 
| 4025       if (chr != bottom) { | 4025       if (chr != bottom) { | 
| 4026         ranges->Add(CharacterRange::Singleton(chars[i])); | 4026         ranges->Add(CharacterRange::Singleton(chars[i])); | 
| 4027       } | 4027       } | 
| 4028     } | 4028     } | 
| 4029   } else if (bottom <= kRangeCanonicalizeMax && | 4029   } else { | 
| 4030              top <= kRangeCanonicalizeMax) { |  | 
| 4031     // If this is a range we expand the characters block by block, | 4030     // If this is a range we expand the characters block by block, | 
| 4032     // expanding contiguous subranges (blocks) one at a time. | 4031     // expanding contiguous subranges (blocks) one at a time. | 
| 4033     // The approach is as follows.  For a given start character we | 4032     // The approach is as follows.  For a given start character we | 
| 4034     // look up the block that contains it, for instance 'a' if the | 4033     // look up the remainder of the block that contains it (represented | 
| 4035     // start character is 'c'.  A block is characterized by the property | 4034     // by the end point), for instance we find 'z' if the character | 
| 4036     // that all characters uncanonicalize in the same way as the first | 4035     // is 'c'.  A block is characterized by the property | 
| 4037     // element, except that each entry in the result is incremented | 4036     // that all characters uncanonicalize in the same way, except that | 
| 4038     // by the distance from the first element.  So a-z is a block | 4037     // each entry in the result is incremented by the distance from the first | 
| 4039     // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter | 4038     // element.  So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and | 
| 4040     // uncanonicalizes to ['a' + k, 'A' + k]. | 4039     // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. | 
| 4041     // Once we've found the start point we look up its uncanonicalization | 4040     // Once we've found the end point we look up its uncanonicalization | 
| 4042     // and produce a range for each element.  For instance for [c-f] | 4041     // and produce a range for each element.  For instance for [c-f] | 
| 4043     // we look up ['a', 'A'] and produce [c-f] and [C-F].  We then only | 4042     // we look up ['z', 'Z'] and produce [c-f] and [C-F].  We then only | 
| 4044     // add a range if it is not already contained in the input, so [c-f] | 4043     // add a range if it is not already contained in the input, so [c-f] | 
| 4045     // will be skipped but [C-F] will be added.  If this range is not | 4044     // will be skipped but [C-F] will be added.  If this range is not | 
| 4046     // completely contained in a block we do this for all the blocks | 4045     // completely contained in a block we do this for all the blocks | 
| 4047     // covered by the range. | 4046     // covered by the range (handling characters that is not in a block | 
|  | 4047     // as a "singleton block"). | 
| 4048     unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 4048     unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 
| 4049     // First, look up the block that contains the 'bottom' character. |  | 
| 4050     int length = canonrange.get(bottom, '\0', range); |  | 
| 4051     if (length == 0) { |  | 
| 4052       range[0] = bottom; |  | 
| 4053     } else { |  | 
| 4054       ASSERT_EQ(1, length); |  | 
| 4055     } |  | 
| 4056     int pos = bottom; | 4049     int pos = bottom; | 
| 4057     // The start of the current block.  Note that except for the first |  | 
| 4058     // iteration 'start' is always equal to 'pos'. |  | 
| 4059     int start; |  | 
| 4060     // If it is not the start point of a block the entry contains the |  | 
| 4061     // offset of the character from the start point. |  | 
| 4062     if ((range[0] & kStartMarker) == 0) { |  | 
| 4063       start = pos - range[0]; |  | 
| 4064     } else { |  | 
| 4065       start = pos; |  | 
| 4066     } |  | 
| 4067     // Then we add the ranges one at a time, incrementing the current |  | 
| 4068     // position to be after the last block each time.  The position |  | 
| 4069     // always points to the start of a block. |  | 
| 4070     while (pos < top) { | 4050     while (pos < top) { | 
| 4071       length = canonrange.get(start, '\0', range); | 4051       int length = canonrange.get(pos, '\0', range); | 
|  | 4052       uc16 block_end; | 
| 4072       if (length == 0) { | 4053       if (length == 0) { | 
| 4073         range[0] = start; | 4054         block_end = pos; | 
| 4074       } else { | 4055       } else { | 
| 4075         ASSERT_EQ(1, length); | 4056         ASSERT_EQ(1, length); | 
|  | 4057         block_end = range[0]; | 
| 4076       } | 4058       } | 
| 4077       ASSERT((range[0] & kStartMarker) != 0); |  | 
| 4078       // The start point of a block contains the distance to the end |  | 
| 4079       // of the range. |  | 
| 4080       int block_end = start + (range[0] & kPayloadMask) - 1; |  | 
| 4081       int end = (block_end > top) ? top : block_end; | 4059       int end = (block_end > top) ? top : block_end; | 
| 4082       length = uncanonicalize.get(start, '\0', range); | 4060       length = uncanonicalize.get(block_end, '\0', range); | 
| 4083       for (int i = 0; i < length; i++) { | 4061       for (int i = 0; i < length; i++) { | 
| 4084         uc32 c = range[i]; | 4062         uc32 c = range[i]; | 
| 4085         uc16 range_from = c + (pos - start); | 4063         uc16 range_from = c - (block_end - pos); | 
| 4086         uc16 range_to = c + (end - start); | 4064         uc16 range_to = c - (block_end - end); | 
| 4087         if (!(bottom <= range_from && range_to <= top)) { | 4065         if (!(bottom <= range_from && range_to <= top)) { | 
| 4088           ranges->Add(CharacterRange(range_from, range_to)); | 4066           ranges->Add(CharacterRange(range_from, range_to)); | 
| 4089         } | 4067         } | 
| 4090       } | 4068       } | 
| 4091       start = pos = block_end + 1; | 4069       pos = end + 1; | 
| 4092     } | 4070     } | 
| 4093   } else { |  | 
| 4094     // Unibrow ranges don't work for high characters due to the "2^11 bug". |  | 
| 4095     // Therefore we do something dumber for these ranges. |  | 
| 4096     AddUncanonicals(ranges, bottom, top); |  | 
| 4097   } | 4071   } | 
| 4098 } | 4072 } | 
| 4099 | 4073 | 
| 4100 | 4074 | 
| 4101 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { | 4075 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { | 
| 4102   ASSERT_NOT_NULL(ranges); | 4076   ASSERT_NOT_NULL(ranges); | 
| 4103   int n = ranges->length(); | 4077   int n = ranges->length(); | 
| 4104   if (n <= 1) return true; | 4078   if (n <= 1) return true; | 
| 4105   int max = ranges->at(0).to(); | 4079   int max = ranges->at(0).to(); | 
| 4106   for (int i = 1; i < n; i++) { | 4080   for (int i = 1; i < n; i++) { | 
| (...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 4201   // this table is correct. | 4175   // this table is correct. | 
| 4202   // 0x0600 - 0x0fff | 4176   // 0x0600 - 0x0fff | 
| 4203   // 0x1100 - 0x1cff | 4177   // 0x1100 - 0x1cff | 
| 4204   // 0x2000 - 0x20ff | 4178   // 0x2000 - 0x20ff | 
| 4205   // 0x2200 - 0x23ff | 4179   // 0x2200 - 0x23ff | 
| 4206   // 0x2500 - 0x2bff | 4180   // 0x2500 - 0x2bff | 
| 4207   // 0x2e00 - 0xa5ff | 4181   // 0x2e00 - 0xa5ff | 
| 4208   // 0xa800 - 0xfaff | 4182   // 0xa800 - 0xfaff | 
| 4209   // 0xfc00 - 0xfeff | 4183   // 0xfc00 - 0xfeff | 
| 4210   const int boundary_count = 18; | 4184   const int boundary_count = 18; | 
| 4211   // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this | 4185   int boundaries[] = { | 
| 4212   // array.  This is to split up big ranges and not because they actually denote |  | 
| 4213   // a case-mapping-free-zone. |  | 
| 4214   ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600); |  | 
| 4215   const int kFirstRealCaselessZoneIndex = 2; |  | 
| 4216   int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax, |  | 
| 4217       0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, | 4186       0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500, | 
| 4218       0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; | 4187       0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00}; | 
| 4219 | 4188 | 
| 4220   // Special ASCII rule from spec can save us some work here. | 4189   // Special ASCII rule from spec can save us some work here. | 
| 4221   if (bottom == 0x80 && top == 0xffff) return; | 4190   if (bottom == 0x80 && top == 0xffff) return; | 
| 4222 | 4191 | 
| 4223   // We have optimized support for this range. | 4192   if (top <= boundaries[0]) { | 
| 4224   if (top <= CharacterRange::kRangeCanonicalizeMax) { |  | 
| 4225     CharacterRange range(bottom, top); | 4193     CharacterRange range(bottom, top); | 
| 4226     range.AddCaseEquivalents(ranges, false); | 4194     range.AddCaseEquivalents(ranges, false); | 
| 4227     return; | 4195     return; | 
| 4228   } | 4196   } | 
| 4229 | 4197 | 
| 4230   // Split up very large ranges.  This helps remove ranges where there are no | 4198   // Split up very large ranges.  This helps remove ranges where there are no | 
| 4231   // case mappings. | 4199   // case mappings. | 
| 4232   for (int i = 0; i < boundary_count; i++) { | 4200   for (int i = 0; i < boundary_count; i++) { | 
| 4233     if (bottom < boundaries[i] && top >= boundaries[i]) { | 4201     if (bottom < boundaries[i] && top >= boundaries[i]) { | 
| 4234       AddUncanonicals(ranges, bottom, boundaries[i] - 1); | 4202       AddUncanonicals(ranges, bottom, boundaries[i] - 1); | 
| 4235       AddUncanonicals(ranges, boundaries[i], top); | 4203       AddUncanonicals(ranges, boundaries[i], top); | 
| 4236       return; | 4204       return; | 
| 4237     } | 4205     } | 
| 4238   } | 4206   } | 
| 4239 | 4207 | 
| 4240   // If we are completely in a zone with no case mappings then we are done. | 4208   // If we are completely in a zone with no case mappings then we are done. | 
| 4241   // We start at 2 so as not to except the ASCII range from mappings. | 4209   for (int i = 0; i < boundary_count; i += 2) { | 
| 4242   for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) { |  | 
| 4243     if (bottom >= boundaries[i] && top < boundaries[i + 1]) { | 4210     if (bottom >= boundaries[i] && top < boundaries[i + 1]) { | 
| 4244 #ifdef DEBUG | 4211 #ifdef DEBUG | 
| 4245       for (int j = bottom; j <= top; j++) { | 4212       for (int j = bottom; j <= top; j++) { | 
| 4246         unsigned current_char = j; | 4213         unsigned current_char = j; | 
| 4247         int length = uncanonicalize.get(current_char, '\0', chars); | 4214         int length = uncanonicalize.get(current_char, '\0', chars); | 
| 4248         for (int k = 0; k < length; k++) { | 4215         for (int k = 0; k < length; k++) { | 
| 4249           ASSERT(chars[k] == current_char); | 4216           ASSERT(chars[k] == current_char); | 
| 4250         } | 4217         } | 
| 4251       } | 4218       } | 
| 4252 #endif | 4219 #endif | 
| (...skipping 1021 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 5274                            node, | 5241                            node, | 
| 5275                            data->capture_count, | 5242                            data->capture_count, | 
| 5276                            pattern); | 5243                            pattern); | 
| 5277 } | 5244 } | 
| 5278 | 5245 | 
| 5279 | 5246 | 
| 5280 int OffsetsVector::static_offsets_vector_[ | 5247 int OffsetsVector::static_offsets_vector_[ | 
| 5281     OffsetsVector::kStaticOffsetsVectorSize]; | 5248     OffsetsVector::kStaticOffsetsVectorSize]; | 
| 5282 | 5249 | 
| 5283 }}  // namespace v8::internal | 5250 }}  // namespace v8::internal | 
| OLD | NEW | 
|---|