OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/jsregexp.h" | 5 #include "src/regexp/jsregexp.h" |
6 | 6 |
7 #include "src/ast/ast.h" | 7 #include "src/ast/ast.h" |
8 #include "src/base/platform/platform.h" | 8 #include "src/base/platform/platform.h" |
9 #include "src/compilation-cache.h" | 9 #include "src/compilation-cache.h" |
10 #include "src/compiler.h" | 10 #include "src/compiler.h" |
(...skipping 5886 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5897 uc32 bottom = range.from(); | 5897 uc32 bottom = range.from(); |
5898 uc32 top = range.to(); | 5898 uc32 top = range.to(); |
5899 // Nothing to be done for surrogates. | 5899 // Nothing to be done for surrogates. |
5900 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return; | 5900 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return; |
5901 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { | 5901 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { |
5902 if (bottom > String::kMaxOneByteCharCode) return; | 5902 if (bottom > String::kMaxOneByteCharCode) return; |
5903 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; | 5903 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; |
5904 } | 5904 } |
5905 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | 5905 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
5906 if (top == bottom) { | 5906 if (top == bottom) { |
5907 // If this is a singleton we just expand the one character. | 5907 // If this is a singleton we just expand the one character. |
5908 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); | 5908 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); |
5909 for (int i = 0; i < length; i++) { | 5909 for (int i = 0; i < length; i++) { |
5910 uc32 chr = chars[i]; | 5910 uc32 chr = chars[i]; |
5911 if (chr != bottom) { | 5911 if (chr != bottom) { |
5912 ranges->Add(CharacterRange::Singleton(chars[i]), zone); | 5912 ranges->Add(CharacterRange::Singleton(chars[i]), zone); |
| 5913 } |
| 5914 } |
| 5915 } else { |
| 5916 // If this is a range we expand the characters block by block, expanding |
| 5917 // contiguous subranges (blocks) one at a time. The approach is as |
| 5918 // follows. For a given start character we look up the remainder of the |
| 5919 // block that contains it (represented by the end point), for instance we |
| 5920 // find 'z' if the character is 'c'. A block is characterized by the |
| 5921 // property that all characters uncanonicalize in the same way, except |
| 5922 // that each entry in the result is incremented by the distance from the |
| 5923 // first element. So a-z is a block because 'a' uncanonicalizes to ['a', |
| 5924 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once |
| 5925 // we've found the end point we look up its uncanonicalization and |
| 5926 // produce a range for each element. For instance for [c-f] we look up |
| 5927 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if |
| 5928 // it is not already contained in the input, so [c-f] will be skipped but |
| 5929 // [C-F] will be added. If this range is not completely contained in a |
| 5930 // block we do this for all the blocks covered by the range (handling |
| 5931 // characters that is not in a block as a "singleton block"). |
| 5932 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| 5933 int pos = bottom; |
| 5934 while (pos <= top) { |
| 5935 int length = |
| 5936 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); |
| 5937 uc32 block_end; |
| 5938 if (length == 0) { |
| 5939 block_end = pos; |
| 5940 } else { |
| 5941 DCHECK_EQ(1, length); |
| 5942 block_end = equivalents[0]; |
| 5943 } |
| 5944 int end = (block_end > top) ? top : block_end; |
| 5945 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', |
| 5946 equivalents); |
| 5947 for (int i = 0; i < length; i++) { |
| 5948 uc32 c = equivalents[i]; |
| 5949 uc32 range_from = c - (block_end - pos); |
| 5950 uc32 range_to = c - (block_end - end); |
| 5951 if (!(bottom <= range_from && range_to <= top)) { |
| 5952 ranges->Add(CharacterRange(range_from, range_to), zone); |
| 5953 } |
| 5954 } |
| 5955 pos = end + 1; |
5913 } | 5956 } |
5914 } | 5957 } |
5915 } else { | |
5916 // If this is a range we expand the characters block by block, | |
5917 // expanding contiguous subranges (blocks) one at a time. | |
5918 // The approach is as follows. For a given start character we | |
5919 // look up the remainder of the block that contains it (represented | |
5920 // by the end point), for instance we find 'z' if the character | |
5921 // is 'c'. A block is characterized by the property | |
5922 // that all characters uncanonicalize in the same way, except that | |
5923 // each entry in the result is incremented by the distance from the first | |
5924 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and | |
5925 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. | |
5926 // Once we've found the end point we look up its uncanonicalization | |
5927 // and produce a range for each element. For instance for [c-f] | |
5928 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only | |
5929 // add a range if it is not already contained in the input, so [c-f] | |
5930 // will be skipped but [C-F] will be added. If this range is not | |
5931 // completely contained in a block we do this for all the blocks | |
5932 // covered by the range (handling characters that is not in a block | |
5933 // as a "singleton block"). | |
5934 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
5935 int pos = bottom; | |
5936 while (pos <= top) { | |
5937 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range); | |
5938 uc32 block_end; | |
5939 if (length == 0) { | |
5940 block_end = pos; | |
5941 } else { | |
5942 DCHECK_EQ(1, length); | |
5943 block_end = range[0]; | |
5944 } | |
5945 int end = (block_end > top) ? top : block_end; | |
5946 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range); | |
5947 for (int i = 0; i < length; i++) { | |
5948 uc32 c = range[i]; | |
5949 uc32 range_from = c - (block_end - pos); | |
5950 uc32 range_to = c - (block_end - end); | |
5951 if (!(bottom <= range_from && range_to <= top)) { | |
5952 ranges->Add(CharacterRange(range_from, range_to), zone); | |
5953 } | |
5954 } | |
5955 pos = end + 1; | |
5956 } | |
5957 } | |
5958 } | 5958 } |
5959 } | 5959 } |
5960 | 5960 |
5961 | 5961 |
5962 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { | 5962 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { |
5963 DCHECK_NOT_NULL(ranges); | 5963 DCHECK_NOT_NULL(ranges); |
5964 int n = ranges->length(); | 5964 int n = ranges->length(); |
5965 if (n <= 1) return true; | 5965 if (n <= 1) return true; |
5966 int max = ranges->at(0).to(); | 5966 int max = ranges->at(0).to(); |
5967 for (int i = 1; i < n; i++) { | 5967 for (int i = 1; i < n; i++) { |
(...skipping 907 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6875 | 6875 |
6876 | 6876 |
6877 void RegExpResultsCache::Clear(FixedArray* cache) { | 6877 void RegExpResultsCache::Clear(FixedArray* cache) { |
6878 for (int i = 0; i < kRegExpResultsCacheSize; i++) { | 6878 for (int i = 0; i < kRegExpResultsCacheSize; i++) { |
6879 cache->set(i, Smi::FromInt(0)); | 6879 cache->set(i, Smi::FromInt(0)); |
6880 } | 6880 } |
6881 } | 6881 } |
6882 | 6882 |
6883 } // namespace internal | 6883 } // namespace internal |
6884 } // namespace v8 | 6884 } // namespace v8 |
OLD | NEW |