 Chromium Code Reviews
 Chromium Code Reviews Issue 10831126:
  Take advantage of batched results when matching global regexp.  (Closed) 
  Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge
    
  
    Issue 10831126:
  Take advantage of batched results when matching global regexp.  (Closed) 
  Base URL: https://v8.googlecode.com/svn/branches/bleeding_edge| OLD | NEW | 
|---|---|
| 1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. | 
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without | 
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are | 
| 4 // met: | 4 // met: | 
| 5 // | 5 // | 
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright | 
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. | 
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above | 
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following | 
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided | 
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 86 Handle<String> pattern, | 86 Handle<String> pattern, | 
| 87 JSRegExp::Flags flags, | 87 JSRegExp::Flags flags, | 
| 88 int capture_register_count); | 88 int capture_register_count); | 
| 89 | 89 | 
| 90 | 90 | 
| 91 static void AtomCompile(Handle<JSRegExp> re, | 91 static void AtomCompile(Handle<JSRegExp> re, | 
| 92 Handle<String> pattern, | 92 Handle<String> pattern, | 
| 93 JSRegExp::Flags flags, | 93 JSRegExp::Flags flags, | 
| 94 Handle<String> match_pattern); | 94 Handle<String> match_pattern); | 
| 95 | 95 | 
| 96 | |
| 97 static int AtomExecRaw(Handle<JSRegExp> regexp, | |
| 98 Handle<String> subject, | |
| 99 int index, | |
| 100 int32_t* output, | |
| 101 int output_size); | |
| 102 | |
| 103 | |
| 96 static Handle<Object> AtomExec(Handle<JSRegExp> regexp, | 104 static Handle<Object> AtomExec(Handle<JSRegExp> regexp, | 
| 97 Handle<String> subject, | 105 Handle<String> subject, | 
| 98 int index, | 106 int index, | 
| 99 Handle<JSArray> lastMatchInfo); | 107 Handle<JSArray> lastMatchInfo); | 
| 100 | 108 | 
| 101 enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 }; | 109 enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 }; | 
| 102 | 110 | 
| 103 // Prepare a RegExp for being executed one or more times (using | 111 // Prepare a RegExp for being executed one or more times (using | 
| 104 // IrregexpExecOnce) on the subject. | 112 // IrregexpExecOnce) on the subject. | 
| 105 // This ensures that the regexp is compiled for the subject, and that | 113 // This ensures that the regexp is compiled for the subject, and that | 
| 106 // the subject is flat. | 114 // the subject is flat. | 
| 107 // Returns the number of integer spaces required by IrregexpExecOnce | 115 // Returns the number of integer spaces required by IrregexpExecOnce | 
| 108 // as its "registers" argument. If the regexp cannot be compiled, | 116 // as its "registers" argument. If the regexp cannot be compiled, | 
| 109 // an exception is set as pending, and this function returns negative. | 117 // an exception is set as pending, and this function returns negative. | 
| 110 static int IrregexpPrepare(Handle<JSRegExp> regexp, | 118 static int IrregexpPrepare(Handle<JSRegExp> regexp, | 
| 111 Handle<String> subject); | 119 Handle<String> subject); | 
| 112 | 120 | 
| 113 // Calculate the size of offsets vector for the case of global regexp | 121 // Calculate the size of offsets vector for the case of global regexp | 
| 114 // and the number of matches this vector is able to store. | 122 // and the number of matches this vector is able to store. | 
| 115 static int GlobalOffsetsVectorSize(Handle<JSRegExp> regexp, | 123 static int GlobalOffsetsVectorSize(Handle<JSRegExp> regexp, | 
| 
ulan
2012/08/03 07:44:27
Looks like this function is not used anymore.
 
Yang
2012/08/03 11:37:42
Done.
 | |
| 116 int registers_per_match, | 124 int registers_per_match, | 
| 117 int* max_matches); | 125 int* max_matches); | 
| 118 | 126 | 
| 119 // Execute a regular expression on the subject, starting from index. | 127 // Execute a regular expression on the subject, starting from index. | 
| 120 // If matching succeeds, return the number of matches. This can be larger | 128 // If matching succeeds, return the number of matches. This can be larger | 
| 121 // than one in the case of global regular expressions. | 129 // than one in the case of global regular expressions. | 
| 122 // The captures and subcaptures are stored into the registers vector. | 130 // The captures and subcaptures are stored into the registers vector. | 
| 123 // If matching fails, returns RE_FAILURE. | 131 // If matching fails, returns RE_FAILURE. | 
| 124 // If execution fails, sets a pending exception and returns RE_EXCEPTION. | 132 // If execution fails, sets a pending exception and returns RE_EXCEPTION. | 
| 125 static int IrregexpExecRaw(Handle<JSRegExp> regexp, | 133 static int IrregexpExecRaw(Handle<JSRegExp> regexp, | 
| 126 Handle<String> subject, | 134 Handle<String> subject, | 
| 127 int index, | 135 int index, | 
| 128 Vector<int> registers); | 136 int32_t* output, | 
| 137 int output_size); | |
| 129 | 138 | 
| 130 // Execute an Irregexp bytecode pattern. | 139 // Execute an Irregexp bytecode pattern. | 
| 131 // On a successful match, the result is a JSArray containing | 140 // On a successful match, the result is a JSArray containing | 
| 132 // captured positions. On a failure, the result is the null value. | 141 // captured positions. On a failure, the result is the null value. | 
| 133 // Returns an empty handle in case of an exception. | 142 // Returns an empty handle in case of an exception. | 
| 134 static Handle<Object> IrregexpExec(Handle<JSRegExp> regexp, | 143 static Handle<Object> IrregexpExec(Handle<JSRegExp> regexp, | 
| 135 Handle<String> subject, | 144 Handle<String> subject, | 
| 136 int index, | 145 int index, | 
| 137 Handle<JSArray> lastMatchInfo); | 146 Handle<JSArray> lastMatchInfo); | 
| 138 | 147 | 
| 148 // Set last match info. If match is NULL, then setting captures is omitted. | |
| 
ulan
2012/08/03 07:44:27
Double spaces in the comments for the functions be
 
Yang
2012/08/03 11:37:42
Done.
 | |
| 149 static Handle<JSArray> SetLastMatchInfo(Handle<JSArray> last_match_info, | |
| 150 Handle<String> subject, | |
| 151 int capture_count, | |
| 152 int32_t* match); | |
| 153 | |
| 154 // Initialize a run of global regexp matching. Return either | |
| 155 // RE_SUCCESS or RE_EXCEPTION. | |
| 156 static int GlobalCacheInitialize(Handle<JSRegExp> regexp, | |
| 157 Handle<String> subject, | |
| 158 bool is_global, | |
| 159 Isolate* isolate); | |
| 160 | |
| 161 // Fetch the next entry in the cache for global regexp match results. | |
| 162 // This does not set the last match info. Upon failure, NULL is returned. | |
| 163 // The cause can be checked with GlobalCacheResult(). The previous | |
| 164 // result is still in available in memory when a failure happens. | |
| 165 INLINE(static int32_t* GlobalCacheFetchNext(Handle<JSRegExp> regexp, | |
| 
ulan
2012/08/03 07:44:27
Can we put this function in .cc file, it seems to
 
Yang
2012/08/03 11:37:42
Done.
 | |
| 166 Handle<String> subject)) { | |
| 167 global_cache_current_match_index_++; | |
| 168 // The cache must not be clobbered. | |
| 
ulan
2012/08/03 07:44:27
Stale comment?
 
Yang
2012/08/03 11:37:42
Done.
 | |
| 169 ASSERT(global_cache_register_array_ != NULL); | |
| 170 if (global_cache_current_match_index_ >= global_cache_num_matches_) { | |
| 171 // Current batch of results exhausted. | |
| 172 // Fail if last batch was not even fully filled. | |
| 173 if (global_cache_num_matches_ < global_cache_max_matches_) { | |
| 174 global_cache_num_matches_ = 0; | |
| 175 return NULL; | |
| 176 } | |
| 177 | |
| 178 int32_t* last_match = | |
| 179 &global_cache_register_array_[global_cache_register_array_size_ - | |
| 180 global_cache_registers_per_match_]; | |
| 181 int last_end_index = last_match[1]; | |
| 182 | |
| 183 if (global_cache_current_type_ == JSRegExp::ATOM) { | |
| 184 global_cache_num_matches_ = | |
| 185 RegExpImpl::AtomExecRaw(regexp, | |
| 186 subject, | |
| 187 last_end_index, | |
| 188 global_cache_register_array_, | |
| 189 global_cache_register_array_size_); | |
| 190 } else { | |
| 191 int last_start_index = last_match[0]; | |
| 192 if (last_start_index == last_end_index) last_end_index++; | |
| 193 if (last_end_index > subject->length()) { | |
| 194 global_cache_num_matches_ = 0; | |
| 195 return NULL; | |
| 196 } | |
| 197 global_cache_num_matches_ = | |
| 198 RegExpImpl::IrregexpExecRaw(regexp, | |
| 199 subject, | |
| 200 last_end_index, | |
| 201 global_cache_register_array_, | |
| 202 global_cache_register_array_size_); | |
| 203 } | |
| 204 | |
| 205 if (global_cache_num_matches_ <= 0) return NULL; | |
| 206 global_cache_current_match_index_ = 0; | |
| 207 return global_cache_register_array_; | |
| 208 } else { | |
| 209 return &global_cache_register_array_[ | |
| 210 global_cache_current_match_index_ * | |
| 211 global_cache_registers_per_match_]; | |
| 212 } | |
| 213 } | |
| 214 | |
| 215 INLINE(static IrregexpResult GlobalCacheResult()) { | |
| 216 if (global_cache_num_matches_ > 0) return RE_SUCCESS; | |
| 217 if (global_cache_num_matches_ == 0) return RE_FAILURE; | |
| 218 return RE_EXCEPTION; | |
| 219 } | |
| 220 | |
| 221 | |
| 139 // Array index in the lastMatchInfo array. | 222 // Array index in the lastMatchInfo array. | 
| 140 static const int kLastCaptureCount = 0; | 223 static const int kLastCaptureCount = 0; | 
| 141 static const int kLastSubject = 1; | 224 static const int kLastSubject = 1; | 
| 142 static const int kLastInput = 2; | 225 static const int kLastInput = 2; | 
| 143 static const int kFirstCapture = 3; | 226 static const int kFirstCapture = 3; | 
| 144 static const int kLastMatchOverhead = 3; | 227 static const int kLastMatchOverhead = 3; | 
| 145 | 228 | 
| 146 // Direct offset into the lastMatchInfo array. | 229 // Direct offset into the lastMatchInfo array. | 
| 147 static const int kLastCaptureCountOffset = | 230 static const int kLastCaptureCountOffset = | 
| 148 FixedArray::kHeaderSize + kLastCaptureCount * kPointerSize; | 231 FixedArray::kHeaderSize + kLastCaptureCount * kPointerSize; | 
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 188 | 271 | 
| 189 // Limit the space regexps take up on the heap. In order to limit this we | 272 // Limit the space regexps take up on the heap. In order to limit this we | 
| 190 // would like to keep track of the amount of regexp code on the heap. This | 273 // would like to keep track of the amount of regexp code on the heap. This | 
| 191 // is not tracked, however. As a conservative approximation we track the | 274 // is not tracked, however. As a conservative approximation we track the | 
| 192 // total regexp code compiled including code that has subsequently been freed | 275 // total regexp code compiled including code that has subsequently been freed | 
| 193 // and the total executable memory at any point. | 276 // and the total executable memory at any point. | 
| 194 static const int kRegExpExecutableMemoryLimit = 16 * MB; | 277 static const int kRegExpExecutableMemoryLimit = 16 * MB; | 
| 195 static const int kRegWxpCompiledLimit = 1 * MB; | 278 static const int kRegWxpCompiledLimit = 1 * MB; | 
| 196 | 279 | 
| 197 private: | 280 private: | 
| 198 static String* last_ascii_string_; | |
| 199 static String* two_byte_cached_string_; | |
| 200 | |
| 201 static bool CompileIrregexp( | 281 static bool CompileIrregexp( | 
| 202 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii); | 282 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii); | 
| 203 static inline bool EnsureCompiledIrregexp( | 283 static inline bool EnsureCompiledIrregexp( | 
| 204 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii); | 284 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii); | 
| 205 | 285 | 
| 206 | 286 // State of the global cache. | 
| 207 // Set the subject cache. The previous string buffer is not deleted, so the | 287 static int global_cache_num_matches_; | 
| 208 // caller should ensure that it doesn't leak. | 288 static int global_cache_max_matches_; | 
| 209 static void SetSubjectCache(String* subject, | 289 static int global_cache_current_match_index_; | 
| 210 char* utf8_subject, | 290 static int global_cache_registers_per_match_; | 
| 211 int uft8_length, | 291 // Pointer to the last set of captures. | 
| 212 int character_position, | 292 static JSRegExp::Type global_cache_current_type_; | 
| 213 int utf8_position); | 293 static int32_t* global_cache_register_array_; | 
| 214 | 294 static int global_cache_register_array_size_; | 
| 215 // A one element cache of the last utf8_subject string and its length. The | |
| 216 // subject JS String object is cached in the heap. We also cache a | |
| 217 // translation between position and utf8 position. | |
| 218 static char* utf8_subject_cache_; | |
| 219 static int utf8_length_cache_; | |
| 220 static int utf8_position_; | |
| 221 static int character_position_; | |
| 222 }; | 295 }; | 
| 223 | 296 | 
| 224 | 297 | 
| 225 // Represents the location of one element relative to the intersection of | 298 // Represents the location of one element relative to the intersection of | 
| 226 // two sets. Corresponds to the four areas of a Venn diagram. | 299 // two sets. Corresponds to the four areas of a Venn diagram. | 
| 227 enum ElementInSetsRelation { | 300 enum ElementInSetsRelation { | 
| 228 kInsideNone = 0, | 301 kInsideNone = 0, | 
| 229 kInsideFirst = 1, | 302 kInsideFirst = 1, | 
| 230 kInsideSecond = 2, | 303 kInsideSecond = 2, | 
| 231 kInsideBoth = 3 | 304 kInsideBoth = 3 | 
| (...skipping 1383 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1615 bool global, | 1688 bool global, | 
| 1616 bool multiline, | 1689 bool multiline, | 
| 1617 Handle<String> pattern, | 1690 Handle<String> pattern, | 
| 1618 Handle<String> sample_subject, | 1691 Handle<String> sample_subject, | 
| 1619 bool is_ascii, Zone* zone); | 1692 bool is_ascii, Zone* zone); | 
| 1620 | 1693 | 
| 1621 static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); | 1694 static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); | 
| 1622 }; | 1695 }; | 
| 1623 | 1696 | 
| 1624 | 1697 | 
| 1625 class OffsetsVector { | |
| 1626 public: | |
| 1627 inline OffsetsVector(int num_registers, Isolate* isolate) | |
| 1628 : offsets_vector_length_(num_registers) { | |
| 1629 if (offsets_vector_length_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { | |
| 1630 vector_ = NewArray<int>(offsets_vector_length_); | |
| 1631 } else { | |
| 1632 vector_ = isolate->jsregexp_static_offsets_vector(); | |
| 1633 } | |
| 1634 } | |
| 1635 inline ~OffsetsVector() { | |
| 1636 if (offsets_vector_length_ > Isolate::kJSRegexpStaticOffsetsVectorSize) { | |
| 1637 DeleteArray(vector_); | |
| 1638 vector_ = NULL; | |
| 1639 } | |
| 1640 } | |
| 1641 inline int* vector() { return vector_; } | |
| 1642 inline int length() { return offsets_vector_length_; } | |
| 1643 | |
| 1644 static const int kStaticOffsetsVectorSize = | |
| 1645 Isolate::kJSRegexpStaticOffsetsVectorSize; | |
| 1646 | |
| 1647 private: | |
| 1648 static Address static_offsets_vector_address(Isolate* isolate) { | |
| 1649 return reinterpret_cast<Address>(isolate->jsregexp_static_offsets_vector()); | |
| 1650 } | |
| 1651 | |
| 1652 int* vector_; | |
| 1653 int offsets_vector_length_; | |
| 1654 | |
| 1655 friend class ExternalReference; | |
| 1656 }; | |
| 1657 | |
| 1658 | |
| 1659 } } // namespace v8::internal | 1698 } } // namespace v8::internal | 
| 1660 | 1699 | 
| 1661 #endif // V8_JSREGEXP_H_ | 1700 #endif // V8_JSREGEXP_H_ | 
| OLD | NEW |