OLD | NEW |
---|---|
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 262 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
273 | 273 |
274 | 274 |
275 // Irregexp implementation. | 275 // Irregexp implementation. |
276 | 276 |
277 // Ensures that the regexp object contains a compiled version of the | 277 // Ensures that the regexp object contains a compiled version of the |
278 // source for either ASCII or non-ASCII strings. | 278 // source for either ASCII or non-ASCII strings. |
279 // If the compiled version doesn't already exist, it is compiled | 279 // If the compiled version doesn't already exist, it is compiled |
280 // from the source pattern. | 280 // from the source pattern. |
281 // If compilation fails, an exception is thrown and this function | 281 // If compilation fails, an exception is thrown and this function |
282 // returns false. | 282 // returns false. |
283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 283 bool RegExpImpl::EnsureCompiledIrregexp( |
284 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { | |
284 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); | 285 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); |
285 #ifdef V8_INTERPRETED_REGEXP | 286 #ifdef V8_INTERPRETED_REGEXP |
286 if (compiled_code->IsByteArray()) return true; | 287 if (compiled_code->IsByteArray()) return true; |
287 #else // V8_INTERPRETED_REGEXP (RegExp native code) | 288 #else // V8_INTERPRETED_REGEXP (RegExp native code) |
288 if (compiled_code->IsCode()) return true; | 289 if (compiled_code->IsCode()) return true; |
289 #endif | 290 #endif |
290 // We could potentially have marked this as flushable, but have kept | 291 // We could potentially have marked this as flushable, but have kept |
291 // a saved version if we did not flush it yet. | 292 // a saved version if we did not flush it yet. |
292 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); | 293 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); |
293 if (saved_code->IsCode()) { | 294 if (saved_code->IsCode()) { |
294 // Reinstate the code in the original place. | 295 // Reinstate the code in the original place. |
295 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); | 296 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); |
296 ASSERT(compiled_code->IsSmi()); | 297 ASSERT(compiled_code->IsSmi()); |
297 return true; | 298 return true; |
298 } | 299 } |
299 return CompileIrregexp(re, is_ascii); | 300 return CompileIrregexp(re, sample_subject, is_ascii); |
300 } | 301 } |
301 | 302 |
302 | 303 |
303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, | 304 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, |
304 bool is_ascii, | 305 bool is_ascii, |
305 Handle<String> error_message, | 306 Handle<String> error_message, |
306 Isolate* isolate) { | 307 Isolate* isolate) { |
307 Factory* factory = isolate->factory(); | 308 Factory* factory = isolate->factory(); |
308 Handle<FixedArray> elements = factory->NewFixedArray(2); | 309 Handle<FixedArray> elements = factory->NewFixedArray(2); |
309 elements->set(0, re->Pattern()); | 310 elements->set(0, re->Pattern()); |
310 elements->set(1, *error_message); | 311 elements->set(1, *error_message); |
311 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 312 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
312 Handle<Object> regexp_err = | 313 Handle<Object> regexp_err = |
313 factory->NewSyntaxError("malformed_regexp", array); | 314 factory->NewSyntaxError("malformed_regexp", array); |
314 isolate->Throw(*regexp_err); | 315 isolate->Throw(*regexp_err); |
315 return false; | 316 return false; |
316 } | 317 } |
317 | 318 |
318 | 319 |
319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 320 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
321 Handle<String> sample_subject, | |
322 bool is_ascii) { | |
320 // Compile the RegExp. | 323 // Compile the RegExp. |
321 Isolate* isolate = re->GetIsolate(); | 324 Isolate* isolate = re->GetIsolate(); |
322 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); | 325 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); |
323 PostponeInterruptsScope postpone(isolate); | 326 PostponeInterruptsScope postpone(isolate); |
324 // If we had a compilation error the last time this is saved at the | 327 // If we had a compilation error the last time this is saved at the |
325 // saved code index. | 328 // saved code index. |
326 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); | 329 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); |
327 // When arriving here entry can only be a smi, either representing an | 330 // When arriving here entry can only be a smi, either representing an |
328 // uncompiled regexp, a previous compilation error, or code that has | 331 // uncompiled regexp, a previous compilation error, or code that has |
329 // been flushed. | 332 // been flushed. |
(...skipping 28 matching lines...) Expand all Loading... | |
358 pattern, | 361 pattern, |
359 compile_data.error, | 362 compile_data.error, |
360 "malformed_regexp"); | 363 "malformed_regexp"); |
361 return false; | 364 return false; |
362 } | 365 } |
363 RegExpEngine::CompilationResult result = | 366 RegExpEngine::CompilationResult result = |
364 RegExpEngine::Compile(&compile_data, | 367 RegExpEngine::Compile(&compile_data, |
365 flags.is_ignore_case(), | 368 flags.is_ignore_case(), |
366 flags.is_multiline(), | 369 flags.is_multiline(), |
367 pattern, | 370 pattern, |
371 sample_subject, | |
368 is_ascii); | 372 is_ascii); |
369 if (result.error_message != NULL) { | 373 if (result.error_message != NULL) { |
370 // Unable to compile regexp. | 374 // Unable to compile regexp. |
371 Handle<String> error_message = | 375 Handle<String> error_message = |
372 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); | 376 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); |
373 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); | 377 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); |
374 return false; | 378 return false; |
375 } | 379 } |
376 | 380 |
377 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); | 381 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
428 capture_count); | 432 capture_count); |
429 } | 433 } |
430 | 434 |
431 | 435 |
432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, | 436 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, |
433 Handle<String> subject) { | 437 Handle<String> subject) { |
434 if (!subject->IsFlat()) FlattenString(subject); | 438 if (!subject->IsFlat()) FlattenString(subject); |
435 | 439 |
436 // Check the asciiness of the underlying storage. | 440 // Check the asciiness of the underlying storage. |
437 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 441 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
438 if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; | 442 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; |
439 | 443 |
440 #ifdef V8_INTERPRETED_REGEXP | 444 #ifdef V8_INTERPRETED_REGEXP |
441 // Byte-code regexp needs space allocated for all its registers. | 445 // Byte-code regexp needs space allocated for all its registers. |
442 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); | 446 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); |
443 #else // V8_INTERPRETED_REGEXP | 447 #else // V8_INTERPRETED_REGEXP |
444 // Native regexp only needs room to output captures. Registers are handled | 448 // Native regexp only needs room to output captures. Registers are handled |
445 // internally. | 449 // internally. |
446 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; | 450 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; |
447 #endif // V8_INTERPRETED_REGEXP | 451 #endif // V8_INTERPRETED_REGEXP |
448 } | 452 } |
(...skipping 10 matching lines...) Expand all Loading... | |
459 | 463 |
460 ASSERT(index >= 0); | 464 ASSERT(index >= 0); |
461 ASSERT(index <= subject->length()); | 465 ASSERT(index <= subject->length()); |
462 ASSERT(subject->IsFlat()); | 466 ASSERT(subject->IsFlat()); |
463 | 467 |
464 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 468 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
465 | 469 |
466 #ifndef V8_INTERPRETED_REGEXP | 470 #ifndef V8_INTERPRETED_REGEXP |
467 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); | 471 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
468 do { | 472 do { |
469 EnsureCompiledIrregexp(regexp, is_ascii); | 473 EnsureCompiledIrregexp(regexp, subject, is_ascii); |
470 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); | 474 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); |
471 NativeRegExpMacroAssembler::Result res = | 475 NativeRegExpMacroAssembler::Result res = |
472 NativeRegExpMacroAssembler::Match(code, | 476 NativeRegExpMacroAssembler::Match(code, |
473 subject, | 477 subject, |
474 output.start(), | 478 output.start(), |
475 output.length(), | 479 output.length(), |
476 index, | 480 index, |
477 isolate); | 481 isolate); |
478 if (res != NativeRegExpMacroAssembler::RETRY) { | 482 if (res != NativeRegExpMacroAssembler::RETRY) { |
479 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || | 483 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || |
(...skipping 297 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { | 781 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { |
778 if (table_ == NULL) { | 782 if (table_ == NULL) { |
779 table_ = new DispatchTable(); | 783 table_ = new DispatchTable(); |
780 DispatchTableConstructor cons(table_, ignore_case); | 784 DispatchTableConstructor cons(table_, ignore_case); |
781 cons.BuildTable(this); | 785 cons.BuildTable(this); |
782 } | 786 } |
783 return table_; | 787 return table_; |
784 } | 788 } |
785 | 789 |
786 | 790 |
791 class FrequencyCollator { | |
792 public: | |
793 FrequencyCollator() : total_frequencies_(0) { | |
794 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { | |
795 frequencies_[i] = CharacterFrequency(i); | |
796 } | |
797 } | |
798 | |
799 void CountCharacter(int character) { | |
800 int index = (character & RegExpMacroAssembler::kTableMask); | |
801 frequencies_[index].Increment(); | |
802 total_frequencies_++; | |
803 } | |
804 | |
805 void SortFrequencies() { | |
806 typedef int (*RawComparer)(const void*, const void*); | |
807 memcpy( | |
808 &sorted_frequencies_[0], &frequencies_[0], sizeof(sorted_frequencies_)); | |
ulan
2012/03/30 13:04:48
A nit: for consistency I would prefer breaking mem
Erik Corry
2012/04/01 00:48:46
Done.
| |
809 qsort(&sorted_frequencies_[0], | |
810 RegExpMacroAssembler::kTableSize, | |
811 sizeof(sorted_frequencies_[0]), | |
812 reinterpret_cast<RawComparer>(&CharacterFrequency::Compare)); | |
813 } | |
814 | |
815 bool IsFrequent(int in_character) { | |
816 if (total_frequencies_ < 10) return false; // Too little statistical basis. | |
817 int freq = frequencies_[in_character].counter(); | |
ulan
2012/03/30 13:04:48
Either mask the in_character or ASSERT(in_characte
Erik Corry
2012/04/01 00:48:46
Done.
| |
818 bool popular = ((freq * 100) / total_frequencies_ > 20); | |
819 return popular; | |
ulan
2012/03/30 13:04:48
Named magic numbers become less magic :)
int freq
Erik Corry
2012/04/01 00:48:46
Done.
| |
820 } | |
821 | |
822 private: | |
823 class CharacterFrequency { | |
824 public: | |
825 CharacterFrequency() : counter_(0), character_(-1) { } | |
826 explicit CharacterFrequency(int character) : counter_(0), character_(charact er) { } | |
ulan
2012/03/30 13:04:48
Long line.
Erik Corry
2012/04/01 00:48:46
Done.
| |
827 | |
828 static int Compare(const CharacterFrequency* a, | |
829 const CharacterFrequency* b) { | |
830 return b->counter_ - a->counter_; | |
831 } | |
832 | |
833 void Increment() { counter_++; } | |
834 int counter() { return counter_; } | |
835 int character() { return character_; } | |
836 | |
837 private: | |
838 int counter_; | |
839 int character_; | |
840 }; | |
841 | |
842 | |
843 private: | |
844 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; | |
845 CharacterFrequency sorted_frequencies_[RegExpMacroAssembler::kTableSize]; | |
846 int total_frequencies_; | |
847 }; | |
848 | |
849 | |
787 class RegExpCompiler { | 850 class RegExpCompiler { |
788 public: | 851 public: |
789 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); | 852 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); |
790 | 853 |
791 int AllocateRegister() { | 854 int AllocateRegister() { |
792 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { | 855 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { |
793 reg_exp_too_big_ = true; | 856 reg_exp_too_big_ = true; |
794 return next_register_; | 857 return next_register_; |
795 } | 858 } |
796 return next_register_++; | 859 return next_register_++; |
(...skipping 15 matching lines...) Expand all Loading... | |
812 | 875 |
813 static const int kMaxRecursion = 100; | 876 static const int kMaxRecursion = 100; |
814 inline int recursion_depth() { return recursion_depth_; } | 877 inline int recursion_depth() { return recursion_depth_; } |
815 inline void IncrementRecursionDepth() { recursion_depth_++; } | 878 inline void IncrementRecursionDepth() { recursion_depth_++; } |
816 inline void DecrementRecursionDepth() { recursion_depth_--; } | 879 inline void DecrementRecursionDepth() { recursion_depth_--; } |
817 | 880 |
818 void SetRegExpTooBig() { reg_exp_too_big_ = true; } | 881 void SetRegExpTooBig() { reg_exp_too_big_ = true; } |
819 | 882 |
820 inline bool ignore_case() { return ignore_case_; } | 883 inline bool ignore_case() { return ignore_case_; } |
821 inline bool ascii() { return ascii_; } | 884 inline bool ascii() { return ascii_; } |
885 FrequencyCollator* frequency_collator() { return &frequency_collator_; } | |
822 | 886 |
823 int current_expansion_factor() { return current_expansion_factor_; } | 887 int current_expansion_factor() { return current_expansion_factor_; } |
824 void set_current_expansion_factor(int value) { | 888 void set_current_expansion_factor(int value) { |
825 current_expansion_factor_ = value; | 889 current_expansion_factor_ = value; |
826 } | 890 } |
827 | 891 |
828 static const int kNoRegister = -1; | 892 static const int kNoRegister = -1; |
829 | 893 |
830 private: | 894 private: |
831 EndNode* accept_; | 895 EndNode* accept_; |
832 int next_register_; | 896 int next_register_; |
833 List<RegExpNode*>* work_list_; | 897 List<RegExpNode*>* work_list_; |
834 int recursion_depth_; | 898 int recursion_depth_; |
835 RegExpMacroAssembler* macro_assembler_; | 899 RegExpMacroAssembler* macro_assembler_; |
836 bool ignore_case_; | 900 bool ignore_case_; |
837 bool ascii_; | 901 bool ascii_; |
838 bool reg_exp_too_big_; | 902 bool reg_exp_too_big_; |
839 int current_expansion_factor_; | 903 int current_expansion_factor_; |
904 FrequencyCollator frequency_collator_; | |
840 }; | 905 }; |
841 | 906 |
842 | 907 |
843 class RecursionCheck { | 908 class RecursionCheck { |
844 public: | 909 public: |
845 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { | 910 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { |
846 compiler->IncrementRecursionDepth(); | 911 compiler->IncrementRecursionDepth(); |
847 } | 912 } |
848 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } | 913 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } |
849 private: | 914 private: |
850 RegExpCompiler* compiler_; | 915 RegExpCompiler* compiler_; |
851 }; | 916 }; |
852 | 917 |
853 | 918 |
854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { | 919 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { |
855 return RegExpEngine::CompilationResult("RegExp too big"); | 920 return RegExpEngine::CompilationResult("RegExp too big"); |
856 } | 921 } |
857 | 922 |
858 | 923 |
859 // Attempts to compile the regexp using an Irregexp code generator. Returns | 924 // Attempts to compile the regexp using an Irregexp code generator. Returns |
860 // a fixed array or a null handle depending on whether it succeeded. | 925 // a fixed array or a null handle depending on whether it succeeded. |
861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) | 926 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) |
862 : next_register_(2 * (capture_count + 1)), | 927 : next_register_(2 * (capture_count + 1)), |
863 work_list_(NULL), | 928 work_list_(NULL), |
864 recursion_depth_(0), | 929 recursion_depth_(0), |
865 ignore_case_(ignore_case), | 930 ignore_case_(ignore_case), |
866 ascii_(ascii), | 931 ascii_(ascii), |
867 reg_exp_too_big_(false), | 932 reg_exp_too_big_(false), |
868 current_expansion_factor_(1) { | 933 current_expansion_factor_(1), |
934 frequency_collator_() { | |
869 accept_ = new EndNode(EndNode::ACCEPT); | 935 accept_ = new EndNode(EndNode::ACCEPT); |
870 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); | 936 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); |
871 } | 937 } |
872 | 938 |
873 | 939 |
874 RegExpEngine::CompilationResult RegExpCompiler::Assemble( | 940 RegExpEngine::CompilationResult RegExpCompiler::Assemble( |
875 RegExpMacroAssembler* macro_assembler, | 941 RegExpMacroAssembler* macro_assembler, |
876 RegExpNode* start, | 942 RegExpNode* start, |
877 int capture_count, | 943 int capture_count, |
878 Handle<String> pattern) { | 944 Handle<String> pattern) { |
(...skipping 1170 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2049 int recursion_depth, | 2115 int recursion_depth, |
2050 bool not_at_start) { | 2116 bool not_at_start) { |
2051 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2117 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2052 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! | 2118 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! |
2053 return on_success()->EatsAtLeast(still_to_find, | 2119 return on_success()->EatsAtLeast(still_to_find, |
2054 recursion_depth + 1, | 2120 recursion_depth + 1, |
2055 not_at_start); | 2121 not_at_start); |
2056 } | 2122 } |
2057 | 2123 |
2058 | 2124 |
2125 void ActionNode::FillInBMInfo( | |
2126 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
ulan
2012/03/30 13:04:48
For consistency with the surrounding code consider
Erik Corry
2012/04/01 00:48:46
Done.
| |
2127 if (type_ == BEGIN_SUBMATCH) { | |
2128 bm->SetRest(offset); | |
2129 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { | |
2130 on_success()->FillInBMInfo(offset, bm, not_at_start); | |
2131 } | |
2132 } | |
2133 | |
2134 | |
2059 int AssertionNode::EatsAtLeast(int still_to_find, | 2135 int AssertionNode::EatsAtLeast(int still_to_find, |
2060 int recursion_depth, | 2136 int recursion_depth, |
2061 bool not_at_start) { | 2137 bool not_at_start) { |
2062 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2138 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2063 // If we know we are not at the start and we are asked "how many characters | 2139 // If we know we are not at the start and we are asked "how many characters |
2064 // will you match if you succeed?" then we can answer anything since false | 2140 // will you match if you succeed?" then we can answer anything since false |
2065 // implies false. So lets just return the max answer (still_to_find) since | 2141 // implies false. So lets just return the max answer (still_to_find) since |
2066 // that won't prevent us from preloading a lot of characters for the other | 2142 // that won't prevent us from preloading a lot of characters for the other |
2067 // branches in the node graph. | 2143 // branches in the node graph. |
2068 if (type() == AT_START && not_at_start) return still_to_find; | 2144 if (type() == AT_START && not_at_start) return still_to_find; |
2069 return on_success()->EatsAtLeast(still_to_find, | 2145 return on_success()->EatsAtLeast(still_to_find, |
2070 recursion_depth + 1, | 2146 recursion_depth + 1, |
2071 not_at_start); | 2147 not_at_start); |
2072 } | 2148 } |
2073 | 2149 |
2074 | 2150 |
2151 void AssertionNode::FillInBMInfo( | |
2152 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
2153 // Match the behaviour of EatsAtLeast on this node. | |
2154 if (type() == AT_START && not_at_start) return; | |
2155 on_success()->FillInBMInfo(offset, bm, not_at_start); | |
2156 } | |
2157 | |
2158 | |
2075 int BackReferenceNode::EatsAtLeast(int still_to_find, | 2159 int BackReferenceNode::EatsAtLeast(int still_to_find, |
2076 int recursion_depth, | 2160 int recursion_depth, |
2077 bool not_at_start) { | 2161 bool not_at_start) { |
2078 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2162 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2079 return on_success()->EatsAtLeast(still_to_find, | 2163 return on_success()->EatsAtLeast(still_to_find, |
2080 recursion_depth + 1, | 2164 recursion_depth + 1, |
2081 not_at_start); | 2165 not_at_start); |
2082 } | 2166 } |
2083 | 2167 |
2084 | 2168 |
(...skipping 412 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2497 bool not_at_start) { | 2581 bool not_at_start) { |
2498 if (body_can_be_zero_length_ || info()->visited) return; | 2582 if (body_can_be_zero_length_ || info()->visited) return; |
2499 VisitMarker marker(info()); | 2583 VisitMarker marker(info()); |
2500 return ChoiceNode::GetQuickCheckDetails(details, | 2584 return ChoiceNode::GetQuickCheckDetails(details, |
2501 compiler, | 2585 compiler, |
2502 characters_filled_in, | 2586 characters_filled_in, |
2503 not_at_start); | 2587 not_at_start); |
2504 } | 2588 } |
2505 | 2589 |
2506 | 2590 |
2591 void LoopChoiceNode::FillInBMInfo( | |
2592 int offset, BoyerMooreLookahead* bm, bool nas) { | |
2593 if (body_can_be_zero_length_) { // || info()->visited) { | |
ulan
2012/03/30 13:04:48
Forgotten comment.
Erik Corry
2012/04/01 00:48:46
Done.
| |
2594 bm->SetRest(offset); | |
2595 return; | |
2596 } | |
2597 // VisitMarker marker(info()); | |
ulan
2012/03/30 13:04:48
Comment.
Erik Corry
2012/04/01 00:48:46
Done.
| |
2598 ChoiceNode::FillInBMInfo(offset, bm, nas); | |
2599 } | |
2600 | |
2601 | |
2507 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2602 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
2508 RegExpCompiler* compiler, | 2603 RegExpCompiler* compiler, |
2509 int characters_filled_in, | 2604 int characters_filled_in, |
2510 bool not_at_start) { | 2605 bool not_at_start) { |
2511 not_at_start = (not_at_start || not_at_start_); | 2606 not_at_start = (not_at_start || not_at_start_); |
2512 int choice_count = alternatives_->length(); | 2607 int choice_count = alternatives_->length(); |
2513 ASSERT(choice_count > 0); | 2608 ASSERT(choice_count > 0); |
2514 alternatives_->at(0).node()->GetQuickCheckDetails(details, | 2609 alternatives_->at(0).node()->GetQuickCheckDetails(details, |
2515 compiler, | 2610 compiler, |
2516 characters_filled_in, | 2611 characters_filled_in, |
(...skipping 476 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2993 int TextNode::GreedyLoopTextLength() { | 3088 int TextNode::GreedyLoopTextLength() { |
2994 TextElement elm = elms_->at(elms_->length() - 1); | 3089 TextElement elm = elms_->at(elms_->length() - 1); |
2995 if (elm.type == TextElement::CHAR_CLASS) { | 3090 if (elm.type == TextElement::CHAR_CLASS) { |
2996 return elm.cp_offset + 1; | 3091 return elm.cp_offset + 1; |
2997 } else { | 3092 } else { |
2998 return elm.cp_offset + elm.data.u_atom->data().length(); | 3093 return elm.cp_offset + elm.data.u_atom->data().length(); |
2999 } | 3094 } |
3000 } | 3095 } |
3001 | 3096 |
3002 | 3097 |
3098 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( | |
3099 RegExpCompiler* compiler) { | |
3100 if (elms_->length() != 1) return NULL; | |
3101 TextElement elm = elms_->at(0); | |
3102 if (elm.type != TextElement::CHAR_CLASS) return NULL; | |
3103 RegExpCharacterClass* node = elm.data.u_char_class; | |
3104 ZoneList<CharacterRange>* ranges = node->ranges(); | |
3105 if (!CharacterRange::IsCanonical(ranges)) { | |
3106 CharacterRange::Canonicalize(ranges); | |
3107 } | |
3108 if (node->is_negated() && ranges->length() == 0) return on_success(); | |
3109 if (ranges->length() != 1) return NULL; | |
3110 if (ranges->at(0).from() != 0) return NULL; | |
3111 int max_char; | |
3112 if (compiler->ascii()) { | |
ulan
2012/03/30 13:04:48
A nit, I like
max_char = compiler->ascii() ? Stri
Erik Corry
2012/04/01 00:48:46
Yes, but a bug in gcc means you get a linker error
| |
3113 max_char = String::kMaxAsciiCharCode; | |
3114 } else { | |
3115 max_char = String::kMaxUtf16CodeUnit; | |
3116 } | |
3117 if (ranges->at(0).to() < max_char) return NULL; | |
ulan
2012/03/30 13:04:48
Consider using ranges(0)->IsEverything()
Erik Corry
2012/04/01 00:48:46
Done.
| |
3118 return on_success(); | |
3119 } | |
3120 | |
3121 | |
3003 // Finds the fixed match length of a sequence of nodes that goes from | 3122 // Finds the fixed match length of a sequence of nodes that goes from |
3004 // this alternative and back to this choice node. If there are variable | 3123 // this alternative and back to this choice node. If there are variable |
3005 // length nodes or other complications in the way then return a sentinel | 3124 // length nodes or other complications in the way then return a sentinel |
3006 // value indicating that a greedy loop cannot be constructed. | 3125 // value indicating that a greedy loop cannot be constructed. |
3007 int ChoiceNode::GreedyLoopTextLengthForAlternative( | 3126 int ChoiceNode::GreedyLoopTextLengthForAlternative( |
3008 GuardedAlternative* alternative) { | 3127 GuardedAlternative* alternative) { |
3009 int length = 0; | 3128 int length = 0; |
3010 RegExpNode* node = alternative->node(); | 3129 RegExpNode* node = alternative->node(); |
3011 // Later we will generate code for all these text nodes using recursion | 3130 // Later we will generate code for all these text nodes using recursion |
3012 // so we have to limit the max number. | 3131 // so we have to limit the max number. |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3057 ASSERT(trace->stop_node() == NULL); | 3176 ASSERT(trace->stop_node() == NULL); |
3058 if (!trace->is_trivial()) { | 3177 if (!trace->is_trivial()) { |
3059 trace->Flush(compiler, this); | 3178 trace->Flush(compiler, this); |
3060 return; | 3179 return; |
3061 } | 3180 } |
3062 ChoiceNode::Emit(compiler, trace); | 3181 ChoiceNode::Emit(compiler, trace); |
3063 } | 3182 } |
3064 | 3183 |
3065 | 3184 |
3066 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, | 3185 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
3067 bool not_at_start) { | 3186 int eats_at_least) { |
3068 int preload_characters = EatsAtLeast(4, 0, not_at_start); | 3187 int preload_characters = eats_at_least > 4 ? 4 : eats_at_least; |
ulan
2012/03/30 13:04:48
Min(4, eats_at_least);
Erik Corry
2012/04/01 00:48:46
Done.
| |
3069 if (compiler->macro_assembler()->CanReadUnaligned()) { | 3188 if (compiler->macro_assembler()->CanReadUnaligned()) { |
3070 bool ascii = compiler->ascii(); | 3189 bool ascii = compiler->ascii(); |
3071 if (ascii) { | 3190 if (ascii) { |
3072 if (preload_characters > 4) preload_characters = 4; | 3191 if (preload_characters > 4) preload_characters = 4; |
3073 // We can't preload 3 characters because there is no machine instruction | 3192 // We can't preload 3 characters because there is no machine instruction |
3074 // to do that. We can't just load 4 because we could be reading | 3193 // to do that. We can't just load 4 because we could be reading |
3075 // beyond the end of the string, which could cause a memory fault. | 3194 // beyond the end of the string, which could cause a memory fault. |
3076 if (preload_characters == 3) preload_characters = 2; | 3195 if (preload_characters == 3) preload_characters = 2; |
3077 } else { | 3196 } else { |
3078 if (preload_characters > 2) preload_characters = 2; | 3197 if (preload_characters > 2) preload_characters = 2; |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3124 return alt_gens_[i]; | 3243 return alt_gens_[i]; |
3125 } | 3244 } |
3126 | 3245 |
3127 private: | 3246 private: |
3128 static const int kAFew = 10; | 3247 static const int kAFew = 10; |
3129 ZoneList<AlternativeGeneration*> alt_gens_; | 3248 ZoneList<AlternativeGeneration*> alt_gens_; |
3130 AlternativeGeneration a_few_alt_gens_[kAFew]; | 3249 AlternativeGeneration a_few_alt_gens_[kAFew]; |
3131 }; | 3250 }; |
3132 | 3251 |
3133 | 3252 |
3253 BoyerMooreLookahead::BoyerMooreLookahead( | |
3254 int length, int map_length, RegExpCompiler* compiler) | |
3255 : length_(length), | |
3256 map_length_(map_length), | |
ulan
2012/03/30 13:04:48
the Set() method assumes that map_length is a powe
Erik Corry
2012/04/01 00:48:46
Done.
| |
3257 compiler_(compiler) { | |
3258 if (compiler->ascii()) { | |
3259 max_char_ = String::kMaxAsciiCharCode; | |
3260 } else { | |
3261 max_char_ = String::kMaxUtf16CodeUnit; | |
3262 } | |
3263 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); | |
ulan
2012/03/30 13:04:48
I wonder if bit-packing (ZoneList<bool>) would imp
Erik Corry
2012/04/01 00:48:46
We normally only do this once per regexp, so I don
| |
3264 for (int i = 0; i < length; i++) { | |
3265 bitmaps_->Add(new ZoneList<bool>(map_length)); | |
3266 ZoneList<bool>* map = bitmaps_->at(i); | |
3267 for (int i = 0; i < map_length; i++) { | |
3268 map->Add(false); | |
3269 } | |
3270 } | |
3271 } | |
3272 | |
3273 | |
3274 bool BoyerMooreLookahead::TooVague(int offset, int limit) { | |
3275 if (bitmaps_->at(offset) == NULL) return true; | |
3276 if (Count(offset) > limit) return true; | |
3277 return false; | |
3278 } | |
3279 | |
3280 | |
3281 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { | |
3282 // The characters that are very frequent in the sample subject string should | |
3283 // not be used for Boyer-Moore operations, so lets mark those positions as | |
3284 // being too vague. | |
3285 for (int i = 0; i < length_; i++) { | |
3286 if (Count(i) >= kTooManyCharacters) { | |
3287 SetAll(i); | |
3288 } else { | |
3289 ZoneList<bool>* map = bitmaps_->at(i); | |
3290 for (int ch = 0; ch < map_length_; ch++) { | |
3291 if (map->at(ch) && compiler_->frequency_collator()->IsFrequent(ch)) { | |
3292 SetAll(i); | |
3293 break; | |
3294 } | |
3295 } | |
3296 } | |
3297 } | |
3298 const int kNoPoints = -1; | |
3299 int biggest_points = kNoPoints; | |
3300 int biggest_interval_max = 0; | |
3301 int biggest_interval = 0; | |
3302 int vagueness_factor = 10; | |
ulan
2012/03/30 13:04:48
Could you please write a comment describing the ma
Erik Corry
2012/04/01 00:48:46
I refactored this function because it reflected fa
| |
3303 for (int initial_max_vagueness = 4; | |
3304 initial_max_vagueness <= kTooManyCharacters; | |
3305 initial_max_vagueness *= 2, vagueness_factor--) { | |
3306 int max = length_ - 1; | |
ulan
2012/03/30 13:04:48
This 'max' can just be called 'i'. I think there a
Erik Corry
2012/04/01 00:48:46
Done.
| |
3307 while (true) { | |
3308 while (max >= 0 && TooVague(max, initial_max_vagueness)) max--; | |
3309 if (max < 0) break; | |
3310 int remembered_max = max; | |
3311 int max_vagueness = initial_max_vagueness; | |
3312 while (max >= 0 && !TooVague(max, max_vagueness)) { | |
3313 max--; | |
3314 if (max_vagueness < kTooManyCharacters) max_vagueness *= 2; | |
ulan
2012/03/30 13:04:48
What is the rationale behind this?
Nit: max_vague
Erik Corry
2012/04/01 00:48:46
All refactored away.
| |
3315 } | |
3316 int points = | |
3317 ((remembered_max - max) * kTooManyCharacters) * vagueness_factor; | |
3318 if (points > biggest_points) { | |
3319 biggest_interval_max = remembered_max; | |
3320 biggest_interval = remembered_max - max; | |
ulan
2012/03/30 13:04:48
Consider maintaining (left, right) instead of (big
Erik Corry
2012/04/01 00:48:46
Done.
| |
3321 biggest_points = points; | |
3322 } | |
3323 if (max < 0) break; | |
3324 } | |
3325 } | |
3326 if (biggest_points == kNoPoints) return false; | |
3327 if (biggest_interval < 1) return false; | |
3328 *to = biggest_interval_max; | |
3329 *from = 1 + biggest_interval_max - biggest_interval; | |
3330 return true; | |
3331 } | |
3332 | |
3333 | |
3334 static int SortChar(const char* a, const char* b) { | |
3335 return static_cast<int>(*a) - *b; | |
3336 } | |
3337 | |
3338 | |
3339 int BoyerMooreLookahead::GetSkipTable(int max_lookahead, | |
3340 Handle<ByteArray> boolean_skip_table) { | |
3341 const int kSize = RegExpMacroAssembler::kTableSize; | |
3342 char table[kSize]; | |
3343 | |
3344 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { | |
3345 table[i] = max_lookahead + 1; | |
3346 } | |
3347 | |
3348 for (int i = max_lookahead; i >= 0; i--) { | |
3349 int skip = max_lookahead - i; | |
ulan
2012/03/30 16:54:51
This is the crucial part. A comment explaining why
Erik Corry
2012/04/01 00:48:46
Done.
| |
3350 ZoneList<bool>* map = bitmaps_->at(i); | |
3351 if (map == NULL) { | |
3352 for (int j = 0; j < map_length_; j++) { | |
3353 if (table[j] > skip) table[j] = skip; | |
3354 } | |
3355 break; | |
3356 } | |
3357 for (int j = 0; j < map_length_; j++) { | |
3358 if (map->at(j)) { | |
3359 if (table[j] > skip) table[j] = skip; | |
3360 } | |
3361 } | |
3362 } | |
3363 | |
3364 char sorted_table[kSize]; | |
3365 memcpy(sorted_table, table, kSize); | |
3366 typedef int (*RawComparer)(const void*, const void*); | |
3367 qsort(sorted_table, kSize, 1, reinterpret_cast<RawComparer>(&SortChar)); | |
3368 | |
3369 int boolean_skip_distance = sorted_table[kSize / 2]; | |
ulan
2012/03/30 16:54:51
Instead of kSize / 2 we can take any i in [0..kSiz
Erik Corry
2012/04/01 00:48:46
Done.
| |
3370 for (int i = 0; i < kSize; i++) { | |
3371 boolean_skip_table->set(i, table[i] >= boolean_skip_distance ? 0 : 1); | |
ulan
2012/03/30 13:04:48
Can (x >= y ? 0 : 1) be replaced with (x < y) ?
Erik Corry
2012/04/01 00:48:46
We don't normally allow implicit bool->int and int
| |
3372 } | |
3373 return boolean_skip_distance; | |
3374 } | |
3375 | |
3376 | |
3377 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | |
3378 int min_lookahead = 0; | |
3379 int max_lookahead = 0; | |
3380 | |
3381 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | |
3382 | |
3383 bool found_single_character = false; | |
ulan
2012/03/30 16:54:51
Consider using a counter:
character_count = 0;
.
Erik Corry
2012/04/01 00:48:46
I want to bail out when I find the second characte
| |
3384 bool abandoned_search_for_single_character = false; | |
3385 int single_character = 0; | |
3386 for (int i = max_lookahead; i >= min_lookahead; i--) { | |
3387 ZoneList<bool>* map = bitmaps_->at(i); | |
3388 for (int j = 0; j < map_length_; j++) { | |
3389 if (map->at(j)) { | |
3390 if (found_single_character) { | |
3391 found_single_character = false; // Found two. | |
3392 abandoned_search_for_single_character = true; | |
3393 break; | |
3394 } else { | |
3395 found_single_character = true; | |
3396 single_character = j; | |
3397 } | |
3398 } | |
3399 } | |
3400 if (abandoned_search_for_single_character) break; | |
3401 } | |
3402 | |
3403 int lookahead_width = max_lookahead + 1 - min_lookahead; | |
3404 | |
3405 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | |
3406 // The mask-compare can probably handle this better. | |
3407 return false; | |
3408 } | |
3409 | |
3410 if (found_single_character) { | |
3411 Label cont, again; | |
3412 masm->Bind(&again); | |
3413 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
3414 if (max_char_ > String::kMaxAsciiCharCode || | |
3415 map_length_ < RegExpMacroAssembler::kTableSize) { | |
3416 masm->CheckCharacterAfterAnd(single_character, | |
3417 RegExpMacroAssembler::kTableMask, | |
3418 &cont); | |
3419 } else { | |
3420 masm->CheckCharacter(single_character, &cont); | |
ulan
2012/03/30 16:54:51
I think this implicitly assumes that
String::kMaxA
Erik Corry
2012/04/01 00:48:46
Fixed.
| |
3421 } | |
3422 masm->AdvanceCurrentPosition(lookahead_width); | |
3423 masm->GoTo(&again); | |
3424 masm->Bind(&cont); | |
3425 return true; | |
3426 } | |
3427 | |
3428 Handle<ByteArray> boolean_skip_table = | |
3429 FACTORY->NewByteArray(map_length_, TENURED); | |
3430 int boolean_skip_distance = GetSkipTable(max_lookahead, boolean_skip_table); | |
3431 if (boolean_skip_distance == 0) return false; | |
3432 | |
3433 Label cont, again; | |
3434 masm->Bind(&again); | |
3435 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
3436 masm->CheckBitInTable(boolean_skip_table, &cont); | |
3437 masm->AdvanceCurrentPosition(boolean_skip_distance); | |
3438 masm->GoTo(&again); | |
3439 masm->Bind(&cont); | |
3440 | |
3441 return true; | |
3442 } | |
3443 | |
3134 /* Code generation for choice nodes. | 3444 /* Code generation for choice nodes. |
3135 * | 3445 * |
3136 * We generate quick checks that do a mask and compare to eliminate a | 3446 * We generate quick checks that do a mask and compare to eliminate a |
3137 * choice. If the quick check succeeds then it jumps to the continuation to | 3447 * choice. If the quick check succeeds then it jumps to the continuation to |
3138 * do slow checks and check subsequent nodes. If it fails (the common case) | 3448 * do slow checks and check subsequent nodes. If it fails (the common case) |
3139 * it falls through to the next choice. | 3449 * it falls through to the next choice. |
3140 * | 3450 * |
3141 * Here is the desired flow graph. Nodes directly below each other imply | 3451 * Here is the desired flow graph. Nodes directly below each other imply |
3142 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative | 3452 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative |
3143 * 3 doesn't have a quick check so we have to call the slow check. | 3453 * 3 doesn't have a quick check so we have to call the slow check. |
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3267 greedy_match_trace.set_loop_label(&loop_label); | 3577 greedy_match_trace.set_loop_label(&loop_label); |
3268 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); | 3578 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); |
3269 macro_assembler->Bind(&greedy_match_failed); | 3579 macro_assembler->Bind(&greedy_match_failed); |
3270 } | 3580 } |
3271 | 3581 |
3272 Label second_choice; // For use in greedy matches. | 3582 Label second_choice; // For use in greedy matches. |
3273 macro_assembler->Bind(&second_choice); | 3583 macro_assembler->Bind(&second_choice); |
3274 | 3584 |
3275 int first_normal_choice = greedy_loop ? 1 : 0; | 3585 int first_normal_choice = greedy_loop ? 1 : 0; |
3276 | 3586 |
3277 int preload_characters = | 3587 bool not_at_start = current_trace->at_start() == Trace::FALSE; |
3278 CalculatePreloadCharacters(compiler, | 3588 const int kEatsAtLeastNotYetInitialized = -1; |
3279 current_trace->at_start() == Trace::FALSE); | 3589 int eats_at_least = kEatsAtLeastNotYetInitialized; |
3280 bool preload_is_current = | 3590 |
3591 bool skip_was_emitted = false; | |
3592 | |
3593 // More makes code generation slower, less makes V8 benchmark score lower. | |
3594 const int kMaxLookaheadForBoyerMoore = 8; | |
3595 | |
3596 if (!greedy_loop && choice_count == 2) { | |
3597 GuardedAlternative alt1 = alternatives_->at(1); | |
3598 if (alt1.guards() == NULL || | |
ulan
2012/03/30 16:54:51
This fits in one line.
Erik Corry
2012/04/01 00:48:46
Done.
Erik Corry
2012/04/01 00:48:46
Done.
| |
3599 alt1.guards()->length() == 0) { | |
3600 RegExpNode* eats_anything_node = alt1.node(); | |
3601 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | |
3602 this) { | |
3603 // At this point we know that we are at a non-greedy loop that will eat | |
3604 // any character one at a time. Any non-anchored regexp has such a | |
3605 // loop prepended to it in order to find where it starts. We look for | |
3606 // a pattern of the form ...abc... where we can look 6 characters ahead | |
ulan
2012/03/30 16:54:51
Can we put a similar comment in description of Emi
Erik Corry
2012/04/01 00:48:46
Done.
| |
3607 // and step forwards 3 if the character is not one of abc. Abc need | |
3608 // not be atoms, they can be any reasonably limited character class or | |
3609 // small alternation. | |
3610 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | |
3611 eats_at_least = | |
3612 Min(kMaxLookaheadForBoyerMoore, | |
3613 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | |
3614 if (eats_at_least >= 1) { | |
3615 BoyerMooreLookahead bm(eats_at_least, | |
3616 RegExpMacroAssembler::kTableSize, | |
3617 compiler); | |
3618 GuardedAlternative alt0 = alternatives_->at(0); | |
3619 alt0.node()->FillInBMInfo(0, &bm, not_at_start); | |
3620 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); | |
3621 } | |
3622 } | |
3623 } | |
3624 } | |
3625 | |
3626 if (eats_at_least == kEatsAtLeastNotYetInitialized) { | |
3627 // Save some time by looking at most one machine word ahead. | |
3628 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); | |
3629 } | |
3630 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); | |
3631 | |
3632 bool preload_is_current = !skip_was_emitted && | |
3281 (current_trace->characters_preloaded() == preload_characters); | 3633 (current_trace->characters_preloaded() == preload_characters); |
3282 bool preload_has_checked_bounds = preload_is_current; | 3634 bool preload_has_checked_bounds = preload_is_current; |
3283 | 3635 |
3284 AlternativeGenerationList alt_gens(choice_count); | 3636 AlternativeGenerationList alt_gens(choice_count); |
3285 | 3637 |
3286 // For now we just call all choices one after the other. The idea ultimately | 3638 // For now we just call all choices one after the other. The idea ultimately |
3287 // is to use the Dispatch table to try only the relevant ones. | 3639 // is to use the Dispatch table to try only the relevant ones. |
3288 for (int i = first_normal_choice; i < choice_count; i++) { | 3640 for (int i = first_normal_choice; i < choice_count; i++) { |
3289 GuardedAlternative alternative = alternatives_->at(i); | 3641 GuardedAlternative alternative = alternatives_->at(i); |
3290 AlternativeGeneration* alt_gen = alt_gens.at(i); | 3642 AlternativeGeneration* alt_gen = alt_gens.at(i); |
(...skipping 2134 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5425 // We don't know anything about the first character of a backreference | 5777 // We don't know anything about the first character of a backreference |
5426 // at this point. | 5778 // at this point. |
5427 // The potential first characters are the first characters of the capture, | 5779 // The potential first characters are the first characters of the capture, |
5428 // and the first characters of the on_success node, depending on whether the | 5780 // and the first characters of the on_success node, depending on whether the |
5429 // capture can be empty and whether it is known to be participating or known | 5781 // capture can be empty and whether it is known to be participating or known |
5430 // not to be. | 5782 // not to be. |
5431 return kComputeFirstCharacterSetFail; | 5783 return kComputeFirstCharacterSetFail; |
5432 } | 5784 } |
5433 | 5785 |
5434 | 5786 |
5787 void ChoiceNode::FillInBMInfo( | |
5788 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
5789 ZoneList<GuardedAlternative>* alts = alternatives(); | |
5790 for (int i = 0; i < alts->length(); i++) { | |
5791 GuardedAlternative& alt = alts->at(i); | |
5792 if (alt.guards() != NULL && | |
ulan
2012/03/30 13:04:48
This fits in one line.
Erik Corry
2012/04/01 00:48:46
Done.
| |
5793 alt.guards()->length() != 0) { | |
5794 bm->SetRest(offset); // Give up trying to fill in info. | |
5795 return; | |
5796 } | |
5797 alt.node()->FillInBMInfo(offset, bm, not_at_start); | |
5798 } | |
5799 } | |
5800 | |
5801 | |
5802 void TextNode::FillInBMInfo( | |
5803 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
5804 if (offset >= bm->length()) return; | |
5805 int max_char = bm->max_char(); | |
5806 for (int i = 0; i < elements()->length(); i++) { | |
5807 if (offset >= bm->length()) return; | |
5808 TextElement text = elements()->at(i); | |
5809 if (text.type == TextElement::ATOM) { | |
5810 RegExpAtom* atom = text.data.u_atom; | |
5811 for (int j = 0; j < atom->length(); j++, offset++) { | |
5812 if (offset >= bm->length()) return; | |
5813 uc16 character = atom->data()[j]; | |
5814 if (bm->compiler()->ignore_case()) { | |
5815 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
5816 int length = GetCaseIndependentLetters( | |
5817 ISOLATE, | |
5818 character, | |
5819 bm->max_char() == String::kMaxAsciiCharCode, | |
5820 chars); | |
5821 for (int j = 0; j < length; j++) { | |
5822 bm->Set(offset, chars[j]); | |
5823 } | |
5824 } else { | |
5825 if (character <= max_char) bm->Set(offset, character); | |
5826 } | |
5827 } | |
5828 } else { | |
5829 ASSERT(text.type == TextElement::CHAR_CLASS); | |
5830 RegExpCharacterClass* char_class = text.data.u_char_class; | |
5831 ZoneList<CharacterRange>* ranges = char_class->ranges(); | |
5832 if (char_class->is_negated()) { | |
5833 bm->SetAll(offset); | |
5834 } else { | |
5835 for (int k = 0; k < ranges->length(); k++) { | |
5836 CharacterRange& range = ranges->at(k); | |
5837 if (range.from() > max_char) continue; | |
5838 int to = Min(max_char, static_cast<int>(range.to())); | |
5839 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { | |
5840 bm->SetAll(offset); | |
5841 break; | |
5842 } | |
5843 for (int m = range.from(); m <= to; m++) { | |
ulan
2012/03/30 16:54:51
Why don't we handle ignore_case() here like in the
Erik Corry
2012/04/01 00:48:46
The character classes have had case-independence r
| |
5844 bm->Set(offset, m); | |
5845 } | |
5846 } | |
5847 } | |
5848 offset++; | |
5849 } | |
5850 } | |
5851 if (offset >= bm->length()) return; | |
5852 on_success()->FillInBMInfo(offset, | |
ulan
2012/03/30 13:04:48
This fits in line if you put the comment in anothe
Erik Corry
2012/04/01 00:48:46
But the comment only applies to the line it is on,
| |
5853 bm, | |
5854 true); // Not at start after a text node. | |
5855 } | |
5856 | |
5857 | |
5435 int TextNode::ComputeFirstCharacterSet(int budget) { | 5858 int TextNode::ComputeFirstCharacterSet(int budget) { |
5436 budget--; | 5859 budget--; |
5437 if (budget >= 0) { | 5860 if (budget >= 0) { |
5438 ASSERT_NE(0, elements()->length()); | 5861 ASSERT_NE(0, elements()->length()); |
5439 TextElement text = elements()->at(0); | 5862 TextElement text = elements()->at(0); |
5440 if (text.type == TextElement::ATOM) { | 5863 if (text.type == TextElement::ATOM) { |
5441 RegExpAtom* atom = text.data.u_atom; | 5864 RegExpAtom* atom = text.data.u_atom; |
5442 ASSERT_NE(0, atom->length()); | 5865 ASSERT_NE(0, atom->length()); |
5443 uc16 first_char = atom->data()[0]; | 5866 uc16 first_char = atom->data()[0]; |
5444 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); | 5867 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); |
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5582 } | 6005 } |
5583 } | 6006 } |
5584 | 6007 |
5585 | 6008 |
5586 void DispatchTableConstructor::VisitAction(ActionNode* that) { | 6009 void DispatchTableConstructor::VisitAction(ActionNode* that) { |
5587 RegExpNode* target = that->on_success(); | 6010 RegExpNode* target = that->on_success(); |
5588 target->Accept(this); | 6011 target->Accept(this); |
5589 } | 6012 } |
5590 | 6013 |
5591 | 6014 |
5592 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, | 6015 RegExpEngine::CompilationResult RegExpEngine::Compile( |
5593 bool ignore_case, | 6016 RegExpCompileData* data, |
5594 bool is_multiline, | 6017 bool ignore_case, |
5595 Handle<String> pattern, | 6018 bool is_multiline, |
5596 bool is_ascii) { | 6019 Handle<String> pattern, |
6020 Handle<String> sample_subject, | |
6021 bool is_ascii) { | |
5597 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { | 6022 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
5598 return IrregexpRegExpTooBig(); | 6023 return IrregexpRegExpTooBig(); |
5599 } | 6024 } |
5600 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); | 6025 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); |
6026 | |
6027 FlattenString(sample_subject); | |
6028 int chars_sampled = 0; | |
6029 int half_way = sample_subject->length() / 2; | |
ulan
2012/03/30 13:04:48
Why do we consider only the upper half of the samp
Erik Corry
2012/04/01 00:48:46
We consider a 100-character chunk in the middle of
| |
6030 for (int i = half_way; i < sample_subject->length(); i++) { | |
6031 if (chars_sampled++ > 100) break; | |
ulan
2012/03/30 13:04:48
A nit: consider moving chars_sampled++ either to t
Erik Corry
2012/04/01 00:48:46
Done.
| |
6032 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); | |
6033 } | |
6034 compiler.frequency_collator()->SortFrequencies(); | |
6035 | |
5601 // Wrap the body of the regexp in capture #0. | 6036 // Wrap the body of the regexp in capture #0. |
5602 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, | 6037 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, |
5603 0, | 6038 0, |
5604 &compiler, | 6039 &compiler, |
5605 compiler.accept()); | 6040 compiler.accept()); |
5606 RegExpNode* node = captured_body; | 6041 RegExpNode* node = captured_body; |
5607 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); | 6042 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); |
5608 bool is_start_anchored = data->tree->IsAnchoredAtStart(); | 6043 bool is_start_anchored = data->tree->IsAnchoredAtStart(); |
5609 int max_length = data->tree->max_match(); | 6044 int max_length = data->tree->max_match(); |
5610 if (!is_start_anchored) { | 6045 if (!is_start_anchored) { |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5673 } | 6108 } |
5674 | 6109 |
5675 return compiler.Assemble(¯o_assembler, | 6110 return compiler.Assemble(¯o_assembler, |
5676 node, | 6111 node, |
5677 data->capture_count, | 6112 data->capture_count, |
5678 pattern); | 6113 pattern); |
5679 } | 6114 } |
5680 | 6115 |
5681 | 6116 |
5682 }} // namespace v8::internal | 6117 }} // namespace v8::internal |
OLD | NEW |