OLD | NEW |
---|---|
1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
4 // met: | 4 // met: |
5 // | 5 // |
6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
(...skipping 262 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
273 | 273 |
274 | 274 |
275 // Irregexp implementation. | 275 // Irregexp implementation. |
276 | 276 |
277 // Ensures that the regexp object contains a compiled version of the | 277 // Ensures that the regexp object contains a compiled version of the |
278 // source for either ASCII or non-ASCII strings. | 278 // source for either ASCII or non-ASCII strings. |
279 // If the compiled version doesn't already exist, it is compiled | 279 // If the compiled version doesn't already exist, it is compiled |
280 // from the source pattern. | 280 // from the source pattern. |
281 // If compilation fails, an exception is thrown and this function | 281 // If compilation fails, an exception is thrown and this function |
282 // returns false. | 282 // returns false. |
283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 283 bool RegExpImpl::EnsureCompiledIrregexp( |
284 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { | |
284 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); | 285 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); |
285 #ifdef V8_INTERPRETED_REGEXP | 286 #ifdef V8_INTERPRETED_REGEXP |
286 if (compiled_code->IsByteArray()) return true; | 287 if (compiled_code->IsByteArray()) return true; |
287 #else // V8_INTERPRETED_REGEXP (RegExp native code) | 288 #else // V8_INTERPRETED_REGEXP (RegExp native code) |
288 if (compiled_code->IsCode()) return true; | 289 if (compiled_code->IsCode()) return true; |
289 #endif | 290 #endif |
290 // We could potentially have marked this as flushable, but have kept | 291 // We could potentially have marked this as flushable, but have kept |
291 // a saved version if we did not flush it yet. | 292 // a saved version if we did not flush it yet. |
292 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); | 293 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); |
293 if (saved_code->IsCode()) { | 294 if (saved_code->IsCode()) { |
294 // Reinstate the code in the original place. | 295 // Reinstate the code in the original place. |
295 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); | 296 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); |
296 ASSERT(compiled_code->IsSmi()); | 297 ASSERT(compiled_code->IsSmi()); |
297 return true; | 298 return true; |
298 } | 299 } |
299 return CompileIrregexp(re, is_ascii); | 300 return CompileIrregexp(re, sample_subject, is_ascii); |
300 } | 301 } |
301 | 302 |
302 | 303 |
303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, | 304 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, |
304 bool is_ascii, | 305 bool is_ascii, |
305 Handle<String> error_message, | 306 Handle<String> error_message, |
306 Isolate* isolate) { | 307 Isolate* isolate) { |
307 Factory* factory = isolate->factory(); | 308 Factory* factory = isolate->factory(); |
308 Handle<FixedArray> elements = factory->NewFixedArray(2); | 309 Handle<FixedArray> elements = factory->NewFixedArray(2); |
309 elements->set(0, re->Pattern()); | 310 elements->set(0, re->Pattern()); |
310 elements->set(1, *error_message); | 311 elements->set(1, *error_message); |
311 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 312 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
312 Handle<Object> regexp_err = | 313 Handle<Object> regexp_err = |
313 factory->NewSyntaxError("malformed_regexp", array); | 314 factory->NewSyntaxError("malformed_regexp", array); |
314 isolate->Throw(*regexp_err); | 315 isolate->Throw(*regexp_err); |
315 return false; | 316 return false; |
316 } | 317 } |
317 | 318 |
318 | 319 |
319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 320 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
321 Handle<String> sample_subject, | |
322 bool is_ascii) { | |
320 // Compile the RegExp. | 323 // Compile the RegExp. |
321 Isolate* isolate = re->GetIsolate(); | 324 Isolate* isolate = re->GetIsolate(); |
322 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); | 325 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); |
323 PostponeInterruptsScope postpone(isolate); | 326 PostponeInterruptsScope postpone(isolate); |
324 // If we had a compilation error the last time this is saved at the | 327 // If we had a compilation error the last time this is saved at the |
325 // saved code index. | 328 // saved code index. |
326 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); | 329 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); |
327 // When arriving here entry can only be a smi, either representing an | 330 // When arriving here entry can only be a smi, either representing an |
328 // uncompiled regexp, a previous compilation error, or code that has | 331 // uncompiled regexp, a previous compilation error, or code that has |
329 // been flushed. | 332 // been flushed. |
(...skipping 28 matching lines...) Expand all Loading... | |
358 pattern, | 361 pattern, |
359 compile_data.error, | 362 compile_data.error, |
360 "malformed_regexp"); | 363 "malformed_regexp"); |
361 return false; | 364 return false; |
362 } | 365 } |
363 RegExpEngine::CompilationResult result = | 366 RegExpEngine::CompilationResult result = |
364 RegExpEngine::Compile(&compile_data, | 367 RegExpEngine::Compile(&compile_data, |
365 flags.is_ignore_case(), | 368 flags.is_ignore_case(), |
366 flags.is_multiline(), | 369 flags.is_multiline(), |
367 pattern, | 370 pattern, |
371 sample_subject, | |
368 is_ascii); | 372 is_ascii); |
369 if (result.error_message != NULL) { | 373 if (result.error_message != NULL) { |
370 // Unable to compile regexp. | 374 // Unable to compile regexp. |
371 Handle<String> error_message = | 375 Handle<String> error_message = |
372 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); | 376 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); |
373 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); | 377 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); |
374 return false; | 378 return false; |
375 } | 379 } |
376 | 380 |
377 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); | 381 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); |
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
428 capture_count); | 432 capture_count); |
429 } | 433 } |
430 | 434 |
431 | 435 |
432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, | 436 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, |
433 Handle<String> subject) { | 437 Handle<String> subject) { |
434 if (!subject->IsFlat()) FlattenString(subject); | 438 if (!subject->IsFlat()) FlattenString(subject); |
435 | 439 |
436 // Check the asciiness of the underlying storage. | 440 // Check the asciiness of the underlying storage. |
437 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 441 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
438 if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; | 442 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; |
439 | 443 |
440 #ifdef V8_INTERPRETED_REGEXP | 444 #ifdef V8_INTERPRETED_REGEXP |
441 // Byte-code regexp needs space allocated for all its registers. | 445 // Byte-code regexp needs space allocated for all its registers. |
442 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); | 446 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); |
443 #else // V8_INTERPRETED_REGEXP | 447 #else // V8_INTERPRETED_REGEXP |
444 // Native regexp only needs room to output captures. Registers are handled | 448 // Native regexp only needs room to output captures. Registers are handled |
445 // internally. | 449 // internally. |
446 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; | 450 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; |
447 #endif // V8_INTERPRETED_REGEXP | 451 #endif // V8_INTERPRETED_REGEXP |
448 } | 452 } |
(...skipping 10 matching lines...) Expand all Loading... | |
459 | 463 |
460 ASSERT(index >= 0); | 464 ASSERT(index >= 0); |
461 ASSERT(index <= subject->length()); | 465 ASSERT(index <= subject->length()); |
462 ASSERT(subject->IsFlat()); | 466 ASSERT(subject->IsFlat()); |
463 | 467 |
464 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 468 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
465 | 469 |
466 #ifndef V8_INTERPRETED_REGEXP | 470 #ifndef V8_INTERPRETED_REGEXP |
467 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); | 471 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
468 do { | 472 do { |
469 EnsureCompiledIrregexp(regexp, is_ascii); | 473 EnsureCompiledIrregexp(regexp, subject, is_ascii); |
470 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); | 474 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); |
471 NativeRegExpMacroAssembler::Result res = | 475 NativeRegExpMacroAssembler::Result res = |
472 NativeRegExpMacroAssembler::Match(code, | 476 NativeRegExpMacroAssembler::Match(code, |
473 subject, | 477 subject, |
474 output.start(), | 478 output.start(), |
475 output.length(), | 479 output.length(), |
476 index, | 480 index, |
477 isolate); | 481 isolate); |
478 if (res != NativeRegExpMacroAssembler::RETRY) { | 482 if (res != NativeRegExpMacroAssembler::RETRY) { |
479 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || | 483 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || |
(...skipping 297 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { | 781 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { |
778 if (table_ == NULL) { | 782 if (table_ == NULL) { |
779 table_ = new DispatchTable(); | 783 table_ = new DispatchTable(); |
780 DispatchTableConstructor cons(table_, ignore_case); | 784 DispatchTableConstructor cons(table_, ignore_case); |
781 cons.BuildTable(this); | 785 cons.BuildTable(this); |
782 } | 786 } |
783 return table_; | 787 return table_; |
784 } | 788 } |
785 | 789 |
786 | 790 |
791 class FrequencyCollator { | |
792 public: | |
793 FrequencyCollator() : total_samples_(0) { | |
794 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { | |
795 frequencies_[i] = CharacterFrequency(i); | |
796 } | |
797 } | |
798 | |
799 void CountCharacter(int character) { | |
800 int index = (character & RegExpMacroAssembler::kTableMask); | |
801 frequencies_[index].Increment(); | |
802 total_samples_++; | |
803 } | |
804 | |
805 // Does not measure in percent, but rather per-128 (the table size from the | |
806 // regexp macro assembler). | |
807 int Frequency(int in_character) { | |
808 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character); | |
809 if (total_samples_ < 1) return 1; // Division by zero. | |
810 int freq_in_per128 = | |
811 (frequencies_[in_character].counter() * 128) / total_samples_; | |
812 return freq_in_per128; | |
813 } | |
814 | |
815 private: | |
816 class CharacterFrequency { | |
817 public: | |
818 CharacterFrequency() : counter_(0), character_(-1) { } | |
819 explicit CharacterFrequency(int character) | |
820 : counter_(0), character_(character) { } | |
821 | |
822 void Increment() { counter_++; } | |
823 int counter() { return counter_; } | |
824 int character() { return character_; } | |
825 | |
826 private: | |
827 int counter_; | |
828 int character_; | |
829 }; | |
830 | |
831 | |
832 private: | |
833 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; | |
834 int total_samples_; | |
835 }; | |
836 | |
837 | |
787 class RegExpCompiler { | 838 class RegExpCompiler { |
788 public: | 839 public: |
789 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); | 840 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); |
790 | 841 |
791 int AllocateRegister() { | 842 int AllocateRegister() { |
792 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { | 843 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { |
793 reg_exp_too_big_ = true; | 844 reg_exp_too_big_ = true; |
794 return next_register_; | 845 return next_register_; |
795 } | 846 } |
796 return next_register_++; | 847 return next_register_++; |
(...skipping 15 matching lines...) Expand all Loading... | |
812 | 863 |
813 static const int kMaxRecursion = 100; | 864 static const int kMaxRecursion = 100; |
814 inline int recursion_depth() { return recursion_depth_; } | 865 inline int recursion_depth() { return recursion_depth_; } |
815 inline void IncrementRecursionDepth() { recursion_depth_++; } | 866 inline void IncrementRecursionDepth() { recursion_depth_++; } |
816 inline void DecrementRecursionDepth() { recursion_depth_--; } | 867 inline void DecrementRecursionDepth() { recursion_depth_--; } |
817 | 868 |
818 void SetRegExpTooBig() { reg_exp_too_big_ = true; } | 869 void SetRegExpTooBig() { reg_exp_too_big_ = true; } |
819 | 870 |
820 inline bool ignore_case() { return ignore_case_; } | 871 inline bool ignore_case() { return ignore_case_; } |
821 inline bool ascii() { return ascii_; } | 872 inline bool ascii() { return ascii_; } |
873 FrequencyCollator* frequency_collator() { return &frequency_collator_; } | |
822 | 874 |
823 int current_expansion_factor() { return current_expansion_factor_; } | 875 int current_expansion_factor() { return current_expansion_factor_; } |
824 void set_current_expansion_factor(int value) { | 876 void set_current_expansion_factor(int value) { |
825 current_expansion_factor_ = value; | 877 current_expansion_factor_ = value; |
826 } | 878 } |
827 | 879 |
828 static const int kNoRegister = -1; | 880 static const int kNoRegister = -1; |
829 | 881 |
830 private: | 882 private: |
831 EndNode* accept_; | 883 EndNode* accept_; |
832 int next_register_; | 884 int next_register_; |
833 List<RegExpNode*>* work_list_; | 885 List<RegExpNode*>* work_list_; |
834 int recursion_depth_; | 886 int recursion_depth_; |
835 RegExpMacroAssembler* macro_assembler_; | 887 RegExpMacroAssembler* macro_assembler_; |
836 bool ignore_case_; | 888 bool ignore_case_; |
837 bool ascii_; | 889 bool ascii_; |
838 bool reg_exp_too_big_; | 890 bool reg_exp_too_big_; |
839 int current_expansion_factor_; | 891 int current_expansion_factor_; |
892 FrequencyCollator frequency_collator_; | |
840 }; | 893 }; |
841 | 894 |
842 | 895 |
843 class RecursionCheck { | 896 class RecursionCheck { |
844 public: | 897 public: |
845 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { | 898 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { |
846 compiler->IncrementRecursionDepth(); | 899 compiler->IncrementRecursionDepth(); |
847 } | 900 } |
848 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } | 901 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } |
849 private: | 902 private: |
850 RegExpCompiler* compiler_; | 903 RegExpCompiler* compiler_; |
851 }; | 904 }; |
852 | 905 |
853 | 906 |
854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { | 907 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { |
855 return RegExpEngine::CompilationResult("RegExp too big"); | 908 return RegExpEngine::CompilationResult("RegExp too big"); |
856 } | 909 } |
857 | 910 |
858 | 911 |
859 // Attempts to compile the regexp using an Irregexp code generator. Returns | 912 // Attempts to compile the regexp using an Irregexp code generator. Returns |
860 // a fixed array or a null handle depending on whether it succeeded. | 913 // a fixed array or a null handle depending on whether it succeeded. |
861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) | 914 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) |
862 : next_register_(2 * (capture_count + 1)), | 915 : next_register_(2 * (capture_count + 1)), |
863 work_list_(NULL), | 916 work_list_(NULL), |
864 recursion_depth_(0), | 917 recursion_depth_(0), |
865 ignore_case_(ignore_case), | 918 ignore_case_(ignore_case), |
866 ascii_(ascii), | 919 ascii_(ascii), |
867 reg_exp_too_big_(false), | 920 reg_exp_too_big_(false), |
868 current_expansion_factor_(1) { | 921 current_expansion_factor_(1), |
922 frequency_collator_() { | |
869 accept_ = new EndNode(EndNode::ACCEPT); | 923 accept_ = new EndNode(EndNode::ACCEPT); |
870 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); | 924 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); |
871 } | 925 } |
872 | 926 |
873 | 927 |
874 RegExpEngine::CompilationResult RegExpCompiler::Assemble( | 928 RegExpEngine::CompilationResult RegExpCompiler::Assemble( |
875 RegExpMacroAssembler* macro_assembler, | 929 RegExpMacroAssembler* macro_assembler, |
876 RegExpNode* start, | 930 RegExpNode* start, |
877 int capture_count, | 931 int capture_count, |
878 Handle<String> pattern) { | 932 Handle<String> pattern) { |
(...skipping 1170 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2049 int recursion_depth, | 2103 int recursion_depth, |
2050 bool not_at_start) { | 2104 bool not_at_start) { |
2051 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2105 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2052 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! | 2106 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! |
2053 return on_success()->EatsAtLeast(still_to_find, | 2107 return on_success()->EatsAtLeast(still_to_find, |
2054 recursion_depth + 1, | 2108 recursion_depth + 1, |
2055 not_at_start); | 2109 not_at_start); |
2056 } | 2110 } |
2057 | 2111 |
2058 | 2112 |
2113 void ActionNode::FillInBMInfo(int offset, | |
2114 BoyerMooreLookahead* bm, | |
2115 bool not_at_start) { | |
2116 if (type_ == BEGIN_SUBMATCH) { | |
2117 bm->SetRest(offset); | |
2118 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { | |
2119 on_success()->FillInBMInfo(offset, bm, not_at_start); | |
2120 } | |
2121 } | |
2122 | |
2123 | |
2059 int AssertionNode::EatsAtLeast(int still_to_find, | 2124 int AssertionNode::EatsAtLeast(int still_to_find, |
2060 int recursion_depth, | 2125 int recursion_depth, |
2061 bool not_at_start) { | 2126 bool not_at_start) { |
2062 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2127 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2063 // If we know we are not at the start and we are asked "how many characters | 2128 // If we know we are not at the start and we are asked "how many characters |
2064 // will you match if you succeed?" then we can answer anything since false | 2129 // will you match if you succeed?" then we can answer anything since false |
2065 // implies false. So lets just return the max answer (still_to_find) since | 2130 // implies false. So lets just return the max answer (still_to_find) since |
2066 // that won't prevent us from preloading a lot of characters for the other | 2131 // that won't prevent us from preloading a lot of characters for the other |
2067 // branches in the node graph. | 2132 // branches in the node graph. |
2068 if (type() == AT_START && not_at_start) return still_to_find; | 2133 if (type() == AT_START && not_at_start) return still_to_find; |
2069 return on_success()->EatsAtLeast(still_to_find, | 2134 return on_success()->EatsAtLeast(still_to_find, |
2070 recursion_depth + 1, | 2135 recursion_depth + 1, |
2071 not_at_start); | 2136 not_at_start); |
2072 } | 2137 } |
2073 | 2138 |
2074 | 2139 |
2140 void AssertionNode::FillInBMInfo( | |
2141 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
2142 // Match the behaviour of EatsAtLeast on this node. | |
2143 if (type() == AT_START && not_at_start) return; | |
2144 on_success()->FillInBMInfo(offset, bm, not_at_start); | |
2145 } | |
2146 | |
2147 | |
2075 int BackReferenceNode::EatsAtLeast(int still_to_find, | 2148 int BackReferenceNode::EatsAtLeast(int still_to_find, |
2076 int recursion_depth, | 2149 int recursion_depth, |
2077 bool not_at_start) { | 2150 bool not_at_start) { |
2078 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2151 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
2079 return on_success()->EatsAtLeast(still_to_find, | 2152 return on_success()->EatsAtLeast(still_to_find, |
2080 recursion_depth + 1, | 2153 recursion_depth + 1, |
2081 not_at_start); | 2154 not_at_start); |
2082 } | 2155 } |
2083 | 2156 |
2084 | 2157 |
(...skipping 412 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2497 bool not_at_start) { | 2570 bool not_at_start) { |
2498 if (body_can_be_zero_length_ || info()->visited) return; | 2571 if (body_can_be_zero_length_ || info()->visited) return; |
2499 VisitMarker marker(info()); | 2572 VisitMarker marker(info()); |
2500 return ChoiceNode::GetQuickCheckDetails(details, | 2573 return ChoiceNode::GetQuickCheckDetails(details, |
2501 compiler, | 2574 compiler, |
2502 characters_filled_in, | 2575 characters_filled_in, |
2503 not_at_start); | 2576 not_at_start); |
2504 } | 2577 } |
2505 | 2578 |
2506 | 2579 |
2580 void LoopChoiceNode::FillInBMInfo( | |
2581 int offset, BoyerMooreLookahead* bm, bool nas) { | |
2582 if (body_can_be_zero_length_) { | |
2583 bm->SetRest(offset); | |
2584 return; | |
2585 } | |
2586 ChoiceNode::FillInBMInfo(offset, bm, nas); | |
2587 } | |
2588 | |
2589 | |
2507 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2590 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
2508 RegExpCompiler* compiler, | 2591 RegExpCompiler* compiler, |
2509 int characters_filled_in, | 2592 int characters_filled_in, |
2510 bool not_at_start) { | 2593 bool not_at_start) { |
2511 not_at_start = (not_at_start || not_at_start_); | 2594 not_at_start = (not_at_start || not_at_start_); |
2512 int choice_count = alternatives_->length(); | 2595 int choice_count = alternatives_->length(); |
2513 ASSERT(choice_count > 0); | 2596 ASSERT(choice_count > 0); |
2514 alternatives_->at(0).node()->GetQuickCheckDetails(details, | 2597 alternatives_->at(0).node()->GetQuickCheckDetails(details, |
2515 compiler, | 2598 compiler, |
2516 characters_filled_in, | 2599 characters_filled_in, |
(...skipping 476 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
2993 int TextNode::GreedyLoopTextLength() { | 3076 int TextNode::GreedyLoopTextLength() { |
2994 TextElement elm = elms_->at(elms_->length() - 1); | 3077 TextElement elm = elms_->at(elms_->length() - 1); |
2995 if (elm.type == TextElement::CHAR_CLASS) { | 3078 if (elm.type == TextElement::CHAR_CLASS) { |
2996 return elm.cp_offset + 1; | 3079 return elm.cp_offset + 1; |
2997 } else { | 3080 } else { |
2998 return elm.cp_offset + elm.data.u_atom->data().length(); | 3081 return elm.cp_offset + elm.data.u_atom->data().length(); |
2999 } | 3082 } |
3000 } | 3083 } |
3001 | 3084 |
3002 | 3085 |
3086 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( | |
3087 RegExpCompiler* compiler) { | |
3088 if (elms_->length() != 1) return NULL; | |
3089 TextElement elm = elms_->at(0); | |
3090 if (elm.type != TextElement::CHAR_CLASS) return NULL; | |
3091 RegExpCharacterClass* node = elm.data.u_char_class; | |
3092 ZoneList<CharacterRange>* ranges = node->ranges(); | |
3093 if (!CharacterRange::IsCanonical(ranges)) { | |
3094 CharacterRange::Canonicalize(ranges); | |
3095 } | |
3096 if (node->is_negated()) { | |
3097 return ranges->length() == 0 ? on_success() : NULL; | |
3098 } | |
3099 if (ranges->length() != 1) return NULL; | |
3100 uint32_t max_char; | |
3101 if (compiler->ascii()) { | |
3102 max_char = String::kMaxAsciiCharCode; | |
3103 } else { | |
3104 max_char = String::kMaxUtf16CodeUnit; | |
3105 } | |
3106 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; | |
3107 } | |
3108 | |
3109 | |
3003 // Finds the fixed match length of a sequence of nodes that goes from | 3110 // Finds the fixed match length of a sequence of nodes that goes from |
3004 // this alternative and back to this choice node. If there are variable | 3111 // this alternative and back to this choice node. If there are variable |
3005 // length nodes or other complications in the way then return a sentinel | 3112 // length nodes or other complications in the way then return a sentinel |
3006 // value indicating that a greedy loop cannot be constructed. | 3113 // value indicating that a greedy loop cannot be constructed. |
3007 int ChoiceNode::GreedyLoopTextLengthForAlternative( | 3114 int ChoiceNode::GreedyLoopTextLengthForAlternative( |
3008 GuardedAlternative* alternative) { | 3115 GuardedAlternative* alternative) { |
3009 int length = 0; | 3116 int length = 0; |
3010 RegExpNode* node = alternative->node(); | 3117 RegExpNode* node = alternative->node(); |
3011 // Later we will generate code for all these text nodes using recursion | 3118 // Later we will generate code for all these text nodes using recursion |
3012 // so we have to limit the max number. | 3119 // so we have to limit the max number. |
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3057 ASSERT(trace->stop_node() == NULL); | 3164 ASSERT(trace->stop_node() == NULL); |
3058 if (!trace->is_trivial()) { | 3165 if (!trace->is_trivial()) { |
3059 trace->Flush(compiler, this); | 3166 trace->Flush(compiler, this); |
3060 return; | 3167 return; |
3061 } | 3168 } |
3062 ChoiceNode::Emit(compiler, trace); | 3169 ChoiceNode::Emit(compiler, trace); |
3063 } | 3170 } |
3064 | 3171 |
3065 | 3172 |
3066 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, | 3173 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
3067 bool not_at_start) { | 3174 int eats_at_least) { |
3068 int preload_characters = EatsAtLeast(4, 0, not_at_start); | 3175 int preload_characters = Min(4, eats_at_least); |
3069 if (compiler->macro_assembler()->CanReadUnaligned()) { | 3176 if (compiler->macro_assembler()->CanReadUnaligned()) { |
3070 bool ascii = compiler->ascii(); | 3177 bool ascii = compiler->ascii(); |
3071 if (ascii) { | 3178 if (ascii) { |
3072 if (preload_characters > 4) preload_characters = 4; | 3179 if (preload_characters > 4) preload_characters = 4; |
3073 // We can't preload 3 characters because there is no machine instruction | 3180 // We can't preload 3 characters because there is no machine instruction |
3074 // to do that. We can't just load 4 because we could be reading | 3181 // to do that. We can't just load 4 because we could be reading |
3075 // beyond the end of the string, which could cause a memory fault. | 3182 // beyond the end of the string, which could cause a memory fault. |
3076 if (preload_characters == 3) preload_characters = 2; | 3183 if (preload_characters == 3) preload_characters = 2; |
3077 } else { | 3184 } else { |
3078 if (preload_characters > 2) preload_characters = 2; | 3185 if (preload_characters > 2) preload_characters = 2; |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3124 return alt_gens_[i]; | 3231 return alt_gens_[i]; |
3125 } | 3232 } |
3126 | 3233 |
3127 private: | 3234 private: |
3128 static const int kAFew = 10; | 3235 static const int kAFew = 10; |
3129 ZoneList<AlternativeGeneration*> alt_gens_; | 3236 ZoneList<AlternativeGeneration*> alt_gens_; |
3130 AlternativeGeneration a_few_alt_gens_[kAFew]; | 3237 AlternativeGeneration a_few_alt_gens_[kAFew]; |
3131 }; | 3238 }; |
3132 | 3239 |
3133 | 3240 |
3241 BoyerMooreLookahead::BoyerMooreLookahead( | |
3242 int length, int map_length, RegExpCompiler* compiler) | |
3243 : length_(length), | |
3244 map_length_(map_length), | |
3245 compiler_(compiler) { | |
3246 ASSERT(IsPowerOf2(map_length)); | |
3247 if (compiler->ascii()) { | |
3248 max_char_ = String::kMaxAsciiCharCode; | |
3249 } else { | |
3250 max_char_ = String::kMaxUtf16CodeUnit; | |
3251 } | |
3252 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); | |
3253 for (int i = 0; i < length; i++) { | |
3254 bitmaps_->Add(new ZoneList<bool>(map_length)); | |
3255 ZoneList<bool>* map = bitmaps_->at(i); | |
3256 for (int i = 0; i < map_length; i++) { | |
3257 map->Add(false); | |
3258 } | |
3259 } | |
3260 } | |
3261 | |
3262 | |
3263 // Find the longest range of lookahead that has the fewest number of different | |
3264 // characters that can occur at a given position. Since we are optimizing two | |
3265 // different parameters at once this is a tradeoff. | |
3266 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { | |
3267 int biggest_points = 0; | |
3268 for (int max_number_of_chars = 4; | |
3269 max_number_of_chars < kTooManyCharacters; | |
3270 max_number_of_chars *= 2) { | |
3271 biggest_points = | |
3272 FindBestInterval(max_number_of_chars, biggest_points, from, to); | |
3273 } | |
3274 if (biggest_points == 0) return false; | |
3275 return true; | |
3276 } | |
3277 | |
3278 | |
3279 // Find the highest-points range between 0 and length_ where the character | |
3280 // information is not too vague. 'Too vague' means that there are more than | |
3281 // max_number_of_chars that can occur at this position. Calculates the number | |
3282 // of points as the product of width-of-the-range and | |
3283 // probability-of-finding-one-of-the-characters, where the probability is | |
3284 // calculated using the frequency distribution of the sample subject string. | |
3285 int BoyerMooreLookahead::FindBestInterval( | |
3286 int max_number_of_chars, int old_biggest_points, int* from, int* to) { | |
3287 int biggest_points = old_biggest_points; | |
3288 static const int kSize = RegExpMacroAssembler::kTableSize; | |
3289 for (int i = 0; i < length_; ) { | |
3290 while (i < length_ && Count(i) > max_number_of_chars) i++; | |
3291 if (i == length_) break; | |
3292 int remembered_from = i; | |
3293 bool union_map[kSize]; | |
3294 for (int j = 0; j < kSize; j++) union_map[j] = false; | |
3295 while (i < length_ && Count(i) <= max_number_of_chars) { | |
3296 ZoneList<bool>* map = bitmaps_->at(i); | |
3297 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); | |
3298 i++; | |
3299 } | |
3300 int frequency = 0; | |
3301 for (int j = 0; j < kSize; j++) { | |
3302 if (union_map[j]) { | |
3303 // Add 1 to the frequency to give a small per-character boost for | |
3304 // the cases where our sampling is not good enough and many | |
3305 // characters have a frequency of zero. | |
3306 frequency += compiler_->frequency_collator()->Frequency(j) + 1; | |
3307 } | |
3308 } | |
3309 // We use the probability of skipping times the distance we are skipping to | |
3310 // judge the effectiveness of this. Actually we have a cut-off: By | |
3311 // dividing by 2 we switch off the skipping if the probability of skipping | |
3312 // is less than 50%. This is because the multibyte mask-and-compare | |
3313 // skipping in quickcheck is more likely to do well on this case. | |
3314 bool in_quickcheck_range = ((i - remembered_from < 4) || | |
3315 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); | |
3316 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; | |
ulan
2012/04/02 08:12:24
This shouldn't affect the correctness but:
if I r
Erik Corry
2012/04/02 09:36:22
This code is as intended. I added a few comments.
| |
3317 int points = (i - remembered_from) * probability; | |
3318 if (points > biggest_points) { | |
3319 *from = remembered_from; | |
3320 *to = i - 1; | |
3321 biggest_points = points; | |
3322 } | |
3323 } | |
3324 return biggest_points; | |
3325 } | |
3326 | |
3327 | |
3328 // Take all the characters that will not prevent a successful match if they | |
3329 // occur occur in the subject string in the range between min_lookahead and | |
ulan
2012/04/02 08:12:24
'occur' twice.
Erik Corry
2012/04/02 09:36:22
Done.
| |
3330 // max_lookahead (inclusive) measured from the current position. If the | |
3331 // character at max_lookahead offset is not one of these characters, then we | |
3332 // can safely skip forwards by the number of characters in the range. | |
3333 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, | |
3334 int max_lookahead, | |
3335 Handle<ByteArray> boolean_skip_table) { | |
3336 const int kSize = RegExpMacroAssembler::kTableSize; | |
3337 | |
3338 const int kSkipArrayEntry = 0; | |
3339 const int kDontSkipArrayEntry = 1; | |
3340 | |
3341 for (int i = 0; i < kSize; i++) { | |
3342 boolean_skip_table->set(i, kSkipArrayEntry); | |
3343 } | |
3344 int skip = max_lookahead + 1 - min_lookahead; | |
3345 | |
3346 for (int i = max_lookahead; i >= min_lookahead; i--) { | |
3347 ZoneList<bool>* map = bitmaps_->at(i); | |
3348 for (int j = 0; j < map_length_; j++) { | |
3349 if (map->at(j)) { | |
3350 boolean_skip_table->set(j, kDontSkipArrayEntry); | |
3351 } | |
3352 } | |
3353 } | |
3354 | |
3355 return skip; | |
3356 } | |
3357 | |
3358 | |
3359 // See comment above on the implementation of GetSkipTable. | |
3360 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | |
3361 int min_lookahead = 0; | |
3362 int max_lookahead = 0; | |
3363 | |
3364 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | |
3365 | |
3366 bool found_single_character = false; | |
3367 bool abandoned_search_for_single_character = false; | |
3368 int single_character = 0; | |
3369 for (int i = max_lookahead; i >= min_lookahead; i--) { | |
3370 ZoneList<bool>* map = bitmaps_->at(i); | |
3371 for (int j = 0; j < map_length_; j++) { | |
3372 if (map->at(j)) { | |
3373 if (found_single_character) { | |
3374 found_single_character = false; // Found two. | |
3375 abandoned_search_for_single_character = true; | |
3376 break; | |
3377 } else { | |
3378 found_single_character = true; | |
3379 single_character = j; | |
3380 } | |
3381 } | |
3382 } | |
3383 if (abandoned_search_for_single_character) break; | |
3384 } | |
3385 | |
3386 int lookahead_width = max_lookahead + 1 - min_lookahead; | |
3387 | |
3388 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | |
3389 // The mask-compare can probably handle this better. | |
3390 return false; | |
3391 } | |
3392 | |
3393 if (found_single_character) { | |
3394 Label cont, again; | |
3395 masm->Bind(&again); | |
3396 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
3397 if (max_char_ > map_length_) { | |
3398 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize); | |
3399 masm->CheckCharacterAfterAnd(single_character, | |
3400 RegExpMacroAssembler::kTableMask, | |
3401 &cont); | |
3402 } else { | |
3403 masm->CheckCharacter(single_character, &cont); | |
3404 } | |
3405 masm->AdvanceCurrentPosition(lookahead_width); | |
3406 masm->GoTo(&again); | |
3407 masm->Bind(&cont); | |
3408 return true; | |
3409 } | |
3410 | |
3411 Handle<ByteArray> boolean_skip_table = | |
3412 FACTORY->NewByteArray(map_length_, TENURED); | |
3413 int skip_distance = GetSkipTable( | |
3414 min_lookahead, max_lookahead, boolean_skip_table); | |
3415 ASSERT(skip_distance != 0); | |
3416 | |
3417 Label cont, again; | |
3418 masm->Bind(&again); | |
3419 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
3420 masm->CheckBitInTable(boolean_skip_table, &cont); | |
3421 masm->AdvanceCurrentPosition(skip_distance); | |
3422 masm->GoTo(&again); | |
3423 masm->Bind(&cont); | |
3424 | |
3425 return true; | |
3426 } | |
3427 | |
3134 /* Code generation for choice nodes. | 3428 /* Code generation for choice nodes. |
3135 * | 3429 * |
3136 * We generate quick checks that do a mask and compare to eliminate a | 3430 * We generate quick checks that do a mask and compare to eliminate a |
3137 * choice. If the quick check succeeds then it jumps to the continuation to | 3431 * choice. If the quick check succeeds then it jumps to the continuation to |
3138 * do slow checks and check subsequent nodes. If it fails (the common case) | 3432 * do slow checks and check subsequent nodes. If it fails (the common case) |
3139 * it falls through to the next choice. | 3433 * it falls through to the next choice. |
3140 * | 3434 * |
3141 * Here is the desired flow graph. Nodes directly below each other imply | 3435 * Here is the desired flow graph. Nodes directly below each other imply |
3142 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative | 3436 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative |
3143 * 3 doesn't have a quick check so we have to call the slow check. | 3437 * 3 doesn't have a quick check so we have to call the slow check. |
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
3267 greedy_match_trace.set_loop_label(&loop_label); | 3561 greedy_match_trace.set_loop_label(&loop_label); |
3268 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); | 3562 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); |
3269 macro_assembler->Bind(&greedy_match_failed); | 3563 macro_assembler->Bind(&greedy_match_failed); |
3270 } | 3564 } |
3271 | 3565 |
3272 Label second_choice; // For use in greedy matches. | 3566 Label second_choice; // For use in greedy matches. |
3273 macro_assembler->Bind(&second_choice); | 3567 macro_assembler->Bind(&second_choice); |
3274 | 3568 |
3275 int first_normal_choice = greedy_loop ? 1 : 0; | 3569 int first_normal_choice = greedy_loop ? 1 : 0; |
3276 | 3570 |
3277 int preload_characters = | 3571 bool not_at_start = current_trace->at_start() == Trace::FALSE; |
3278 CalculatePreloadCharacters(compiler, | 3572 const int kEatsAtLeastNotYetInitialized = -1; |
3279 current_trace->at_start() == Trace::FALSE); | 3573 int eats_at_least = kEatsAtLeastNotYetInitialized; |
3280 bool preload_is_current = | 3574 |
3575 bool skip_was_emitted = false; | |
3576 | |
3577 // More makes code generation slower, less makes V8 benchmark score lower. | |
3578 const int kMaxLookaheadForBoyerMoore = 8; | |
3579 | |
3580 if (!greedy_loop && choice_count == 2) { | |
3581 GuardedAlternative alt1 = alternatives_->at(1); | |
3582 if (alt1.guards() == NULL || alt1.guards()->length() == 0) { | |
3583 RegExpNode* eats_anything_node = alt1.node(); | |
3584 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | |
3585 this) { | |
3586 // At this point we know that we are at a non-greedy loop that will eat | |
3587 // any character one at a time. Any non-anchored regexp has such a | |
3588 // loop prepended to it in order to find where it starts. We look for | |
3589 // a pattern of the form ...abc... where we can look 6 characters ahead | |
3590 // and step forwards 3 if the character is not one of abc. Abc need | |
3591 // not be atoms, they can be any reasonably limited character class or | |
3592 // small alternation. | |
3593 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | |
3594 eats_at_least = | |
3595 Min(kMaxLookaheadForBoyerMoore, | |
3596 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | |
3597 if (eats_at_least >= 1) { | |
3598 BoyerMooreLookahead bm(eats_at_least, | |
3599 RegExpMacroAssembler::kTableSize, | |
3600 compiler); | |
3601 GuardedAlternative alt0 = alternatives_->at(0); | |
3602 alt0.node()->FillInBMInfo(0, &bm, not_at_start); | |
3603 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); | |
3604 } | |
3605 } | |
3606 } | |
3607 } | |
3608 | |
3609 if (eats_at_least == kEatsAtLeastNotYetInitialized) { | |
3610 // Save some time by looking at most one machine word ahead. | |
3611 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); | |
3612 } | |
3613 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); | |
3614 | |
3615 bool preload_is_current = !skip_was_emitted && | |
3281 (current_trace->characters_preloaded() == preload_characters); | 3616 (current_trace->characters_preloaded() == preload_characters); |
3282 bool preload_has_checked_bounds = preload_is_current; | 3617 bool preload_has_checked_bounds = preload_is_current; |
3283 | 3618 |
3284 AlternativeGenerationList alt_gens(choice_count); | 3619 AlternativeGenerationList alt_gens(choice_count); |
3285 | 3620 |
3286 // For now we just call all choices one after the other. The idea ultimately | 3621 // For now we just call all choices one after the other. The idea ultimately |
3287 // is to use the Dispatch table to try only the relevant ones. | 3622 // is to use the Dispatch table to try only the relevant ones. |
3288 for (int i = first_normal_choice; i < choice_count; i++) { | 3623 for (int i = first_normal_choice; i < choice_count; i++) { |
3289 GuardedAlternative alternative = alternatives_->at(i); | 3624 GuardedAlternative alternative = alternatives_->at(i); |
3290 AlternativeGeneration* alt_gen = alt_gens.at(i); | 3625 AlternativeGeneration* alt_gen = alt_gens.at(i); |
(...skipping 2134 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5425 // We don't know anything about the first character of a backreference | 5760 // We don't know anything about the first character of a backreference |
5426 // at this point. | 5761 // at this point. |
5427 // The potential first characters are the first characters of the capture, | 5762 // The potential first characters are the first characters of the capture, |
5428 // and the first characters of the on_success node, depending on whether the | 5763 // and the first characters of the on_success node, depending on whether the |
5429 // capture can be empty and whether it is known to be participating or known | 5764 // capture can be empty and whether it is known to be participating or known |
5430 // not to be. | 5765 // not to be. |
5431 return kComputeFirstCharacterSetFail; | 5766 return kComputeFirstCharacterSetFail; |
5432 } | 5767 } |
5433 | 5768 |
5434 | 5769 |
5770 void ChoiceNode::FillInBMInfo( | |
5771 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
5772 ZoneList<GuardedAlternative>* alts = alternatives(); | |
5773 for (int i = 0; i < alts->length(); i++) { | |
5774 GuardedAlternative& alt = alts->at(i); | |
5775 if (alt.guards() != NULL && alt.guards()->length() != 0) { | |
5776 bm->SetRest(offset); // Give up trying to fill in info. | |
5777 return; | |
5778 } | |
5779 alt.node()->FillInBMInfo(offset, bm, not_at_start); | |
5780 } | |
5781 } | |
5782 | |
5783 | |
5784 void TextNode::FillInBMInfo( | |
5785 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
5786 if (offset >= bm->length()) return; | |
5787 int max_char = bm->max_char(); | |
5788 for (int i = 0; i < elements()->length(); i++) { | |
5789 if (offset >= bm->length()) return; | |
5790 TextElement text = elements()->at(i); | |
5791 if (text.type == TextElement::ATOM) { | |
5792 RegExpAtom* atom = text.data.u_atom; | |
5793 for (int j = 0; j < atom->length(); j++, offset++) { | |
5794 if (offset >= bm->length()) return; | |
5795 uc16 character = atom->data()[j]; | |
5796 if (bm->compiler()->ignore_case()) { | |
5797 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
5798 int length = GetCaseIndependentLetters( | |
5799 ISOLATE, | |
5800 character, | |
5801 bm->max_char() == String::kMaxAsciiCharCode, | |
5802 chars); | |
5803 for (int j = 0; j < length; j++) { | |
5804 bm->Set(offset, chars[j]); | |
5805 } | |
5806 } else { | |
5807 if (character <= max_char) bm->Set(offset, character); | |
5808 } | |
5809 } | |
5810 } else { | |
5811 ASSERT(text.type == TextElement::CHAR_CLASS); | |
5812 RegExpCharacterClass* char_class = text.data.u_char_class; | |
5813 ZoneList<CharacterRange>* ranges = char_class->ranges(); | |
5814 if (char_class->is_negated()) { | |
5815 bm->SetAll(offset); | |
5816 } else { | |
5817 for (int k = 0; k < ranges->length(); k++) { | |
5818 CharacterRange& range = ranges->at(k); | |
5819 if (range.from() > max_char) continue; | |
5820 int to = Min(max_char, static_cast<int>(range.to())); | |
5821 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { | |
5822 bm->SetAll(offset); | |
5823 break; | |
5824 } | |
5825 for (int m = range.from(); m <= to; m++) { | |
5826 bm->Set(offset, m); | |
5827 } | |
5828 } | |
5829 } | |
5830 offset++; | |
5831 } | |
5832 } | |
5833 if (offset >= bm->length()) return; | |
5834 on_success()->FillInBMInfo(offset, | |
5835 bm, | |
5836 true); // Not at start after a text node. | |
5837 } | |
5838 | |
5839 | |
5435 int TextNode::ComputeFirstCharacterSet(int budget) { | 5840 int TextNode::ComputeFirstCharacterSet(int budget) { |
5436 budget--; | 5841 budget--; |
5437 if (budget >= 0) { | 5842 if (budget >= 0) { |
5438 ASSERT_NE(0, elements()->length()); | 5843 ASSERT_NE(0, elements()->length()); |
5439 TextElement text = elements()->at(0); | 5844 TextElement text = elements()->at(0); |
5440 if (text.type == TextElement::ATOM) { | 5845 if (text.type == TextElement::ATOM) { |
5441 RegExpAtom* atom = text.data.u_atom; | 5846 RegExpAtom* atom = text.data.u_atom; |
5442 ASSERT_NE(0, atom->length()); | 5847 ASSERT_NE(0, atom->length()); |
5443 uc16 first_char = atom->data()[0]; | 5848 uc16 first_char = atom->data()[0]; |
5444 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); | 5849 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); |
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5582 } | 5987 } |
5583 } | 5988 } |
5584 | 5989 |
5585 | 5990 |
5586 void DispatchTableConstructor::VisitAction(ActionNode* that) { | 5991 void DispatchTableConstructor::VisitAction(ActionNode* that) { |
5587 RegExpNode* target = that->on_success(); | 5992 RegExpNode* target = that->on_success(); |
5588 target->Accept(this); | 5993 target->Accept(this); |
5589 } | 5994 } |
5590 | 5995 |
5591 | 5996 |
5592 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, | 5997 RegExpEngine::CompilationResult RegExpEngine::Compile( |
5593 bool ignore_case, | 5998 RegExpCompileData* data, |
5594 bool is_multiline, | 5999 bool ignore_case, |
5595 Handle<String> pattern, | 6000 bool is_multiline, |
5596 bool is_ascii) { | 6001 Handle<String> pattern, |
6002 Handle<String> sample_subject, | |
6003 bool is_ascii) { | |
5597 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { | 6004 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
5598 return IrregexpRegExpTooBig(); | 6005 return IrregexpRegExpTooBig(); |
5599 } | 6006 } |
5600 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); | 6007 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); |
6008 | |
6009 // Sample some characters from the middle of the string. | |
6010 static const int kSampleSize = 128; | |
6011 | |
6012 FlattenString(sample_subject); | |
6013 int chars_sampled = 0; | |
6014 int half_way = (sample_subject->length() - kSampleSize) / 2; | |
6015 for (int i = Max(0, half_way); | |
6016 i < sample_subject->length(); | |
6017 i++, chars_sampled++) { | |
6018 if (chars_sampled > kSampleSize) break; | |
ulan
2012/04/02 08:12:24
Now we can move this into the for loop condition.
Erik Corry
2012/04/02 09:36:22
Done.
| |
6019 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); | |
6020 } | |
6021 | |
5601 // Wrap the body of the regexp in capture #0. | 6022 // Wrap the body of the regexp in capture #0. |
5602 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, | 6023 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, |
5603 0, | 6024 0, |
5604 &compiler, | 6025 &compiler, |
5605 compiler.accept()); | 6026 compiler.accept()); |
5606 RegExpNode* node = captured_body; | 6027 RegExpNode* node = captured_body; |
5607 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); | 6028 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); |
5608 bool is_start_anchored = data->tree->IsAnchoredAtStart(); | 6029 bool is_start_anchored = data->tree->IsAnchoredAtStart(); |
5609 int max_length = data->tree->max_match(); | 6030 int max_length = data->tree->max_match(); |
5610 if (!is_start_anchored) { | 6031 if (!is_start_anchored) { |
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
5673 } | 6094 } |
5674 | 6095 |
5675 return compiler.Assemble(¯o_assembler, | 6096 return compiler.Assemble(¯o_assembler, |
5676 node, | 6097 node, |
5677 data->capture_count, | 6098 data->capture_count, |
5678 pattern); | 6099 pattern); |
5679 } | 6100 } |
5680 | 6101 |
5681 | 6102 |
5682 }} // namespace v8::internal | 6103 }} // namespace v8::internal |
OLD | NEW |