Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(395)

Side by Side Diff: src/jsregexp.cc

Issue 9965010: Regexp: Improve the speed that we scan for an initial point where a non-anchored (Closed) Base URL: http://v8.googlecode.com/svn/branches/bleeding_edge/
Patch Set: '' Created 8 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/jsregexp.h ('k') | src/x64/regexp-macro-assembler-x64.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2011 the V8 project authors. All rights reserved. 1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 262 matching lines...) Expand 10 before | Expand all | Expand 10 after
273 273
274 274
275 // Irregexp implementation. 275 // Irregexp implementation.
276 276
277 // Ensures that the regexp object contains a compiled version of the 277 // Ensures that the regexp object contains a compiled version of the
278 // source for either ASCII or non-ASCII strings. 278 // source for either ASCII or non-ASCII strings.
279 // If the compiled version doesn't already exist, it is compiled 279 // If the compiled version doesn't already exist, it is compiled
280 // from the source pattern. 280 // from the source pattern.
281 // If compilation fails, an exception is thrown and this function 281 // If compilation fails, an exception is thrown and this function
282 // returns false. 282 // returns false.
283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { 283 bool RegExpImpl::EnsureCompiledIrregexp(
284 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
284 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); 285 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
285 #ifdef V8_INTERPRETED_REGEXP 286 #ifdef V8_INTERPRETED_REGEXP
286 if (compiled_code->IsByteArray()) return true; 287 if (compiled_code->IsByteArray()) return true;
287 #else // V8_INTERPRETED_REGEXP (RegExp native code) 288 #else // V8_INTERPRETED_REGEXP (RegExp native code)
288 if (compiled_code->IsCode()) return true; 289 if (compiled_code->IsCode()) return true;
289 #endif 290 #endif
290 // We could potentially have marked this as flushable, but have kept 291 // We could potentially have marked this as flushable, but have kept
291 // a saved version if we did not flush it yet. 292 // a saved version if we did not flush it yet.
292 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); 293 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
293 if (saved_code->IsCode()) { 294 if (saved_code->IsCode()) {
294 // Reinstate the code in the original place. 295 // Reinstate the code in the original place.
295 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); 296 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
296 ASSERT(compiled_code->IsSmi()); 297 ASSERT(compiled_code->IsSmi());
297 return true; 298 return true;
298 } 299 }
299 return CompileIrregexp(re, is_ascii); 300 return CompileIrregexp(re, sample_subject, is_ascii);
300 } 301 }
301 302
302 303
303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, 304 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
304 bool is_ascii, 305 bool is_ascii,
305 Handle<String> error_message, 306 Handle<String> error_message,
306 Isolate* isolate) { 307 Isolate* isolate) {
307 Factory* factory = isolate->factory(); 308 Factory* factory = isolate->factory();
308 Handle<FixedArray> elements = factory->NewFixedArray(2); 309 Handle<FixedArray> elements = factory->NewFixedArray(2);
309 elements->set(0, re->Pattern()); 310 elements->set(0, re->Pattern());
310 elements->set(1, *error_message); 311 elements->set(1, *error_message);
311 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); 312 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
312 Handle<Object> regexp_err = 313 Handle<Object> regexp_err =
313 factory->NewSyntaxError("malformed_regexp", array); 314 factory->NewSyntaxError("malformed_regexp", array);
314 isolate->Throw(*regexp_err); 315 isolate->Throw(*regexp_err);
315 return false; 316 return false;
316 } 317 }
317 318
318 319
319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { 320 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
321 Handle<String> sample_subject,
322 bool is_ascii) {
320 // Compile the RegExp. 323 // Compile the RegExp.
321 Isolate* isolate = re->GetIsolate(); 324 Isolate* isolate = re->GetIsolate();
322 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); 325 ZoneScope zone_scope(isolate, DELETE_ON_EXIT);
323 PostponeInterruptsScope postpone(isolate); 326 PostponeInterruptsScope postpone(isolate);
324 // If we had a compilation error the last time this is saved at the 327 // If we had a compilation error the last time this is saved at the
325 // saved code index. 328 // saved code index.
326 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); 329 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
327 // When arriving here entry can only be a smi, either representing an 330 // When arriving here entry can only be a smi, either representing an
328 // uncompiled regexp, a previous compilation error, or code that has 331 // uncompiled regexp, a previous compilation error, or code that has
329 // been flushed. 332 // been flushed.
(...skipping 28 matching lines...) Expand all
358 pattern, 361 pattern,
359 compile_data.error, 362 compile_data.error,
360 "malformed_regexp"); 363 "malformed_regexp");
361 return false; 364 return false;
362 } 365 }
363 RegExpEngine::CompilationResult result = 366 RegExpEngine::CompilationResult result =
364 RegExpEngine::Compile(&compile_data, 367 RegExpEngine::Compile(&compile_data,
365 flags.is_ignore_case(), 368 flags.is_ignore_case(),
366 flags.is_multiline(), 369 flags.is_multiline(),
367 pattern, 370 pattern,
371 sample_subject,
368 is_ascii); 372 is_ascii);
369 if (result.error_message != NULL) { 373 if (result.error_message != NULL) {
370 // Unable to compile regexp. 374 // Unable to compile regexp.
371 Handle<String> error_message = 375 Handle<String> error_message =
372 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); 376 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message));
373 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); 377 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
374 return false; 378 return false;
375 } 379 }
376 380
377 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); 381 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
(...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after
428 capture_count); 432 capture_count);
429 } 433 }
430 434
431 435
432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, 436 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
433 Handle<String> subject) { 437 Handle<String> subject) {
434 if (!subject->IsFlat()) FlattenString(subject); 438 if (!subject->IsFlat()) FlattenString(subject);
435 439
436 // Check the asciiness of the underlying storage. 440 // Check the asciiness of the underlying storage.
437 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); 441 bool is_ascii = subject->IsAsciiRepresentationUnderneath();
438 if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; 442 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
439 443
440 #ifdef V8_INTERPRETED_REGEXP 444 #ifdef V8_INTERPRETED_REGEXP
441 // Byte-code regexp needs space allocated for all its registers. 445 // Byte-code regexp needs space allocated for all its registers.
442 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); 446 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data()));
443 #else // V8_INTERPRETED_REGEXP 447 #else // V8_INTERPRETED_REGEXP
444 // Native regexp only needs room to output captures. Registers are handled 448 // Native regexp only needs room to output captures. Registers are handled
445 // internally. 449 // internally.
446 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; 450 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
447 #endif // V8_INTERPRETED_REGEXP 451 #endif // V8_INTERPRETED_REGEXP
448 } 452 }
(...skipping 10 matching lines...) Expand all
459 463
460 ASSERT(index >= 0); 464 ASSERT(index >= 0);
461 ASSERT(index <= subject->length()); 465 ASSERT(index <= subject->length());
462 ASSERT(subject->IsFlat()); 466 ASSERT(subject->IsFlat());
463 467
464 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); 468 bool is_ascii = subject->IsAsciiRepresentationUnderneath();
465 469
466 #ifndef V8_INTERPRETED_REGEXP 470 #ifndef V8_INTERPRETED_REGEXP
467 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); 471 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
468 do { 472 do {
469 EnsureCompiledIrregexp(regexp, is_ascii); 473 EnsureCompiledIrregexp(regexp, subject, is_ascii);
470 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); 474 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
471 NativeRegExpMacroAssembler::Result res = 475 NativeRegExpMacroAssembler::Result res =
472 NativeRegExpMacroAssembler::Match(code, 476 NativeRegExpMacroAssembler::Match(code,
473 subject, 477 subject,
474 output.start(), 478 output.start(),
475 output.length(), 479 output.length(),
476 index, 480 index,
477 isolate); 481 isolate);
478 if (res != NativeRegExpMacroAssembler::RETRY) { 482 if (res != NativeRegExpMacroAssembler::RETRY) {
479 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || 483 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
(...skipping 297 matching lines...) Expand 10 before | Expand all | Expand 10 after
777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { 781 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
778 if (table_ == NULL) { 782 if (table_ == NULL) {
779 table_ = new DispatchTable(); 783 table_ = new DispatchTable();
780 DispatchTableConstructor cons(table_, ignore_case); 784 DispatchTableConstructor cons(table_, ignore_case);
781 cons.BuildTable(this); 785 cons.BuildTable(this);
782 } 786 }
783 return table_; 787 return table_;
784 } 788 }
785 789
786 790
791 class FrequencyCollator {
792 public:
793 FrequencyCollator() : total_samples_(0) {
794 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
795 frequencies_[i] = CharacterFrequency(i);
796 }
797 }
798
799 void CountCharacter(int character) {
800 int index = (character & RegExpMacroAssembler::kTableMask);
801 frequencies_[index].Increment();
802 total_samples_++;
803 }
804
805 // Does not measure in percent, but rather per-128 (the table size from the
806 // regexp macro assembler).
807 int Frequency(int in_character) {
808 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
809 if (total_samples_ < 1) return 1; // Division by zero.
810 int freq_in_per128 =
811 (frequencies_[in_character].counter() * 128) / total_samples_;
812 return freq_in_per128;
813 }
814
815 private:
816 class CharacterFrequency {
817 public:
818 CharacterFrequency() : counter_(0), character_(-1) { }
819 explicit CharacterFrequency(int character)
820 : counter_(0), character_(character) { }
821
822 void Increment() { counter_++; }
823 int counter() { return counter_; }
824 int character() { return character_; }
825
826 private:
827 int counter_;
828 int character_;
829 };
830
831
832 private:
833 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
834 int total_samples_;
835 };
836
837
787 class RegExpCompiler { 838 class RegExpCompiler {
788 public: 839 public:
789 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); 840 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
790 841
791 int AllocateRegister() { 842 int AllocateRegister() {
792 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { 843 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
793 reg_exp_too_big_ = true; 844 reg_exp_too_big_ = true;
794 return next_register_; 845 return next_register_;
795 } 846 }
796 return next_register_++; 847 return next_register_++;
(...skipping 15 matching lines...) Expand all
812 863
813 static const int kMaxRecursion = 100; 864 static const int kMaxRecursion = 100;
814 inline int recursion_depth() { return recursion_depth_; } 865 inline int recursion_depth() { return recursion_depth_; }
815 inline void IncrementRecursionDepth() { recursion_depth_++; } 866 inline void IncrementRecursionDepth() { recursion_depth_++; }
816 inline void DecrementRecursionDepth() { recursion_depth_--; } 867 inline void DecrementRecursionDepth() { recursion_depth_--; }
817 868
818 void SetRegExpTooBig() { reg_exp_too_big_ = true; } 869 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
819 870
820 inline bool ignore_case() { return ignore_case_; } 871 inline bool ignore_case() { return ignore_case_; }
821 inline bool ascii() { return ascii_; } 872 inline bool ascii() { return ascii_; }
873 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
822 874
823 int current_expansion_factor() { return current_expansion_factor_; } 875 int current_expansion_factor() { return current_expansion_factor_; }
824 void set_current_expansion_factor(int value) { 876 void set_current_expansion_factor(int value) {
825 current_expansion_factor_ = value; 877 current_expansion_factor_ = value;
826 } 878 }
827 879
828 static const int kNoRegister = -1; 880 static const int kNoRegister = -1;
829 881
830 private: 882 private:
831 EndNode* accept_; 883 EndNode* accept_;
832 int next_register_; 884 int next_register_;
833 List<RegExpNode*>* work_list_; 885 List<RegExpNode*>* work_list_;
834 int recursion_depth_; 886 int recursion_depth_;
835 RegExpMacroAssembler* macro_assembler_; 887 RegExpMacroAssembler* macro_assembler_;
836 bool ignore_case_; 888 bool ignore_case_;
837 bool ascii_; 889 bool ascii_;
838 bool reg_exp_too_big_; 890 bool reg_exp_too_big_;
839 int current_expansion_factor_; 891 int current_expansion_factor_;
892 FrequencyCollator frequency_collator_;
840 }; 893 };
841 894
842 895
843 class RecursionCheck { 896 class RecursionCheck {
844 public: 897 public:
845 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { 898 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
846 compiler->IncrementRecursionDepth(); 899 compiler->IncrementRecursionDepth();
847 } 900 }
848 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } 901 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
849 private: 902 private:
850 RegExpCompiler* compiler_; 903 RegExpCompiler* compiler_;
851 }; 904 };
852 905
853 906
854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { 907 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
855 return RegExpEngine::CompilationResult("RegExp too big"); 908 return RegExpEngine::CompilationResult("RegExp too big");
856 } 909 }
857 910
858 911
859 // Attempts to compile the regexp using an Irregexp code generator. Returns 912 // Attempts to compile the regexp using an Irregexp code generator. Returns
860 // a fixed array or a null handle depending on whether it succeeded. 913 // a fixed array or a null handle depending on whether it succeeded.
861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) 914 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
862 : next_register_(2 * (capture_count + 1)), 915 : next_register_(2 * (capture_count + 1)),
863 work_list_(NULL), 916 work_list_(NULL),
864 recursion_depth_(0), 917 recursion_depth_(0),
865 ignore_case_(ignore_case), 918 ignore_case_(ignore_case),
866 ascii_(ascii), 919 ascii_(ascii),
867 reg_exp_too_big_(false), 920 reg_exp_too_big_(false),
868 current_expansion_factor_(1) { 921 current_expansion_factor_(1),
922 frequency_collator_() {
869 accept_ = new EndNode(EndNode::ACCEPT); 923 accept_ = new EndNode(EndNode::ACCEPT);
870 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); 924 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
871 } 925 }
872 926
873 927
874 RegExpEngine::CompilationResult RegExpCompiler::Assemble( 928 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
875 RegExpMacroAssembler* macro_assembler, 929 RegExpMacroAssembler* macro_assembler,
876 RegExpNode* start, 930 RegExpNode* start,
877 int capture_count, 931 int capture_count,
878 Handle<String> pattern) { 932 Handle<String> pattern) {
(...skipping 1170 matching lines...) Expand 10 before | Expand all | Expand 10 after
2049 int recursion_depth, 2103 int recursion_depth,
2050 bool not_at_start) { 2104 bool not_at_start) {
2051 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 2105 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2052 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! 2106 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2053 return on_success()->EatsAtLeast(still_to_find, 2107 return on_success()->EatsAtLeast(still_to_find,
2054 recursion_depth + 1, 2108 recursion_depth + 1,
2055 not_at_start); 2109 not_at_start);
2056 } 2110 }
2057 2111
2058 2112
2113 void ActionNode::FillInBMInfo(int offset,
2114 BoyerMooreLookahead* bm,
2115 bool not_at_start) {
2116 if (type_ == BEGIN_SUBMATCH) {
2117 bm->SetRest(offset);
2118 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) {
2119 on_success()->FillInBMInfo(offset, bm, not_at_start);
2120 }
2121 }
2122
2123
2059 int AssertionNode::EatsAtLeast(int still_to_find, 2124 int AssertionNode::EatsAtLeast(int still_to_find,
2060 int recursion_depth, 2125 int recursion_depth,
2061 bool not_at_start) { 2126 bool not_at_start) {
2062 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 2127 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2063 // If we know we are not at the start and we are asked "how many characters 2128 // If we know we are not at the start and we are asked "how many characters
2064 // will you match if you succeed?" then we can answer anything since false 2129 // will you match if you succeed?" then we can answer anything since false
2065 // implies false. So lets just return the max answer (still_to_find) since 2130 // implies false. So lets just return the max answer (still_to_find) since
2066 // that won't prevent us from preloading a lot of characters for the other 2131 // that won't prevent us from preloading a lot of characters for the other
2067 // branches in the node graph. 2132 // branches in the node graph.
2068 if (type() == AT_START && not_at_start) return still_to_find; 2133 if (type() == AT_START && not_at_start) return still_to_find;
2069 return on_success()->EatsAtLeast(still_to_find, 2134 return on_success()->EatsAtLeast(still_to_find,
2070 recursion_depth + 1, 2135 recursion_depth + 1,
2071 not_at_start); 2136 not_at_start);
2072 } 2137 }
2073 2138
2074 2139
2140 void AssertionNode::FillInBMInfo(
2141 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
2142 // Match the behaviour of EatsAtLeast on this node.
2143 if (type() == AT_START && not_at_start) return;
2144 on_success()->FillInBMInfo(offset, bm, not_at_start);
2145 }
2146
2147
2075 int BackReferenceNode::EatsAtLeast(int still_to_find, 2148 int BackReferenceNode::EatsAtLeast(int still_to_find,
2076 int recursion_depth, 2149 int recursion_depth,
2077 bool not_at_start) { 2150 bool not_at_start) {
2078 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; 2151 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
2079 return on_success()->EatsAtLeast(still_to_find, 2152 return on_success()->EatsAtLeast(still_to_find,
2080 recursion_depth + 1, 2153 recursion_depth + 1,
2081 not_at_start); 2154 not_at_start);
2082 } 2155 }
2083 2156
2084 2157
(...skipping 412 matching lines...) Expand 10 before | Expand all | Expand 10 after
2497 bool not_at_start) { 2570 bool not_at_start) {
2498 if (body_can_be_zero_length_ || info()->visited) return; 2571 if (body_can_be_zero_length_ || info()->visited) return;
2499 VisitMarker marker(info()); 2572 VisitMarker marker(info());
2500 return ChoiceNode::GetQuickCheckDetails(details, 2573 return ChoiceNode::GetQuickCheckDetails(details,
2501 compiler, 2574 compiler,
2502 characters_filled_in, 2575 characters_filled_in,
2503 not_at_start); 2576 not_at_start);
2504 } 2577 }
2505 2578
2506 2579
2580 void LoopChoiceNode::FillInBMInfo(
2581 int offset, BoyerMooreLookahead* bm, bool nas) {
2582 if (body_can_be_zero_length_) {
2583 bm->SetRest(offset);
2584 return;
2585 }
2586 ChoiceNode::FillInBMInfo(offset, bm, nas);
2587 }
2588
2589
2507 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, 2590 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2508 RegExpCompiler* compiler, 2591 RegExpCompiler* compiler,
2509 int characters_filled_in, 2592 int characters_filled_in,
2510 bool not_at_start) { 2593 bool not_at_start) {
2511 not_at_start = (not_at_start || not_at_start_); 2594 not_at_start = (not_at_start || not_at_start_);
2512 int choice_count = alternatives_->length(); 2595 int choice_count = alternatives_->length();
2513 ASSERT(choice_count > 0); 2596 ASSERT(choice_count > 0);
2514 alternatives_->at(0).node()->GetQuickCheckDetails(details, 2597 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2515 compiler, 2598 compiler,
2516 characters_filled_in, 2599 characters_filled_in,
(...skipping 476 matching lines...) Expand 10 before | Expand all | Expand 10 after
2993 int TextNode::GreedyLoopTextLength() { 3076 int TextNode::GreedyLoopTextLength() {
2994 TextElement elm = elms_->at(elms_->length() - 1); 3077 TextElement elm = elms_->at(elms_->length() - 1);
2995 if (elm.type == TextElement::CHAR_CLASS) { 3078 if (elm.type == TextElement::CHAR_CLASS) {
2996 return elm.cp_offset + 1; 3079 return elm.cp_offset + 1;
2997 } else { 3080 } else {
2998 return elm.cp_offset + elm.data.u_atom->data().length(); 3081 return elm.cp_offset + elm.data.u_atom->data().length();
2999 } 3082 }
3000 } 3083 }
3001 3084
3002 3085
3086 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3087 RegExpCompiler* compiler) {
3088 if (elms_->length() != 1) return NULL;
3089 TextElement elm = elms_->at(0);
3090 if (elm.type != TextElement::CHAR_CLASS) return NULL;
3091 RegExpCharacterClass* node = elm.data.u_char_class;
3092 ZoneList<CharacterRange>* ranges = node->ranges();
3093 if (!CharacterRange::IsCanonical(ranges)) {
3094 CharacterRange::Canonicalize(ranges);
3095 }
3096 if (node->is_negated()) {
3097 return ranges->length() == 0 ? on_success() : NULL;
3098 }
3099 if (ranges->length() != 1) return NULL;
3100 uint32_t max_char;
3101 if (compiler->ascii()) {
3102 max_char = String::kMaxAsciiCharCode;
3103 } else {
3104 max_char = String::kMaxUtf16CodeUnit;
3105 }
3106 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3107 }
3108
3109
3003 // Finds the fixed match length of a sequence of nodes that goes from 3110 // Finds the fixed match length of a sequence of nodes that goes from
3004 // this alternative and back to this choice node. If there are variable 3111 // this alternative and back to this choice node. If there are variable
3005 // length nodes or other complications in the way then return a sentinel 3112 // length nodes or other complications in the way then return a sentinel
3006 // value indicating that a greedy loop cannot be constructed. 3113 // value indicating that a greedy loop cannot be constructed.
3007 int ChoiceNode::GreedyLoopTextLengthForAlternative( 3114 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3008 GuardedAlternative* alternative) { 3115 GuardedAlternative* alternative) {
3009 int length = 0; 3116 int length = 0;
3010 RegExpNode* node = alternative->node(); 3117 RegExpNode* node = alternative->node();
3011 // Later we will generate code for all these text nodes using recursion 3118 // Later we will generate code for all these text nodes using recursion
3012 // so we have to limit the max number. 3119 // so we have to limit the max number.
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
3057 ASSERT(trace->stop_node() == NULL); 3164 ASSERT(trace->stop_node() == NULL);
3058 if (!trace->is_trivial()) { 3165 if (!trace->is_trivial()) {
3059 trace->Flush(compiler, this); 3166 trace->Flush(compiler, this);
3060 return; 3167 return;
3061 } 3168 }
3062 ChoiceNode::Emit(compiler, trace); 3169 ChoiceNode::Emit(compiler, trace);
3063 } 3170 }
3064 3171
3065 3172
3066 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, 3173 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3067 bool not_at_start) { 3174 int eats_at_least) {
3068 int preload_characters = EatsAtLeast(4, 0, not_at_start); 3175 int preload_characters = Min(4, eats_at_least);
3069 if (compiler->macro_assembler()->CanReadUnaligned()) { 3176 if (compiler->macro_assembler()->CanReadUnaligned()) {
3070 bool ascii = compiler->ascii(); 3177 bool ascii = compiler->ascii();
3071 if (ascii) { 3178 if (ascii) {
3072 if (preload_characters > 4) preload_characters = 4; 3179 if (preload_characters > 4) preload_characters = 4;
3073 // We can't preload 3 characters because there is no machine instruction 3180 // We can't preload 3 characters because there is no machine instruction
3074 // to do that. We can't just load 4 because we could be reading 3181 // to do that. We can't just load 4 because we could be reading
3075 // beyond the end of the string, which could cause a memory fault. 3182 // beyond the end of the string, which could cause a memory fault.
3076 if (preload_characters == 3) preload_characters = 2; 3183 if (preload_characters == 3) preload_characters = 2;
3077 } else { 3184 } else {
3078 if (preload_characters > 2) preload_characters = 2; 3185 if (preload_characters > 2) preload_characters = 2;
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
3124 return alt_gens_[i]; 3231 return alt_gens_[i];
3125 } 3232 }
3126 3233
3127 private: 3234 private:
3128 static const int kAFew = 10; 3235 static const int kAFew = 10;
3129 ZoneList<AlternativeGeneration*> alt_gens_; 3236 ZoneList<AlternativeGeneration*> alt_gens_;
3130 AlternativeGeneration a_few_alt_gens_[kAFew]; 3237 AlternativeGeneration a_few_alt_gens_[kAFew];
3131 }; 3238 };
3132 3239
3133 3240
3241 BoyerMooreLookahead::BoyerMooreLookahead(
3242 int length, int map_length, RegExpCompiler* compiler)
3243 : length_(length),
3244 map_length_(map_length),
3245 compiler_(compiler) {
3246 ASSERT(IsPowerOf2(map_length));
3247 if (compiler->ascii()) {
3248 max_char_ = String::kMaxAsciiCharCode;
3249 } else {
3250 max_char_ = String::kMaxUtf16CodeUnit;
3251 }
3252 bitmaps_ = new ZoneList<ZoneList<bool>*>(length);
3253 for (int i = 0; i < length; i++) {
3254 bitmaps_->Add(new ZoneList<bool>(map_length));
3255 ZoneList<bool>* map = bitmaps_->at(i);
3256 for (int i = 0; i < map_length; i++) {
3257 map->Add(false);
3258 }
3259 }
3260 }
3261
3262
3263 // Find the longest range of lookahead that has the fewest number of different
3264 // characters that can occur at a given position. Since we are optimizing two
3265 // different parameters at once this is a tradeoff.
3266 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3267 int biggest_points = 0;
3268 for (int max_number_of_chars = 4;
3269 max_number_of_chars < kTooManyCharacters;
3270 max_number_of_chars *= 2) {
3271 biggest_points =
3272 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3273 }
3274 if (biggest_points == 0) return false;
3275 return true;
3276 }
3277
3278
3279 // Find the highest-points range between 0 and length_ where the character
3280 // information is not too vague. 'Too vague' means that there are more than
3281 // max_number_of_chars that can occur at this position. Calculates the number
3282 // of points as the product of width-of-the-range and
3283 // probability-of-finding-one-of-the-characters, where the probability is
3284 // calculated using the frequency distribution of the sample subject string.
3285 int BoyerMooreLookahead::FindBestInterval(
3286 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3287 int biggest_points = old_biggest_points;
3288 static const int kSize = RegExpMacroAssembler::kTableSize;
3289 for (int i = 0; i < length_; ) {
3290 while (i < length_ && Count(i) > max_number_of_chars) i++;
3291 if (i == length_) break;
3292 int remembered_from = i;
3293 bool union_map[kSize];
3294 for (int j = 0; j < kSize; j++) union_map[j] = false;
3295 while (i < length_ && Count(i) <= max_number_of_chars) {
3296 ZoneList<bool>* map = bitmaps_->at(i);
3297 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3298 i++;
3299 }
3300 int frequency = 0;
3301 for (int j = 0; j < kSize; j++) {
3302 if (union_map[j]) {
3303 // Add 1 to the frequency to give a small per-character boost for
3304 // the cases where our sampling is not good enough and many
3305 // characters have a frequency of zero.
3306 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3307 }
3308 }
3309 // We use the probability of skipping times the distance we are skipping to
3310 // judge the effectiveness of this. Actually we have a cut-off: By
3311 // dividing by 2 we switch off the skipping if the probability of skipping
3312 // is less than 50%. This is because the multibyte mask-and-compare
3313 // skipping in quickcheck is more likely to do well on this case.
3314 bool in_quickcheck_range = ((i - remembered_from < 4) ||
3315 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
3316 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
ulan 2012/04/02 08:12:24 This shouldn't affect the correctness but: if I r
Erik Corry 2012/04/02 09:36:22 This code is as intended. I added a few comments.
3317 int points = (i - remembered_from) * probability;
3318 if (points > biggest_points) {
3319 *from = remembered_from;
3320 *to = i - 1;
3321 biggest_points = points;
3322 }
3323 }
3324 return biggest_points;
3325 }
3326
3327
3328 // Take all the characters that will not prevent a successful match if they
3329 // occur occur in the subject string in the range between min_lookahead and
ulan 2012/04/02 08:12:24 'occur' twice.
Erik Corry 2012/04/02 09:36:22 Done.
3330 // max_lookahead (inclusive) measured from the current position. If the
3331 // character at max_lookahead offset is not one of these characters, then we
3332 // can safely skip forwards by the number of characters in the range.
3333 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3334 int max_lookahead,
3335 Handle<ByteArray> boolean_skip_table) {
3336 const int kSize = RegExpMacroAssembler::kTableSize;
3337
3338 const int kSkipArrayEntry = 0;
3339 const int kDontSkipArrayEntry = 1;
3340
3341 for (int i = 0; i < kSize; i++) {
3342 boolean_skip_table->set(i, kSkipArrayEntry);
3343 }
3344 int skip = max_lookahead + 1 - min_lookahead;
3345
3346 for (int i = max_lookahead; i >= min_lookahead; i--) {
3347 ZoneList<bool>* map = bitmaps_->at(i);
3348 for (int j = 0; j < map_length_; j++) {
3349 if (map->at(j)) {
3350 boolean_skip_table->set(j, kDontSkipArrayEntry);
3351 }
3352 }
3353 }
3354
3355 return skip;
3356 }
3357
3358
3359 // See comment above on the implementation of GetSkipTable.
3360 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3361 int min_lookahead = 0;
3362 int max_lookahead = 0;
3363
3364 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
3365
3366 bool found_single_character = false;
3367 bool abandoned_search_for_single_character = false;
3368 int single_character = 0;
3369 for (int i = max_lookahead; i >= min_lookahead; i--) {
3370 ZoneList<bool>* map = bitmaps_->at(i);
3371 for (int j = 0; j < map_length_; j++) {
3372 if (map->at(j)) {
3373 if (found_single_character) {
3374 found_single_character = false; // Found two.
3375 abandoned_search_for_single_character = true;
3376 break;
3377 } else {
3378 found_single_character = true;
3379 single_character = j;
3380 }
3381 }
3382 }
3383 if (abandoned_search_for_single_character) break;
3384 }
3385
3386 int lookahead_width = max_lookahead + 1 - min_lookahead;
3387
3388 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3389 // The mask-compare can probably handle this better.
3390 return false;
3391 }
3392
3393 if (found_single_character) {
3394 Label cont, again;
3395 masm->Bind(&again);
3396 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3397 if (max_char_ > map_length_) {
3398 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize);
3399 masm->CheckCharacterAfterAnd(single_character,
3400 RegExpMacroAssembler::kTableMask,
3401 &cont);
3402 } else {
3403 masm->CheckCharacter(single_character, &cont);
3404 }
3405 masm->AdvanceCurrentPosition(lookahead_width);
3406 masm->GoTo(&again);
3407 masm->Bind(&cont);
3408 return true;
3409 }
3410
3411 Handle<ByteArray> boolean_skip_table =
3412 FACTORY->NewByteArray(map_length_, TENURED);
3413 int skip_distance = GetSkipTable(
3414 min_lookahead, max_lookahead, boolean_skip_table);
3415 ASSERT(skip_distance != 0);
3416
3417 Label cont, again;
3418 masm->Bind(&again);
3419 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3420 masm->CheckBitInTable(boolean_skip_table, &cont);
3421 masm->AdvanceCurrentPosition(skip_distance);
3422 masm->GoTo(&again);
3423 masm->Bind(&cont);
3424
3425 return true;
3426 }
3427
3134 /* Code generation for choice nodes. 3428 /* Code generation for choice nodes.
3135 * 3429 *
3136 * We generate quick checks that do a mask and compare to eliminate a 3430 * We generate quick checks that do a mask and compare to eliminate a
3137 * choice. If the quick check succeeds then it jumps to the continuation to 3431 * choice. If the quick check succeeds then it jumps to the continuation to
3138 * do slow checks and check subsequent nodes. If it fails (the common case) 3432 * do slow checks and check subsequent nodes. If it fails (the common case)
3139 * it falls through to the next choice. 3433 * it falls through to the next choice.
3140 * 3434 *
3141 * Here is the desired flow graph. Nodes directly below each other imply 3435 * Here is the desired flow graph. Nodes directly below each other imply
3142 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative 3436 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3143 * 3 doesn't have a quick check so we have to call the slow check. 3437 * 3 doesn't have a quick check so we have to call the slow check.
(...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after
3267 greedy_match_trace.set_loop_label(&loop_label); 3561 greedy_match_trace.set_loop_label(&loop_label);
3268 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); 3562 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3269 macro_assembler->Bind(&greedy_match_failed); 3563 macro_assembler->Bind(&greedy_match_failed);
3270 } 3564 }
3271 3565
3272 Label second_choice; // For use in greedy matches. 3566 Label second_choice; // For use in greedy matches.
3273 macro_assembler->Bind(&second_choice); 3567 macro_assembler->Bind(&second_choice);
3274 3568
3275 int first_normal_choice = greedy_loop ? 1 : 0; 3569 int first_normal_choice = greedy_loop ? 1 : 0;
3276 3570
3277 int preload_characters = 3571 bool not_at_start = current_trace->at_start() == Trace::FALSE;
3278 CalculatePreloadCharacters(compiler, 3572 const int kEatsAtLeastNotYetInitialized = -1;
3279 current_trace->at_start() == Trace::FALSE); 3573 int eats_at_least = kEatsAtLeastNotYetInitialized;
3280 bool preload_is_current = 3574
3575 bool skip_was_emitted = false;
3576
3577 // More makes code generation slower, less makes V8 benchmark score lower.
3578 const int kMaxLookaheadForBoyerMoore = 8;
3579
3580 if (!greedy_loop && choice_count == 2) {
3581 GuardedAlternative alt1 = alternatives_->at(1);
3582 if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
3583 RegExpNode* eats_anything_node = alt1.node();
3584 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
3585 this) {
3586 // At this point we know that we are at a non-greedy loop that will eat
3587 // any character one at a time. Any non-anchored regexp has such a
3588 // loop prepended to it in order to find where it starts. We look for
3589 // a pattern of the form ...abc... where we can look 6 characters ahead
3590 // and step forwards 3 if the character is not one of abc. Abc need
3591 // not be atoms, they can be any reasonably limited character class or
3592 // small alternation.
3593 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
3594 eats_at_least =
3595 Min(kMaxLookaheadForBoyerMoore,
3596 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start));
3597 if (eats_at_least >= 1) {
3598 BoyerMooreLookahead bm(eats_at_least,
3599 RegExpMacroAssembler::kTableSize,
3600 compiler);
3601 GuardedAlternative alt0 = alternatives_->at(0);
3602 alt0.node()->FillInBMInfo(0, &bm, not_at_start);
3603 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler);
3604 }
3605 }
3606 }
3607 }
3608
3609 if (eats_at_least == kEatsAtLeastNotYetInitialized) {
3610 // Save some time by looking at most one machine word ahead.
3611 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start);
3612 }
3613 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
3614
3615 bool preload_is_current = !skip_was_emitted &&
3281 (current_trace->characters_preloaded() == preload_characters); 3616 (current_trace->characters_preloaded() == preload_characters);
3282 bool preload_has_checked_bounds = preload_is_current; 3617 bool preload_has_checked_bounds = preload_is_current;
3283 3618
3284 AlternativeGenerationList alt_gens(choice_count); 3619 AlternativeGenerationList alt_gens(choice_count);
3285 3620
3286 // For now we just call all choices one after the other. The idea ultimately 3621 // For now we just call all choices one after the other. The idea ultimately
3287 // is to use the Dispatch table to try only the relevant ones. 3622 // is to use the Dispatch table to try only the relevant ones.
3288 for (int i = first_normal_choice; i < choice_count; i++) { 3623 for (int i = first_normal_choice; i < choice_count; i++) {
3289 GuardedAlternative alternative = alternatives_->at(i); 3624 GuardedAlternative alternative = alternatives_->at(i);
3290 AlternativeGeneration* alt_gen = alt_gens.at(i); 3625 AlternativeGeneration* alt_gen = alt_gens.at(i);
(...skipping 2134 matching lines...) Expand 10 before | Expand all | Expand 10 after
5425 // We don't know anything about the first character of a backreference 5760 // We don't know anything about the first character of a backreference
5426 // at this point. 5761 // at this point.
5427 // The potential first characters are the first characters of the capture, 5762 // The potential first characters are the first characters of the capture,
5428 // and the first characters of the on_success node, depending on whether the 5763 // and the first characters of the on_success node, depending on whether the
5429 // capture can be empty and whether it is known to be participating or known 5764 // capture can be empty and whether it is known to be participating or known
5430 // not to be. 5765 // not to be.
5431 return kComputeFirstCharacterSetFail; 5766 return kComputeFirstCharacterSetFail;
5432 } 5767 }
5433 5768
5434 5769
5770 void ChoiceNode::FillInBMInfo(
5771 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
5772 ZoneList<GuardedAlternative>* alts = alternatives();
5773 for (int i = 0; i < alts->length(); i++) {
5774 GuardedAlternative& alt = alts->at(i);
5775 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5776 bm->SetRest(offset); // Give up trying to fill in info.
5777 return;
5778 }
5779 alt.node()->FillInBMInfo(offset, bm, not_at_start);
5780 }
5781 }
5782
5783
5784 void TextNode::FillInBMInfo(
5785 int offset, BoyerMooreLookahead* bm, bool not_at_start) {
5786 if (offset >= bm->length()) return;
5787 int max_char = bm->max_char();
5788 for (int i = 0; i < elements()->length(); i++) {
5789 if (offset >= bm->length()) return;
5790 TextElement text = elements()->at(i);
5791 if (text.type == TextElement::ATOM) {
5792 RegExpAtom* atom = text.data.u_atom;
5793 for (int j = 0; j < atom->length(); j++, offset++) {
5794 if (offset >= bm->length()) return;
5795 uc16 character = atom->data()[j];
5796 if (bm->compiler()->ignore_case()) {
5797 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5798 int length = GetCaseIndependentLetters(
5799 ISOLATE,
5800 character,
5801 bm->max_char() == String::kMaxAsciiCharCode,
5802 chars);
5803 for (int j = 0; j < length; j++) {
5804 bm->Set(offset, chars[j]);
5805 }
5806 } else {
5807 if (character <= max_char) bm->Set(offset, character);
5808 }
5809 }
5810 } else {
5811 ASSERT(text.type == TextElement::CHAR_CLASS);
5812 RegExpCharacterClass* char_class = text.data.u_char_class;
5813 ZoneList<CharacterRange>* ranges = char_class->ranges();
5814 if (char_class->is_negated()) {
5815 bm->SetAll(offset);
5816 } else {
5817 for (int k = 0; k < ranges->length(); k++) {
5818 CharacterRange& range = ranges->at(k);
5819 if (range.from() > max_char) continue;
5820 int to = Min(max_char, static_cast<int>(range.to()));
5821 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) {
5822 bm->SetAll(offset);
5823 break;
5824 }
5825 for (int m = range.from(); m <= to; m++) {
5826 bm->Set(offset, m);
5827 }
5828 }
5829 }
5830 offset++;
5831 }
5832 }
5833 if (offset >= bm->length()) return;
5834 on_success()->FillInBMInfo(offset,
5835 bm,
5836 true); // Not at start after a text node.
5837 }
5838
5839
5435 int TextNode::ComputeFirstCharacterSet(int budget) { 5840 int TextNode::ComputeFirstCharacterSet(int budget) {
5436 budget--; 5841 budget--;
5437 if (budget >= 0) { 5842 if (budget >= 0) {
5438 ASSERT_NE(0, elements()->length()); 5843 ASSERT_NE(0, elements()->length());
5439 TextElement text = elements()->at(0); 5844 TextElement text = elements()->at(0);
5440 if (text.type == TextElement::ATOM) { 5845 if (text.type == TextElement::ATOM) {
5441 RegExpAtom* atom = text.data.u_atom; 5846 RegExpAtom* atom = text.data.u_atom;
5442 ASSERT_NE(0, atom->length()); 5847 ASSERT_NE(0, atom->length());
5443 uc16 first_char = atom->data()[0]; 5848 uc16 first_char = atom->data()[0];
5444 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); 5849 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
(...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after
5582 } 5987 }
5583 } 5988 }
5584 5989
5585 5990
5586 void DispatchTableConstructor::VisitAction(ActionNode* that) { 5991 void DispatchTableConstructor::VisitAction(ActionNode* that) {
5587 RegExpNode* target = that->on_success(); 5992 RegExpNode* target = that->on_success();
5588 target->Accept(this); 5993 target->Accept(this);
5589 } 5994 }
5590 5995
5591 5996
5592 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, 5997 RegExpEngine::CompilationResult RegExpEngine::Compile(
5593 bool ignore_case, 5998 RegExpCompileData* data,
5594 bool is_multiline, 5999 bool ignore_case,
5595 Handle<String> pattern, 6000 bool is_multiline,
5596 bool is_ascii) { 6001 Handle<String> pattern,
6002 Handle<String> sample_subject,
6003 bool is_ascii) {
5597 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6004 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
5598 return IrregexpRegExpTooBig(); 6005 return IrregexpRegExpTooBig();
5599 } 6006 }
5600 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); 6007 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
6008
6009 // Sample some characters from the middle of the string.
6010 static const int kSampleSize = 128;
6011
6012 FlattenString(sample_subject);
6013 int chars_sampled = 0;
6014 int half_way = (sample_subject->length() - kSampleSize) / 2;
6015 for (int i = Max(0, half_way);
6016 i < sample_subject->length();
6017 i++, chars_sampled++) {
6018 if (chars_sampled > kSampleSize) break;
ulan 2012/04/02 08:12:24 Now we can move this into the for loop condition.
Erik Corry 2012/04/02 09:36:22 Done.
6019 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6020 }
6021
5601 // Wrap the body of the regexp in capture #0. 6022 // Wrap the body of the regexp in capture #0.
5602 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, 6023 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
5603 0, 6024 0,
5604 &compiler, 6025 &compiler,
5605 compiler.accept()); 6026 compiler.accept());
5606 RegExpNode* node = captured_body; 6027 RegExpNode* node = captured_body;
5607 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); 6028 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5608 bool is_start_anchored = data->tree->IsAnchoredAtStart(); 6029 bool is_start_anchored = data->tree->IsAnchoredAtStart();
5609 int max_length = data->tree->max_match(); 6030 int max_length = data->tree->max_match();
5610 if (!is_start_anchored) { 6031 if (!is_start_anchored) {
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
5673 } 6094 }
5674 6095
5675 return compiler.Assemble(&macro_assembler, 6096 return compiler.Assemble(&macro_assembler,
5676 node, 6097 node,
5677 data->capture_count, 6098 data->capture_count,
5678 pattern); 6099 pattern);
5679 } 6100 }
5680 6101
5681 6102
5682 }} // namespace v8::internal 6103 }} // namespace v8::internal
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/x64/regexp-macro-assembler-x64.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698