Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 262 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 273 | 273 |
| 274 | 274 |
| 275 // Irregexp implementation. | 275 // Irregexp implementation. |
| 276 | 276 |
| 277 // Ensures that the regexp object contains a compiled version of the | 277 // Ensures that the regexp object contains a compiled version of the |
| 278 // source for either ASCII or non-ASCII strings. | 278 // source for either ASCII or non-ASCII strings. |
| 279 // If the compiled version doesn't already exist, it is compiled | 279 // If the compiled version doesn't already exist, it is compiled |
| 280 // from the source pattern. | 280 // from the source pattern. |
| 281 // If compilation fails, an exception is thrown and this function | 281 // If compilation fails, an exception is thrown and this function |
| 282 // returns false. | 282 // returns false. |
| 283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 283 bool RegExpImpl::EnsureCompiledIrregexp( |
| 284 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { | |
| 284 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); | 285 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); |
| 285 #ifdef V8_INTERPRETED_REGEXP | 286 #ifdef V8_INTERPRETED_REGEXP |
| 286 if (compiled_code->IsByteArray()) return true; | 287 if (compiled_code->IsByteArray()) return true; |
| 287 #else // V8_INTERPRETED_REGEXP (RegExp native code) | 288 #else // V8_INTERPRETED_REGEXP (RegExp native code) |
| 288 if (compiled_code->IsCode()) return true; | 289 if (compiled_code->IsCode()) return true; |
| 289 #endif | 290 #endif |
| 290 // We could potentially have marked this as flushable, but have kept | 291 // We could potentially have marked this as flushable, but have kept |
| 291 // a saved version if we did not flush it yet. | 292 // a saved version if we did not flush it yet. |
| 292 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); | 293 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); |
| 293 if (saved_code->IsCode()) { | 294 if (saved_code->IsCode()) { |
| 294 // Reinstate the code in the original place. | 295 // Reinstate the code in the original place. |
| 295 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); | 296 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); |
| 296 ASSERT(compiled_code->IsSmi()); | 297 ASSERT(compiled_code->IsSmi()); |
| 297 return true; | 298 return true; |
| 298 } | 299 } |
| 299 return CompileIrregexp(re, is_ascii); | 300 return CompileIrregexp(re, sample_subject, is_ascii); |
| 300 } | 301 } |
| 301 | 302 |
| 302 | 303 |
| 303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, | 304 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, |
| 304 bool is_ascii, | 305 bool is_ascii, |
| 305 Handle<String> error_message, | 306 Handle<String> error_message, |
| 306 Isolate* isolate) { | 307 Isolate* isolate) { |
| 307 Factory* factory = isolate->factory(); | 308 Factory* factory = isolate->factory(); |
| 308 Handle<FixedArray> elements = factory->NewFixedArray(2); | 309 Handle<FixedArray> elements = factory->NewFixedArray(2); |
| 309 elements->set(0, re->Pattern()); | 310 elements->set(0, re->Pattern()); |
| 310 elements->set(1, *error_message); | 311 elements->set(1, *error_message); |
| 311 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 312 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
| 312 Handle<Object> regexp_err = | 313 Handle<Object> regexp_err = |
| 313 factory->NewSyntaxError("malformed_regexp", array); | 314 factory->NewSyntaxError("malformed_regexp", array); |
| 314 isolate->Throw(*regexp_err); | 315 isolate->Throw(*regexp_err); |
| 315 return false; | 316 return false; |
| 316 } | 317 } |
| 317 | 318 |
| 318 | 319 |
| 319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 320 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
| 321 Handle<String> sample_subject, | |
| 322 bool is_ascii) { | |
| 320 // Compile the RegExp. | 323 // Compile the RegExp. |
| 321 Isolate* isolate = re->GetIsolate(); | 324 Isolate* isolate = re->GetIsolate(); |
| 322 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); | 325 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); |
| 323 PostponeInterruptsScope postpone(isolate); | 326 PostponeInterruptsScope postpone(isolate); |
| 324 // If we had a compilation error the last time this is saved at the | 327 // If we had a compilation error the last time this is saved at the |
| 325 // saved code index. | 328 // saved code index. |
| 326 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); | 329 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); |
| 327 // When arriving here entry can only be a smi, either representing an | 330 // When arriving here entry can only be a smi, either representing an |
| 328 // uncompiled regexp, a previous compilation error, or code that has | 331 // uncompiled regexp, a previous compilation error, or code that has |
| 329 // been flushed. | 332 // been flushed. |
| (...skipping 28 matching lines...) Expand all Loading... | |
| 358 pattern, | 361 pattern, |
| 359 compile_data.error, | 362 compile_data.error, |
| 360 "malformed_regexp"); | 363 "malformed_regexp"); |
| 361 return false; | 364 return false; |
| 362 } | 365 } |
| 363 RegExpEngine::CompilationResult result = | 366 RegExpEngine::CompilationResult result = |
| 364 RegExpEngine::Compile(&compile_data, | 367 RegExpEngine::Compile(&compile_data, |
| 365 flags.is_ignore_case(), | 368 flags.is_ignore_case(), |
| 366 flags.is_multiline(), | 369 flags.is_multiline(), |
| 367 pattern, | 370 pattern, |
| 371 sample_subject, | |
| 368 is_ascii); | 372 is_ascii); |
| 369 if (result.error_message != NULL) { | 373 if (result.error_message != NULL) { |
| 370 // Unable to compile regexp. | 374 // Unable to compile regexp. |
| 371 Handle<String> error_message = | 375 Handle<String> error_message = |
| 372 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); | 376 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); |
| 373 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); | 377 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); |
| 374 return false; | 378 return false; |
| 375 } | 379 } |
| 376 | 380 |
| 377 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); | 381 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 428 capture_count); | 432 capture_count); |
| 429 } | 433 } |
| 430 | 434 |
| 431 | 435 |
| 432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, | 436 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, |
| 433 Handle<String> subject) { | 437 Handle<String> subject) { |
| 434 if (!subject->IsFlat()) FlattenString(subject); | 438 if (!subject->IsFlat()) FlattenString(subject); |
| 435 | 439 |
| 436 // Check the asciiness of the underlying storage. | 440 // Check the asciiness of the underlying storage. |
| 437 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 441 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
| 438 if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; | 442 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; |
| 439 | 443 |
| 440 #ifdef V8_INTERPRETED_REGEXP | 444 #ifdef V8_INTERPRETED_REGEXP |
| 441 // Byte-code regexp needs space allocated for all its registers. | 445 // Byte-code regexp needs space allocated for all its registers. |
| 442 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); | 446 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); |
| 443 #else // V8_INTERPRETED_REGEXP | 447 #else // V8_INTERPRETED_REGEXP |
| 444 // Native regexp only needs room to output captures. Registers are handled | 448 // Native regexp only needs room to output captures. Registers are handled |
| 445 // internally. | 449 // internally. |
| 446 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; | 450 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; |
| 447 #endif // V8_INTERPRETED_REGEXP | 451 #endif // V8_INTERPRETED_REGEXP |
| 448 } | 452 } |
| (...skipping 10 matching lines...) Expand all Loading... | |
| 459 | 463 |
| 460 ASSERT(index >= 0); | 464 ASSERT(index >= 0); |
| 461 ASSERT(index <= subject->length()); | 465 ASSERT(index <= subject->length()); |
| 462 ASSERT(subject->IsFlat()); | 466 ASSERT(subject->IsFlat()); |
| 463 | 467 |
| 464 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 468 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
| 465 | 469 |
| 466 #ifndef V8_INTERPRETED_REGEXP | 470 #ifndef V8_INTERPRETED_REGEXP |
| 467 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); | 471 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
| 468 do { | 472 do { |
| 469 EnsureCompiledIrregexp(regexp, is_ascii); | 473 EnsureCompiledIrregexp(regexp, subject, is_ascii); |
| 470 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); | 474 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); |
| 471 NativeRegExpMacroAssembler::Result res = | 475 NativeRegExpMacroAssembler::Result res = |
| 472 NativeRegExpMacroAssembler::Match(code, | 476 NativeRegExpMacroAssembler::Match(code, |
| 473 subject, | 477 subject, |
| 474 output.start(), | 478 output.start(), |
| 475 output.length(), | 479 output.length(), |
| 476 index, | 480 index, |
| 477 isolate); | 481 isolate); |
| 478 if (res != NativeRegExpMacroAssembler::RETRY) { | 482 if (res != NativeRegExpMacroAssembler::RETRY) { |
| 479 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || | 483 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || |
| (...skipping 297 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { | 781 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { |
| 778 if (table_ == NULL) { | 782 if (table_ == NULL) { |
| 779 table_ = new DispatchTable(); | 783 table_ = new DispatchTable(); |
| 780 DispatchTableConstructor cons(table_, ignore_case); | 784 DispatchTableConstructor cons(table_, ignore_case); |
| 781 cons.BuildTable(this); | 785 cons.BuildTable(this); |
| 782 } | 786 } |
| 783 return table_; | 787 return table_; |
| 784 } | 788 } |
| 785 | 789 |
| 786 | 790 |
| 791 class FrequencyCollator { | |
| 792 public: | |
| 793 FrequencyCollator() : total_frequencies_(0) { | |
| 794 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { | |
| 795 frequencies_[i] = CharacterFrequency(i); | |
| 796 } | |
| 797 } | |
| 798 | |
| 799 void CountCharacter(int character) { | |
| 800 int index = (character & RegExpMacroAssembler::kTableMask); | |
| 801 frequencies_[index].Increment(); | |
| 802 total_frequencies_++; | |
| 803 } | |
| 804 | |
| 805 void SortFrequencies() { | |
| 806 typedef int (*RawComparer)(const void*, const void*); | |
| 807 memcpy( | |
| 808 &sorted_frequencies_[0], &frequencies_[0], sizeof(sorted_frequencies_)); | |
|
ulan
2012/03/30 13:04:48
A nit: for consistency I would prefer breaking mem
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 809 qsort(&sorted_frequencies_[0], | |
| 810 RegExpMacroAssembler::kTableSize, | |
| 811 sizeof(sorted_frequencies_[0]), | |
| 812 reinterpret_cast<RawComparer>(&CharacterFrequency::Compare)); | |
| 813 } | |
| 814 | |
| 815 bool IsFrequent(int in_character) { | |
| 816 if (total_frequencies_ < 10) return false; // Too little statistical basis. | |
| 817 int freq = frequencies_[in_character].counter(); | |
|
ulan
2012/03/30 13:04:48
Either mask the in_character or ASSERT(in_characte
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 818 bool popular = ((freq * 100) / total_frequencies_ > 20); | |
| 819 return popular; | |
|
ulan
2012/03/30 13:04:48
Named magic numbers become less magic :)
int freq
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 820 } | |
| 821 | |
| 822 private: | |
| 823 class CharacterFrequency { | |
| 824 public: | |
| 825 CharacterFrequency() : counter_(0), character_(-1) { } | |
| 826 explicit CharacterFrequency(int character) : counter_(0), character_(charact er) { } | |
|
ulan
2012/03/30 13:04:48
Long line.
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 827 | |
| 828 static int Compare(const CharacterFrequency* a, | |
| 829 const CharacterFrequency* b) { | |
| 830 return b->counter_ - a->counter_; | |
| 831 } | |
| 832 | |
| 833 void Increment() { counter_++; } | |
| 834 int counter() { return counter_; } | |
| 835 int character() { return character_; } | |
| 836 | |
| 837 private: | |
| 838 int counter_; | |
| 839 int character_; | |
| 840 }; | |
| 841 | |
| 842 | |
| 843 private: | |
| 844 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; | |
| 845 CharacterFrequency sorted_frequencies_[RegExpMacroAssembler::kTableSize]; | |
| 846 int total_frequencies_; | |
| 847 }; | |
| 848 | |
| 849 | |
| 787 class RegExpCompiler { | 850 class RegExpCompiler { |
| 788 public: | 851 public: |
| 789 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); | 852 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); |
| 790 | 853 |
| 791 int AllocateRegister() { | 854 int AllocateRegister() { |
| 792 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { | 855 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { |
| 793 reg_exp_too_big_ = true; | 856 reg_exp_too_big_ = true; |
| 794 return next_register_; | 857 return next_register_; |
| 795 } | 858 } |
| 796 return next_register_++; | 859 return next_register_++; |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 812 | 875 |
| 813 static const int kMaxRecursion = 100; | 876 static const int kMaxRecursion = 100; |
| 814 inline int recursion_depth() { return recursion_depth_; } | 877 inline int recursion_depth() { return recursion_depth_; } |
| 815 inline void IncrementRecursionDepth() { recursion_depth_++; } | 878 inline void IncrementRecursionDepth() { recursion_depth_++; } |
| 816 inline void DecrementRecursionDepth() { recursion_depth_--; } | 879 inline void DecrementRecursionDepth() { recursion_depth_--; } |
| 817 | 880 |
| 818 void SetRegExpTooBig() { reg_exp_too_big_ = true; } | 881 void SetRegExpTooBig() { reg_exp_too_big_ = true; } |
| 819 | 882 |
| 820 inline bool ignore_case() { return ignore_case_; } | 883 inline bool ignore_case() { return ignore_case_; } |
| 821 inline bool ascii() { return ascii_; } | 884 inline bool ascii() { return ascii_; } |
| 885 FrequencyCollator* frequency_collator() { return &frequency_collator_; } | |
| 822 | 886 |
| 823 int current_expansion_factor() { return current_expansion_factor_; } | 887 int current_expansion_factor() { return current_expansion_factor_; } |
| 824 void set_current_expansion_factor(int value) { | 888 void set_current_expansion_factor(int value) { |
| 825 current_expansion_factor_ = value; | 889 current_expansion_factor_ = value; |
| 826 } | 890 } |
| 827 | 891 |
| 828 static const int kNoRegister = -1; | 892 static const int kNoRegister = -1; |
| 829 | 893 |
| 830 private: | 894 private: |
| 831 EndNode* accept_; | 895 EndNode* accept_; |
| 832 int next_register_; | 896 int next_register_; |
| 833 List<RegExpNode*>* work_list_; | 897 List<RegExpNode*>* work_list_; |
| 834 int recursion_depth_; | 898 int recursion_depth_; |
| 835 RegExpMacroAssembler* macro_assembler_; | 899 RegExpMacroAssembler* macro_assembler_; |
| 836 bool ignore_case_; | 900 bool ignore_case_; |
| 837 bool ascii_; | 901 bool ascii_; |
| 838 bool reg_exp_too_big_; | 902 bool reg_exp_too_big_; |
| 839 int current_expansion_factor_; | 903 int current_expansion_factor_; |
| 904 FrequencyCollator frequency_collator_; | |
| 840 }; | 905 }; |
| 841 | 906 |
| 842 | 907 |
| 843 class RecursionCheck { | 908 class RecursionCheck { |
| 844 public: | 909 public: |
| 845 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { | 910 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { |
| 846 compiler->IncrementRecursionDepth(); | 911 compiler->IncrementRecursionDepth(); |
| 847 } | 912 } |
| 848 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } | 913 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } |
| 849 private: | 914 private: |
| 850 RegExpCompiler* compiler_; | 915 RegExpCompiler* compiler_; |
| 851 }; | 916 }; |
| 852 | 917 |
| 853 | 918 |
| 854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { | 919 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { |
| 855 return RegExpEngine::CompilationResult("RegExp too big"); | 920 return RegExpEngine::CompilationResult("RegExp too big"); |
| 856 } | 921 } |
| 857 | 922 |
| 858 | 923 |
| 859 // Attempts to compile the regexp using an Irregexp code generator. Returns | 924 // Attempts to compile the regexp using an Irregexp code generator. Returns |
| 860 // a fixed array or a null handle depending on whether it succeeded. | 925 // a fixed array or a null handle depending on whether it succeeded. |
| 861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) | 926 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) |
| 862 : next_register_(2 * (capture_count + 1)), | 927 : next_register_(2 * (capture_count + 1)), |
| 863 work_list_(NULL), | 928 work_list_(NULL), |
| 864 recursion_depth_(0), | 929 recursion_depth_(0), |
| 865 ignore_case_(ignore_case), | 930 ignore_case_(ignore_case), |
| 866 ascii_(ascii), | 931 ascii_(ascii), |
| 867 reg_exp_too_big_(false), | 932 reg_exp_too_big_(false), |
| 868 current_expansion_factor_(1) { | 933 current_expansion_factor_(1), |
| 934 frequency_collator_() { | |
| 869 accept_ = new EndNode(EndNode::ACCEPT); | 935 accept_ = new EndNode(EndNode::ACCEPT); |
| 870 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); | 936 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); |
| 871 } | 937 } |
| 872 | 938 |
| 873 | 939 |
| 874 RegExpEngine::CompilationResult RegExpCompiler::Assemble( | 940 RegExpEngine::CompilationResult RegExpCompiler::Assemble( |
| 875 RegExpMacroAssembler* macro_assembler, | 941 RegExpMacroAssembler* macro_assembler, |
| 876 RegExpNode* start, | 942 RegExpNode* start, |
| 877 int capture_count, | 943 int capture_count, |
| 878 Handle<String> pattern) { | 944 Handle<String> pattern) { |
| (...skipping 1170 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2049 int recursion_depth, | 2115 int recursion_depth, |
| 2050 bool not_at_start) { | 2116 bool not_at_start) { |
| 2051 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2117 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
| 2052 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! | 2118 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! |
| 2053 return on_success()->EatsAtLeast(still_to_find, | 2119 return on_success()->EatsAtLeast(still_to_find, |
| 2054 recursion_depth + 1, | 2120 recursion_depth + 1, |
| 2055 not_at_start); | 2121 not_at_start); |
| 2056 } | 2122 } |
| 2057 | 2123 |
| 2058 | 2124 |
| 2125 void ActionNode::FillInBMInfo( | |
| 2126 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
|
ulan
2012/03/30 13:04:48
For consistency with the surrounding code consider
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 2127 if (type_ == BEGIN_SUBMATCH) { | |
| 2128 bm->SetRest(offset); | |
| 2129 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { | |
| 2130 on_success()->FillInBMInfo(offset, bm, not_at_start); | |
| 2131 } | |
| 2132 } | |
| 2133 | |
| 2134 | |
| 2059 int AssertionNode::EatsAtLeast(int still_to_find, | 2135 int AssertionNode::EatsAtLeast(int still_to_find, |
| 2060 int recursion_depth, | 2136 int recursion_depth, |
| 2061 bool not_at_start) { | 2137 bool not_at_start) { |
| 2062 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2138 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
| 2063 // If we know we are not at the start and we are asked "how many characters | 2139 // If we know we are not at the start and we are asked "how many characters |
| 2064 // will you match if you succeed?" then we can answer anything since false | 2140 // will you match if you succeed?" then we can answer anything since false |
| 2065 // implies false. So lets just return the max answer (still_to_find) since | 2141 // implies false. So lets just return the max answer (still_to_find) since |
| 2066 // that won't prevent us from preloading a lot of characters for the other | 2142 // that won't prevent us from preloading a lot of characters for the other |
| 2067 // branches in the node graph. | 2143 // branches in the node graph. |
| 2068 if (type() == AT_START && not_at_start) return still_to_find; | 2144 if (type() == AT_START && not_at_start) return still_to_find; |
| 2069 return on_success()->EatsAtLeast(still_to_find, | 2145 return on_success()->EatsAtLeast(still_to_find, |
| 2070 recursion_depth + 1, | 2146 recursion_depth + 1, |
| 2071 not_at_start); | 2147 not_at_start); |
| 2072 } | 2148 } |
| 2073 | 2149 |
| 2074 | 2150 |
| 2151 void AssertionNode::FillInBMInfo( | |
| 2152 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
| 2153 // Match the behaviour of EatsAtLeast on this node. | |
| 2154 if (type() == AT_START && not_at_start) return; | |
| 2155 on_success()->FillInBMInfo(offset, bm, not_at_start); | |
| 2156 } | |
| 2157 | |
| 2158 | |
| 2075 int BackReferenceNode::EatsAtLeast(int still_to_find, | 2159 int BackReferenceNode::EatsAtLeast(int still_to_find, |
| 2076 int recursion_depth, | 2160 int recursion_depth, |
| 2077 bool not_at_start) { | 2161 bool not_at_start) { |
| 2078 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2162 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
| 2079 return on_success()->EatsAtLeast(still_to_find, | 2163 return on_success()->EatsAtLeast(still_to_find, |
| 2080 recursion_depth + 1, | 2164 recursion_depth + 1, |
| 2081 not_at_start); | 2165 not_at_start); |
| 2082 } | 2166 } |
| 2083 | 2167 |
| 2084 | 2168 |
| (...skipping 412 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2497 bool not_at_start) { | 2581 bool not_at_start) { |
| 2498 if (body_can_be_zero_length_ || info()->visited) return; | 2582 if (body_can_be_zero_length_ || info()->visited) return; |
| 2499 VisitMarker marker(info()); | 2583 VisitMarker marker(info()); |
| 2500 return ChoiceNode::GetQuickCheckDetails(details, | 2584 return ChoiceNode::GetQuickCheckDetails(details, |
| 2501 compiler, | 2585 compiler, |
| 2502 characters_filled_in, | 2586 characters_filled_in, |
| 2503 not_at_start); | 2587 not_at_start); |
| 2504 } | 2588 } |
| 2505 | 2589 |
| 2506 | 2590 |
| 2591 void LoopChoiceNode::FillInBMInfo( | |
| 2592 int offset, BoyerMooreLookahead* bm, bool nas) { | |
| 2593 if (body_can_be_zero_length_) { // || info()->visited) { | |
|
ulan
2012/03/30 13:04:48
Forgotten comment.
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 2594 bm->SetRest(offset); | |
| 2595 return; | |
| 2596 } | |
| 2597 // VisitMarker marker(info()); | |
|
ulan
2012/03/30 13:04:48
Comment.
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 2598 ChoiceNode::FillInBMInfo(offset, bm, nas); | |
| 2599 } | |
| 2600 | |
| 2601 | |
| 2507 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2602 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| 2508 RegExpCompiler* compiler, | 2603 RegExpCompiler* compiler, |
| 2509 int characters_filled_in, | 2604 int characters_filled_in, |
| 2510 bool not_at_start) { | 2605 bool not_at_start) { |
| 2511 not_at_start = (not_at_start || not_at_start_); | 2606 not_at_start = (not_at_start || not_at_start_); |
| 2512 int choice_count = alternatives_->length(); | 2607 int choice_count = alternatives_->length(); |
| 2513 ASSERT(choice_count > 0); | 2608 ASSERT(choice_count > 0); |
| 2514 alternatives_->at(0).node()->GetQuickCheckDetails(details, | 2609 alternatives_->at(0).node()->GetQuickCheckDetails(details, |
| 2515 compiler, | 2610 compiler, |
| 2516 characters_filled_in, | 2611 characters_filled_in, |
| (...skipping 476 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2993 int TextNode::GreedyLoopTextLength() { | 3088 int TextNode::GreedyLoopTextLength() { |
| 2994 TextElement elm = elms_->at(elms_->length() - 1); | 3089 TextElement elm = elms_->at(elms_->length() - 1); |
| 2995 if (elm.type == TextElement::CHAR_CLASS) { | 3090 if (elm.type == TextElement::CHAR_CLASS) { |
| 2996 return elm.cp_offset + 1; | 3091 return elm.cp_offset + 1; |
| 2997 } else { | 3092 } else { |
| 2998 return elm.cp_offset + elm.data.u_atom->data().length(); | 3093 return elm.cp_offset + elm.data.u_atom->data().length(); |
| 2999 } | 3094 } |
| 3000 } | 3095 } |
| 3001 | 3096 |
| 3002 | 3097 |
| 3098 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( | |
| 3099 RegExpCompiler* compiler) { | |
| 3100 if (elms_->length() != 1) return NULL; | |
| 3101 TextElement elm = elms_->at(0); | |
| 3102 if (elm.type != TextElement::CHAR_CLASS) return NULL; | |
| 3103 RegExpCharacterClass* node = elm.data.u_char_class; | |
| 3104 ZoneList<CharacterRange>* ranges = node->ranges(); | |
| 3105 if (!CharacterRange::IsCanonical(ranges)) { | |
| 3106 CharacterRange::Canonicalize(ranges); | |
| 3107 } | |
| 3108 if (node->is_negated() && ranges->length() == 0) return on_success(); | |
| 3109 if (ranges->length() != 1) return NULL; | |
| 3110 if (ranges->at(0).from() != 0) return NULL; | |
| 3111 int max_char; | |
| 3112 if (compiler->ascii()) { | |
|
ulan
2012/03/30 13:04:48
A nit, I like
max_char = compiler->ascii() ? Stri
Erik Corry
2012/04/01 00:48:46
Yes, but a bug in gcc means you get a linker error
| |
| 3113 max_char = String::kMaxAsciiCharCode; | |
| 3114 } else { | |
| 3115 max_char = String::kMaxUtf16CodeUnit; | |
| 3116 } | |
| 3117 if (ranges->at(0).to() < max_char) return NULL; | |
|
ulan
2012/03/30 13:04:48
Consider using ranges(0)->IsEverything()
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3118 return on_success(); | |
| 3119 } | |
| 3120 | |
| 3121 | |
| 3003 // Finds the fixed match length of a sequence of nodes that goes from | 3122 // Finds the fixed match length of a sequence of nodes that goes from |
| 3004 // this alternative and back to this choice node. If there are variable | 3123 // this alternative and back to this choice node. If there are variable |
| 3005 // length nodes or other complications in the way then return a sentinel | 3124 // length nodes or other complications in the way then return a sentinel |
| 3006 // value indicating that a greedy loop cannot be constructed. | 3125 // value indicating that a greedy loop cannot be constructed. |
| 3007 int ChoiceNode::GreedyLoopTextLengthForAlternative( | 3126 int ChoiceNode::GreedyLoopTextLengthForAlternative( |
| 3008 GuardedAlternative* alternative) { | 3127 GuardedAlternative* alternative) { |
| 3009 int length = 0; | 3128 int length = 0; |
| 3010 RegExpNode* node = alternative->node(); | 3129 RegExpNode* node = alternative->node(); |
| 3011 // Later we will generate code for all these text nodes using recursion | 3130 // Later we will generate code for all these text nodes using recursion |
| 3012 // so we have to limit the max number. | 3131 // so we have to limit the max number. |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3057 ASSERT(trace->stop_node() == NULL); | 3176 ASSERT(trace->stop_node() == NULL); |
| 3058 if (!trace->is_trivial()) { | 3177 if (!trace->is_trivial()) { |
| 3059 trace->Flush(compiler, this); | 3178 trace->Flush(compiler, this); |
| 3060 return; | 3179 return; |
| 3061 } | 3180 } |
| 3062 ChoiceNode::Emit(compiler, trace); | 3181 ChoiceNode::Emit(compiler, trace); |
| 3063 } | 3182 } |
| 3064 | 3183 |
| 3065 | 3184 |
| 3066 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, | 3185 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
| 3067 bool not_at_start) { | 3186 int eats_at_least) { |
| 3068 int preload_characters = EatsAtLeast(4, 0, not_at_start); | 3187 int preload_characters = eats_at_least > 4 ? 4 : eats_at_least; |
|
ulan
2012/03/30 13:04:48
Min(4, eats_at_least);
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3069 if (compiler->macro_assembler()->CanReadUnaligned()) { | 3188 if (compiler->macro_assembler()->CanReadUnaligned()) { |
| 3070 bool ascii = compiler->ascii(); | 3189 bool ascii = compiler->ascii(); |
| 3071 if (ascii) { | 3190 if (ascii) { |
| 3072 if (preload_characters > 4) preload_characters = 4; | 3191 if (preload_characters > 4) preload_characters = 4; |
| 3073 // We can't preload 3 characters because there is no machine instruction | 3192 // We can't preload 3 characters because there is no machine instruction |
| 3074 // to do that. We can't just load 4 because we could be reading | 3193 // to do that. We can't just load 4 because we could be reading |
| 3075 // beyond the end of the string, which could cause a memory fault. | 3194 // beyond the end of the string, which could cause a memory fault. |
| 3076 if (preload_characters == 3) preload_characters = 2; | 3195 if (preload_characters == 3) preload_characters = 2; |
| 3077 } else { | 3196 } else { |
| 3078 if (preload_characters > 2) preload_characters = 2; | 3197 if (preload_characters > 2) preload_characters = 2; |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3124 return alt_gens_[i]; | 3243 return alt_gens_[i]; |
| 3125 } | 3244 } |
| 3126 | 3245 |
| 3127 private: | 3246 private: |
| 3128 static const int kAFew = 10; | 3247 static const int kAFew = 10; |
| 3129 ZoneList<AlternativeGeneration*> alt_gens_; | 3248 ZoneList<AlternativeGeneration*> alt_gens_; |
| 3130 AlternativeGeneration a_few_alt_gens_[kAFew]; | 3249 AlternativeGeneration a_few_alt_gens_[kAFew]; |
| 3131 }; | 3250 }; |
| 3132 | 3251 |
| 3133 | 3252 |
| 3253 BoyerMooreLookahead::BoyerMooreLookahead( | |
| 3254 int length, int map_length, RegExpCompiler* compiler) | |
| 3255 : length_(length), | |
| 3256 map_length_(map_length), | |
|
ulan
2012/03/30 13:04:48
the Set() method assumes that map_length is a powe
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3257 compiler_(compiler) { | |
| 3258 if (compiler->ascii()) { | |
| 3259 max_char_ = String::kMaxAsciiCharCode; | |
| 3260 } else { | |
| 3261 max_char_ = String::kMaxUtf16CodeUnit; | |
| 3262 } | |
| 3263 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); | |
|
ulan
2012/03/30 13:04:48
I wonder if bit-packing (ZoneList<bool>) would imp
Erik Corry
2012/04/01 00:48:46
We normally only do this once per regexp, so I don
| |
| 3264 for (int i = 0; i < length; i++) { | |
| 3265 bitmaps_->Add(new ZoneList<bool>(map_length)); | |
| 3266 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3267 for (int i = 0; i < map_length; i++) { | |
| 3268 map->Add(false); | |
| 3269 } | |
| 3270 } | |
| 3271 } | |
| 3272 | |
| 3273 | |
| 3274 bool BoyerMooreLookahead::TooVague(int offset, int limit) { | |
| 3275 if (bitmaps_->at(offset) == NULL) return true; | |
| 3276 if (Count(offset) > limit) return true; | |
| 3277 return false; | |
| 3278 } | |
| 3279 | |
| 3280 | |
| 3281 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { | |
| 3282 // The characters that are very frequent in the sample subject string should | |
| 3283 // not be used for Boyer-Moore operations, so lets mark those positions as | |
| 3284 // being too vague. | |
| 3285 for (int i = 0; i < length_; i++) { | |
| 3286 if (Count(i) >= kTooManyCharacters) { | |
| 3287 SetAll(i); | |
| 3288 } else { | |
| 3289 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3290 for (int ch = 0; ch < map_length_; ch++) { | |
| 3291 if (map->at(ch) && compiler_->frequency_collator()->IsFrequent(ch)) { | |
| 3292 SetAll(i); | |
| 3293 break; | |
| 3294 } | |
| 3295 } | |
| 3296 } | |
| 3297 } | |
| 3298 const int kNoPoints = -1; | |
| 3299 int biggest_points = kNoPoints; | |
| 3300 int biggest_interval_max = 0; | |
| 3301 int biggest_interval = 0; | |
| 3302 int vagueness_factor = 10; | |
|
ulan
2012/03/30 13:04:48
Could you please write a comment describing the ma
Erik Corry
2012/04/01 00:48:46
I refactored this function because it reflected fa
| |
| 3303 for (int initial_max_vagueness = 4; | |
| 3304 initial_max_vagueness <= kTooManyCharacters; | |
| 3305 initial_max_vagueness *= 2, vagueness_factor--) { | |
| 3306 int max = length_ - 1; | |
|
ulan
2012/03/30 13:04:48
This 'max' can just be called 'i'. I think there a
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3307 while (true) { | |
| 3308 while (max >= 0 && TooVague(max, initial_max_vagueness)) max--; | |
| 3309 if (max < 0) break; | |
| 3310 int remembered_max = max; | |
| 3311 int max_vagueness = initial_max_vagueness; | |
| 3312 while (max >= 0 && !TooVague(max, max_vagueness)) { | |
| 3313 max--; | |
| 3314 if (max_vagueness < kTooManyCharacters) max_vagueness *= 2; | |
|
ulan
2012/03/30 13:04:48
What is the rationale behind this?
Nit: max_vague
Erik Corry
2012/04/01 00:48:46
All refactored away.
| |
| 3315 } | |
| 3316 int points = | |
| 3317 ((remembered_max - max) * kTooManyCharacters) * vagueness_factor; | |
| 3318 if (points > biggest_points) { | |
| 3319 biggest_interval_max = remembered_max; | |
| 3320 biggest_interval = remembered_max - max; | |
|
ulan
2012/03/30 13:04:48
Consider maintaining (left, right) instead of (big
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3321 biggest_points = points; | |
| 3322 } | |
| 3323 if (max < 0) break; | |
| 3324 } | |
| 3325 } | |
| 3326 if (biggest_points == kNoPoints) return false; | |
| 3327 if (biggest_interval < 1) return false; | |
| 3328 *to = biggest_interval_max; | |
| 3329 *from = 1 + biggest_interval_max - biggest_interval; | |
| 3330 return true; | |
| 3331 } | |
| 3332 | |
| 3333 | |
| 3334 static int SortChar(const char* a, const char* b) { | |
| 3335 return static_cast<int>(*a) - *b; | |
| 3336 } | |
| 3337 | |
| 3338 | |
| 3339 int BoyerMooreLookahead::GetSkipTable(int max_lookahead, | |
| 3340 Handle<ByteArray> boolean_skip_table) { | |
| 3341 const int kSize = RegExpMacroAssembler::kTableSize; | |
| 3342 char table[kSize]; | |
| 3343 | |
| 3344 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { | |
| 3345 table[i] = max_lookahead + 1; | |
| 3346 } | |
| 3347 | |
| 3348 for (int i = max_lookahead; i >= 0; i--) { | |
| 3349 int skip = max_lookahead - i; | |
|
ulan
2012/03/30 16:54:51
This is the crucial part. A comment explaining why
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3350 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3351 if (map == NULL) { | |
| 3352 for (int j = 0; j < map_length_; j++) { | |
| 3353 if (table[j] > skip) table[j] = skip; | |
| 3354 } | |
| 3355 break; | |
| 3356 } | |
| 3357 for (int j = 0; j < map_length_; j++) { | |
| 3358 if (map->at(j)) { | |
| 3359 if (table[j] > skip) table[j] = skip; | |
| 3360 } | |
| 3361 } | |
| 3362 } | |
| 3363 | |
| 3364 char sorted_table[kSize]; | |
| 3365 memcpy(sorted_table, table, kSize); | |
| 3366 typedef int (*RawComparer)(const void*, const void*); | |
| 3367 qsort(sorted_table, kSize, 1, reinterpret_cast<RawComparer>(&SortChar)); | |
| 3368 | |
| 3369 int boolean_skip_distance = sorted_table[kSize / 2]; | |
|
ulan
2012/03/30 16:54:51
Instead of kSize / 2 we can take any i in [0..kSiz
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3370 for (int i = 0; i < kSize; i++) { | |
| 3371 boolean_skip_table->set(i, table[i] >= boolean_skip_distance ? 0 : 1); | |
|
ulan
2012/03/30 13:04:48
Can (x >= y ? 0 : 1) be replaced with (x < y) ?
Erik Corry
2012/04/01 00:48:46
We don't normally allow implicit bool->int and int
| |
| 3372 } | |
| 3373 return boolean_skip_distance; | |
| 3374 } | |
| 3375 | |
| 3376 | |
| 3377 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | |
| 3378 int min_lookahead = 0; | |
| 3379 int max_lookahead = 0; | |
| 3380 | |
| 3381 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | |
| 3382 | |
| 3383 bool found_single_character = false; | |
|
ulan
2012/03/30 16:54:51
Consider using a counter:
character_count = 0;
.
Erik Corry
2012/04/01 00:48:46
I want to bail out when I find the second characte
| |
| 3384 bool abandoned_search_for_single_character = false; | |
| 3385 int single_character = 0; | |
| 3386 for (int i = max_lookahead; i >= min_lookahead; i--) { | |
| 3387 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3388 for (int j = 0; j < map_length_; j++) { | |
| 3389 if (map->at(j)) { | |
| 3390 if (found_single_character) { | |
| 3391 found_single_character = false; // Found two. | |
| 3392 abandoned_search_for_single_character = true; | |
| 3393 break; | |
| 3394 } else { | |
| 3395 found_single_character = true; | |
| 3396 single_character = j; | |
| 3397 } | |
| 3398 } | |
| 3399 } | |
| 3400 if (abandoned_search_for_single_character) break; | |
| 3401 } | |
| 3402 | |
| 3403 int lookahead_width = max_lookahead + 1 - min_lookahead; | |
| 3404 | |
| 3405 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | |
| 3406 // The mask-compare can probably handle this better. | |
| 3407 return false; | |
| 3408 } | |
| 3409 | |
| 3410 if (found_single_character) { | |
| 3411 Label cont, again; | |
| 3412 masm->Bind(&again); | |
| 3413 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
| 3414 if (max_char_ > String::kMaxAsciiCharCode || | |
| 3415 map_length_ < RegExpMacroAssembler::kTableSize) { | |
| 3416 masm->CheckCharacterAfterAnd(single_character, | |
| 3417 RegExpMacroAssembler::kTableMask, | |
| 3418 &cont); | |
| 3419 } else { | |
| 3420 masm->CheckCharacter(single_character, &cont); | |
|
ulan
2012/03/30 16:54:51
I think this implicitly assumes that
String::kMaxA
Erik Corry
2012/04/01 00:48:46
Fixed.
| |
| 3421 } | |
| 3422 masm->AdvanceCurrentPosition(lookahead_width); | |
| 3423 masm->GoTo(&again); | |
| 3424 masm->Bind(&cont); | |
| 3425 return true; | |
| 3426 } | |
| 3427 | |
| 3428 Handle<ByteArray> boolean_skip_table = | |
| 3429 FACTORY->NewByteArray(map_length_, TENURED); | |
| 3430 int boolean_skip_distance = GetSkipTable(max_lookahead, boolean_skip_table); | |
| 3431 if (boolean_skip_distance == 0) return false; | |
| 3432 | |
| 3433 Label cont, again; | |
| 3434 masm->Bind(&again); | |
| 3435 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
| 3436 masm->CheckBitInTable(boolean_skip_table, &cont); | |
| 3437 masm->AdvanceCurrentPosition(boolean_skip_distance); | |
| 3438 masm->GoTo(&again); | |
| 3439 masm->Bind(&cont); | |
| 3440 | |
| 3441 return true; | |
| 3442 } | |
| 3443 | |
| 3134 /* Code generation for choice nodes. | 3444 /* Code generation for choice nodes. |
| 3135 * | 3445 * |
| 3136 * We generate quick checks that do a mask and compare to eliminate a | 3446 * We generate quick checks that do a mask and compare to eliminate a |
| 3137 * choice. If the quick check succeeds then it jumps to the continuation to | 3447 * choice. If the quick check succeeds then it jumps to the continuation to |
| 3138 * do slow checks and check subsequent nodes. If it fails (the common case) | 3448 * do slow checks and check subsequent nodes. If it fails (the common case) |
| 3139 * it falls through to the next choice. | 3449 * it falls through to the next choice. |
| 3140 * | 3450 * |
| 3141 * Here is the desired flow graph. Nodes directly below each other imply | 3451 * Here is the desired flow graph. Nodes directly below each other imply |
| 3142 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative | 3452 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative |
| 3143 * 3 doesn't have a quick check so we have to call the slow check. | 3453 * 3 doesn't have a quick check so we have to call the slow check. |
| (...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3267 greedy_match_trace.set_loop_label(&loop_label); | 3577 greedy_match_trace.set_loop_label(&loop_label); |
| 3268 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); | 3578 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); |
| 3269 macro_assembler->Bind(&greedy_match_failed); | 3579 macro_assembler->Bind(&greedy_match_failed); |
| 3270 } | 3580 } |
| 3271 | 3581 |
| 3272 Label second_choice; // For use in greedy matches. | 3582 Label second_choice; // For use in greedy matches. |
| 3273 macro_assembler->Bind(&second_choice); | 3583 macro_assembler->Bind(&second_choice); |
| 3274 | 3584 |
| 3275 int first_normal_choice = greedy_loop ? 1 : 0; | 3585 int first_normal_choice = greedy_loop ? 1 : 0; |
| 3276 | 3586 |
| 3277 int preload_characters = | 3587 bool not_at_start = current_trace->at_start() == Trace::FALSE; |
| 3278 CalculatePreloadCharacters(compiler, | 3588 const int kEatsAtLeastNotYetInitialized = -1; |
| 3279 current_trace->at_start() == Trace::FALSE); | 3589 int eats_at_least = kEatsAtLeastNotYetInitialized; |
| 3280 bool preload_is_current = | 3590 |
| 3591 bool skip_was_emitted = false; | |
| 3592 | |
| 3593 // More makes code generation slower, less makes V8 benchmark score lower. | |
| 3594 const int kMaxLookaheadForBoyerMoore = 8; | |
| 3595 | |
| 3596 if (!greedy_loop && choice_count == 2) { | |
| 3597 GuardedAlternative alt1 = alternatives_->at(1); | |
| 3598 if (alt1.guards() == NULL || | |
|
ulan
2012/03/30 16:54:51
This fits in one line.
Erik Corry
2012/04/01 00:48:46
Done.
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3599 alt1.guards()->length() == 0) { | |
| 3600 RegExpNode* eats_anything_node = alt1.node(); | |
| 3601 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | |
| 3602 this) { | |
| 3603 // At this point we know that we are at a non-greedy loop that will eat | |
| 3604 // any character one at a time. Any non-anchored regexp has such a | |
| 3605 // loop prepended to it in order to find where it starts. We look for | |
| 3606 // a pattern of the form ...abc... where we can look 6 characters ahead | |
|
ulan
2012/03/30 16:54:51
Can we put a similar comment in description of Emi
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 3607 // and step forwards 3 if the character is not one of abc. Abc need | |
| 3608 // not be atoms, they can be any reasonably limited character class or | |
| 3609 // small alternation. | |
| 3610 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | |
| 3611 eats_at_least = | |
| 3612 Min(kMaxLookaheadForBoyerMoore, | |
| 3613 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | |
| 3614 if (eats_at_least >= 1) { | |
| 3615 BoyerMooreLookahead bm(eats_at_least, | |
| 3616 RegExpMacroAssembler::kTableSize, | |
| 3617 compiler); | |
| 3618 GuardedAlternative alt0 = alternatives_->at(0); | |
| 3619 alt0.node()->FillInBMInfo(0, &bm, not_at_start); | |
| 3620 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); | |
| 3621 } | |
| 3622 } | |
| 3623 } | |
| 3624 } | |
| 3625 | |
| 3626 if (eats_at_least == kEatsAtLeastNotYetInitialized) { | |
| 3627 // Save some time by looking at most one machine word ahead. | |
| 3628 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); | |
| 3629 } | |
| 3630 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); | |
| 3631 | |
| 3632 bool preload_is_current = !skip_was_emitted && | |
| 3281 (current_trace->characters_preloaded() == preload_characters); | 3633 (current_trace->characters_preloaded() == preload_characters); |
| 3282 bool preload_has_checked_bounds = preload_is_current; | 3634 bool preload_has_checked_bounds = preload_is_current; |
| 3283 | 3635 |
| 3284 AlternativeGenerationList alt_gens(choice_count); | 3636 AlternativeGenerationList alt_gens(choice_count); |
| 3285 | 3637 |
| 3286 // For now we just call all choices one after the other. The idea ultimately | 3638 // For now we just call all choices one after the other. The idea ultimately |
| 3287 // is to use the Dispatch table to try only the relevant ones. | 3639 // is to use the Dispatch table to try only the relevant ones. |
| 3288 for (int i = first_normal_choice; i < choice_count; i++) { | 3640 for (int i = first_normal_choice; i < choice_count; i++) { |
| 3289 GuardedAlternative alternative = alternatives_->at(i); | 3641 GuardedAlternative alternative = alternatives_->at(i); |
| 3290 AlternativeGeneration* alt_gen = alt_gens.at(i); | 3642 AlternativeGeneration* alt_gen = alt_gens.at(i); |
| (...skipping 2134 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5425 // We don't know anything about the first character of a backreference | 5777 // We don't know anything about the first character of a backreference |
| 5426 // at this point. | 5778 // at this point. |
| 5427 // The potential first characters are the first characters of the capture, | 5779 // The potential first characters are the first characters of the capture, |
| 5428 // and the first characters of the on_success node, depending on whether the | 5780 // and the first characters of the on_success node, depending on whether the |
| 5429 // capture can be empty and whether it is known to be participating or known | 5781 // capture can be empty and whether it is known to be participating or known |
| 5430 // not to be. | 5782 // not to be. |
| 5431 return kComputeFirstCharacterSetFail; | 5783 return kComputeFirstCharacterSetFail; |
| 5432 } | 5784 } |
| 5433 | 5785 |
| 5434 | 5786 |
| 5787 void ChoiceNode::FillInBMInfo( | |
| 5788 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
| 5789 ZoneList<GuardedAlternative>* alts = alternatives(); | |
| 5790 for (int i = 0; i < alts->length(); i++) { | |
| 5791 GuardedAlternative& alt = alts->at(i); | |
| 5792 if (alt.guards() != NULL && | |
|
ulan
2012/03/30 13:04:48
This fits in one line.
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 5793 alt.guards()->length() != 0) { | |
| 5794 bm->SetRest(offset); // Give up trying to fill in info. | |
| 5795 return; | |
| 5796 } | |
| 5797 alt.node()->FillInBMInfo(offset, bm, not_at_start); | |
| 5798 } | |
| 5799 } | |
| 5800 | |
| 5801 | |
| 5802 void TextNode::FillInBMInfo( | |
| 5803 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
| 5804 if (offset >= bm->length()) return; | |
| 5805 int max_char = bm->max_char(); | |
| 5806 for (int i = 0; i < elements()->length(); i++) { | |
| 5807 if (offset >= bm->length()) return; | |
| 5808 TextElement text = elements()->at(i); | |
| 5809 if (text.type == TextElement::ATOM) { | |
| 5810 RegExpAtom* atom = text.data.u_atom; | |
| 5811 for (int j = 0; j < atom->length(); j++, offset++) { | |
| 5812 if (offset >= bm->length()) return; | |
| 5813 uc16 character = atom->data()[j]; | |
| 5814 if (bm->compiler()->ignore_case()) { | |
| 5815 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
| 5816 int length = GetCaseIndependentLetters( | |
| 5817 ISOLATE, | |
| 5818 character, | |
| 5819 bm->max_char() == String::kMaxAsciiCharCode, | |
| 5820 chars); | |
| 5821 for (int j = 0; j < length; j++) { | |
| 5822 bm->Set(offset, chars[j]); | |
| 5823 } | |
| 5824 } else { | |
| 5825 if (character <= max_char) bm->Set(offset, character); | |
| 5826 } | |
| 5827 } | |
| 5828 } else { | |
| 5829 ASSERT(text.type == TextElement::CHAR_CLASS); | |
| 5830 RegExpCharacterClass* char_class = text.data.u_char_class; | |
| 5831 ZoneList<CharacterRange>* ranges = char_class->ranges(); | |
| 5832 if (char_class->is_negated()) { | |
| 5833 bm->SetAll(offset); | |
| 5834 } else { | |
| 5835 for (int k = 0; k < ranges->length(); k++) { | |
| 5836 CharacterRange& range = ranges->at(k); | |
| 5837 if (range.from() > max_char) continue; | |
| 5838 int to = Min(max_char, static_cast<int>(range.to())); | |
| 5839 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { | |
| 5840 bm->SetAll(offset); | |
| 5841 break; | |
| 5842 } | |
| 5843 for (int m = range.from(); m <= to; m++) { | |
|
ulan
2012/03/30 16:54:51
Why don't we handle ignore_case() here like in the
Erik Corry
2012/04/01 00:48:46
The character classes have had case-independence r
| |
| 5844 bm->Set(offset, m); | |
| 5845 } | |
| 5846 } | |
| 5847 } | |
| 5848 offset++; | |
| 5849 } | |
| 5850 } | |
| 5851 if (offset >= bm->length()) return; | |
| 5852 on_success()->FillInBMInfo(offset, | |
|
ulan
2012/03/30 13:04:48
This fits in line if you put the comment in anothe
Erik Corry
2012/04/01 00:48:46
But the comment only applies to the line it is on,
| |
| 5853 bm, | |
| 5854 true); // Not at start after a text node. | |
| 5855 } | |
| 5856 | |
| 5857 | |
| 5435 int TextNode::ComputeFirstCharacterSet(int budget) { | 5858 int TextNode::ComputeFirstCharacterSet(int budget) { |
| 5436 budget--; | 5859 budget--; |
| 5437 if (budget >= 0) { | 5860 if (budget >= 0) { |
| 5438 ASSERT_NE(0, elements()->length()); | 5861 ASSERT_NE(0, elements()->length()); |
| 5439 TextElement text = elements()->at(0); | 5862 TextElement text = elements()->at(0); |
| 5440 if (text.type == TextElement::ATOM) { | 5863 if (text.type == TextElement::ATOM) { |
| 5441 RegExpAtom* atom = text.data.u_atom; | 5864 RegExpAtom* atom = text.data.u_atom; |
| 5442 ASSERT_NE(0, atom->length()); | 5865 ASSERT_NE(0, atom->length()); |
| 5443 uc16 first_char = atom->data()[0]; | 5866 uc16 first_char = atom->data()[0]; |
| 5444 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); | 5867 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); |
| (...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5582 } | 6005 } |
| 5583 } | 6006 } |
| 5584 | 6007 |
| 5585 | 6008 |
| 5586 void DispatchTableConstructor::VisitAction(ActionNode* that) { | 6009 void DispatchTableConstructor::VisitAction(ActionNode* that) { |
| 5587 RegExpNode* target = that->on_success(); | 6010 RegExpNode* target = that->on_success(); |
| 5588 target->Accept(this); | 6011 target->Accept(this); |
| 5589 } | 6012 } |
| 5590 | 6013 |
| 5591 | 6014 |
| 5592 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, | 6015 RegExpEngine::CompilationResult RegExpEngine::Compile( |
| 5593 bool ignore_case, | 6016 RegExpCompileData* data, |
| 5594 bool is_multiline, | 6017 bool ignore_case, |
| 5595 Handle<String> pattern, | 6018 bool is_multiline, |
| 5596 bool is_ascii) { | 6019 Handle<String> pattern, |
| 6020 Handle<String> sample_subject, | |
| 6021 bool is_ascii) { | |
| 5597 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { | 6022 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
| 5598 return IrregexpRegExpTooBig(); | 6023 return IrregexpRegExpTooBig(); |
| 5599 } | 6024 } |
| 5600 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); | 6025 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); |
| 6026 | |
| 6027 FlattenString(sample_subject); | |
| 6028 int chars_sampled = 0; | |
| 6029 int half_way = sample_subject->length() / 2; | |
|
ulan
2012/03/30 13:04:48
Why do we consider only the upper half of the samp
Erik Corry
2012/04/01 00:48:46
We consider a 100-character chunk in the middle of
| |
| 6030 for (int i = half_way; i < sample_subject->length(); i++) { | |
| 6031 if (chars_sampled++ > 100) break; | |
|
ulan
2012/03/30 13:04:48
A nit: consider moving chars_sampled++ either to t
Erik Corry
2012/04/01 00:48:46
Done.
| |
| 6032 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); | |
| 6033 } | |
| 6034 compiler.frequency_collator()->SortFrequencies(); | |
| 6035 | |
| 5601 // Wrap the body of the regexp in capture #0. | 6036 // Wrap the body of the regexp in capture #0. |
| 5602 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, | 6037 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, |
| 5603 0, | 6038 0, |
| 5604 &compiler, | 6039 &compiler, |
| 5605 compiler.accept()); | 6040 compiler.accept()); |
| 5606 RegExpNode* node = captured_body; | 6041 RegExpNode* node = captured_body; |
| 5607 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); | 6042 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); |
| 5608 bool is_start_anchored = data->tree->IsAnchoredAtStart(); | 6043 bool is_start_anchored = data->tree->IsAnchoredAtStart(); |
| 5609 int max_length = data->tree->max_match(); | 6044 int max_length = data->tree->max_match(); |
| 5610 if (!is_start_anchored) { | 6045 if (!is_start_anchored) { |
| (...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5673 } | 6108 } |
| 5674 | 6109 |
| 5675 return compiler.Assemble(¯o_assembler, | 6110 return compiler.Assemble(¯o_assembler, |
| 5676 node, | 6111 node, |
| 5677 data->capture_count, | 6112 data->capture_count, |
| 5678 pattern); | 6113 pattern); |
| 5679 } | 6114 } |
| 5680 | 6115 |
| 5681 | 6116 |
| 5682 }} // namespace v8::internal | 6117 }} // namespace v8::internal |
| OLD | NEW |