Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2011 the V8 project authors. All rights reserved. | 1 // Copyright 2011 the V8 project authors. All rights reserved. |
| 2 // Redistribution and use in source and binary forms, with or without | 2 // Redistribution and use in source and binary forms, with or without |
| 3 // modification, are permitted provided that the following conditions are | 3 // modification, are permitted provided that the following conditions are |
| 4 // met: | 4 // met: |
| 5 // | 5 // |
| 6 // * Redistributions of source code must retain the above copyright | 6 // * Redistributions of source code must retain the above copyright |
| 7 // notice, this list of conditions and the following disclaimer. | 7 // notice, this list of conditions and the following disclaimer. |
| 8 // * Redistributions in binary form must reproduce the above | 8 // * Redistributions in binary form must reproduce the above |
| 9 // copyright notice, this list of conditions and the following | 9 // copyright notice, this list of conditions and the following |
| 10 // disclaimer in the documentation and/or other materials provided | 10 // disclaimer in the documentation and/or other materials provided |
| (...skipping 262 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 273 | 273 |
| 274 | 274 |
| 275 // Irregexp implementation. | 275 // Irregexp implementation. |
| 276 | 276 |
| 277 // Ensures that the regexp object contains a compiled version of the | 277 // Ensures that the regexp object contains a compiled version of the |
| 278 // source for either ASCII or non-ASCII strings. | 278 // source for either ASCII or non-ASCII strings. |
| 279 // If the compiled version doesn't already exist, it is compiled | 279 // If the compiled version doesn't already exist, it is compiled |
| 280 // from the source pattern. | 280 // from the source pattern. |
| 281 // If compilation fails, an exception is thrown and this function | 281 // If compilation fails, an exception is thrown and this function |
| 282 // returns false. | 282 // returns false. |
| 283 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 283 bool RegExpImpl::EnsureCompiledIrregexp( |
| 284 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) { | |
| 284 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); | 285 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii)); |
| 285 #ifdef V8_INTERPRETED_REGEXP | 286 #ifdef V8_INTERPRETED_REGEXP |
| 286 if (compiled_code->IsByteArray()) return true; | 287 if (compiled_code->IsByteArray()) return true; |
| 287 #else // V8_INTERPRETED_REGEXP (RegExp native code) | 288 #else // V8_INTERPRETED_REGEXP (RegExp native code) |
| 288 if (compiled_code->IsCode()) return true; | 289 if (compiled_code->IsCode()) return true; |
| 289 #endif | 290 #endif |
| 290 // We could potentially have marked this as flushable, but have kept | 291 // We could potentially have marked this as flushable, but have kept |
| 291 // a saved version if we did not flush it yet. | 292 // a saved version if we did not flush it yet. |
| 292 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); | 293 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii)); |
| 293 if (saved_code->IsCode()) { | 294 if (saved_code->IsCode()) { |
| 294 // Reinstate the code in the original place. | 295 // Reinstate the code in the original place. |
| 295 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); | 296 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code); |
| 296 ASSERT(compiled_code->IsSmi()); | 297 ASSERT(compiled_code->IsSmi()); |
| 297 return true; | 298 return true; |
| 298 } | 299 } |
| 299 return CompileIrregexp(re, is_ascii); | 300 return CompileIrregexp(re, sample_subject, is_ascii); |
| 300 } | 301 } |
| 301 | 302 |
| 302 | 303 |
| 303 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, | 304 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re, |
| 304 bool is_ascii, | 305 bool is_ascii, |
| 305 Handle<String> error_message, | 306 Handle<String> error_message, |
| 306 Isolate* isolate) { | 307 Isolate* isolate) { |
| 307 Factory* factory = isolate->factory(); | 308 Factory* factory = isolate->factory(); |
| 308 Handle<FixedArray> elements = factory->NewFixedArray(2); | 309 Handle<FixedArray> elements = factory->NewFixedArray(2); |
| 309 elements->set(0, re->Pattern()); | 310 elements->set(0, re->Pattern()); |
| 310 elements->set(1, *error_message); | 311 elements->set(1, *error_message); |
| 311 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); | 312 Handle<JSArray> array = factory->NewJSArrayWithElements(elements); |
| 312 Handle<Object> regexp_err = | 313 Handle<Object> regexp_err = |
| 313 factory->NewSyntaxError("malformed_regexp", array); | 314 factory->NewSyntaxError("malformed_regexp", array); |
| 314 isolate->Throw(*regexp_err); | 315 isolate->Throw(*regexp_err); |
| 315 return false; | 316 return false; |
| 316 } | 317 } |
| 317 | 318 |
| 318 | 319 |
| 319 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) { | 320 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, |
| 321 Handle<String> sample_subject, | |
| 322 bool is_ascii) { | |
| 320 // Compile the RegExp. | 323 // Compile the RegExp. |
| 321 Isolate* isolate = re->GetIsolate(); | 324 Isolate* isolate = re->GetIsolate(); |
| 322 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); | 325 ZoneScope zone_scope(isolate, DELETE_ON_EXIT); |
| 323 PostponeInterruptsScope postpone(isolate); | 326 PostponeInterruptsScope postpone(isolate); |
| 324 // If we had a compilation error the last time this is saved at the | 327 // If we had a compilation error the last time this is saved at the |
| 325 // saved code index. | 328 // saved code index. |
| 326 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); | 329 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii)); |
| 327 // When arriving here entry can only be a smi, either representing an | 330 // When arriving here entry can only be a smi, either representing an |
| 328 // uncompiled regexp, a previous compilation error, or code that has | 331 // uncompiled regexp, a previous compilation error, or code that has |
| 329 // been flushed. | 332 // been flushed. |
| (...skipping 28 matching lines...) Expand all Loading... | |
| 358 pattern, | 361 pattern, |
| 359 compile_data.error, | 362 compile_data.error, |
| 360 "malformed_regexp"); | 363 "malformed_regexp"); |
| 361 return false; | 364 return false; |
| 362 } | 365 } |
| 363 RegExpEngine::CompilationResult result = | 366 RegExpEngine::CompilationResult result = |
| 364 RegExpEngine::Compile(&compile_data, | 367 RegExpEngine::Compile(&compile_data, |
| 365 flags.is_ignore_case(), | 368 flags.is_ignore_case(), |
| 366 flags.is_multiline(), | 369 flags.is_multiline(), |
| 367 pattern, | 370 pattern, |
| 371 sample_subject, | |
| 368 is_ascii); | 372 is_ascii); |
| 369 if (result.error_message != NULL) { | 373 if (result.error_message != NULL) { |
| 370 // Unable to compile regexp. | 374 // Unable to compile regexp. |
| 371 Handle<String> error_message = | 375 Handle<String> error_message = |
| 372 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); | 376 isolate->factory()->NewStringFromUtf8(CStrVector(result.error_message)); |
| 373 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); | 377 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate); |
| 374 return false; | 378 return false; |
| 375 } | 379 } |
| 376 | 380 |
| 377 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); | 381 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data())); |
| (...skipping 50 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 428 capture_count); | 432 capture_count); |
| 429 } | 433 } |
| 430 | 434 |
| 431 | 435 |
| 432 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, | 436 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp, |
| 433 Handle<String> subject) { | 437 Handle<String> subject) { |
| 434 if (!subject->IsFlat()) FlattenString(subject); | 438 if (!subject->IsFlat()) FlattenString(subject); |
| 435 | 439 |
| 436 // Check the asciiness of the underlying storage. | 440 // Check the asciiness of the underlying storage. |
| 437 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 441 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
| 438 if (!EnsureCompiledIrregexp(regexp, is_ascii)) return -1; | 442 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1; |
| 439 | 443 |
| 440 #ifdef V8_INTERPRETED_REGEXP | 444 #ifdef V8_INTERPRETED_REGEXP |
| 441 // Byte-code regexp needs space allocated for all its registers. | 445 // Byte-code regexp needs space allocated for all its registers. |
| 442 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); | 446 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())); |
| 443 #else // V8_INTERPRETED_REGEXP | 447 #else // V8_INTERPRETED_REGEXP |
| 444 // Native regexp only needs room to output captures. Registers are handled | 448 // Native regexp only needs room to output captures. Registers are handled |
| 445 // internally. | 449 // internally. |
| 446 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; | 450 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2; |
| 447 #endif // V8_INTERPRETED_REGEXP | 451 #endif // V8_INTERPRETED_REGEXP |
| 448 } | 452 } |
| (...skipping 10 matching lines...) Expand all Loading... | |
| 459 | 463 |
| 460 ASSERT(index >= 0); | 464 ASSERT(index >= 0); |
| 461 ASSERT(index <= subject->length()); | 465 ASSERT(index <= subject->length()); |
| 462 ASSERT(subject->IsFlat()); | 466 ASSERT(subject->IsFlat()); |
| 463 | 467 |
| 464 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); | 468 bool is_ascii = subject->IsAsciiRepresentationUnderneath(); |
| 465 | 469 |
| 466 #ifndef V8_INTERPRETED_REGEXP | 470 #ifndef V8_INTERPRETED_REGEXP |
| 467 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); | 471 ASSERT(output.length() >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2); |
| 468 do { | 472 do { |
| 469 EnsureCompiledIrregexp(regexp, is_ascii); | 473 EnsureCompiledIrregexp(regexp, subject, is_ascii); |
| 470 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); | 474 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate); |
| 471 NativeRegExpMacroAssembler::Result res = | 475 NativeRegExpMacroAssembler::Result res = |
| 472 NativeRegExpMacroAssembler::Match(code, | 476 NativeRegExpMacroAssembler::Match(code, |
| 473 subject, | 477 subject, |
| 474 output.start(), | 478 output.start(), |
| 475 output.length(), | 479 output.length(), |
| 476 index, | 480 index, |
| 477 isolate); | 481 isolate); |
| 478 if (res != NativeRegExpMacroAssembler::RETRY) { | 482 if (res != NativeRegExpMacroAssembler::RETRY) { |
| 479 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || | 483 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION || |
| (...skipping 297 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 777 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { | 781 DispatchTable* ChoiceNode::GetTable(bool ignore_case) { |
| 778 if (table_ == NULL) { | 782 if (table_ == NULL) { |
| 779 table_ = new DispatchTable(); | 783 table_ = new DispatchTable(); |
| 780 DispatchTableConstructor cons(table_, ignore_case); | 784 DispatchTableConstructor cons(table_, ignore_case); |
| 781 cons.BuildTable(this); | 785 cons.BuildTable(this); |
| 782 } | 786 } |
| 783 return table_; | 787 return table_; |
| 784 } | 788 } |
| 785 | 789 |
| 786 | 790 |
| 791 class FrequencyCollator { | |
| 792 public: | |
| 793 FrequencyCollator() : total_samples_(0) { | |
| 794 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { | |
| 795 frequencies_[i] = CharacterFrequency(i); | |
| 796 } | |
| 797 } | |
| 798 | |
| 799 void CountCharacter(int character) { | |
| 800 int index = (character & RegExpMacroAssembler::kTableMask); | |
| 801 frequencies_[index].Increment(); | |
| 802 total_samples_++; | |
| 803 } | |
| 804 | |
| 805 // Does not measure in percent, but rather per-128 (the table size from the | |
| 806 // regexp macro assembler). | |
| 807 int Frequency(int in_character) { | |
| 808 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character); | |
| 809 if (total_samples_ < 1) return 1; // Division by zero. | |
| 810 int freq_in_per128 = | |
| 811 (frequencies_[in_character].counter() * 128) / total_samples_; | |
| 812 return freq_in_per128; | |
| 813 } | |
| 814 | |
| 815 private: | |
| 816 class CharacterFrequency { | |
| 817 public: | |
| 818 CharacterFrequency() : counter_(0), character_(-1) { } | |
| 819 explicit CharacterFrequency(int character) | |
| 820 : counter_(0), character_(character) { } | |
| 821 | |
| 822 void Increment() { counter_++; } | |
| 823 int counter() { return counter_; } | |
| 824 int character() { return character_; } | |
| 825 | |
| 826 private: | |
| 827 int counter_; | |
| 828 int character_; | |
| 829 }; | |
| 830 | |
| 831 | |
| 832 private: | |
| 833 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; | |
| 834 int total_samples_; | |
| 835 }; | |
| 836 | |
| 837 | |
| 787 class RegExpCompiler { | 838 class RegExpCompiler { |
| 788 public: | 839 public: |
| 789 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); | 840 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii); |
| 790 | 841 |
| 791 int AllocateRegister() { | 842 int AllocateRegister() { |
| 792 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { | 843 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { |
| 793 reg_exp_too_big_ = true; | 844 reg_exp_too_big_ = true; |
| 794 return next_register_; | 845 return next_register_; |
| 795 } | 846 } |
| 796 return next_register_++; | 847 return next_register_++; |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 812 | 863 |
| 813 static const int kMaxRecursion = 100; | 864 static const int kMaxRecursion = 100; |
| 814 inline int recursion_depth() { return recursion_depth_; } | 865 inline int recursion_depth() { return recursion_depth_; } |
| 815 inline void IncrementRecursionDepth() { recursion_depth_++; } | 866 inline void IncrementRecursionDepth() { recursion_depth_++; } |
| 816 inline void DecrementRecursionDepth() { recursion_depth_--; } | 867 inline void DecrementRecursionDepth() { recursion_depth_--; } |
| 817 | 868 |
| 818 void SetRegExpTooBig() { reg_exp_too_big_ = true; } | 869 void SetRegExpTooBig() { reg_exp_too_big_ = true; } |
| 819 | 870 |
| 820 inline bool ignore_case() { return ignore_case_; } | 871 inline bool ignore_case() { return ignore_case_; } |
| 821 inline bool ascii() { return ascii_; } | 872 inline bool ascii() { return ascii_; } |
| 873 FrequencyCollator* frequency_collator() { return &frequency_collator_; } | |
| 822 | 874 |
| 823 int current_expansion_factor() { return current_expansion_factor_; } | 875 int current_expansion_factor() { return current_expansion_factor_; } |
| 824 void set_current_expansion_factor(int value) { | 876 void set_current_expansion_factor(int value) { |
| 825 current_expansion_factor_ = value; | 877 current_expansion_factor_ = value; |
| 826 } | 878 } |
| 827 | 879 |
| 828 static const int kNoRegister = -1; | 880 static const int kNoRegister = -1; |
| 829 | 881 |
| 830 private: | 882 private: |
| 831 EndNode* accept_; | 883 EndNode* accept_; |
| 832 int next_register_; | 884 int next_register_; |
| 833 List<RegExpNode*>* work_list_; | 885 List<RegExpNode*>* work_list_; |
| 834 int recursion_depth_; | 886 int recursion_depth_; |
| 835 RegExpMacroAssembler* macro_assembler_; | 887 RegExpMacroAssembler* macro_assembler_; |
| 836 bool ignore_case_; | 888 bool ignore_case_; |
| 837 bool ascii_; | 889 bool ascii_; |
| 838 bool reg_exp_too_big_; | 890 bool reg_exp_too_big_; |
| 839 int current_expansion_factor_; | 891 int current_expansion_factor_; |
| 892 FrequencyCollator frequency_collator_; | |
| 840 }; | 893 }; |
| 841 | 894 |
| 842 | 895 |
| 843 class RecursionCheck { | 896 class RecursionCheck { |
| 844 public: | 897 public: |
| 845 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { | 898 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) { |
| 846 compiler->IncrementRecursionDepth(); | 899 compiler->IncrementRecursionDepth(); |
| 847 } | 900 } |
| 848 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } | 901 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); } |
| 849 private: | 902 private: |
| 850 RegExpCompiler* compiler_; | 903 RegExpCompiler* compiler_; |
| 851 }; | 904 }; |
| 852 | 905 |
| 853 | 906 |
| 854 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { | 907 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { |
| 855 return RegExpEngine::CompilationResult("RegExp too big"); | 908 return RegExpEngine::CompilationResult("RegExp too big"); |
| 856 } | 909 } |
| 857 | 910 |
| 858 | 911 |
| 859 // Attempts to compile the regexp using an Irregexp code generator. Returns | 912 // Attempts to compile the regexp using an Irregexp code generator. Returns |
| 860 // a fixed array or a null handle depending on whether it succeeded. | 913 // a fixed array or a null handle depending on whether it succeeded. |
| 861 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) | 914 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii) |
| 862 : next_register_(2 * (capture_count + 1)), | 915 : next_register_(2 * (capture_count + 1)), |
| 863 work_list_(NULL), | 916 work_list_(NULL), |
| 864 recursion_depth_(0), | 917 recursion_depth_(0), |
| 865 ignore_case_(ignore_case), | 918 ignore_case_(ignore_case), |
| 866 ascii_(ascii), | 919 ascii_(ascii), |
| 867 reg_exp_too_big_(false), | 920 reg_exp_too_big_(false), |
| 868 current_expansion_factor_(1) { | 921 current_expansion_factor_(1), |
| 922 frequency_collator_() { | |
| 869 accept_ = new EndNode(EndNode::ACCEPT); | 923 accept_ = new EndNode(EndNode::ACCEPT); |
| 870 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); | 924 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister); |
| 871 } | 925 } |
| 872 | 926 |
| 873 | 927 |
| 874 RegExpEngine::CompilationResult RegExpCompiler::Assemble( | 928 RegExpEngine::CompilationResult RegExpCompiler::Assemble( |
| 875 RegExpMacroAssembler* macro_assembler, | 929 RegExpMacroAssembler* macro_assembler, |
| 876 RegExpNode* start, | 930 RegExpNode* start, |
| 877 int capture_count, | 931 int capture_count, |
| 878 Handle<String> pattern) { | 932 Handle<String> pattern) { |
| (...skipping 1170 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2049 int recursion_depth, | 2103 int recursion_depth, |
| 2050 bool not_at_start) { | 2104 bool not_at_start) { |
| 2051 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2105 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
| 2052 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! | 2106 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input! |
| 2053 return on_success()->EatsAtLeast(still_to_find, | 2107 return on_success()->EatsAtLeast(still_to_find, |
| 2054 recursion_depth + 1, | 2108 recursion_depth + 1, |
| 2055 not_at_start); | 2109 not_at_start); |
| 2056 } | 2110 } |
| 2057 | 2111 |
| 2058 | 2112 |
| 2113 void ActionNode::FillInBMInfo(int offset, | |
| 2114 BoyerMooreLookahead* bm, | |
| 2115 bool not_at_start) { | |
| 2116 if (type_ == BEGIN_SUBMATCH) { | |
| 2117 bm->SetRest(offset); | |
| 2118 } else if (type_ != POSITIVE_SUBMATCH_SUCCESS) { | |
| 2119 on_success()->FillInBMInfo(offset, bm, not_at_start); | |
| 2120 } | |
| 2121 } | |
| 2122 | |
| 2123 | |
| 2059 int AssertionNode::EatsAtLeast(int still_to_find, | 2124 int AssertionNode::EatsAtLeast(int still_to_find, |
| 2060 int recursion_depth, | 2125 int recursion_depth, |
| 2061 bool not_at_start) { | 2126 bool not_at_start) { |
| 2062 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2127 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
| 2063 // If we know we are not at the start and we are asked "how many characters | 2128 // If we know we are not at the start and we are asked "how many characters |
| 2064 // will you match if you succeed?" then we can answer anything since false | 2129 // will you match if you succeed?" then we can answer anything since false |
| 2065 // implies false. So lets just return the max answer (still_to_find) since | 2130 // implies false. So lets just return the max answer (still_to_find) since |
| 2066 // that won't prevent us from preloading a lot of characters for the other | 2131 // that won't prevent us from preloading a lot of characters for the other |
| 2067 // branches in the node graph. | 2132 // branches in the node graph. |
| 2068 if (type() == AT_START && not_at_start) return still_to_find; | 2133 if (type() == AT_START && not_at_start) return still_to_find; |
| 2069 return on_success()->EatsAtLeast(still_to_find, | 2134 return on_success()->EatsAtLeast(still_to_find, |
| 2070 recursion_depth + 1, | 2135 recursion_depth + 1, |
| 2071 not_at_start); | 2136 not_at_start); |
| 2072 } | 2137 } |
| 2073 | 2138 |
| 2074 | 2139 |
| 2140 void AssertionNode::FillInBMInfo( | |
| 2141 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
| 2142 // Match the behaviour of EatsAtLeast on this node. | |
| 2143 if (type() == AT_START && not_at_start) return; | |
| 2144 on_success()->FillInBMInfo(offset, bm, not_at_start); | |
| 2145 } | |
| 2146 | |
| 2147 | |
| 2075 int BackReferenceNode::EatsAtLeast(int still_to_find, | 2148 int BackReferenceNode::EatsAtLeast(int still_to_find, |
| 2076 int recursion_depth, | 2149 int recursion_depth, |
| 2077 bool not_at_start) { | 2150 bool not_at_start) { |
| 2078 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; | 2151 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0; |
| 2079 return on_success()->EatsAtLeast(still_to_find, | 2152 return on_success()->EatsAtLeast(still_to_find, |
| 2080 recursion_depth + 1, | 2153 recursion_depth + 1, |
| 2081 not_at_start); | 2154 not_at_start); |
| 2082 } | 2155 } |
| 2083 | 2156 |
| 2084 | 2157 |
| (...skipping 412 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2497 bool not_at_start) { | 2570 bool not_at_start) { |
| 2498 if (body_can_be_zero_length_ || info()->visited) return; | 2571 if (body_can_be_zero_length_ || info()->visited) return; |
| 2499 VisitMarker marker(info()); | 2572 VisitMarker marker(info()); |
| 2500 return ChoiceNode::GetQuickCheckDetails(details, | 2573 return ChoiceNode::GetQuickCheckDetails(details, |
| 2501 compiler, | 2574 compiler, |
| 2502 characters_filled_in, | 2575 characters_filled_in, |
| 2503 not_at_start); | 2576 not_at_start); |
| 2504 } | 2577 } |
| 2505 | 2578 |
| 2506 | 2579 |
| 2580 void LoopChoiceNode::FillInBMInfo( | |
| 2581 int offset, BoyerMooreLookahead* bm, bool nas) { | |
| 2582 if (body_can_be_zero_length_) { | |
| 2583 bm->SetRest(offset); | |
| 2584 return; | |
| 2585 } | |
| 2586 ChoiceNode::FillInBMInfo(offset, bm, nas); | |
| 2587 } | |
| 2588 | |
| 2589 | |
| 2507 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, | 2590 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details, |
| 2508 RegExpCompiler* compiler, | 2591 RegExpCompiler* compiler, |
| 2509 int characters_filled_in, | 2592 int characters_filled_in, |
| 2510 bool not_at_start) { | 2593 bool not_at_start) { |
| 2511 not_at_start = (not_at_start || not_at_start_); | 2594 not_at_start = (not_at_start || not_at_start_); |
| 2512 int choice_count = alternatives_->length(); | 2595 int choice_count = alternatives_->length(); |
| 2513 ASSERT(choice_count > 0); | 2596 ASSERT(choice_count > 0); |
| 2514 alternatives_->at(0).node()->GetQuickCheckDetails(details, | 2597 alternatives_->at(0).node()->GetQuickCheckDetails(details, |
| 2515 compiler, | 2598 compiler, |
| 2516 characters_filled_in, | 2599 characters_filled_in, |
| (...skipping 476 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 2993 int TextNode::GreedyLoopTextLength() { | 3076 int TextNode::GreedyLoopTextLength() { |
| 2994 TextElement elm = elms_->at(elms_->length() - 1); | 3077 TextElement elm = elms_->at(elms_->length() - 1); |
| 2995 if (elm.type == TextElement::CHAR_CLASS) { | 3078 if (elm.type == TextElement::CHAR_CLASS) { |
| 2996 return elm.cp_offset + 1; | 3079 return elm.cp_offset + 1; |
| 2997 } else { | 3080 } else { |
| 2998 return elm.cp_offset + elm.data.u_atom->data().length(); | 3081 return elm.cp_offset + elm.data.u_atom->data().length(); |
| 2999 } | 3082 } |
| 3000 } | 3083 } |
| 3001 | 3084 |
| 3002 | 3085 |
| 3086 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( | |
| 3087 RegExpCompiler* compiler) { | |
| 3088 if (elms_->length() != 1) return NULL; | |
| 3089 TextElement elm = elms_->at(0); | |
| 3090 if (elm.type != TextElement::CHAR_CLASS) return NULL; | |
| 3091 RegExpCharacterClass* node = elm.data.u_char_class; | |
| 3092 ZoneList<CharacterRange>* ranges = node->ranges(); | |
| 3093 if (!CharacterRange::IsCanonical(ranges)) { | |
| 3094 CharacterRange::Canonicalize(ranges); | |
| 3095 } | |
| 3096 if (node->is_negated()) { | |
| 3097 return ranges->length() == 0 ? on_success() : NULL; | |
| 3098 } | |
| 3099 if (ranges->length() != 1) return NULL; | |
| 3100 uint32_t max_char; | |
| 3101 if (compiler->ascii()) { | |
| 3102 max_char = String::kMaxAsciiCharCode; | |
| 3103 } else { | |
| 3104 max_char = String::kMaxUtf16CodeUnit; | |
| 3105 } | |
| 3106 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL; | |
| 3107 } | |
| 3108 | |
| 3109 | |
| 3003 // Finds the fixed match length of a sequence of nodes that goes from | 3110 // Finds the fixed match length of a sequence of nodes that goes from |
| 3004 // this alternative and back to this choice node. If there are variable | 3111 // this alternative and back to this choice node. If there are variable |
| 3005 // length nodes or other complications in the way then return a sentinel | 3112 // length nodes or other complications in the way then return a sentinel |
| 3006 // value indicating that a greedy loop cannot be constructed. | 3113 // value indicating that a greedy loop cannot be constructed. |
| 3007 int ChoiceNode::GreedyLoopTextLengthForAlternative( | 3114 int ChoiceNode::GreedyLoopTextLengthForAlternative( |
| 3008 GuardedAlternative* alternative) { | 3115 GuardedAlternative* alternative) { |
| 3009 int length = 0; | 3116 int length = 0; |
| 3010 RegExpNode* node = alternative->node(); | 3117 RegExpNode* node = alternative->node(); |
| 3011 // Later we will generate code for all these text nodes using recursion | 3118 // Later we will generate code for all these text nodes using recursion |
| 3012 // so we have to limit the max number. | 3119 // so we have to limit the max number. |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3057 ASSERT(trace->stop_node() == NULL); | 3164 ASSERT(trace->stop_node() == NULL); |
| 3058 if (!trace->is_trivial()) { | 3165 if (!trace->is_trivial()) { |
| 3059 trace->Flush(compiler, this); | 3166 trace->Flush(compiler, this); |
| 3060 return; | 3167 return; |
| 3061 } | 3168 } |
| 3062 ChoiceNode::Emit(compiler, trace); | 3169 ChoiceNode::Emit(compiler, trace); |
| 3063 } | 3170 } |
| 3064 | 3171 |
| 3065 | 3172 |
| 3066 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, | 3173 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler, |
| 3067 bool not_at_start) { | 3174 int eats_at_least) { |
| 3068 int preload_characters = EatsAtLeast(4, 0, not_at_start); | 3175 int preload_characters = Min(4, eats_at_least); |
| 3069 if (compiler->macro_assembler()->CanReadUnaligned()) { | 3176 if (compiler->macro_assembler()->CanReadUnaligned()) { |
| 3070 bool ascii = compiler->ascii(); | 3177 bool ascii = compiler->ascii(); |
| 3071 if (ascii) { | 3178 if (ascii) { |
| 3072 if (preload_characters > 4) preload_characters = 4; | 3179 if (preload_characters > 4) preload_characters = 4; |
| 3073 // We can't preload 3 characters because there is no machine instruction | 3180 // We can't preload 3 characters because there is no machine instruction |
| 3074 // to do that. We can't just load 4 because we could be reading | 3181 // to do that. We can't just load 4 because we could be reading |
| 3075 // beyond the end of the string, which could cause a memory fault. | 3182 // beyond the end of the string, which could cause a memory fault. |
| 3076 if (preload_characters == 3) preload_characters = 2; | 3183 if (preload_characters == 3) preload_characters = 2; |
| 3077 } else { | 3184 } else { |
| 3078 if (preload_characters > 2) preload_characters = 2; | 3185 if (preload_characters > 2) preload_characters = 2; |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3124 return alt_gens_[i]; | 3231 return alt_gens_[i]; |
| 3125 } | 3232 } |
| 3126 | 3233 |
| 3127 private: | 3234 private: |
| 3128 static const int kAFew = 10; | 3235 static const int kAFew = 10; |
| 3129 ZoneList<AlternativeGeneration*> alt_gens_; | 3236 ZoneList<AlternativeGeneration*> alt_gens_; |
| 3130 AlternativeGeneration a_few_alt_gens_[kAFew]; | 3237 AlternativeGeneration a_few_alt_gens_[kAFew]; |
| 3131 }; | 3238 }; |
| 3132 | 3239 |
| 3133 | 3240 |
| 3241 BoyerMooreLookahead::BoyerMooreLookahead( | |
| 3242 int length, int map_length, RegExpCompiler* compiler) | |
| 3243 : length_(length), | |
| 3244 map_length_(map_length), | |
| 3245 compiler_(compiler) { | |
| 3246 ASSERT(IsPowerOf2(map_length)); | |
| 3247 if (compiler->ascii()) { | |
| 3248 max_char_ = String::kMaxAsciiCharCode; | |
| 3249 } else { | |
| 3250 max_char_ = String::kMaxUtf16CodeUnit; | |
| 3251 } | |
| 3252 bitmaps_ = new ZoneList<ZoneList<bool>*>(length); | |
| 3253 for (int i = 0; i < length; i++) { | |
| 3254 bitmaps_->Add(new ZoneList<bool>(map_length)); | |
| 3255 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3256 for (int i = 0; i < map_length; i++) { | |
| 3257 map->Add(false); | |
| 3258 } | |
| 3259 } | |
| 3260 } | |
| 3261 | |
| 3262 | |
| 3263 // Find the longest range of lookahead that has the fewest number of different | |
| 3264 // characters that can occur at a given position. Since we are optimizing two | |
| 3265 // different parameters at once this is a tradeoff. | |
| 3266 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) { | |
| 3267 int biggest_points = 0; | |
| 3268 for (int max_number_of_chars = 4; | |
| 3269 max_number_of_chars < kTooManyCharacters; | |
| 3270 max_number_of_chars *= 2) { | |
| 3271 biggest_points = | |
| 3272 FindBestInterval(max_number_of_chars, biggest_points, from, to); | |
| 3273 } | |
| 3274 if (biggest_points == 0) return false; | |
| 3275 return true; | |
| 3276 } | |
| 3277 | |
| 3278 | |
| 3279 // Find the highest-points range between 0 and length_ where the character | |
| 3280 // information is not too vague. 'Too vague' means that there are more than | |
| 3281 // max_number_of_chars that can occur at this position. Calculates the number | |
| 3282 // of points as the product of width-of-the-range and | |
| 3283 // probability-of-finding-one-of-the-characters, where the probability is | |
| 3284 // calculated using the frequency distribution of the sample subject string. | |
| 3285 int BoyerMooreLookahead::FindBestInterval( | |
| 3286 int max_number_of_chars, int old_biggest_points, int* from, int* to) { | |
| 3287 int biggest_points = old_biggest_points; | |
| 3288 static const int kSize = RegExpMacroAssembler::kTableSize; | |
| 3289 for (int i = 0; i < length_; ) { | |
| 3290 while (i < length_ && Count(i) > max_number_of_chars) i++; | |
| 3291 if (i == length_) break; | |
| 3292 int remembered_from = i; | |
| 3293 bool union_map[kSize]; | |
| 3294 for (int j = 0; j < kSize; j++) union_map[j] = false; | |
| 3295 while (i < length_ && Count(i) <= max_number_of_chars) { | |
| 3296 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3297 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j); | |
| 3298 i++; | |
| 3299 } | |
| 3300 int frequency = 0; | |
| 3301 for (int j = 0; j < kSize; j++) { | |
| 3302 if (union_map[j]) { | |
| 3303 // Add 1 to the frequency to give a small per-character boost for | |
| 3304 // the cases where our sampling is not good enough and many | |
| 3305 // characters have a frequency of zero. | |
| 3306 frequency += compiler_->frequency_collator()->Frequency(j) + 1; | |
| 3307 } | |
| 3308 } | |
| 3309 // We use the probability of skipping times the distance we are skipping to | |
| 3310 // judge the effectiveness of this. Actually we have a cut-off: By | |
| 3311 // dividing by 2 we switch off the skipping if the probability of skipping | |
| 3312 // is less than 50%. This is because the multibyte mask-and-compare | |
| 3313 // skipping in quickcheck is more likely to do well on this case. | |
| 3314 bool in_quickcheck_range = ((i - remembered_from < 4) || | |
| 3315 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2)); | |
| 3316 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency; | |
|
ulan
2012/04/02 08:12:24
This shouldn't affect the correctness but:
if I r
Erik Corry
2012/04/02 09:36:22
This code is as intended. I added a few comments.
| |
| 3317 int points = (i - remembered_from) * probability; | |
| 3318 if (points > biggest_points) { | |
| 3319 *from = remembered_from; | |
| 3320 *to = i - 1; | |
| 3321 biggest_points = points; | |
| 3322 } | |
| 3323 } | |
| 3324 return biggest_points; | |
| 3325 } | |
| 3326 | |
| 3327 | |
| 3328 // Take all the characters that will not prevent a successful match if they | |
| 3329 // occur occur in the subject string in the range between min_lookahead and | |
|
ulan
2012/04/02 08:12:24
'occur' twice.
Erik Corry
2012/04/02 09:36:22
Done.
| |
| 3330 // max_lookahead (inclusive) measured from the current position. If the | |
| 3331 // character at max_lookahead offset is not one of these characters, then we | |
| 3332 // can safely skip forwards by the number of characters in the range. | |
| 3333 int BoyerMooreLookahead::GetSkipTable(int min_lookahead, | |
| 3334 int max_lookahead, | |
| 3335 Handle<ByteArray> boolean_skip_table) { | |
| 3336 const int kSize = RegExpMacroAssembler::kTableSize; | |
| 3337 | |
| 3338 const int kSkipArrayEntry = 0; | |
| 3339 const int kDontSkipArrayEntry = 1; | |
| 3340 | |
| 3341 for (int i = 0; i < kSize; i++) { | |
| 3342 boolean_skip_table->set(i, kSkipArrayEntry); | |
| 3343 } | |
| 3344 int skip = max_lookahead + 1 - min_lookahead; | |
| 3345 | |
| 3346 for (int i = max_lookahead; i >= min_lookahead; i--) { | |
| 3347 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3348 for (int j = 0; j < map_length_; j++) { | |
| 3349 if (map->at(j)) { | |
| 3350 boolean_skip_table->set(j, kDontSkipArrayEntry); | |
| 3351 } | |
| 3352 } | |
| 3353 } | |
| 3354 | |
| 3355 return skip; | |
| 3356 } | |
| 3357 | |
| 3358 | |
| 3359 // See comment above on the implementation of GetSkipTable. | |
| 3360 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) { | |
| 3361 int min_lookahead = 0; | |
| 3362 int max_lookahead = 0; | |
| 3363 | |
| 3364 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false; | |
| 3365 | |
| 3366 bool found_single_character = false; | |
| 3367 bool abandoned_search_for_single_character = false; | |
| 3368 int single_character = 0; | |
| 3369 for (int i = max_lookahead; i >= min_lookahead; i--) { | |
| 3370 ZoneList<bool>* map = bitmaps_->at(i); | |
| 3371 for (int j = 0; j < map_length_; j++) { | |
| 3372 if (map->at(j)) { | |
| 3373 if (found_single_character) { | |
| 3374 found_single_character = false; // Found two. | |
| 3375 abandoned_search_for_single_character = true; | |
| 3376 break; | |
| 3377 } else { | |
| 3378 found_single_character = true; | |
| 3379 single_character = j; | |
| 3380 } | |
| 3381 } | |
| 3382 } | |
| 3383 if (abandoned_search_for_single_character) break; | |
| 3384 } | |
| 3385 | |
| 3386 int lookahead_width = max_lookahead + 1 - min_lookahead; | |
| 3387 | |
| 3388 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) { | |
| 3389 // The mask-compare can probably handle this better. | |
| 3390 return false; | |
| 3391 } | |
| 3392 | |
| 3393 if (found_single_character) { | |
| 3394 Label cont, again; | |
| 3395 masm->Bind(&again); | |
| 3396 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
| 3397 if (max_char_ > map_length_) { | |
| 3398 ASSERT(map_length_ == RegExpMacroAssembler::kTableSize); | |
| 3399 masm->CheckCharacterAfterAnd(single_character, | |
| 3400 RegExpMacroAssembler::kTableMask, | |
| 3401 &cont); | |
| 3402 } else { | |
| 3403 masm->CheckCharacter(single_character, &cont); | |
| 3404 } | |
| 3405 masm->AdvanceCurrentPosition(lookahead_width); | |
| 3406 masm->GoTo(&again); | |
| 3407 masm->Bind(&cont); | |
| 3408 return true; | |
| 3409 } | |
| 3410 | |
| 3411 Handle<ByteArray> boolean_skip_table = | |
| 3412 FACTORY->NewByteArray(map_length_, TENURED); | |
| 3413 int skip_distance = GetSkipTable( | |
| 3414 min_lookahead, max_lookahead, boolean_skip_table); | |
| 3415 ASSERT(skip_distance != 0); | |
| 3416 | |
| 3417 Label cont, again; | |
| 3418 masm->Bind(&again); | |
| 3419 masm->LoadCurrentCharacter(max_lookahead, &cont, true); | |
| 3420 masm->CheckBitInTable(boolean_skip_table, &cont); | |
| 3421 masm->AdvanceCurrentPosition(skip_distance); | |
| 3422 masm->GoTo(&again); | |
| 3423 masm->Bind(&cont); | |
| 3424 | |
| 3425 return true; | |
| 3426 } | |
| 3427 | |
| 3134 /* Code generation for choice nodes. | 3428 /* Code generation for choice nodes. |
| 3135 * | 3429 * |
| 3136 * We generate quick checks that do a mask and compare to eliminate a | 3430 * We generate quick checks that do a mask and compare to eliminate a |
| 3137 * choice. If the quick check succeeds then it jumps to the continuation to | 3431 * choice. If the quick check succeeds then it jumps to the continuation to |
| 3138 * do slow checks and check subsequent nodes. If it fails (the common case) | 3432 * do slow checks and check subsequent nodes. If it fails (the common case) |
| 3139 * it falls through to the next choice. | 3433 * it falls through to the next choice. |
| 3140 * | 3434 * |
| 3141 * Here is the desired flow graph. Nodes directly below each other imply | 3435 * Here is the desired flow graph. Nodes directly below each other imply |
| 3142 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative | 3436 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative |
| 3143 * 3 doesn't have a quick check so we have to call the slow check. | 3437 * 3 doesn't have a quick check so we have to call the slow check. |
| (...skipping 123 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 3267 greedy_match_trace.set_loop_label(&loop_label); | 3561 greedy_match_trace.set_loop_label(&loop_label); |
| 3268 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); | 3562 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace); |
| 3269 macro_assembler->Bind(&greedy_match_failed); | 3563 macro_assembler->Bind(&greedy_match_failed); |
| 3270 } | 3564 } |
| 3271 | 3565 |
| 3272 Label second_choice; // For use in greedy matches. | 3566 Label second_choice; // For use in greedy matches. |
| 3273 macro_assembler->Bind(&second_choice); | 3567 macro_assembler->Bind(&second_choice); |
| 3274 | 3568 |
| 3275 int first_normal_choice = greedy_loop ? 1 : 0; | 3569 int first_normal_choice = greedy_loop ? 1 : 0; |
| 3276 | 3570 |
| 3277 int preload_characters = | 3571 bool not_at_start = current_trace->at_start() == Trace::FALSE; |
| 3278 CalculatePreloadCharacters(compiler, | 3572 const int kEatsAtLeastNotYetInitialized = -1; |
| 3279 current_trace->at_start() == Trace::FALSE); | 3573 int eats_at_least = kEatsAtLeastNotYetInitialized; |
| 3280 bool preload_is_current = | 3574 |
| 3575 bool skip_was_emitted = false; | |
| 3576 | |
| 3577 // More makes code generation slower, less makes V8 benchmark score lower. | |
| 3578 const int kMaxLookaheadForBoyerMoore = 8; | |
| 3579 | |
| 3580 if (!greedy_loop && choice_count == 2) { | |
| 3581 GuardedAlternative alt1 = alternatives_->at(1); | |
| 3582 if (alt1.guards() == NULL || alt1.guards()->length() == 0) { | |
| 3583 RegExpNode* eats_anything_node = alt1.node(); | |
| 3584 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) == | |
| 3585 this) { | |
| 3586 // At this point we know that we are at a non-greedy loop that will eat | |
| 3587 // any character one at a time. Any non-anchored regexp has such a | |
| 3588 // loop prepended to it in order to find where it starts. We look for | |
| 3589 // a pattern of the form ...abc... where we can look 6 characters ahead | |
| 3590 // and step forwards 3 if the character is not one of abc. Abc need | |
| 3591 // not be atoms, they can be any reasonably limited character class or | |
| 3592 // small alternation. | |
| 3593 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes. | |
| 3594 eats_at_least = | |
| 3595 Min(kMaxLookaheadForBoyerMoore, | |
| 3596 EatsAtLeast(kMaxLookaheadForBoyerMoore, 0, not_at_start)); | |
| 3597 if (eats_at_least >= 1) { | |
| 3598 BoyerMooreLookahead bm(eats_at_least, | |
| 3599 RegExpMacroAssembler::kTableSize, | |
| 3600 compiler); | |
| 3601 GuardedAlternative alt0 = alternatives_->at(0); | |
| 3602 alt0.node()->FillInBMInfo(0, &bm, not_at_start); | |
| 3603 skip_was_emitted = bm.EmitSkipInstructions(macro_assembler); | |
| 3604 } | |
| 3605 } | |
| 3606 } | |
| 3607 } | |
| 3608 | |
| 3609 if (eats_at_least == kEatsAtLeastNotYetInitialized) { | |
| 3610 // Save some time by looking at most one machine word ahead. | |
| 3611 eats_at_least = EatsAtLeast(compiler->ascii() ? 4 : 2, 0, not_at_start); | |
| 3612 } | |
| 3613 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least); | |
| 3614 | |
| 3615 bool preload_is_current = !skip_was_emitted && | |
| 3281 (current_trace->characters_preloaded() == preload_characters); | 3616 (current_trace->characters_preloaded() == preload_characters); |
| 3282 bool preload_has_checked_bounds = preload_is_current; | 3617 bool preload_has_checked_bounds = preload_is_current; |
| 3283 | 3618 |
| 3284 AlternativeGenerationList alt_gens(choice_count); | 3619 AlternativeGenerationList alt_gens(choice_count); |
| 3285 | 3620 |
| 3286 // For now we just call all choices one after the other. The idea ultimately | 3621 // For now we just call all choices one after the other. The idea ultimately |
| 3287 // is to use the Dispatch table to try only the relevant ones. | 3622 // is to use the Dispatch table to try only the relevant ones. |
| 3288 for (int i = first_normal_choice; i < choice_count; i++) { | 3623 for (int i = first_normal_choice; i < choice_count; i++) { |
| 3289 GuardedAlternative alternative = alternatives_->at(i); | 3624 GuardedAlternative alternative = alternatives_->at(i); |
| 3290 AlternativeGeneration* alt_gen = alt_gens.at(i); | 3625 AlternativeGeneration* alt_gen = alt_gens.at(i); |
| (...skipping 2134 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5425 // We don't know anything about the first character of a backreference | 5760 // We don't know anything about the first character of a backreference |
| 5426 // at this point. | 5761 // at this point. |
| 5427 // The potential first characters are the first characters of the capture, | 5762 // The potential first characters are the first characters of the capture, |
| 5428 // and the first characters of the on_success node, depending on whether the | 5763 // and the first characters of the on_success node, depending on whether the |
| 5429 // capture can be empty and whether it is known to be participating or known | 5764 // capture can be empty and whether it is known to be participating or known |
| 5430 // not to be. | 5765 // not to be. |
| 5431 return kComputeFirstCharacterSetFail; | 5766 return kComputeFirstCharacterSetFail; |
| 5432 } | 5767 } |
| 5433 | 5768 |
| 5434 | 5769 |
| 5770 void ChoiceNode::FillInBMInfo( | |
| 5771 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
| 5772 ZoneList<GuardedAlternative>* alts = alternatives(); | |
| 5773 for (int i = 0; i < alts->length(); i++) { | |
| 5774 GuardedAlternative& alt = alts->at(i); | |
| 5775 if (alt.guards() != NULL && alt.guards()->length() != 0) { | |
| 5776 bm->SetRest(offset); // Give up trying to fill in info. | |
| 5777 return; | |
| 5778 } | |
| 5779 alt.node()->FillInBMInfo(offset, bm, not_at_start); | |
| 5780 } | |
| 5781 } | |
| 5782 | |
| 5783 | |
| 5784 void TextNode::FillInBMInfo( | |
| 5785 int offset, BoyerMooreLookahead* bm, bool not_at_start) { | |
| 5786 if (offset >= bm->length()) return; | |
| 5787 int max_char = bm->max_char(); | |
| 5788 for (int i = 0; i < elements()->length(); i++) { | |
| 5789 if (offset >= bm->length()) return; | |
| 5790 TextElement text = elements()->at(i); | |
| 5791 if (text.type == TextElement::ATOM) { | |
| 5792 RegExpAtom* atom = text.data.u_atom; | |
| 5793 for (int j = 0; j < atom->length(); j++, offset++) { | |
| 5794 if (offset >= bm->length()) return; | |
| 5795 uc16 character = atom->data()[j]; | |
| 5796 if (bm->compiler()->ignore_case()) { | |
| 5797 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; | |
| 5798 int length = GetCaseIndependentLetters( | |
| 5799 ISOLATE, | |
| 5800 character, | |
| 5801 bm->max_char() == String::kMaxAsciiCharCode, | |
| 5802 chars); | |
| 5803 for (int j = 0; j < length; j++) { | |
| 5804 bm->Set(offset, chars[j]); | |
| 5805 } | |
| 5806 } else { | |
| 5807 if (character <= max_char) bm->Set(offset, character); | |
| 5808 } | |
| 5809 } | |
| 5810 } else { | |
| 5811 ASSERT(text.type == TextElement::CHAR_CLASS); | |
| 5812 RegExpCharacterClass* char_class = text.data.u_char_class; | |
| 5813 ZoneList<CharacterRange>* ranges = char_class->ranges(); | |
| 5814 if (char_class->is_negated()) { | |
| 5815 bm->SetAll(offset); | |
| 5816 } else { | |
| 5817 for (int k = 0; k < ranges->length(); k++) { | |
| 5818 CharacterRange& range = ranges->at(k); | |
| 5819 if (range.from() > max_char) continue; | |
| 5820 int to = Min(max_char, static_cast<int>(range.to())); | |
| 5821 if (to - range.from() >= BoyerMooreLookahead::kTooManyCharacters) { | |
| 5822 bm->SetAll(offset); | |
| 5823 break; | |
| 5824 } | |
| 5825 for (int m = range.from(); m <= to; m++) { | |
| 5826 bm->Set(offset, m); | |
| 5827 } | |
| 5828 } | |
| 5829 } | |
| 5830 offset++; | |
| 5831 } | |
| 5832 } | |
| 5833 if (offset >= bm->length()) return; | |
| 5834 on_success()->FillInBMInfo(offset, | |
| 5835 bm, | |
| 5836 true); // Not at start after a text node. | |
| 5837 } | |
| 5838 | |
| 5839 | |
| 5435 int TextNode::ComputeFirstCharacterSet(int budget) { | 5840 int TextNode::ComputeFirstCharacterSet(int budget) { |
| 5436 budget--; | 5841 budget--; |
| 5437 if (budget >= 0) { | 5842 if (budget >= 0) { |
| 5438 ASSERT_NE(0, elements()->length()); | 5843 ASSERT_NE(0, elements()->length()); |
| 5439 TextElement text = elements()->at(0); | 5844 TextElement text = elements()->at(0); |
| 5440 if (text.type == TextElement::ATOM) { | 5845 if (text.type == TextElement::ATOM) { |
| 5441 RegExpAtom* atom = text.data.u_atom; | 5846 RegExpAtom* atom = text.data.u_atom; |
| 5442 ASSERT_NE(0, atom->length()); | 5847 ASSERT_NE(0, atom->length()); |
| 5443 uc16 first_char = atom->data()[0]; | 5848 uc16 first_char = atom->data()[0]; |
| 5444 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); | 5849 ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1); |
| (...skipping 137 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5582 } | 5987 } |
| 5583 } | 5988 } |
| 5584 | 5989 |
| 5585 | 5990 |
| 5586 void DispatchTableConstructor::VisitAction(ActionNode* that) { | 5991 void DispatchTableConstructor::VisitAction(ActionNode* that) { |
| 5587 RegExpNode* target = that->on_success(); | 5992 RegExpNode* target = that->on_success(); |
| 5588 target->Accept(this); | 5993 target->Accept(this); |
| 5589 } | 5994 } |
| 5590 | 5995 |
| 5591 | 5996 |
| 5592 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data, | 5997 RegExpEngine::CompilationResult RegExpEngine::Compile( |
| 5593 bool ignore_case, | 5998 RegExpCompileData* data, |
| 5594 bool is_multiline, | 5999 bool ignore_case, |
| 5595 Handle<String> pattern, | 6000 bool is_multiline, |
| 5596 bool is_ascii) { | 6001 Handle<String> pattern, |
| 6002 Handle<String> sample_subject, | |
| 6003 bool is_ascii) { | |
| 5597 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { | 6004 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { |
| 5598 return IrregexpRegExpTooBig(); | 6005 return IrregexpRegExpTooBig(); |
| 5599 } | 6006 } |
| 5600 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); | 6007 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii); |
| 6008 | |
| 6009 // Sample some characters from the middle of the string. | |
| 6010 static const int kSampleSize = 128; | |
| 6011 | |
| 6012 FlattenString(sample_subject); | |
| 6013 int chars_sampled = 0; | |
| 6014 int half_way = (sample_subject->length() - kSampleSize) / 2; | |
| 6015 for (int i = Max(0, half_way); | |
| 6016 i < sample_subject->length(); | |
| 6017 i++, chars_sampled++) { | |
| 6018 if (chars_sampled > kSampleSize) break; | |
|
ulan
2012/04/02 08:12:24
Now we can move this into the for loop condition.
Erik Corry
2012/04/02 09:36:22
Done.
| |
| 6019 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i)); | |
| 6020 } | |
| 6021 | |
| 5601 // Wrap the body of the regexp in capture #0. | 6022 // Wrap the body of the regexp in capture #0. |
| 5602 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, | 6023 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree, |
| 5603 0, | 6024 0, |
| 5604 &compiler, | 6025 &compiler, |
| 5605 compiler.accept()); | 6026 compiler.accept()); |
| 5606 RegExpNode* node = captured_body; | 6027 RegExpNode* node = captured_body; |
| 5607 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); | 6028 bool is_end_anchored = data->tree->IsAnchoredAtEnd(); |
| 5608 bool is_start_anchored = data->tree->IsAnchoredAtStart(); | 6029 bool is_start_anchored = data->tree->IsAnchoredAtStart(); |
| 5609 int max_length = data->tree->max_match(); | 6030 int max_length = data->tree->max_match(); |
| 5610 if (!is_start_anchored) { | 6031 if (!is_start_anchored) { |
| (...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 5673 } | 6094 } |
| 5674 | 6095 |
| 5675 return compiler.Assemble(¯o_assembler, | 6096 return compiler.Assemble(¯o_assembler, |
| 5676 node, | 6097 node, |
| 5677 data->capture_count, | 6098 data->capture_count, |
| 5678 pattern); | 6099 pattern); |
| 5679 } | 6100 } |
| 5680 | 6101 |
| 5681 | 6102 |
| 5682 }} // namespace v8::internal | 6103 }} // namespace v8::internal |
| OLD | NEW |