Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(400)

Side by Side Diff: src/runtime.cc

Issue 3291021: Move string-search functions to separate file. (Closed)
Patch Set: Address (some)co Created 10 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/string-search.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2006-2009 the V8 project authors. All rights reserved. 1 // Copyright 2006-2009 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without 2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are 3 // modification, are permitted provided that the following conditions are
4 // met: 4 // met:
5 // 5 //
6 // * Redistributions of source code must retain the above copyright 6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer. 7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above 8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following 9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided 10 // disclaimer in the documentation and/or other materials provided
(...skipping 29 matching lines...) Expand all
40 #include "execution.h" 40 #include "execution.h"
41 #include "jsregexp.h" 41 #include "jsregexp.h"
42 #include "liveedit.h" 42 #include "liveedit.h"
43 #include "parser.h" 43 #include "parser.h"
44 #include "platform.h" 44 #include "platform.h"
45 #include "runtime.h" 45 #include "runtime.h"
46 #include "scopeinfo.h" 46 #include "scopeinfo.h"
47 #include "smart-pointer.h" 47 #include "smart-pointer.h"
48 #include "stub-cache.h" 48 #include "stub-cache.h"
49 #include "v8threads.h" 49 #include "v8threads.h"
50 #include "string-search.h"
50 51
51 namespace v8 { 52 namespace v8 {
52 namespace internal { 53 namespace internal {
53 54
54 55
55 #define RUNTIME_ASSERT(value) \ 56 #define RUNTIME_ASSERT(value) \
56 if (!(value)) return Top::ThrowIllegalOperation(); 57 if (!(value)) return Top::ThrowIllegalOperation();
57 58
58 // Cast the given object to a value of the specified type and store 59 // Cast the given object to a value of the specified type and store
59 // it in a variable with the given name. If the object is not of the 60 // it in a variable with the given name. If the object is not of the
(...skipping 2504 matching lines...) Expand 10 before | Expand all | Expand 10 after
2564 } 2565 }
2565 } 2566 }
2566 2567
2567 return StringReplaceRegExpWithString(subject, 2568 return StringReplaceRegExpWithString(subject,
2568 regexp, 2569 regexp,
2569 replacement, 2570 replacement,
2570 last_match_info); 2571 last_match_info);
2571 } 2572 }
2572 2573
2573 2574
2574 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
2575 // limit, we can fix the size of tables.
2576 static const int kBMMaxShift = 0xff;
2577 // Reduce alphabet to this size.
2578 static const int kBMAlphabetSize = 0x100;
2579 // For patterns below this length, the skip length of Boyer-Moore is too short
2580 // to compensate for the algorithmic overhead compared to simple brute force.
2581 static const int kBMMinPatternLength = 7;
2582
2583 // Holds the two buffers used by Boyer-Moore string search's Good Suffix
2584 // shift. Only allows the last kBMMaxShift characters of the needle
2585 // to be indexed.
2586 class BMGoodSuffixBuffers {
2587 public:
2588 BMGoodSuffixBuffers() {}
2589 inline void init(int needle_length) {
2590 ASSERT(needle_length > 1);
2591 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;
2592 int len = needle_length - start;
2593 biased_suffixes_ = suffixes_ - start;
2594 biased_good_suffix_shift_ = good_suffix_shift_ - start;
2595 for (int i = 0; i <= len; i++) {
2596 good_suffix_shift_[i] = len;
2597 }
2598 }
2599 inline int& suffix(int index) {
2600 ASSERT(biased_suffixes_ + index >= suffixes_);
2601 return biased_suffixes_[index];
2602 }
2603 inline int& shift(int index) {
2604 ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);
2605 return biased_good_suffix_shift_[index];
2606 }
2607 private:
2608 int suffixes_[kBMMaxShift + 1];
2609 int good_suffix_shift_[kBMMaxShift + 1];
2610 int* biased_suffixes_;
2611 int* biased_good_suffix_shift_;
2612 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);
2613 };
2614
2615 // buffers reused by BoyerMoore
2616 static int bad_char_occurrence[kBMAlphabetSize];
2617 static BMGoodSuffixBuffers bmgs_buffers;
2618
2619 // State of the string match tables.
2620 // SIMPLE: No usable content in the buffers.
2621 // BOYER_MOORE_HORSPOOL: The bad_char_occurences table has been populated.
2622 // BOYER_MOORE: The bmgs_buffers tables have also been populated.
2623 // Whenever starting with a new needle, one should call InitializeStringSearch
2624 // to determine which search strategy to use, and in the case of a long-needle
2625 // strategy, the call also initializes the algorithm to SIMPLE.
2626 enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };
2627 static StringSearchAlgorithm algorithm;
2628
2629
2630 // Compute the bad-char table for Boyer-Moore in the static buffer.
2631 template <typename pchar>
2632 static void BoyerMoorePopulateBadCharTable(Vector<const pchar> pattern) {
2633 // Only preprocess at most kBMMaxShift last characters of pattern.
2634 int start = pattern.length() < kBMMaxShift ? 0
2635 : pattern.length() - kBMMaxShift;
2636 // Run forwards to populate bad_char_table, so that *last* instance
2637 // of character equivalence class is the one registered.
2638 // Notice: Doesn't include the last character.
2639 int table_size = (sizeof(pchar) == 1) ? String::kMaxAsciiCharCode + 1
2640 : kBMAlphabetSize;
2641 if (start == 0) { // All patterns less than kBMMaxShift in length.
2642 memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
2643 } else {
2644 for (int i = 0; i < table_size; i++) {
2645 bad_char_occurrence[i] = start - 1;
2646 }
2647 }
2648 for (int i = start; i < pattern.length() - 1; i++) {
2649 pchar c = pattern[i];
2650 int bucket = (sizeof(pchar) ==1) ? c : c % kBMAlphabetSize;
2651 bad_char_occurrence[bucket] = i;
2652 }
2653 }
2654
2655
2656 template <typename pchar>
2657 static void BoyerMoorePopulateGoodSuffixTable(Vector<const pchar> pattern) {
2658 int m = pattern.length();
2659 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
2660 int len = m - start;
2661 // Compute Good Suffix tables.
2662 bmgs_buffers.init(m);
2663
2664 bmgs_buffers.shift(m-1) = 1;
2665 bmgs_buffers.suffix(m) = m + 1;
2666 pchar last_char = pattern[m - 1];
2667 int suffix = m + 1;
2668 for (int i = m; i > start;) {
2669 for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {
2670 if (bmgs_buffers.shift(suffix) == len) {
2671 bmgs_buffers.shift(suffix) = suffix - i;
2672 }
2673 suffix = bmgs_buffers.suffix(suffix);
2674 }
2675 i--;
2676 suffix--;
2677 bmgs_buffers.suffix(i) = suffix;
2678 if (suffix == m) {
2679 // No suffix to extend, so we check against last_char only.
2680 while (i > start && pattern[i - 1] != last_char) {
2681 if (bmgs_buffers.shift(m) == len) {
2682 bmgs_buffers.shift(m) = m - i;
2683 }
2684 i--;
2685 bmgs_buffers.suffix(i) = m;
2686 }
2687 if (i > start) {
2688 i--;
2689 suffix--;
2690 bmgs_buffers.suffix(i) = suffix;
2691 }
2692 }
2693 }
2694 if (suffix < m) {
2695 for (int i = start; i <= m; i++) {
2696 if (bmgs_buffers.shift(i) == len) {
2697 bmgs_buffers.shift(i) = suffix - start;
2698 }
2699 if (i == suffix) {
2700 suffix = bmgs_buffers.suffix(suffix);
2701 }
2702 }
2703 }
2704 }
2705
2706
2707 template <typename schar, typename pchar>
2708 static inline int CharOccurrence(int char_code) {
2709 if (sizeof(schar) == 1) {
2710 return bad_char_occurrence[char_code];
2711 }
2712 if (sizeof(pchar) == 1) {
2713 if (char_code > String::kMaxAsciiCharCode) {
2714 return -1;
2715 }
2716 return bad_char_occurrence[char_code];
2717 }
2718 return bad_char_occurrence[char_code % kBMAlphabetSize];
2719 }
2720
2721
2722 // Restricted simplified Boyer-Moore string matching.
2723 // Uses only the bad-shift table of Boyer-Moore and only uses it
2724 // for the character compared to the last character of the needle.
2725 template <typename schar, typename pchar>
2726 static int BoyerMooreHorspool(Vector<const schar> subject,
2727 Vector<const pchar> pattern,
2728 int start_index,
2729 bool* complete) {
2730 ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);
2731 int n = subject.length();
2732 int m = pattern.length();
2733
2734 int badness = -m;
2735
2736 // How bad we are doing without a good-suffix table.
2737 int idx; // No matches found prior to this index.
2738 pchar last_char = pattern[m - 1];
2739 int last_char_shift = m - 1 - CharOccurrence<schar, pchar>(last_char);
2740 // Perform search
2741 for (idx = start_index; idx <= n - m;) {
2742 int j = m - 1;
2743 int c;
2744 while (last_char != (c = subject[idx + j])) {
2745 int bc_occ = CharOccurrence<schar, pchar>(c);
2746 int shift = j - bc_occ;
2747 idx += shift;
2748 badness += 1 - shift; // at most zero, so badness cannot increase.
2749 if (idx > n - m) {
2750 *complete = true;
2751 return -1;
2752 }
2753 }
2754 j--;
2755 while (j >= 0 && pattern[j] == (subject[idx + j])) j--;
2756 if (j < 0) {
2757 *complete = true;
2758 return idx;
2759 } else {
2760 idx += last_char_shift;
2761 // Badness increases by the number of characters we have
2762 // checked, and decreases by the number of characters we
2763 // can skip by shifting. It's a measure of how we are doing
2764 // compared to reading each character exactly once.
2765 badness += (m - j) - last_char_shift;
2766 if (badness > 0) {
2767 *complete = false;
2768 return idx;
2769 }
2770 }
2771 }
2772 *complete = true;
2773 return -1;
2774 }
2775
2776
2777 template <typename schar, typename pchar>
2778 static int BoyerMooreIndexOf(Vector<const schar> subject,
2779 Vector<const pchar> pattern,
2780 int idx) {
2781 ASSERT(algorithm <= BOYER_MOORE);
2782 int n = subject.length();
2783 int m = pattern.length();
2784 // Only preprocess at most kBMMaxShift last characters of pattern.
2785 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;
2786
2787 pchar last_char = pattern[m - 1];
2788 // Continue search from i.
2789 while (idx <= n - m) {
2790 int j = m - 1;
2791 schar c;
2792 while (last_char != (c = subject[idx + j])) {
2793 int shift = j - CharOccurrence<schar, pchar>(c);
2794 idx += shift;
2795 if (idx > n - m) {
2796 return -1;
2797 }
2798 }
2799 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;
2800 if (j < 0) {
2801 return idx;
2802 } else if (j < start) {
2803 // we have matched more than our tables allow us to be smart about.
2804 // Fall back on BMH shift.
2805 idx += m - 1 - CharOccurrence<schar, pchar>(last_char);
2806 } else {
2807 int gs_shift = bmgs_buffers.shift(j + 1); // Good suffix shift.
2808 int bc_occ = CharOccurrence<schar, pchar>(c);
2809 int shift = j - bc_occ; // Bad-char shift.
2810 if (gs_shift > shift) {
2811 shift = gs_shift;
2812 }
2813 idx += shift;
2814 }
2815 }
2816
2817 return -1;
2818 }
2819
2820
2821 // Trivial string search for shorter strings.
2822 // On return, if "complete" is set to true, the return value is the
2823 // final result of searching for the patter in the subject.
2824 // If "complete" is set to false, the return value is the index where
2825 // further checking should start, i.e., it's guaranteed that the pattern
2826 // does not occur at a position prior to the returned index.
2827 template <typename pchar, typename schar>
2828 static int SimpleIndexOf(Vector<const schar> subject,
2829 Vector<const pchar> pattern,
2830 int idx,
2831 bool* complete) {
2832 ASSERT(pattern.length() > 1);
2833 int pattern_length = pattern.length();
2834 // Badness is a count of how much work we have done. When we have
2835 // done enough work we decide it's probably worth switching to a better
2836 // algorithm.
2837 int badness = -10 - (pattern_length << 2);
2838
2839 // We know our pattern is at least 2 characters, we cache the first so
2840 // the common case of the first character not matching is faster.
2841 pchar pattern_first_char = pattern[0];
2842 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
2843 badness++;
2844 if (badness > 0) {
2845 *complete = false;
2846 return i;
2847 }
2848 if (sizeof(schar) == 1 && sizeof(pchar) == 1) {
2849 const schar* pos = reinterpret_cast<const schar*>(
2850 memchr(subject.start() + i,
2851 pattern_first_char,
2852 n - i + 1));
2853 if (pos == NULL) {
2854 *complete = true;
2855 return -1;
2856 }
2857 i = static_cast<int>(pos - subject.start());
2858 } else {
2859 if (subject[i] != pattern_first_char) continue;
2860 }
2861 int j = 1;
2862 do {
2863 if (pattern[j] != subject[i+j]) {
2864 break;
2865 }
2866 j++;
2867 } while (j < pattern_length);
2868 if (j == pattern_length) {
2869 *complete = true;
2870 return i;
2871 }
2872 badness += j;
2873 }
2874 *complete = true;
2875 return -1;
2876 }
2877
2878 // Simple indexOf that never bails out. For short patterns only.
2879 template <typename pchar, typename schar>
2880 static int SimpleIndexOf(Vector<const schar> subject,
2881 Vector<const pchar> pattern,
2882 int idx) {
2883 int pattern_length = pattern.length();
2884 pchar pattern_first_char = pattern[0];
2885 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {
2886 if (sizeof(schar) == 1 && sizeof(pchar) == 1) {
2887 const schar* pos = reinterpret_cast<const schar*>(
2888 memchr(subject.start() + i,
2889 pattern_first_char,
2890 n - i + 1));
2891 if (pos == NULL) return -1;
2892 i = static_cast<int>(pos - subject.start());
2893 } else {
2894 if (subject[i] != pattern_first_char) continue;
2895 }
2896 int j = 1;
2897 while (j < pattern_length) {
2898 if (pattern[j] != subject[i+j]) {
2899 break;
2900 }
2901 j++;
2902 }
2903 if (j == pattern_length) {
2904 return i;
2905 }
2906 }
2907 return -1;
2908 }
2909
2910
2911 // Strategy for searching for a string in another string.
2912 enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };
2913
2914
2915 template <typename pchar>
2916 static inline StringSearchStrategy InitializeStringSearch(
2917 Vector<const pchar> pat, bool ascii_subject) {
2918 // We have an ASCII haystack and a non-ASCII needle. Check if there
2919 // really is a non-ASCII character in the needle and bail out if there
2920 // is.
2921 if (ascii_subject && sizeof(pchar) > 1) {
2922 for (int i = 0; i < pat.length(); i++) {
2923 uc16 c = pat[i];
2924 if (c > String::kMaxAsciiCharCode) {
2925 return SEARCH_FAIL;
2926 }
2927 }
2928 }
2929 if (pat.length() < kBMMinPatternLength) {
2930 return SEARCH_SHORT;
2931 }
2932 algorithm = SIMPLE_SEARCH;
2933 return SEARCH_LONG;
2934 }
2935
2936
2937 // Dispatch long needle searches to different algorithms.
2938 template <typename schar, typename pchar>
2939 static int ComplexIndexOf(Vector<const schar> sub,
2940 Vector<const pchar> pat,
2941 int start_index) {
2942 ASSERT(pat.length() >= kBMMinPatternLength);
2943 // Try algorithms in order of increasing setup cost and expected performance.
2944 bool complete;
2945 int idx = start_index;
2946 switch (algorithm) {
2947 case SIMPLE_SEARCH:
2948 idx = SimpleIndexOf(sub, pat, idx, &complete);
2949 if (complete) return idx;
2950 BoyerMoorePopulateBadCharTable(pat);
2951 algorithm = BOYER_MOORE_HORSPOOL;
2952 // FALLTHROUGH.
2953 case BOYER_MOORE_HORSPOOL:
2954 idx = BoyerMooreHorspool(sub, pat, idx, &complete);
2955 if (complete) return idx;
2956 // Build the Good Suffix table and continue searching.
2957 BoyerMoorePopulateGoodSuffixTable(pat);
2958 algorithm = BOYER_MOORE;
2959 // FALLTHROUGH.
2960 case BOYER_MOORE:
2961 return BoyerMooreIndexOf(sub, pat, idx);
2962 }
2963 UNREACHABLE();
2964 return -1;
2965 }
2966
2967
2968 // Dispatch to different search strategies for a single search.
2969 // If searching multiple times on the same needle, the search
2970 // strategy should only be computed once and then dispatch to different
2971 // loops.
2972 template <typename schar, typename pchar>
2973 static int StringSearch(Vector<const schar> sub,
2974 Vector<const pchar> pat,
2975 int start_index) {
2976 bool ascii_subject = (sizeof(schar) == 1);
2977 StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);
2978 switch (strategy) {
2979 case SEARCH_FAIL: return -1;
2980 case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);
2981 case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);
2982 }
2983 UNREACHABLE();
2984 return -1;
2985 }
2986
2987
2988 // Perform string match of pattern on subject, starting at start index. 2575 // Perform string match of pattern on subject, starting at start index.
2989 // Caller must ensure that 0 <= start_index <= sub->length(), 2576 // Caller must ensure that 0 <= start_index <= sub->length(),
2990 // and should check that pat->length() + start_index <= sub->length() 2577 // and should check that pat->length() + start_index <= sub->length()
2991 int Runtime::StringMatch(Handle<String> sub, 2578 int Runtime::StringMatch(Handle<String> sub,
2992 Handle<String> pat, 2579 Handle<String> pat,
2993 int start_index) { 2580 int start_index) {
2994 ASSERT(0 <= start_index); 2581 ASSERT(0 <= start_index);
2995 ASSERT(start_index <= sub->length()); 2582 ASSERT(start_index <= sub->length());
2996 2583
2997 int pattern_length = pat->length(); 2584 int pattern_length = pat->length();
(...skipping 2165 matching lines...) Expand 10 before | Expand all | Expand 10 after
5163 int right = length; 4750 int right = length;
5164 if (trimRight) { 4751 if (trimRight) {
5165 while (right > left && IsTrimWhiteSpace(s->Get(right - 1))) { 4752 while (right > left && IsTrimWhiteSpace(s->Get(right - 1))) {
5166 right--; 4753 right--;
5167 } 4754 }
5168 } 4755 }
5169 return s->SubString(left, right); 4756 return s->SubString(left, right);
5170 } 4757 }
5171 4758
5172 4759
4760 // Define storage for buffers declared in header file.
4761 // TODO(lrn): Remove these when rewriting search code.
4762 int BMBuffers::bad_char_occurrence[kBMAlphabetSize];
4763 BMGoodSuffixBuffers BMBuffers::bmgs_buffers;
4764
4765
5173 template <typename schar, typename pchar> 4766 template <typename schar, typename pchar>
5174 void FindStringIndices(Vector<const schar> subject, 4767 void FindStringIndices(Vector<const schar> subject,
5175 Vector<const pchar> pattern, 4768 Vector<const pchar> pattern,
5176 ZoneList<int>* indices, 4769 ZoneList<int>* indices,
5177 unsigned int limit) { 4770 unsigned int limit) {
5178 ASSERT(limit > 0); 4771 ASSERT(limit > 0);
5179 // Collect indices of pattern in subject, and the end-of-string index. 4772 // Collect indices of pattern in subject, and the end-of-string index.
5180 // Stop after finding at most limit values. 4773 // Stop after finding at most limit values.
5181 StringSearchStrategy strategy = 4774 StringSearchStrategy strategy =
5182 InitializeStringSearch(pattern, sizeof(schar) == 1); 4775 InitializeStringSearch(pattern, sizeof(schar) == 1);
(...skipping 5350 matching lines...) Expand 10 before | Expand all | Expand 10 after
10533 } else { 10126 } else {
10534 // Handle last resort GC and make sure to allow future allocations 10127 // Handle last resort GC and make sure to allow future allocations
10535 // to grow the heap without causing GCs (if possible). 10128 // to grow the heap without causing GCs (if possible).
10536 Counters::gc_last_resort_from_js.Increment(); 10129 Counters::gc_last_resort_from_js.Increment();
10537 Heap::CollectAllGarbage(false); 10130 Heap::CollectAllGarbage(false);
10538 } 10131 }
10539 } 10132 }
10540 10133
10541 10134
10542 } } // namespace v8::internal 10135 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « no previous file | src/string-search.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698