src/runtime.cc - Issue 7114: * Changed string search to full-blown Boyer-Moore.

Side by Side Diff: src/runtime.cc

Issue 7114: * Changed string search to full-blown Boyer-Moore. (Closed)

Patch Set: Addressed review comments Created 12 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2006-2008 the V8 project authors. All rights reserved.	1 // Copyright 2006-2008 the V8 project authors. All rights reserved.

2 // Redistribution and use in source and binary forms, with or without	2 // Redistribution and use in source and binary forms, with or without

3 // modification, are permitted provided that the following conditions are	3 // modification, are permitted provided that the following conditions are

4 // met:	4 // met:

5 //	5 //

6 // * Redistributions of source code must retain the above copyright	6 // * Redistributions of source code must retain the above copyright

7 // notice, this list of conditions and the following disclaimer.	7 // notice, this list of conditions and the following disclaimer.

8 // * Redistributions in binary form must reproduce the above	8 // * Redistributions in binary form must reproduce the above

9 // copyright notice, this list of conditions and the following	9 // copyright notice, this list of conditions and the following

10 // disclaimer in the documentation and/or other materials provided	10 // disclaimer in the documentation and/or other materials provided

(...skipping 934 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
945 uint32_t code;	945 uint32_t code;

946 if (Array::IndexFromObject(args[0], &code)) {	946 if (Array::IndexFromObject(args[0], &code)) {

947 if (code <= 0xffff) {	947 if (code <= 0xffff) {

948 return Heap::LookupSingleCharacterStringFromCode(code);	948 return Heap::LookupSingleCharacterStringFromCode(code);

949 }	949 }

950 }	950 }

951 return Heap::empty_string();	951 return Heap::empty_string();

952 }	952 }

953	953

954	954

	955 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a

	956 // limit, we can fix the size of tables.

	957 static const int kBMMaxShift = 0xff;

	958 static const int kBMAlphabetSize = 0x100; // Reduce alphabet to this size.

	959

	960 // Holds the two buffers used by Boyer-Moore string search's Good Suffix

	961 // shift. Only allows the last kBMMaxShift characters of the needle

	962 // to be indexed.

	963 class BMGoodSuffixBuffers: public AllStatic {

	964 public:

	965 BMGoodSuffixBuffers() {}

	966 inline void init(int needle_length) {

	967 ASSERT(needle_length > 1);

	968 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;

	969 int len = needle_length - start;

	970 biased_suffixes_ = suffixes_ - start;

	971 biased_good_suffix_shift_ = good_suffix_shift_ - start;

	972 for (int i = 0; i <= len; i++) {

	973 good_suffix_shift_[i] = len;

	974 }

	975 }

	976 inline int& suffix(int index) {

	977 ASSERT(biased_suffixes_ + index >= suffixes_);

	978 return biased_suffixes_[index];

	979 }

	980 inline int& shift(int index) {

	981 ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);

	982 return biased_good_suffix_shift_[index];

	983 }

	984 private:

	985 int suffixes_[kBMMaxShift + 1];

	986 int good_suffix_shift_[kBMMaxShift + 1];

	987 int *biased_suffixes_;

	988 int *biased_good_suffix_shift_;

	989 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);

	990 };

	991

	992 // buffers reused by BoyerMoore

	993 static int bad_char_occurence[kBMAlphabetSize];

	994 static BMGoodSuffixBuffers bmgs_buffers;

	995

	996 // Restricted Boyer-Moore string matching. Restricts tables to a

	997 // suffix of long pattern strings and handles only equivalence classes

	998 // of the full alphabet. This allows us to ensure that tables take only

	999 // a fixed amount of space.

	1000 template <typename schar, typename pchar>

	1001 static int BoyerMooreIndexOf(Vector<const schar> subject,

	1002 Vector<const pchar> pattern,

	1003 int start_index) {

	1004 int m = pattern.length();

	1005 int n = subject.length();

	1006

	1007 // Only preprocess at most kBMMaxShift last characters of pattern.

	1008 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

	1009 int len = m - start;

	1010

	1011 // Run forwards to populate bad_char_table, so that last instance

	1012 // of character equivalence class is the one registered.

	1013 // Notice: Doesn't include last character.

	1014 for (int i = 0; i < kBMAlphabetSize; i++) {

	1015 bad_char_occurence[i] = start - 1;

	1016 }

	1017 for (int i = start; i < m; i++) {

	1018 uc32 c = pattern[i];

	1019 bad_char_occurence[c % kBMAlphabetSize] = i;

	1020 if (sizeof(schar) == 1 &&

	1021 sizeof(pchar) > 1 &&

	1022 c > String::kMaxAsciiCharCode) {

	1023 return -1;

	1024 }

	1025 }

	1026 // End of Bad Char computation.

	1027

	1028 // Compute Good Suffix shift table.

	1029 bmgs_buffers.init(m);

	1030

	1031 bmgs_buffers.shift(m-1) = 1;

	1032 bmgs_buffers.suffix(m) = m + 1;

	1033 pchar last_char = pattern[m - 1];

	1034 int suffix = m + 1;

	1035 for (int i = m; i > start;) {

	1036 for (pchar c = pattern[i - 1]; suffix <= m && c != pattern[suffix - 1];) {

	1037 if (bmgs_buffers.shift(suffix) == len) {

	1038 bmgs_buffers.shift(suffix) = suffix - i;

	1039 }

	1040 suffix = bmgs_buffers.suffix(suffix);

	1041 }

	1042 i--;

	1043 suffix--;

	1044 bmgs_buffers.suffix(i) = suffix;

	1045 if (suffix == m) {

	1046 // no suffix to extend, so we check against last_char only.

	1047 while (i > start && pattern[i - 1] != last_char) {

	1048 if (bmgs_buffers.shift(m) == len) {

	1049 bmgs_buffers.shift(m) = m - i;

	1050 }

	1051 i--;

	1052 bmgs_buffers.suffix(i) = m;

	1053 }

	1054 if (i > start) {

	1055 i--;

	1056 suffix--;

	1057 bmgs_buffers.suffix(i) = suffix;

	1058 }

	1059 }

	1060 }

	1061 if (suffix < m) {

	1062 for (int i = start; i <= m; i++) {

	1063 if (bmgs_buffers.shift(i) == len) {

	1064 bmgs_buffers.shift(i) = suffix - start;

	1065 }

	1066 if (i == suffix) {

	1067 suffix = bmgs_buffers.suffix(suffix);

	1068 }

	1069 }

	1070 }

	1071 // End of Good Suffix computation.

	1072

	1073

	1074 // Perform search

	1075 for (int i = start_index; i <= n - m;) {

	1076 int j = m - 1;

	1077 schar c;

	1078 while (j >= 0 && pattern[j] == (c = subject[i + j])) j--;

	1079 if (j < 0) {

	1080 return i;

	1081 } else if (j < start) {

	1082 // we have matched more than our tables allow us to be smart about.

	1083 i += 1;

	1084 } else {

	1085 int gs_shift = bmgs_buffers.shift(j + 1);

	1086 int bc_occ = bad_char_occurence[c % kBMAlphabetSize];

	1087 int bc_shift = j - bc_occ;

	1088 i += (gs_shift > bc_shift) ? gs_shift : bc_shift;

	1089 }

	1090 }

	1091 return -1;

	1092 }

	1093

955 template <typename schar, typename pchar>	1094 template <typename schar, typename pchar>

956 static int SingleCharIndexOf(Vector<const schar> string,	1095 static int SingleCharIndexOf(Vector<const schar> string,

957 pchar pattern_char,	1096 pchar pattern_char,

958 int start_index) {	1097 int start_index) {

959 for (int i = start_index, n = string.length(); i < n; i++) {	1098 for (int i = start_index, n = string.length(); i < n; i++) {

960 if (pattern_char == string[i]) {	1099 if (pattern_char == string[i]) {

961 return i;	1100 return i;

962 }	1101 }

963 }	1102 }

964 return -1;	1103 return -1;

965 }	1104 }

966	1105

967 // Trivial string search for shorter strings.	1106 // Trivial string search for shorter strings.

968 template <typename pchar, typename schar>	1107 template <typename pchar, typename schar>

969 static int SimpleIndexOf(Vector<const schar> subject,	1108 static int SimpleIndexOf(Vector<const schar> subject,

970 Vector<const pchar> pattern,	1109 Vector<const pchar> pattern,

971 int start_index) {	1110 int start_index) {

972 int pattern_length = pattern.length();	1111 int pattern_length = pattern.length();

973 int subject_length = subject.length();	1112 int subject_length = subject.length();

974 // We know our pattern is at least 2 characters, we cache the first so	1113 // We know our pattern is at least 2 characters, we cache the first so

975 // the common case of the first character not matching is faster.	1114 // the common case of the first character not matching is faster.

976 pchar pattern_first_char = pattern[0];	1115 pchar pattern_first_char = pattern[0];

977 for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {	1116 for (int i = start_index, n = subject_length - pattern_length; i <= n; i++) {

978 if (subject[i] != pattern_first_char) continue;	1117 if (subject[i] != pattern_first_char) continue;

979	1118 int j = 1;

980 bool failure = false;	1119 while (pattern[j] == subject[j+i]) {

981 for (int j = 1; j < pattern_length; j++) {	1120 j++;

982 if (pattern[j] != subject[j+i]) {	1121 if (j == pattern_length) {

983 failure = true;	1122 return i;

984 break;

985 }	1123 }

986 }	1124 }

987 if (!failure) {

988 return i;

989 }

990 }	1125 }

991 return -1;	1126 return -1;

992 }

993

994 // Maximal length (+1) of suffix that is indexed. Also the size of the

995 // maximal bad-character skip.

996 static const int kBMHSignificantSuffixLength = 0xff;

997

998 // Significant bits taken from characters to use in bad-character

999 // skips, to reduce size of the table for Unicode letters.

1000 static const int kBMHSignificantBitsMask = 0xff;

1001

1002 // Number of elements in bad-char table.

1003 static const int kBMHBadCharCount = kBMHSignificantBitsMask + 1;

1004

1005 // Simplified Boyer-Moore string matching. Only uses bad-char skipping,

1006 // and restricts table to a suffix of long strings (also restricting

1007 // the maximum possible skip-length) in order to reduce space.

1008 template <typename schar, typename pchar>

1009 static int BoyerMooreHorspoolIndexOf(Vector<const schar> subject,

1010 Vector<const pchar> pattern,

1011 int start_index) {

1012 ASSERT(kBMHSignificantSuffixLength < 0x100); // We can use bytes as skips.

1013 static byte bad_char_map[kBMHBadCharCount];

1014

1015 int m = pattern.length();

1016 int n = subject.length();

1017 // Cap bad char table to last p chars of pattern. Also max skip value.

1018 int p = m < kBMHSignificantSuffixLength ? m : kBMHSignificantSuffixLength;

1019

1020 memset(bad_char_map, p, kBMHBadCharCount);

1021

1022 // Run forwards to populate bad_char_table, so that last instance

1023 // of character equivalence class is the one registered.

1024 // Notice: Doesn't include last character.

1025 for (int i = p < m ? m - p : 0; i < m - 1; i++) {

1026 uc32 c = pattern[i];

1027 if (sizeof(schar) == 1 &&

1028 sizeof(pchar) > 1 &&

1029 c > String::kMaxAsciiCharCode) {

1030 return -1;

1031 }

1032 bad_char_map[c & kBMHSignificantBitsMask] = m - 1 - i;

1033 }

1034

1035 for (int i = start_index + m - 1, j = m - 1; i < n;) {

1036 schar c = subject[i];

1037 if (c == pattern[j]) {

1038 if (j == 0) {

1039 return i;

1040 }

1041 j--;

1042 i--;

1043 } else {

1044 int skip = bad_char_map[c & kBMHSignificantBitsMask];

1045 if (skip < (m - j)) {

1046 skip = m - j;

1047 }

1048 i += skip;

1049 j = m - 1;

1050 }

1051 }

1052 return -1;

1053 }

1054

1055

1056 // Full KMP pattern match.

1057 template <typename schar, typename pchar> // Pattern & subject char types

1058 static int KMPIndexOf(Vector<const schar> subject,

1059 Vector<const pchar> pattern,

1060 int start_index) {

1061 int subject_length = subject.length();

1062 int pattern_length = pattern.length();

1063 SmartPointer<int> next_table(NewArray<int>(pattern_length));

1064

1065 // Compute KMP "next" table

1066 int i = 0;

1067 int j = -1;

1068 next_table[0] = -1;

1069

1070 pchar p = pattern[0];

1071 while (i < pattern_length - 1) {

1072 while (j > -1 && p != pattern[j]) {

1073 j = next_table[j];

1074 }

1075 i++;

1076 j++;

1077 p = pattern[i];

1078 if (p == pattern[j]) {

1079 next_table[i] = next_table[j];

1080 } else {

1081 next_table[i] = j;

1082 }

1083 }

1084

1085 // Search using the 'next' table.

1086 int pattern_index = 0;

1087 int subject_index = start_index;

1088 while (subject_index < subject_length) {

1089 schar subject_char = subject[subject_index];

1090 while (pattern_index > -1 && pattern[pattern_index] != subject_char) {

1091 pattern_index = next_table[pattern_index];

1092 }

1093 pattern_index++;

1094 subject_index++;

1095 if (pattern_index >= pattern_length) {

1096 return subject_index - pattern_index;

1097 }

1098 }

1099 return -1;

1100 }	1127 }

1101	1128

1102 // Dispatch to different algorithms for different length of pattern/subject	1129 // Dispatch to different algorithms for different length of pattern/subject

1103 template <typename schar, typename pchar>	1130 template <typename schar, typename pchar>

1104 static int StringMatchStrategy(Vector<const schar> sub,	1131 static int StringMatchStrategy(Vector<const schar> sub,

1105 Vector<const pchar> pat,	1132 Vector<const pchar> pat,

1106 int start_index) {	1133 int start_index) {

1107 int pattern_length = pat.length();	1134 int pattern_length = pat.length();

1108 // Searching for one specific character is common. For one	1135 ASSERT(pattern_length > 1);

1109 // character patterns the KMP algorithm is guaranteed to slow down

1110 // the search, so we just run through the subject string.

1111 if (pattern_length == 1) {

1112 return SingleCharIndexOf(sub, pat[0], start_index);

1113 }

1114	1136

1115 // For small searches, a complex sort is not worth the setup overhead.	1137 // For small searches, a complex sort is not worth the setup overhead.

1116 if (sub.length() - start_index < 25) {	1138 int subject_length = sub.length() - start_index;

	1139 if (subject_length < 100 \|\| pattern_length < 4) {

1117 return SimpleIndexOf(sub, pat, start_index);	1140 return SimpleIndexOf(sub, pat, start_index);

1118 }	1141 }

1119	1142

1120 return BoyerMooreHorspoolIndexOf(sub, pat, start_index);	1143 return BoyerMooreIndexOf(sub, pat, start_index);

1121 }	1144 }

1122	1145

1123 // Perform string match of pattern on subject, starting at start index.	1146 // Perform string match of pattern on subject, starting at start index.

1124 // Caller must ensure that 0 <= start_index <= sub->length(),	1147 // Caller must ensure that 0 <= start_index <= sub->length(),

1125 // and should check that pat->length() + start_index <= sub->length()	1148 // and should check that pat->length() + start_index <= sub->length()

1126 int Runtime::StringMatch(Handle<String> sub,	1149 int Runtime::StringMatch(Handle<String> sub,

1127 Handle<String> pat,	1150 Handle<String> pat,

1128 int start_index) {	1151 int start_index) {

1129 ASSERT(0 <= start_index);	1152 ASSERT(0 <= start_index);

1130 ASSERT(start_index <= sub->length());	1153 ASSERT(start_index <= sub->length());

1131	1154

1132 int pattern_length = pat->length();	1155 int pattern_length = pat->length();

1133 if (pattern_length == 0) return start_index;	1156 if (pattern_length == 0) return start_index;

1134	1157

1135 int subject_length = sub->length();	1158 int subject_length = sub->length();

1136 if (start_index + pattern_length > subject_length) return -1;	1159 if (start_index + pattern_length > subject_length) return -1;

1137	1160

1138 FlattenString(sub);	1161 FlattenString(sub);

	1162 // Searching for one specific character is common. For one

	1163 // character patterns linear search is necessary, so any smart

	1164 // algorithm is unnecessary overhead.

	1165 if (pattern_length == 1) {

	1166 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid

	1167 if (sub->is_ascii_representation()) {

	1168 return SingleCharIndexOf(sub->ToAsciiVector(), pat->Get(0), start_index);

	1169 }

	1170 return SingleCharIndexOf(sub->ToUC16Vector(), pat->Get(0), start_index);

	1171 }

	1172

1139 FlattenString(pat);	1173 FlattenString(pat);

1140	1174

1141 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid	1175 AssertNoAllocation no_heap_allocation; // ensure vectors stay valid

1142 // dispatch on type of strings	1176 // dispatch on type of strings

1143 if (pat->is_ascii_representation()) {	1177 if (pat->is_ascii_representation()) {

1144 Vector<const char> pat_vector = pat->ToAsciiVector();	1178 Vector<const char> pat_vector = pat->ToAsciiVector();

1145 if (sub->is_ascii_representation()) {	1179 if (sub->is_ascii_representation()) {

1146 return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index);	1180 return StringMatchStrategy(sub->ToAsciiVector(), pat_vector, start_index);

1147 }	1181 }

1148 return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index);	1182 return StringMatchStrategy(sub->ToUC16Vector(), pat_vector, start_index);

(...skipping 4073 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5222	5256

5223 void Runtime::PerformGC(Object* result) {	5257 void Runtime::PerformGC(Object* result) {

5224 Failure* failure = Failure::cast(result);	5258 Failure* failure = Failure::cast(result);

5225 // Try to do a garbage collection; ignore it if it fails. The C	5259 // Try to do a garbage collection; ignore it if it fails. The C

5226 // entry stub will throw an out-of-memory exception in that case.	5260 // entry stub will throw an out-of-memory exception in that case.

5227 Heap::CollectGarbage(failure->requested(), failure->allocation_space());	5261 Heap::CollectGarbage(failure->requested(), failure->allocation_space());

5228 }	5262 }

5229	5263

5230	5264

5231 } } // namespace v8::internal	5265 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « no previous file | test/mjsunit/string-indexof.js » ('j') | no next file with comments »