src/string-search.h - Issue 3291021: Move string-search functions to separate file.

Side by Side Diff: src/string-search.h

Issue 3291021: Move string-search functions to separate file. (Closed)

Patch Set: Address (some)co Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2010 the V8 project authors. All rights reserved.

	2 // Redistribution and use in source and binary forms, with or without

	3 // modification, are permitted provided that the following conditions are

	4 // met:

	5 //

	6 // * Redistributions of source code must retain the above copyright

	7 // notice, this list of conditions and the following disclaimer.

	8 // * Redistributions in binary form must reproduce the above

	9 // copyright notice, this list of conditions and the following

	10 // disclaimer in the documentation and/or other materials provided

	11 // with the distribution.

	12 // * Neither the name of Google Inc. nor the names of its

	13 // contributors may be used to endorse or promote products derived

	14 // from this software without specific prior written permission.

	15 //

	16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	27

	28 #ifndef V8_STRING_SEARCH_H_

	29 #define V8_STRING_SEARCH_H_

	30

	31 namespace v8 {

	32 namespace internal {

	33

	34

	35 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a

	36 // limit, we can fix the size of tables. For a needle longer than this limit,

	37 // search will not be optimal, since we only build tables for a smaller suffix

	38 // of the string, which is a safe approximation.

	39 static const int kBMMaxShift = 250;

	40 // Reduce alphabet to this size.

	41 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size

	42 // proportional to the input alphabet. We reduce the alphabet size by

	43 // equating input characters modulo a smaller alphabet size. This gives

	44 // a potentially less efficient searching, but is a safe approximation.

	45 // For needles using only characters in the same Unicode 256-code point page,

	46 // there is no search speed degradation.

	47 static const int kBMAlphabetSize = 256;

	48 // For patterns below this length, the skip length of Boyer-Moore is too short

	49 // to compensate for the algorithmic overhead compared to simple brute force.

	50 static const int kBMMinPatternLength = 7;

	51

	52 // Holds the two buffers used by Boyer-Moore string search's Good Suffix

	53 // shift. Only allows the last kBMMaxShift characters of the needle

	54 // to be indexed.

	55 class BMGoodSuffixBuffers {

	56 public:

	57 BMGoodSuffixBuffers() {}

	58 inline void Initialize(int needle_length) {

	59 ASSERT(needle_length > 1);

	60 int start = needle_length < kBMMaxShift ? 0 : needle_length - kBMMaxShift;

	61 int len = needle_length - start;

	62 biased_suffixes_ = suffixes_ - start;

	63 biased_good_suffix_shift_ = good_suffix_shift_ - start;

	64 for (int i = 0; i <= len; i++) {

	65 good_suffix_shift_[i] = len;

	66 }

	67 }

	68 inline int& suffix(int index) {

	69 ASSERT(biased_suffixes_ + index >= suffixes_);

	70 return biased_suffixes_[index];

	71 }

	72 inline int& shift(int index) {

	73 ASSERT(biased_good_suffix_shift_ + index >= good_suffix_shift_);

	74 return biased_good_suffix_shift_[index];

	75 }

	76 private:

	77 int suffixes_[kBMMaxShift + 1];

	78 int good_suffix_shift_[kBMMaxShift + 1];

	79 int* biased_suffixes_;

	80 int* biased_good_suffix_shift_;

	81 DISALLOW_COPY_AND_ASSIGN(BMGoodSuffixBuffers);

	82 };

	83

	84 // buffers reused by BoyerMoore

	85 struct BMBuffers {

	86 public:

	87 static int bad_char_occurrence[kBMAlphabetSize];

	88 static BMGoodSuffixBuffers bmgs_buffers;

	89 };

	90

	91 // State of the string match tables.

	92 // SIMPLE: No usable content in the buffers.

	93 // BOYER_MOORE_HORSPOOL: The bad_char_occurence table has been populated.

	94 // BOYER_MOORE: The bmgs_buffers tables have also been populated.

	95 // Whenever starting with a new needle, one should call InitializeStringSearch

	96 // to determine which search strategy to use, and in the case of a long-needle

	97 // strategy, the call also initializes the algorithm to SIMPLE.

	98 enum StringSearchAlgorithm { SIMPLE_SEARCH, BOYER_MOORE_HORSPOOL, BOYER_MOORE };

	99 static StringSearchAlgorithm algorithm;

	100

	101

	102 // Compute the bad-char table for Boyer-Moore in the static buffer.

	103 template <typename PatternChar>

	104 static void BoyerMoorePopulateBadCharTable(Vector<const PatternChar> pattern) {

	105 // Only preprocess at most kBMMaxShift last characters of pattern.

	106 int start = Max(pattern.length() - kBMMaxShift, 0);

	107 // Run forwards to populate bad_char_table, so that last instance

	108 // of character equivalence class is the one registered.

	109 // Notice: Doesn't include the last character.

	110 int table_size = (sizeof(PatternChar) == 1) ? String::kMaxAsciiCharCode + 1

	111 : kBMAlphabetSize;

	112 if (start == 0) { // All patterns less than kBMMaxShift in length.

	113 memset(BMBuffers::bad_char_occurrence,

	114 -1,

	115 table_size * sizeof(*BMBuffers::bad_char_occurrence));

	116 } else {

	117 for (int i = 0; i < table_size; i++) {

	118 BMBuffers::bad_char_occurrence[i] = start - 1;

	119 }

	120 }

	121 for (int i = start; i < pattern.length() - 1; i++) {

	122 PatternChar c = pattern[i];

	123 int bucket = (sizeof(PatternChar) ==1) ? c : c % kBMAlphabetSize;

	124 BMBuffers::bad_char_occurrence[bucket] = i;

	125 }

	126 }

	127

	128

	129 template <typename PatternChar>

	130 static void BoyerMoorePopulateGoodSuffixTable(

	131 Vector<const PatternChar> pattern) {

	132 int m = pattern.length();

	133 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

	134 int len = m - start;

	135 // Compute Good Suffix tables.

	136 BMBuffers::bmgs_buffers.Initialize(m);

	137

	138 BMBuffers::bmgs_buffers.shift(m-1) = 1;

	139 BMBuffers::bmgs_buffers.suffix(m) = m + 1;

	140 PatternChar last_char = pattern[m - 1];

	141 int suffix = m + 1;

	142 {

	143 int i = m;

	144 while (i > start) {

	145 PatternChar c = pattern[i - 1];

	146 while (suffix <= m && c != pattern[suffix - 1]) {

	147 if (BMBuffers::bmgs_buffers.shift(suffix) == len) {

	148 BMBuffers::bmgs_buffers.shift(suffix) = suffix - i;

	149 }

	150 suffix = BMBuffers::bmgs_buffers.suffix(suffix);

	151 }

	152 BMBuffers::bmgs_buffers.suffix(--i) = --suffix;

	153 if (suffix == m) {

	154 // No suffix to extend, so we check against last_char only.

	155 while ((i > start) && (pattern[i - 1] != last_char)) {

	156 if (BMBuffers::bmgs_buffers.shift(m) == len) {

	157 BMBuffers::bmgs_buffers.shift(m) = m - i;

	158 }

	159 BMBuffers::bmgs_buffers.suffix(--i) = m;

	160 }

	161 if (i > start) {

	162 BMBuffers::bmgs_buffers.suffix(--i) = --suffix;

	163 }

	164 }

	165 }

	166 }

	167 if (suffix < m) {

	168 for (int i = start; i <= m; i++) {

	169 if (BMBuffers::bmgs_buffers.shift(i) == len) {

	170 BMBuffers::bmgs_buffers.shift(i) = suffix - start;

	171 }

	172 if (i == suffix) {

	173 suffix = BMBuffers::bmgs_buffers.suffix(suffix);

	174 }

	175 }

	176 }

	177 }

	178

	179

	180 template <typename SubjectChar, typename PatternChar>

	181 static inline int CharOccurrence(int char_code) {

	182 if (sizeof(SubjectChar) == 1) {

	183 return BMBuffers::bad_char_occurrence[char_code];

	184 }

	185 if (sizeof(PatternChar) == 1) {

	186 if (char_code > String::kMaxAsciiCharCode) {

	187 return -1;

	188 }

	189 return BMBuffers::bad_char_occurrence[char_code];

	190 }

	191 return BMBuffers::bad_char_occurrence[char_code % kBMAlphabetSize];

	192 }

	193

	194

	195 // Restricted simplified Boyer-Moore string matching.

	196 // Uses only the bad-shift table of Boyer-Moore and only uses it

	197 // for the character compared to the last character of the needle.

	198 template <typename SubjectChar, typename PatternChar>

	199 static int BoyerMooreHorspool(Vector<const SubjectChar> subject,

	200 Vector<const PatternChar> pattern,

	201 int start_index,

	202 bool* complete) {

	203 ASSERT(algorithm <= BOYER_MOORE_HORSPOOL);

	204 int n = subject.length();

	205 int m = pattern.length();

	206

	207 int badness = -m;

	208

	209 // How bad we are doing without a good-suffix table.

	210 int idx; // No matches found prior to this index.

	211 PatternChar last_char = pattern[m - 1];

	212 int last_char_shift =

	213 m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);

	214 // Perform search

	215 for (idx = start_index; idx <= n - m;) {

	216 int j = m - 1;

	217 int c;

	218 while (last_char != (c = subject[idx + j])) {

	219 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);

	220 int shift = j - bc_occ;

	221 idx += shift;

	222 badness += 1 - shift; // at most zero, so badness cannot increase.

	223 if (idx > n - m) {

	224 *complete = true;

	225 return -1;

	226 }

	227 }

	228 j--;

	229 while (j >= 0 && pattern[j] == (subject[idx + j])) j--;

	230 if (j < 0) {

	231 *complete = true;

	232 return idx;

	233 } else {

	234 idx += last_char_shift;

	235 // Badness increases by the number of characters we have

	236 // checked, and decreases by the number of characters we

	237 // can skip by shifting. It's a measure of how we are doing

	238 // compared to reading each character exactly once.

	239 badness += (m - j) - last_char_shift;

	240 if (badness > 0) {

	241 *complete = false;

	242 return idx;

	243 }

	244 }

	245 }

	246 *complete = true;

	247 return -1;

	248 }

	249

	250

	251 template <typename SubjectChar, typename PatternChar>

	252 static int BoyerMooreIndexOf(Vector<const SubjectChar> subject,

	253 Vector<const PatternChar> pattern,

	254 int idx) {

	255 ASSERT(algorithm <= BOYER_MOORE);

	256 int n = subject.length();

	257 int m = pattern.length();

	258 // Only preprocess at most kBMMaxShift last characters of pattern.

	259 int start = m < kBMMaxShift ? 0 : m - kBMMaxShift;

	260

	261 PatternChar last_char = pattern[m - 1];

	262 // Continue search from i.

	263 while (idx <= n - m) {

	264 int j = m - 1;

	265 SubjectChar c;

	266 while (last_char != (c = subject[idx + j])) {

	267 int shift = j - CharOccurrence<SubjectChar, PatternChar>(c);

	268 idx += shift;

	269 if (idx > n - m) {

	270 return -1;

	271 }

	272 }

	273 while (j >= 0 && pattern[j] == (c = subject[idx + j])) j--;

	274 if (j < 0) {

	275 return idx;

	276 } else if (j < start) {

	277 // we have matched more than our tables allow us to be smart about.

	278 // Fall back on BMH shift.

	279 idx += m - 1 - CharOccurrence<SubjectChar, PatternChar>(last_char);

	280 } else {

	281 int gs_shift = BMBuffers::bmgs_buffers.shift(j + 1); // Good suffix shift.

	282 int bc_occ = CharOccurrence<SubjectChar, PatternChar>(c);

	283 int shift = j - bc_occ; // Bad-char shift.

	284 if (gs_shift > shift) {

	285 shift = gs_shift;

	286 }

	287 idx += shift;

	288 }

	289 }

	290

	291 return -1;

	292 }

	293

	294

	295 // Trivial string search for shorter strings.

	296 // On return, if "complete" is set to true, the return value is the

	297 // final result of searching for the patter in the subject.

	298 // If "complete" is set to false, the return value is the index where

	299 // further checking should start, i.e., it's guaranteed that the pattern

	300 // does not occur at a position prior to the returned index.

	301 template <typename PatternChar, typename SubjectChar>

	302 static int SimpleIndexOf(Vector<const SubjectChar> subject,

	303 Vector<const PatternChar> pattern,

	304 int idx,

	305 bool* complete) {

	306 ASSERT(pattern.length() > 1);

	307 int pattern_length = pattern.length();

	308 // Badness is a count of how much work we have done. When we have

	309 // done enough work we decide it's probably worth switching to a better

	310 // algorithm.

	311 int badness = -10 - (pattern_length << 2);

	312

	313 // We know our pattern is at least 2 characters, we cache the first so

	314 // the common case of the first character not matching is faster.

	315 PatternChar pattern_first_char = pattern[0];

	316 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {

	317 badness++;

	318 if (badness > 0) {

	319 *complete = false;

	320 return i;

	321 }

	322 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {

	323 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(

	324 memchr(subject.start() + i,

	325 pattern_first_char,

	326 n - i + 1));

	327 if (pos == NULL) {

	328 *complete = true;

	329 return -1;

	330 }

	331 i = static_cast<int>(pos - subject.start());

	332 } else {

	333 if (subject[i] != pattern_first_char) continue;

	334 }

	335 int j = 1;

	336 do {

	337 if (pattern[j] != subject[i+j]) {

	338 break;

	339 }

	340 j++;

	341 } while (j < pattern_length);

	342 if (j == pattern_length) {

	343 *complete = true;

	344 return i;

	345 }

	346 badness += j;

	347 }

	348 *complete = true;

	349 return -1;

	350 }

	351

	352 // Simple indexOf that never bails out. For short patterns only.

	353 template <typename PatternChar, typename SubjectChar>

	354 static int SimpleIndexOf(Vector<const SubjectChar> subject,

	355 Vector<const PatternChar> pattern,

	356 int idx) {

	357 int pattern_length = pattern.length();

	358 PatternChar pattern_first_char = pattern[0];

	359 for (int i = idx, n = subject.length() - pattern_length; i <= n; i++) {

	360 if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {

	361 const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(

	362 memchr(subject.start() + i,

	363 pattern_first_char,

	364 n - i + 1));

	365 if (pos == NULL) return -1;

	366 i = static_cast<int>(pos - subject.start());

	367 } else {

	368 if (subject[i] != pattern_first_char) continue;

	369 }

	370 int j = 1;

	371 while (j < pattern_length) {

	372 if (pattern[j] != subject[i+j]) {

	373 break;

	374 }

	375 j++;

	376 }

	377 if (j == pattern_length) {

	378 return i;

	379 }

	380 }

	381 return -1;

	382 }

	383

	384

	385 // Strategy for searching for a string in another string.

	386 enum StringSearchStrategy { SEARCH_FAIL, SEARCH_SHORT, SEARCH_LONG };

	387

	388

	389 template <typename PatternChar>

	390 static inline StringSearchStrategy InitializeStringSearch(

	391 Vector<const PatternChar> pat, bool ascii_subject) {

	392 // We have an ASCII haystack and a non-ASCII needle. Check if there

	393 // really is a non-ASCII character in the needle and bail out if there

	394 // is.

	395 if (ascii_subject && sizeof(PatternChar) > 1) {

	396 for (int i = 0; i < pat.length(); i++) {

	397 uc16 c = pat[i];

	398 if (c > String::kMaxAsciiCharCode) {

	399 return SEARCH_FAIL;

	400 }

	401 }

	402 }

	403 if (pat.length() < kBMMinPatternLength) {

	404 return SEARCH_SHORT;

	405 }

	406 algorithm = SIMPLE_SEARCH;

	407 return SEARCH_LONG;

	408 }

	409

	410

	411 // Dispatch long needle searches to different algorithms.

	412 template <typename SubjectChar, typename PatternChar>

	413 static int ComplexIndexOf(Vector<const SubjectChar> sub,

	414 Vector<const PatternChar> pat,

	415 int start_index) {

	416 ASSERT(pat.length() >= kBMMinPatternLength);

	417 // Try algorithms in order of increasing setup cost and expected performance.

	418 bool complete;

	419 int idx = start_index;

	420 switch (algorithm) {

	421 case SIMPLE_SEARCH:

	422 idx = SimpleIndexOf(sub, pat, idx, &complete);

	423 if (complete) return idx;

	424 BoyerMoorePopulateBadCharTable(pat);

	425 algorithm = BOYER_MOORE_HORSPOOL;

	426 // FALLTHROUGH.

	427 case BOYER_MOORE_HORSPOOL:

	428 idx = BoyerMooreHorspool(sub, pat, idx, &complete);

	429 if (complete) return idx;

	430 // Build the Good Suffix table and continue searching.

	431 BoyerMoorePopulateGoodSuffixTable(pat);

	432 algorithm = BOYER_MOORE;

	433 // FALLTHROUGH.

	434 case BOYER_MOORE:

	435 return BoyerMooreIndexOf(sub, pat, idx);

	436 }

	437 UNREACHABLE();

	438 return -1;

	439 }

	440

	441

	442 // Dispatch to different search strategies for a single search.

	443 // If searching multiple times on the same needle, the search

	444 // strategy should only be computed once and then dispatch to different

	445 // loops.

	446 template <typename SubjectChar, typename PatternChar>

	447 static int StringSearch(Vector<const SubjectChar> sub,

	448 Vector<const PatternChar> pat,

	449 int start_index) {

	450 bool ascii_subject = (sizeof(SubjectChar) == 1);

	451 StringSearchStrategy strategy = InitializeStringSearch(pat, ascii_subject);

	452 switch (strategy) {

	453 case SEARCH_FAIL: return -1;

	454 case SEARCH_SHORT: return SimpleIndexOf(sub, pat, start_index);

	455 case SEARCH_LONG: return ComplexIndexOf(sub, pat, start_index);

	456 }

	457 UNREACHABLE();

	458 return -1;

	459 }

	460

	461 }} // namespace v8::internal

	462

	463 #endif // V8_STRING_SEARCH_H_

OLD	NEW

« no previous file with comments | « src/runtime.cc ('k') | no next file » | no next file with comments »