chrome/installer/zucchini/suffix_array.h - Issue 2963463002: [Zucchini] Generic suffix array algorithms.

Side by Side Diff: chrome/installer/zucchini/suffix_array.h

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)

Patch Set: Fix comment issue Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2017 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

	6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

	7

	8 #include <algorithm>

	9 #include <cassert>

	10 #include <iterator>

	11 #include <numeric>

	12 #include <vector>

	13

	14 #include "base/logging.h"

	15

	16 namespace zucchini {

	17

	18 // A functor class that implements the naive suffix sorting algorithm that uses

	19 // std::sort with lexicographical compare. This is only meant as reference of

	20 // the interface.

	21 class NaiveSuffixSort {

	22 public:

	23 // Type requirements:

	24 // \|InputRng\| is an input random access range.

	25 // \|KeyType\| is an unsigned integer type.

	26 // \|SAIt\| is a random access iterator with mutable references.

	27 template <class InputRng, class KeyType, class SAIt>

	28 // \|str\| is the input string on which suffix sort is applied.

	29 // Characters found in \|str\| must be in the range [0, \|key_bound\|)

	30 // \|suffix_array\| is the beginning of the destination range, which is at least

	31 // as large as \|str\|.

	32 void operator()(const InputRng& str,

	33 KeyType key_bound,

	34 SAIt suffix_array) const {

	35 using size_type = typename SAIt::value_type;

	36

	37 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

	38

	39 // \|suffix_array\| is first filled with ordered indices of \|str\|.

	40 // Those indices are then sorted with lexicographical comparisons in \|str\|.

	41 std::iota(suffix_array, suffix_array + n, 0);

	42 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {

	43 return std::lexicographical_compare(std::begin(str) + i, std::end(str),

	44 std::begin(str) + j, std::end(str));

	45 });

	46 }

	47 };

	48

	49 // A functor class that implements suffix array induced sorting (SA-IS)

	50 // algorithm with linear time and memory complexity,

	51 // see http://ieeexplore.ieee.org/abstract/document/5582081/

	52 class Sais {

	53 public:

	54 // Type requirements:

	55 // \|InputRng\| is an input random access range.

	56 // \|KeyType\| is an unsigned integer type.

	57 // \|SAIt\| is a random access iterator with mutable values.

	58 template <class InputRng, class KeyType, class SAIt>

	59 // \|str\| is the input string on which suffix sort is applied.

	60 // Characters found in \|str\| must be in the range [0, \|key_bound\|)

	61 // \|suffix_array\| is the beginning of the destination range, which is at least

	62 // as large as \|str\|.

	63 void operator()(const InputRng& str,

	64 KeyType key_bound,

	65 SAIt suffix_array) const {

	66 using value_type = typename InputRng::value_type;

	67 using size_type = typename SAIt::value_type;

	68

	69 static_assert(std::is_unsigned<value_type>::value,

	70 "Sais only supports input string with unsigned values");

	71 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");

	72

	73 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));

	74

	75 Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,

	76 key_bound, suffix_array);

	77 }

	78

	79 // Given string S of length n. We assume S is terminated by a unique sentinel

	80 // $, which is considered as the smallest character. This sentinel does not

	81 // exist in memory and is only treated implicitly, hence \|n\| does not count

	82 // the sentinel in this implementation. We denote suf(S,i) the suffix formed

	83 // by S[i..n).

	84

	85 // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)

	86 // or suf(S,i) > suf(S,i+1), respectively.

	87 enum SLType : bool { SType, LType };

	88

	89 // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is

	90 // S-type or L-type, respectively.

	91

	92 // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and

	93 // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS

	94 // character.

	95

	96 // An LMS-substring is a substring S[i..j) with both S[i] and S[j], being LMS

	97 // characters, and there is no other LMS character in the substring, or the

	98 // sentinel itself

	99

	100 // A substring S[i..j) is an LMS-substring if
	huangs 2017/07/10 20:01:18 I meant to have this replace the above comment -- I meant to have this replace the above comment -- but I'll leave it to you. etiennep1 2017/07/10 22:57:08 Oops, I forgot to remove the above. Show quoted text On 2017/07/10 20:01:18, huangs wrote: > I meant to have this replace the above comment -- but I'll leave it to you. Oops, I forgot to remove the above.
	101 // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other

	102 // LMS characters, or

	103 // (2) S[i..j) is the sentinel $.

	104

	105 template <class SizeType, class KeyType>

	106 struct Implementation {

	107 static_assert(std::is_unsigned<SizeType>::value,

	108 "SizeType must be unsigned");

	109 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");

	110 using size_type = SizeType;

	111 using key_type = KeyType;

	112

	113 using iterator = typename std::vector<size_type>::iterator;

	114 using const_iterator = typename std::vector<size_type>::const_iterator;

	115

	116 // Partition every suffix based on SL-type. Returns the number of LMS

	117 // suffixes.

	118 template <class StrIt>

	119 static size_type BuildSLPartition(

	120 StrIt str,

	121 size_type length,

	122 key_type key_bound,

	123 std::vector<SLType>::reverse_iterator sl_partition_it) {

	124 // We will count LMS suffixes (S to L-type or last S-type).

	125 size_type lms_count = 0;

	126

	127 // \|previous_type\| is initialized to L-type to avoid counting an extra

	128 // LMS suffix at the end

	129 SLType previous_type = LType;

	130

	131 // Initialized to dummy, impossible key.

	132 key_type previous_key = key_bound;

	133

	134 // We're travelling backward to determine the partition,

	135 // as if we prepend one character at a time to the string, ex:

	136 // b$ is L-type because b > $.

	137 // ab$ is S-type because a < b, implying ab$ < b$.

	138 // bab$ is L-type because b > a, implying bab$ > ab$.

	139 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.

	140 for (auto str_it = std::reverse_iterator<StrIt>(str + length);

	141 str_it != std::reverse_iterator<StrIt>(str);

	142 ++str_it, ++sl_partition_it) {

	143 key_type current_key = *str_it;

	144

	145 if (current_key > previous_key \|\| previous_key == key_bound) {

	146 // S[i] > S[i + 1] or S[i] is last character.

	147 if (previous_type == SType)

	148 // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,

	149 // suf(S,i+1) was a LMS suffix.

	150 ++lms_count;

	151

	152 previous_type = LType; // For next round.

	153 } else if (current_key < previous_key) {

	154 // S[i] < S[i + 1]

	155 previous_type = SType; // For next round.

	156 }

	157 // Else, S[i] == S[i + 1]:

	158 // The next character that differs determines the SL-type,

	159 // so we reuse the last seen type.

	160

	161 *sl_partition_it = previous_type;

	162 previous_key = current_key; // For next round.

	163 }

	164

	165 return lms_count;

	166 }

	167

	168 // Find indices of LMS suffixes and write result to \|lms_indices\|.

	169 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,

	170 iterator lms_indices) {

	171 // \|previous_type\| is initialized to S-type to avoid counting an extra

	172 // LMS suffix at the beginning

	173 SLType previous_type = SType;

	174 for (size_type i = 0; i < sl_partition.size(); ++i) {

	175 if (sl_partition[i] == SType && previous_type == LType)

	176 *lms_indices++ = i;

	177 previous_type = sl_partition[i];

	178 }

	179 }

	180

	181 template <class StrIt>

	182 static std::vector<size_type> MakeBucketCount(StrIt str,

	183 size_type length,

	184 key_type key_bound) {

	185 // Occurrence of every unique character is counted in \|buckets\|

	186 std::vector<size_type> buckets(static_cast<size_type>(key_bound));

	187

	188 for (auto it = str; it != str + length; ++it)

	189 ++buckets[*it];

	190 return buckets;

	191 }

	192

	193 // Apply induced sort from \|lms_indices\| to \|suffix_array\| associated with

	194 // the string \|str\|.

	195 template <class StrIt, class SAIt>

	196 static void InducedSort(StrIt str,

	197 size_type length,

	198 const std::vector<SLType>& sl_partition,

	199 const std::vector<size_type>& lms_indices,

	200 const std::vector<size_type>& buckets,

	201 SAIt suffix_array) {

	202 // All indices are first marked as unset with the illegal value \|length\|.

	203 std::fill(suffix_array, suffix_array + length, length);

	204

	205 // Used to mark bucket boundaries (head or end) as indices in str.

	206 DCHECK(!buckets.empty());

	207 std::vector<size_type> bucket_bounds(buckets.size());

	208

	209 // Step 1: Assign indices for LMS suffixes, populating the end of

	210 // respective buckets but keeping relative order.

	211

	212 // Find the end of each bucket and write it to \|bucket_bounds\|.

	213 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

	214

	215 // Process each \|lms_indices\| backward, and assign them to the end of

	216 // their respective buckets, so relative order is preserved.

	217 for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {

	218 key_type key = str[*it];

	219 suffix_array[--bucket_bounds[key]] = *it;

	220 }

	221

	222 // Step 2

	223 // Scan forward \|suffix_array\|; for each modified suf(S,i) for which

	224 // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current

	225 // head of the corresponding bucket and forward the bucket head to the

	226 // right.

	227

	228 // Find the head of each bucket and write it to \|bucket_bounds\|. Since

	229 // only LMS suffixes where inserted in \|suffix_array\| during Step 1,

	230 // \|bucket_bounds\| does not contains the head of each bucket and needs to

	231 // be updated.

	232 bucket_bounds[0] = 0;

	233 std::partial_sum(buckets.begin(), buckets.end() - 1,

	234 bucket_bounds.begin() + 1);

	235

	236 // From Step 1, the sentinel $, which we treat implicitly, would have

	237 // been placed at the beginning of \|suffix_array\|, since $ is always

	238 // considered as the smallest character. We then have to deal with the

	239 // previous (last) suffix.

	240 if (sl_partition[length - 1] == LType) {

	241 key_type key = str[length - 1];

	242 suffix_array[bucket_bounds[key]++] = length - 1;

	243 }

	244 for (auto it = suffix_array; it != suffix_array + length; ++it) {

	245 size_type suffix_index = *it;

	246

	247 // While the original algorithm marks unset suffixes with -1,

	248 // we found that marking them with \|length\| is also possible and more

	249 // convenient because we are working with unsigned integers.

	250 if (suffix_index != length && suffix_index > 0 &&

	251 sl_partition[--suffix_index] == LType) {

	252 key_type key = str[suffix_index];

	253 suffix_array[bucket_bounds[key]++] = suffix_index;

	254 }

	255 }

	256

	257 // Step 3

	258 // Scan backward \|suffix_array\|; for each modified suf(S, i) for which

	259 // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current

	260 // end of the corresponding bucket and forward the bucket head to the

	261 // left.

	262

	263 // Find the end of each bucket and write it to \|bucket_bounds\|. Since

	264 // only L-type suffixes where inserted in \|suffix_array\| during Step 2,

	265 // \|bucket_bounds\| does not contain the end of each bucket and needs to

	266 // be updated.

	267 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());

	268

	269 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);

	270 it != std::reverse_iterator<SAIt>(suffix_array); ++it) {

	271 size_type suffix_index = *it;

	272 if (suffix_index != length && suffix_index > 0 &&

	273 sl_partition[--suffix_index] == SType) {

	274 key_type key = str[suffix_index];

	275 suffix_array[--bucket_bounds[key]] = suffix_index;

	276 }

	277 }

	278 // Deals with the last suffix, because of the sentinel.

	279 if (sl_partition[length - 1] == SType) {

	280 key_type key = str[length - 1];

	281 suffix_array[--bucket_bounds[key]] = length - 1;

	282 }

	283 }

	284

	285 // Given a string S starting at \|str\| with length \|length\|, an array

	286 // starting at \|substring_array\| containing lexicographically ordered LMS

	287 // terminated substring indices of S and an SL-Type partition \|sl_partition\|

	288 // of S, assigns a unique label to every unique LMS substring. The sorted

	289 // labels for all LMS substrings are written to \|lms_str\|, while the indices

	290 // of LMS suffixes are written to \|lms_indices\|. In addition, returns the

	291 // total number of unique labels.

	292 template <class StrIt, class SAIt>

	293 static size_type LabelLmsSubstrings(StrIt str,

	294 size_type length,

	295 const std::vector<SLType>& sl_partition,

	296 SAIt suffix_array,

	297 iterator lms_indices,

	298 iterator lms_str) {

	299 // Labelling starts at 0.

	300 size_type label = 0;

	301

	302 // \|previous_lms\| is initialized to 0 to indicate it is unset.

	303 // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in

	304 // lexicographical order.

	305 size_type previous_lms = 0;

	306 for (auto it = suffix_array; it != suffix_array + length; ++it) {

	307 if (it > 0 && sl_partition[it] == SType &&

	308 sl_partition[*it - 1] == LType) {

	309 // suf(S, *it) is a LMS suffix.

	310

	311 size_type current_lms = *it;

	312 if (previous_lms != 0) {

	313 // There was a previous LMS suffix. Check if the current LMS

	314 // substring is equal to the previous one.

	315 SLType current_lms_type = SType;

	316 SLType previous_lms_type = SType;

	317 for (size_type k = 0;; ++k) {

	318 // \|current_lms_end\| and \|previous_lms_end\| denote whether we have

	319 // reached the end of the current and previous LMS substring,

	320 // respectively

	321 bool current_lms_end = false;

	322 bool previous_lms_end = false;

	323

	324 // Check for both previous and current substring ends.

	325 // Note that it is more convenient to check if

	326 // suf(S,current_lms + k) is an LMS suffix than to retrieve it

	327 // from lms_indices.

	328 if (current_lms + k >= length \|\|

	329 (current_lms_type == LType &&

	330 sl_partition[current_lms + k] == SType)) {

	331 current_lms_end = true;

	332 }

	333 if (previous_lms + k >= length \|\|

	334 (previous_lms_type == LType &&

	335 sl_partition[previous_lms + k] == SType)) {

	336 previous_lms_end = true;

	337 }

	338

	339 if (current_lms_end && previous_lms_end) {

	340 break; // Previous and current substrings are identical.

	341 } else if (current_lms_end != previous_lms_end \|\|

	342 str[current_lms + k] != str[previous_lms + k]) {

	343 // Previous and current substrings differ, a new label is used.

	344 ++label;

	345 break;

	346 }

	347

	348 current_lms_type = sl_partition[current_lms + k];

	349 previous_lms_type = sl_partition[previous_lms + k];

	350 }

	351 }

	352 lms_indices++ = it;

	353 *lms_str++ = label;

	354 previous_lms = current_lms;

	355 }

	356 }

	357

	358 return label + 1;

	359 }

	360

	361 // Implementation of the SA-IS algorithm. \|str\| must be a random access

	362 // iterator pointing at the beginning of S with length \|length\|. The result

	363 // is writtend in \|suffix_array\|, a random access iterator.

	364 template <class StrIt, class SAIt>

	365 static void SuffixSort(StrIt str,

	366 size_type length,

	367 key_type key_bound,

	368 SAIt suffix_array) {

	369 if (length == 1)

	370 *suffix_array = 0;

	371 if (length < 2)

	372 return;

	373

	374 std::vector<SLType> sl_partition(length);

	375 size_type lms_count =

	376 BuildSLPartition(str, length, key_bound, sl_partition.rbegin());

	377 std::vector<size_type> lms_indices(lms_count);

	378 FindLmsSuffixes(sl_partition, lms_indices.begin());

	379 std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);

	380

	381 if (lms_indices.size() > 1) {

	382 // Given \|lms_indices\| in the same order they appear in \|str\|, induce

	383 // LMS substrings relative order and write result to \|suffix_array\|.

	384 InducedSort(str, length, sl_partition, lms_indices, buckets,

	385 suffix_array);

	386 std::vector<size_type> lms_str(lms_indices.size());

	387

	388 // Given LMS substrings in relative order found in \|suffix_array\|,

	389 // map LMS substrings to unique labels to form a new string, \|lms_str\|.

	390 size_type label_count =

	391 LabelLmsSubstrings(str, length, sl_partition, suffix_array,

	392 lms_indices.begin(), lms_str.begin());

	393

	394 if (label_count < lms_str.size()) {

	395 // Reorder \|lms_str\| to have LMS suffixes in the same order they

	396 // appear in \|str\|.

	397 for (size_type i = 0; i < lms_indices.size(); ++i)

	398 suffix_array[lms_indices[i]] = lms_str[i];

	399

	400 SLType previous_type = SType;

	401 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {

	402 if (sl_partition[i] == SType && previous_type == LType) {

	403 lms_str[j] = suffix_array[i];

	404 lms_indices[j++] = i;

	405 }

	406 previous_type = sl_partition[i];

	407 }

	408

	409 // Recursively apply SuffixSort on \|lms_str\|, which is formed from

	410 // labeled LMS suffixes in the same order they appear in \|str\|.

	411 // Note that \|KeyType\| will be size_type because \|lms_str\| contains

	412 // indices. \|lms_str\| is at most half the length of \|str\|.

	413 Implementation<size_type, size_type>::SuffixSort(

	414 lms_str.begin(), static_cast<size_type>(lms_str.size()),

	415 label_count, suffix_array);

	416

	417 // Map LMS labels back to indices in \|str\| and write result to

	418 // \|lms_indices\|. We're using \|suffix_array\| as a temporary buffer.

	419 for (size_type i = 0; i < lms_indices.size(); ++i)

	420 suffix_array[i] = lms_indices[suffix_array[i]];

	421 std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());

	422

	423 // At this point, \|lms_indices\| contains sorted LMS suffixes of \|str\|.

	424 }

	425 }

	426 // Given \|lms_indices\| where LMS suffixes are sorted, induce the full

	427 // order of suffixes in \|str\|.

	428 InducedSort(str, length, sl_partition, lms_indices, buckets,

	429 suffix_array);

	430 }

	431 };

	432 };

	433

	434 // Generates a sorted suffix array for the input string \|str\| using the functor

	435 // \|Algorithm\| which provides an interface equivalent to NaiveSuffixSort.

	436 /// Characters found in \|str\| are assumed to be in range [0, \|key_bound\|).

	437 // Returns the suffix array as a vector.

	438 // \|StrRng\| is an input random access range.

	439 // \|KeyType\| is an unsigned integer type.

	440 template <class Algorithm, class StrRng, class KeyType>

	441 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,

	442 KeyType key_bound) {

	443 Algorithm sort;

	444 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());

	445 sort(str, key_bound, suffix_array.begin());

	446 return suffix_array;

	447 }

	448

	449 // Type requirements:

	450 // \|SARng\| is an input random access range.

	451 // \|StrIt1\| is a random access iterator.

	452 // \|StrIt2\| is a forward iterator.

	453 template <class SARng, class StrIt1, class StrIt2>

	454 // Lexicographical lower bound using binary search for

	455 // [\|str2_first\|, \|str2_last\|) in the suffix array \|suffix_array\| of a string

	456 // starting at \|str1_first\|. This does not necessarily return the index of

	457 // the longest matching substring.

	458 auto SuffixLowerBound(const SARng& suffix_array,

	459 StrIt1 str1_first,

	460 StrIt2 str2_first,

	461 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {

	462 using size_type = typename SARng::value_type;

	463

	464 size_t n = std::end(suffix_array) - std::begin(suffix_array);

	465 auto it = std::lower_bound(

	466 std::begin(suffix_array), std::end(suffix_array), str2_first,

	467 [str1_first, str2_last, n](size_type a, StrIt2 b) {

	468 return std::lexicographical_compare(str1_first + a, str1_first + n, b,

	469 str2_last);

	470 });

	471 return it;

	472 }

	473

	474 } // namespace zucchini

	475

	476 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

OLD	NEW

« no previous file with comments | « chrome/installer/zucchini/BUILD.gn ('k') | chrome/installer/zucchini/suffix_array_unittest.cc » ('j') | no next file with comments »