chrome/installer/zucchini/suffix_array.h - Issue 2963463002: [Zucchini] Generic suffix array algorithms.

Side by Side Diff: chrome/installer/zucchini/suffix_array.h

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)

Patch Set: Make gcc happy Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2017 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

	6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

	7

	8 #include <algorithm>

	9 #include <cassert>

	10 #include <iterator>

	11 #include <numeric>

	12 #include <vector>

	13

	14 namespace zucchini {

	15

	16 // A functor class that implements the naive suffix sorting algorithm

	17 // that uses std::sort with lexicographical compare.

	18 // This is only meant as reference of the interface.

	19 class NaiveSuffixSort {

	20 public:

	21 // \|str\| must be a random access input range.

	22 // Characters found in \|str\| must be in the range [0, \|max_key\|)

	23 // \|suffix_array\| is a random access mutable range containing the result.

	24 template <class InputRng, class KeyType, class SAIt>

	25 void operator()(const InputRng& str,

	26 KeyType max_key,

	27 SAIt suffix_array) const {

	28 using size_type = typename SAIt::value_type;

	29

	30 size_type n = str.end() - str.begin();

	31

	32 // \|suffix_array\| is first filled with ordered indices of \|str\|.

	33 // Those indices are then sorted with lexicographical comparisons in \|str\|.

	34 std::iota(suffix_array, suffix_array + n, 0);

	35 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {

	36 return std::lexicographical_compare(std::begin(str) + i, std::end(str),

	37 std::begin(str) + j, std::end(str));

	38 });

	39 }

	40 };

	41

	42 // A functor class that implements suffix array induced sorting (SA-IS)

	43 // algorithm with linear time and memory complexity,

	44 // see http://ieeexplore.ieee.org/abstract/document/5582081/

	45 class Sais {

	46 public:

	47 // \|str\| must be a random access input range.

	48 // Characters found in \|str\| must be in the range [0, \|max_key\|)

	49 // \|suffix_array\| is a random access mutable range containing the result.

	50 template <class InputRng, class KeyType, class SAIt>

	51 void operator()(const InputRng& str,

	52 KeyType max_key,

	53 SAIt suffix_array) const {

	54 using value_type = typename InputRng::value_type;

	55 using size_type = typename SAIt::value_type;

	56

	57 static_assert(std::is_unsigned<value_type>::value,

	58 "Sais only supports input string with unsigned values");

	59

	60 size_type n = static_cast<size_type>(str.end() - str.begin());

	61

	62 Implementation<size_type, typename std::make_unsigned<KeyType>::type>::

	63 SuffixSort(std::begin(str), n, max_key, suffix_array);

	64 }

	65

	66 // Given the string S of length n.

	67 // We assume S is terminated by a unique sentinel $, which is considered as

	68 // the smallest character.

	69 // This sentinel does not exist in memory and is only treated implicitely.
	huangs 2017/06/27 20:31:21 Unwrap comments to fit. TYPO: implicitly Period Unwrap comments to fit. TYPO: implicitly Period at end. etiennep1 2017/06/28 19:26:08 Done. Show quoted text On 2017/06/27 20:31:21, huangs wrote: > Unwrap comments to fit. > > TYPO: implicitly > > Period at end. Done.
	70 // We denote suf(S, i) the suffix formed by S[i, n)

	71

	72 // A suffix suf(S, i) is said to be S-type or L-type,

	73 // if suf(S, i) < suf(S, i+1) or suf(S, i) > suf(S, i+1), respectively.

	74 enum SLType : bool { SType, LType };

	75

	76 // A character S[i] is said to be S-type or L-type if the suffix suf(S, i) is

	77 // S-type or L-type, respectively.

	78

	79 // A character S[i], i is called LMS (leftmost S-type),

	80 // if S[i] is S-type and S[i-1] is L-type.

	81 // A suffix suf(S, i) is called LMS, if S[i] is an LMS character.

	82

	83 // An LMS-substring is a substring S[i..j) with both S[i] and S[j],

	84 // being LMS characters, and there is no other LMS character in the substring,

	85 // or the sentinel itself

	86

	87 template <class SizeType, class KeyType>

	88 struct Implementation {

	89 static_assert(std::is_unsigned<SizeType>::value,

	90 "SizeType must be unsigned");

	91 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");

	92 using size_type = SizeType;

	93 using key_type = KeyType;

	94

	95 using iterator = typename std::vector<size_type>::iterator;

	96 using const_iterator = typename std::vector<size_type>::const_iterator;

	97

	98 // Partition every suffix based on SL-type.

	99 template <class StrIt>

	100 static size_type BuildSLPartition(

	101 StrIt str,

	102 size_type length,

	103 key_type max_key,

	104 std::vector<SLType>::reverse_iterator sl_partition) {

	105 // We will count LMS suffixes (S to L-type or last S-type)

	106 size_type lms_count = 0;

	107

	108 // \|previous_type\| is initialized to L-type to avoid counting an extra

	109 // LMS suffix at the end

	110 SLType previous_type = LType;

	111

	112 key_type previous_key = max_key; // initialized to dummy, impossible key

	113

	114 // We're travelling backward to determine the partition,

	115 // as if we prepend one character at a time to the string, ex:

	116 // b$ is L-type because b > $

	117 // ab$ is S-type because a < b, implying ab$ < b$

	118 // bab$ is L-type because b > a, implying bab$ > ab$

	119 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$

	120 for (auto str_it = std::reverse_iterator<StrIt>(str + length);

	121 str_it != std::reverse_iterator<StrIt>(str);

	122 ++str_it, ++sl_partition) {

	123 key_type current_key = *str_it;

	124

	125 if (current_key > previous_key \|\| previous_key == max_key) {

	126 // S[i] > S[i + 1] or S[i] is last character

	127 *sl_partition = LType;

	128 if (previous_type == SType)

	129 // suf(S, i) is L-type and suf(S, i + 1) is S-type,

	130 // therefore, suf(S, i+1) was a LMS suffix.

	131 ++lms_count;

	132

	133 previous_type = LType; // for next round

	134 } else if (current_key < previous_key) {

	135 // S[i] < S[i + 1]

	136 *sl_partition = SType;

	137 previous_type = SType; // for next round

	138 } else {

	139 // S[i] == S[i + 1]

	140 // The next character that differs determines the SL-type,

	141 // so we reuse the last seen type.

	142 *sl_partition = previous_type;

	143 }

	144 previous_key = current_key; // for next round

	145 }

	146

	147 return lms_count;

	148 }

	149

	150 // Find indices of LMS suffixes and write result to \|lms_indices\|.

	151 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,

	152 iterator lms_indices) {

	153 // \|previous_type\| is initialized to S-type to avoid counting an extra

	154 // LMS suffix at the beginning

	155 SLType previous_type = SType;

	156 size_type j = 0;

	157 for (size_type i = 0; i < sl_partition.size(); ++i) {

	158 if (sl_partition[i] == SType && previous_type == LType)

	159 lms_indices[j++] = i;

	160 previous_type = sl_partition[i];

	161 }

	162 }

	163

	164 template <class StrIt>

	165 static std::vector<size_type> MakeBucketCount(StrIt str,

	166 size_type length,

	167 key_type max_key) {

	168 // Occurence of every unique character is counted in \|buckets\|

	169 std::vector<size_type> buckets(static_cast<size_type>(max_key));

	170

	171 for (auto it = str; it != str + length; ++it)

	172 ++buckets[*it];

	173 return buckets;

	174 }

	175

	176 // Apply induced sort from \|lms_indices\| to \|suffix_array\| associated with

	177 // the string \|str\|.

	178 template <class StrIt, class SAIt>

	179 static void InducedSort(StrIt str,

	180 size_type length,

	181 const std::vector<SLType>& sl_partition,

	182 const std::vector<size_type>& lms_substrings,

	183 const std::vector<size_type>& buckets,

	184 SAIt suffix_array) {

	185 // All indices are first marked as unset with 0.

	186 std::fill(suffix_array, suffix_array + length, 0);

	187

	188 // Used to mark bucket boundaries (head or end) as indices in str.

	189 std::vector<size_type> bucket_bounds(buckets.size());

	190

	191 // Step 1

	192 // for each leftmost S-type suffix suf(S, i) found in \|lms_indices\|

	193 // scanned backward, place suf(S, i) at the end of the corresponding

	194 // bucket and forward the bucket end to the left.

	195

	196 // By corresponding bucket for suf(S, i), we mean the bucket associated

	197 // with the character S(i).

	198

	199 // find the end of each bucket

	200 bucket_bounds[0] = buckets[0];

	201 for (key_type i = 1; i < buckets.size(); ++i)

	202 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i];

	203

	204 for (auto it = lms_substrings.crbegin(); it != lms_substrings.crend();

	205 ++it) {

	206 key_type key = str[*it];

	207 suffix_array[--bucket_bounds[key]] = *it;

	208 }

	209

	210 // Step 2

	211 // for each modified suf(S, i), scanned forward, for which

	212 // suf(S, SA(i) - 1) is L-type,

	213 // place suf(S, SA(i) - 1) to the current head of the corresponding

	214 // bucket and forward the bucket head to the right.

	215

	216 // find the head of each bucket

	217 bucket_bounds[0] = 0;

	218 for (key_type i = 1; i < buckets.size(); ++i)

	219 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i - 1];

	220

	221 // from Step 1, the sentinel $, which we treat implicitely, would have

	222 // been place at the beginning of \|suffix_array\| since $ is always

	223 // considered as the smallest character.

	224 // We then have to deal with the previous (last) suffix.

	225 if (sl_partition[length - 1] == LType) {

	226 key_type key = str[length - 1];

	227 suffix_array[bucket_bounds[key]++] = length - 1;

	228 }

	229 for (auto it = suffix_array; it != suffix_array + length; ++it) {

	230 size_type suffix_index = *it;

	231

	232 // While the original algorithm marks unset suffixes with -1,

	233 // we found that marking them with 0 is also possible, since suf(S, 0)

	234 // has no previous suffix, and also more convenient because we are

	235 // working with unsigned integers.

	236 if (suffix_index > 0 && sl_partition[--suffix_index] == LType) {

	237 key_type key = str[suffix_index];

	238 suffix_array[bucket_bounds[key]++] = suffix_index;

	239 }

	240 }

	241

	242 // Step 3

	243 // for each modified suf(S, i), scanned backward, for which

	244 // suf(S, SA(i) - 1) is S-type,

	245 // place suf(S, SA(i) - 1) to the current end of the corresponding

	246 // bucket and forward the bucket head to the left.

	247

	248 // find the end of each bucket

	249 bucket_bounds[0] = buckets[0];

	250 for (size_type i = 1; i < buckets.size(); ++i)

	251 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i];

	252

	253 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);

	254 it != std::reverse_iterator<SAIt>(suffix_array); ++it) {

	255 size_type suffix_index = *it;

	256 if (suffix_index > 0 && sl_partition[--suffix_index] == SType) {

	257 key_type key = str[suffix_index];

	258 suffix_array[--bucket_bounds[key]] = suffix_index;

	259 }

	260 }

	261 // Deals with the last suffix, because of the sentinel.

	262 if (sl_partition[length - 1] == SType) {

	263 key_type key = str[length - 1];

	264 suffix_array[--bucket_bounds[key]] = length - 1;

	265 }

	266 }

	267

	268 // Given a string S starting at \|str\| with length \|length\|,

	269 // an array starting at \|substring_array\| containing ordered LMS terminated

	270 // substring indices of S and an SL-Type partition \|sl_partition\| of S,

	271 // assigns a unique label to every unique LMS substring.

	272 // The sorted labels for all LMS substrings are written to \|lms_str\|,

	273 // while the indices of LMS suffixes are written to \|lms_indices\|.

	274 // In addition, returns the total number of unique labels.

	275 template <class StrIt, class SAIt>

	276 static size_type LabelLmsSubstrings(StrIt str,

	277 size_type length,

	278 const std::vector<SLType>& sl_partition,

	279 SAIt suffix_array,

	280 iterator lms_indices,

	281 iterator lms_str) {

	282 // Labelling starts at 0

	283 size_type label = 0;

	284

	285 // \|previous_lms\| is initialized to 0 to indicate it is unset.

	286 // Note that suf(S, 0) is never a LMS suffix.

	287 // Substrings will be visited in relative order

	288 size_type previous_lms = 0;

	289 for (auto it = suffix_array; it != suffix_array + length; ++it) {

	290 if (it > 0 && sl_partition[it] == SType &&

	291 sl_partition[*it - 1] == LType) {

	292 // suf(S, *it) is a LMS suffix

	293

	294 size_type current_lms = *it;

	295 if (previous_lms != 0) {

	296 // There was a previous LMS suffix

	297 // Check if the current LMS substring is equal to the previous one

	298 SLType current_lms_type = SType, previous_lms_type = SType;

	299 for (size_type k = 0;; ++k) {

	300 // \|current_lms_end\| and \|previous_lms_end\| denote weither we have

	301 // reached the end of the current and previous LMS substring,

	302 // respectively

	303 bool current_lms_end = false, previous_lms_end = false;

	304

	305 // Check both previous and current substrings end ie

	306 // Note: it is more convenient to check if suf(S, current_lms + k)

	307 // is an LMS suffix than to retrieve it from lms_indices.

	308 if (current_lms + k >= length \|\|

	309 (current_lms_type == LType &&

	310 sl_partition[current_lms + k] == SType)) {

	311 current_lms_end = true;

	312 }

	313 if (previous_lms + k >= length \|\|

	314 (previous_lms_type == LType &&

	315 sl_partition[previous_lms + k] == SType)) {

	316 previous_lms_end = true;

	317 }

	318

	319 if (current_lms_end && previous_lms_end) {

	320 break;

	321 } else if (current_lms_end != previous_lms_end \|\|

	322 str[current_lms + k] != str[previous_lms + k]) {

	323 // previous and current substrings differ,

	324 ++label; // use a new label

	325 break;

	326 }

	327

	328 current_lms_type = sl_partition[current_lms + k];

	329 previous_lms_type = sl_partition[previous_lms + k];

	330 }

	331 }

	332 lms_indices++ = it;

	333 *lms_str++ = label;

	334 previous_lms = current_lms;

	335 }

	336 }

	337

	338 return ++label;

	339 }

	340

	341 // Implementation of the SA-IS algorithm.

	342 // \|str\| must be a random access iterator pointing at the beginning of S

	343 // with length \|length\|.

	344 // The result is writtend in \|suffix_array\|, a random access iterator.

	345 template <class StrIt, class SAIt>

	346 static void SuffixSort(StrIt str,

	347 size_type length,

	348 key_type max_key,

	349 SAIt suffix_array) {

	350 if (length < 2)
	huangs 2017/06/27 20:31:21 Should \|suffix_array\| be assigned [0] for the case Should \|suffix_array\| be assigned [0] for the case length == 1? etiennep1 2017/06/28 19:26:09 That sounds like a good idea Show quoted text On 2017/06/27 20:31:21, huangs wrote: > Should \|suffix_array\| be assigned [0] for the case length == 1? That sounds like a good idea
	351 return;

	352

	353 std::vector<SLType> sl_partition(length);

	354 size_type lms_count =

	355 BuildSLPartition(str, length, max_key, sl_partition.rbegin());

	356 std::vector<size_type> lms_indices(lms_count);

	357 FindLmsSuffixes(sl_partition, lms_indices.begin());

	358 std::vector<size_type> buckets = MakeBucketCount(str, length, max_key);

	359

	360 if (lms_indices.size() > 1) {

	361 // Given \|lms_indices\| in order of apparition, induce LMS substrings

	362 // relative order and write result to \|suffix_array\|.

	363 InducedSort(str, length, sl_partition, lms_indices, buckets,

	364 suffix_array);

	365 std::vector<size_type> lms_str(lms_indices.size());

	366

	367 // Given LMS substrings in relative order found in \|suffix_array\|,

	368 // map LMS substrings to unique labels to form a new string, \|lms_str\|.

	369 size_type label_count =

	370 LabelLmsSubstrings(str, length, sl_partition, suffix_array,

	371 lms_indices.begin(), lms_str.begin());

	372

	373 if (label_count < lms_str.size()) {

	374 // Reorder \|lms_str\| to have LMS suffixes in order of apparition.

	375 for (size_type i = 0; i < lms_indices.size(); ++i)

	376 suffix_array[lms_indices[i]] = lms_str[i];

	377

	378 SLType previous_type = SType;

	379 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {

	380 if (sl_partition[i] == SType && previous_type == LType) {

	381 lms_str[j] = suffix_array[i];

	382 lms_indices[j++] = i;

	383 }

	384 previous_type = sl_partition[i];

	385 }

	386

	387 // Recursively apply SuffixSort on \|lms_str\|, which is

	388 // formed from labeled LMS suffixes in order of apparition.

	389 // \|lms_str\| is at most half the length of \|str\|.

	390 Implementation<size_type, size_type>::SuffixSort(

	391 lms_str.begin(), static_cast<size_type>(lms_str.size()),

	392 label_count, suffix_array);

	393

	394 // Map LMS labels back to indices in \|str\| and

	395 // write result to \|lms_indices\|.

	396 // We're using \|suffix_array\| as a temporary buffer.

	397 for (size_type i = 0; i < lms_indices.size(); ++i)

	398 suffix_array[i] = lms_indices[suffix_array[i]];

	399 for (size_type i = 0; i < lms_indices.size(); ++i)

	400 lms_indices[i] = suffix_array[i];

	401

	402 // At this point, \|lms_indices\| contains sorted LMS suffixes of \|str\|.

	403 }

	404 }

	405 // Given \|lms_indices\| where LMS suffixes are sorted,

	406 // induce the full order of suffixes in \|str\|

	407 InducedSort(str, length, sl_partition, lms_indices, buckets,

	408 suffix_array);

	409 }

	410 };

	411 };

	412

	413 // Generates a suffix array from \|str\|, a random access input range from which

	414 // suffixes are sorted, using the functor \|algorithm\| which provides an

	415 // interface equivalent to NaiveSuffixSort.

	416 // Characters found in \|str\| are assumed to be in range [0, \|max_key\|).

	417 // Returns the suffix array as a vector.

	418 template <class Algorithm, class StrRng, class KeyType>

	419 std::vector<typename StrRng::size_type> MakeSuffixArray(Algorithm algorithm,
	huangs 2017/06/27 20:31:21 \|algorithm\| is unused? \|algorithm\| is unused? etiennep1 2017/06/28 19:26:09 It is used as a tag and allows the template parame Show quoted text On 2017/06/27 20:31:21, huangs wrote: > \|algorithm\| is unused? It is used as a tag and allows the template parameter Algorithm to be inferred. Instead of MakeSuffixSort<Sais>(...), we're using MakeSuffxSort(Sais(), ...) huangs 2017/06/28 20:02:55 I think the first form is preferred, since it's be I think the first form is preferred, since it's better to be explicit, and the second form instantialization.
	420 const StrRng& str,

	421 KeyType max_key) {

	422 Algorithm sort;

	423 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());

	424 sort(str, max_key, suffix_array.begin());

	425 return suffix_array;

	426 }

	427

	428 // Lexicographical lower bound of \|str2\| in the suffix array of \|str1\| using

	429 // binary search.

	430 // \|str1_first\| is a random access iterator pointing to the beginning of \|str1\|.

	431 // \|str2_first\| and \|str2_last\| are forward iterators pointing to the beginning

	432 // and end of \|str2\|, respectively.

	433 template <class SARng, class StrIt1, class StrIt2>

	434 auto SearchSuffixArray(const SARng& suffix_array,

	435 StrIt1 str1_first,

	436 StrIt2 str2_first,

	437 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {

	438 using size_type = typename SARng::value_type;

	439

	440 size_t n = std::end(suffix_array) - std::begin(suffix_array);

	441 auto it = std::lower_bound(

	442 std::begin(suffix_array), std::end(suffix_array), str2_first,

	443 [str1_first, str2_last, n](size_type a, StrIt2 b) {

	444 return std::lexicographical_compare(str1_first + a, str1_first + n, b,

	445 str2_last);

	446 });

	447 return it;

	448 }

	449

	450 } // namespace zucchini

	451

	452 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_

OLD	NEW