OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
7 | |
8 #include <algorithm> | |
9 #include <cassert> | |
10 #include <iterator> | |
11 #include <numeric> | |
12 #include <vector> | |
13 | |
14 namespace zucchini { | |
15 | |
16 // A functor class that implements the naive suffix sorting algorithm | |
17 // that uses std::sort with lexicographical compare. | |
18 // This is only meant as reference of the interface. | |
19 class NaiveSuffixSort { | |
20 public: | |
21 // |str| must be a random access input range. | |
22 // Characters found in |str| must be in the range [0, |max_key|) | |
23 // |suffix_array| is a random access mutable range containing the result. | |
24 template <class InputRng, class KeyType, class SAIt> | |
25 void operator()(const InputRng& str, | |
26 KeyType max_key, | |
27 SAIt suffix_array) const { | |
28 using size_type = typename SAIt::value_type; | |
29 | |
30 size_type n = str.end() - str.begin(); | |
31 | |
32 // |suffix_array| is first filled with ordered indices of |str|. | |
33 // Those indices are then sorted with lexicographical comparisons in |str|. | |
34 std::iota(suffix_array, suffix_array + n, 0); | |
35 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) { | |
36 return std::lexicographical_compare(std::begin(str) + i, std::end(str), | |
37 std::begin(str) + j, std::end(str)); | |
38 }); | |
39 } | |
40 }; | |
41 | |
42 // A functor class that implements suffix array induced sorting (SA-IS) | |
43 // algorithm with linear time and memory complexity, | |
44 // see http://ieeexplore.ieee.org/abstract/document/5582081/ | |
45 class Sais { | |
46 public: | |
47 // |str| must be a random access input range. | |
48 // Characters found in |str| must be in the range [0, |max_key|) | |
49 // |suffix_array| is a random access mutable range containing the result. | |
50 template <class InputRng, class KeyType, class SAIt> | |
51 void operator()(const InputRng& str, | |
52 KeyType max_key, | |
53 SAIt suffix_array) const { | |
54 using value_type = typename InputRng::value_type; | |
55 using size_type = typename SAIt::value_type; | |
56 | |
57 static_assert(std::is_unsigned<value_type>::value, | |
58 "Sais only supports input string with unsigned values"); | |
59 | |
60 size_type n = static_cast<size_type>(str.end() - str.begin()); | |
61 | |
62 Implementation<size_type, typename std::make_unsigned<KeyType>::type>:: | |
63 SuffixSort(std::begin(str), n, max_key, suffix_array); | |
64 } | |
65 | |
66 // Given the string S of length n. | |
67 // We assume S is terminated by a unique sentinel $, which is considered as | |
68 // the smallest character. | |
69 // This sentinel does not exist in memory and is only treated implicitely. | |
huangs
2017/06/27 20:31:21
Unwrap comments to fit.
TYPO: implicitly
Period
etiennep1
2017/06/28 19:26:08
Done.
| |
70 // We denote suf(S, i) the suffix formed by S[i, n) | |
71 | |
72 // A suffix suf(S, i) is said to be S-type or L-type, | |
73 // if suf(S, i) < suf(S, i+1) or suf(S, i) > suf(S, i+1), respectively. | |
74 enum SLType : bool { SType, LType }; | |
75 | |
76 // A character S[i] is said to be S-type or L-type if the suffix suf(S, i) is | |
77 // S-type or L-type, respectively. | |
78 | |
79 // A character S[i], i is called LMS (leftmost S-type), | |
80 // if S[i] is S-type and S[i-1] is L-type. | |
81 // A suffix suf(S, i) is called LMS, if S[i] is an LMS character. | |
82 | |
83 // An LMS-substring is a substring S[i..j) with both S[i] and S[j], | |
84 // being LMS characters, and there is no other LMS character in the substring, | |
85 // or the sentinel itself | |
86 | |
87 template <class SizeType, class KeyType> | |
88 struct Implementation { | |
89 static_assert(std::is_unsigned<SizeType>::value, | |
90 "SizeType must be unsigned"); | |
91 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); | |
92 using size_type = SizeType; | |
93 using key_type = KeyType; | |
94 | |
95 using iterator = typename std::vector<size_type>::iterator; | |
96 using const_iterator = typename std::vector<size_type>::const_iterator; | |
97 | |
98 // Partition every suffix based on SL-type. | |
99 template <class StrIt> | |
100 static size_type BuildSLPartition( | |
101 StrIt str, | |
102 size_type length, | |
103 key_type max_key, | |
104 std::vector<SLType>::reverse_iterator sl_partition) { | |
105 // We will count LMS suffixes (S to L-type or last S-type) | |
106 size_type lms_count = 0; | |
107 | |
108 // |previous_type| is initialized to L-type to avoid counting an extra | |
109 // LMS suffix at the end | |
110 SLType previous_type = LType; | |
111 | |
112 key_type previous_key = max_key; // initialized to dummy, impossible key | |
113 | |
114 // We're travelling backward to determine the partition, | |
115 // as if we prepend one character at a time to the string, ex: | |
116 // b$ is L-type because b > $ | |
117 // ab$ is S-type because a < b, implying ab$ < b$ | |
118 // bab$ is L-type because b > a, implying bab$ > ab$ | |
119 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$ | |
120 for (auto str_it = std::reverse_iterator<StrIt>(str + length); | |
121 str_it != std::reverse_iterator<StrIt>(str); | |
122 ++str_it, ++sl_partition) { | |
123 key_type current_key = *str_it; | |
124 | |
125 if (current_key > previous_key || previous_key == max_key) { | |
126 // S[i] > S[i + 1] or S[i] is last character | |
127 *sl_partition = LType; | |
128 if (previous_type == SType) | |
129 // suf(S, i) is L-type and suf(S, i + 1) is S-type, | |
130 // therefore, suf(S, i+1) was a LMS suffix. | |
131 ++lms_count; | |
132 | |
133 previous_type = LType; // for next round | |
134 } else if (current_key < previous_key) { | |
135 // S[i] < S[i + 1] | |
136 *sl_partition = SType; | |
137 previous_type = SType; // for next round | |
138 } else { | |
139 // S[i] == S[i + 1] | |
140 // The next character that differs determines the SL-type, | |
141 // so we reuse the last seen type. | |
142 *sl_partition = previous_type; | |
143 } | |
144 previous_key = current_key; // for next round | |
145 } | |
146 | |
147 return lms_count; | |
148 } | |
149 | |
150 // Find indices of LMS suffixes and write result to |lms_indices|. | |
151 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition, | |
152 iterator lms_indices) { | |
153 // |previous_type| is initialized to S-type to avoid counting an extra | |
154 // LMS suffix at the beginning | |
155 SLType previous_type = SType; | |
156 size_type j = 0; | |
157 for (size_type i = 0; i < sl_partition.size(); ++i) { | |
158 if (sl_partition[i] == SType && previous_type == LType) | |
159 lms_indices[j++] = i; | |
160 previous_type = sl_partition[i]; | |
161 } | |
162 } | |
163 | |
164 template <class StrIt> | |
165 static std::vector<size_type> MakeBucketCount(StrIt str, | |
166 size_type length, | |
167 key_type max_key) { | |
168 // Occurence of every unique character is counted in |buckets| | |
169 std::vector<size_type> buckets(static_cast<size_type>(max_key)); | |
170 | |
171 for (auto it = str; it != str + length; ++it) | |
172 ++buckets[*it]; | |
173 return buckets; | |
174 } | |
175 | |
176 // Apply induced sort from |lms_indices| to |suffix_array| associated with | |
177 // the string |str|. | |
178 template <class StrIt, class SAIt> | |
179 static void InducedSort(StrIt str, | |
180 size_type length, | |
181 const std::vector<SLType>& sl_partition, | |
182 const std::vector<size_type>& lms_substrings, | |
183 const std::vector<size_type>& buckets, | |
184 SAIt suffix_array) { | |
185 // All indices are first marked as unset with 0. | |
186 std::fill(suffix_array, suffix_array + length, 0); | |
187 | |
188 // Used to mark bucket boundaries (head or end) as indices in str. | |
189 std::vector<size_type> bucket_bounds(buckets.size()); | |
190 | |
191 // Step 1 | |
192 // for each leftmost S-type suffix suf(S, i) found in |lms_indices| | |
193 // scanned backward, place suf(S, i) at the end of the corresponding | |
194 // bucket and forward the bucket end to the left. | |
195 | |
196 // By corresponding bucket for suf(S, i), we mean the bucket associated | |
197 // with the character S(i). | |
198 | |
199 // find the end of each bucket | |
200 bucket_bounds[0] = buckets[0]; | |
201 for (key_type i = 1; i < buckets.size(); ++i) | |
202 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i]; | |
203 | |
204 for (auto it = lms_substrings.crbegin(); it != lms_substrings.crend(); | |
205 ++it) { | |
206 key_type key = str[*it]; | |
207 suffix_array[--bucket_bounds[key]] = *it; | |
208 } | |
209 | |
210 // Step 2 | |
211 // for each modified suf(S, i), scanned forward, for which | |
212 // suf(S, SA(i) - 1) is L-type, | |
213 // place suf(S, SA(i) - 1) to the current head of the corresponding | |
214 // bucket and forward the bucket head to the right. | |
215 | |
216 // find the head of each bucket | |
217 bucket_bounds[0] = 0; | |
218 for (key_type i = 1; i < buckets.size(); ++i) | |
219 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i - 1]; | |
220 | |
221 // from Step 1, the sentinel $, which we treat implicitely, would have | |
222 // been place at the beginning of |suffix_array| since $ is always | |
223 // considered as the smallest character. | |
224 // We then have to deal with the previous (last) suffix. | |
225 if (sl_partition[length - 1] == LType) { | |
226 key_type key = str[length - 1]; | |
227 suffix_array[bucket_bounds[key]++] = length - 1; | |
228 } | |
229 for (auto it = suffix_array; it != suffix_array + length; ++it) { | |
230 size_type suffix_index = *it; | |
231 | |
232 // While the original algorithm marks unset suffixes with -1, | |
233 // we found that marking them with 0 is also possible, since suf(S, 0) | |
234 // has no previous suffix, and also more convenient because we are | |
235 // working with unsigned integers. | |
236 if (suffix_index > 0 && sl_partition[--suffix_index] == LType) { | |
237 key_type key = str[suffix_index]; | |
238 suffix_array[bucket_bounds[key]++] = suffix_index; | |
239 } | |
240 } | |
241 | |
242 // Step 3 | |
243 // for each modified suf(S, i), scanned backward, for which | |
244 // suf(S, SA(i) - 1) is S-type, | |
245 // place suf(S, SA(i) - 1) to the current end of the corresponding | |
246 // bucket and forward the bucket head to the left. | |
247 | |
248 // find the end of each bucket | |
249 bucket_bounds[0] = buckets[0]; | |
250 for (size_type i = 1; i < buckets.size(); ++i) | |
251 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i]; | |
252 | |
253 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length); | |
254 it != std::reverse_iterator<SAIt>(suffix_array); ++it) { | |
255 size_type suffix_index = *it; | |
256 if (suffix_index > 0 && sl_partition[--suffix_index] == SType) { | |
257 key_type key = str[suffix_index]; | |
258 suffix_array[--bucket_bounds[key]] = suffix_index; | |
259 } | |
260 } | |
261 // Deals with the last suffix, because of the sentinel. | |
262 if (sl_partition[length - 1] == SType) { | |
263 key_type key = str[length - 1]; | |
264 suffix_array[--bucket_bounds[key]] = length - 1; | |
265 } | |
266 } | |
267 | |
268 // Given a string S starting at |str| with length |length|, | |
269 // an array starting at |substring_array| containing ordered LMS terminated | |
270 // substring indices of S and an SL-Type partition |sl_partition| of S, | |
271 // assigns a unique label to every unique LMS substring. | |
272 // The sorted labels for all LMS substrings are written to |lms_str|, | |
273 // while the indices of LMS suffixes are written to |lms_indices|. | |
274 // In addition, returns the total number of unique labels. | |
275 template <class StrIt, class SAIt> | |
276 static size_type LabelLmsSubstrings(StrIt str, | |
277 size_type length, | |
278 const std::vector<SLType>& sl_partition, | |
279 SAIt suffix_array, | |
280 iterator lms_indices, | |
281 iterator lms_str) { | |
282 // Labelling starts at 0 | |
283 size_type label = 0; | |
284 | |
285 // |previous_lms| is initialized to 0 to indicate it is unset. | |
286 // Note that suf(S, 0) is never a LMS suffix. | |
287 // Substrings will be visited in relative order | |
288 size_type previous_lms = 0; | |
289 for (auto it = suffix_array; it != suffix_array + length; ++it) { | |
290 if (*it > 0 && sl_partition[*it] == SType && | |
291 sl_partition[*it - 1] == LType) { | |
292 // suf(S, *it) is a LMS suffix | |
293 | |
294 size_type current_lms = *it; | |
295 if (previous_lms != 0) { | |
296 // There was a previous LMS suffix | |
297 // Check if the current LMS substring is equal to the previous one | |
298 SLType current_lms_type = SType, previous_lms_type = SType; | |
299 for (size_type k = 0;; ++k) { | |
300 // |current_lms_end| and |previous_lms_end| denote weither we have | |
301 // reached the end of the current and previous LMS substring, | |
302 // respectively | |
303 bool current_lms_end = false, previous_lms_end = false; | |
304 | |
305 // Check both previous and current substrings end ie | |
306 // Note: it is more convenient to check if suf(S, current_lms + k) | |
307 // is an LMS suffix than to retrieve it from lms_indices. | |
308 if (current_lms + k >= length || | |
309 (current_lms_type == LType && | |
310 sl_partition[current_lms + k] == SType)) { | |
311 current_lms_end = true; | |
312 } | |
313 if (previous_lms + k >= length || | |
314 (previous_lms_type == LType && | |
315 sl_partition[previous_lms + k] == SType)) { | |
316 previous_lms_end = true; | |
317 } | |
318 | |
319 if (current_lms_end && previous_lms_end) { | |
320 break; | |
321 } else if (current_lms_end != previous_lms_end || | |
322 str[current_lms + k] != str[previous_lms + k]) { | |
323 // previous and current substrings differ, | |
324 ++label; // use a new label | |
325 break; | |
326 } | |
327 | |
328 current_lms_type = sl_partition[current_lms + k]; | |
329 previous_lms_type = sl_partition[previous_lms + k]; | |
330 } | |
331 } | |
332 *lms_indices++ = *it; | |
333 *lms_str++ = label; | |
334 previous_lms = current_lms; | |
335 } | |
336 } | |
337 | |
338 return ++label; | |
339 } | |
340 | |
341 // Implementation of the SA-IS algorithm. | |
342 // |str| must be a random access iterator pointing at the beginning of S | |
343 // with length |length|. | |
344 // The result is writtend in |suffix_array|, a random access iterator. | |
345 template <class StrIt, class SAIt> | |
346 static void SuffixSort(StrIt str, | |
347 size_type length, | |
348 key_type max_key, | |
349 SAIt suffix_array) { | |
350 if (length < 2) | |
huangs
2017/06/27 20:31:21
Should |suffix_array| be assigned [0] for the case
etiennep1
2017/06/28 19:26:09
That sounds like a good idea
| |
351 return; | |
352 | |
353 std::vector<SLType> sl_partition(length); | |
354 size_type lms_count = | |
355 BuildSLPartition(str, length, max_key, sl_partition.rbegin()); | |
356 std::vector<size_type> lms_indices(lms_count); | |
357 FindLmsSuffixes(sl_partition, lms_indices.begin()); | |
358 std::vector<size_type> buckets = MakeBucketCount(str, length, max_key); | |
359 | |
360 if (lms_indices.size() > 1) { | |
361 // Given |lms_indices| in order of apparition, induce LMS substrings | |
362 // relative order and write result to |suffix_array|. | |
363 InducedSort(str, length, sl_partition, lms_indices, buckets, | |
364 suffix_array); | |
365 std::vector<size_type> lms_str(lms_indices.size()); | |
366 | |
367 // Given LMS substrings in relative order found in |suffix_array|, | |
368 // map LMS substrings to unique labels to form a new string, |lms_str|. | |
369 size_type label_count = | |
370 LabelLmsSubstrings(str, length, sl_partition, suffix_array, | |
371 lms_indices.begin(), lms_str.begin()); | |
372 | |
373 if (label_count < lms_str.size()) { | |
374 // Reorder |lms_str| to have LMS suffixes in order of apparition. | |
375 for (size_type i = 0; i < lms_indices.size(); ++i) | |
376 suffix_array[lms_indices[i]] = lms_str[i]; | |
377 | |
378 SLType previous_type = SType; | |
379 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) { | |
380 if (sl_partition[i] == SType && previous_type == LType) { | |
381 lms_str[j] = suffix_array[i]; | |
382 lms_indices[j++] = i; | |
383 } | |
384 previous_type = sl_partition[i]; | |
385 } | |
386 | |
387 // Recursively apply SuffixSort on |lms_str|, which is | |
388 // formed from labeled LMS suffixes in order of apparition. | |
389 // |lms_str| is at most half the length of |str|. | |
390 Implementation<size_type, size_type>::SuffixSort( | |
391 lms_str.begin(), static_cast<size_type>(lms_str.size()), | |
392 label_count, suffix_array); | |
393 | |
394 // Map LMS labels back to indices in |str| and | |
395 // write result to |lms_indices|. | |
396 // We're using |suffix_array| as a temporary buffer. | |
397 for (size_type i = 0; i < lms_indices.size(); ++i) | |
398 suffix_array[i] = lms_indices[suffix_array[i]]; | |
399 for (size_type i = 0; i < lms_indices.size(); ++i) | |
400 lms_indices[i] = suffix_array[i]; | |
401 | |
402 // At this point, |lms_indices| contains sorted LMS suffixes of |str|. | |
403 } | |
404 } | |
405 // Given |lms_indices| where LMS suffixes are sorted, | |
406 // induce the full order of suffixes in |str| | |
407 InducedSort(str, length, sl_partition, lms_indices, buckets, | |
408 suffix_array); | |
409 } | |
410 }; | |
411 }; | |
412 | |
413 // Generates a suffix array from |str|, a random access input range from which | |
414 // suffixes are sorted, using the functor |algorithm| which provides an | |
415 // interface equivalent to NaiveSuffixSort. | |
416 // Characters found in |str| are assumed to be in range [0, |max_key|). | |
417 // Returns the suffix array as a vector. | |
418 template <class Algorithm, class StrRng, class KeyType> | |
419 std::vector<typename StrRng::size_type> MakeSuffixArray(Algorithm algorithm, | |
huangs
2017/06/27 20:31:21
|algorithm| is unused?
etiennep1
2017/06/28 19:26:09
It is used as a tag and allows the template parame
huangs
2017/06/28 20:02:55
I think the first form is preferred, since it's be
| |
420 const StrRng& str, | |
421 KeyType max_key) { | |
422 Algorithm sort; | |
423 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin()); | |
424 sort(str, max_key, suffix_array.begin()); | |
425 return suffix_array; | |
426 } | |
427 | |
428 // Lexicographical lower bound of |str2| in the suffix array of |str1| using | |
429 // binary search. | |
430 // |str1_first| is a random access iterator pointing to the beginning of |str1|. | |
431 // |str2_first| and |str2_last| are forward iterators pointing to the beginning | |
432 // and end of |str2|, respectively. | |
433 template <class SARng, class StrIt1, class StrIt2> | |
434 auto SearchSuffixArray(const SARng& suffix_array, | |
435 StrIt1 str1_first, | |
436 StrIt2 str2_first, | |
437 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) { | |
438 using size_type = typename SARng::value_type; | |
439 | |
440 size_t n = std::end(suffix_array) - std::begin(suffix_array); | |
441 auto it = std::lower_bound( | |
442 std::begin(suffix_array), std::end(suffix_array), str2_first, | |
443 [str1_first, str2_last, n](size_type a, StrIt2 b) { | |
444 return std::lexicographical_compare(str1_first + a, str1_first + n, b, | |
445 str2_last); | |
446 }); | |
447 return it; | |
448 } | |
449 | |
450 } // namespace zucchini | |
451 | |
452 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
OLD | NEW |