Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1125)

Side by Side Diff: chrome/installer/zucchini/suffix_array.h

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)
Patch Set: Fix comment issue Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « chrome/installer/zucchini/BUILD.gn ('k') | chrome/installer/zucchini/suffix_array_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
7
8 #include <algorithm>
9 #include <cassert>
10 #include <iterator>
11 #include <numeric>
12 #include <vector>
13
14 #include "base/logging.h"
15
16 namespace zucchini {
17
18 // A functor class that implements the naive suffix sorting algorithm that uses
19 // std::sort with lexicographical compare. This is only meant as reference of
20 // the interface.
21 class NaiveSuffixSort {
22 public:
23 // Type requirements:
24 // |InputRng| is an input random access range.
25 // |KeyType| is an unsigned integer type.
26 // |SAIt| is a random access iterator with mutable references.
27 template <class InputRng, class KeyType, class SAIt>
28 // |str| is the input string on which suffix sort is applied.
29 // Characters found in |str| must be in the range [0, |key_bound|)
30 // |suffix_array| is the beginning of the destination range, which is at least
31 // as large as |str|.
32 void operator()(const InputRng& str,
33 KeyType key_bound,
34 SAIt suffix_array) const {
35 using size_type = typename SAIt::value_type;
36
37 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
38
39 // |suffix_array| is first filled with ordered indices of |str|.
40 // Those indices are then sorted with lexicographical comparisons in |str|.
41 std::iota(suffix_array, suffix_array + n, 0);
42 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
43 return std::lexicographical_compare(std::begin(str) + i, std::end(str),
44 std::begin(str) + j, std::end(str));
45 });
46 }
47 };
48
49 // A functor class that implements suffix array induced sorting (SA-IS)
50 // algorithm with linear time and memory complexity,
51 // see http://ieeexplore.ieee.org/abstract/document/5582081/
52 class Sais {
53 public:
54 // Type requirements:
55 // |InputRng| is an input random access range.
56 // |KeyType| is an unsigned integer type.
57 // |SAIt| is a random access iterator with mutable values.
58 template <class InputRng, class KeyType, class SAIt>
59 // |str| is the input string on which suffix sort is applied.
60 // Characters found in |str| must be in the range [0, |key_bound|)
61 // |suffix_array| is the beginning of the destination range, which is at least
62 // as large as |str|.
63 void operator()(const InputRng& str,
64 KeyType key_bound,
65 SAIt suffix_array) const {
66 using value_type = typename InputRng::value_type;
67 using size_type = typename SAIt::value_type;
68
69 static_assert(std::is_unsigned<value_type>::value,
70 "Sais only supports input string with unsigned values");
71 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
72
73 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
74
75 Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
76 key_bound, suffix_array);
77 }
78
79 // Given string S of length n. We assume S is terminated by a unique sentinel
80 // $, which is considered as the smallest character. This sentinel does not
81 // exist in memory and is only treated implicitly, hence |n| does not count
82 // the sentinel in this implementation. We denote suf(S,i) the suffix formed
83 // by S[i..n).
84
85 // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
86 // or suf(S,i) > suf(S,i+1), respectively.
87 enum SLType : bool { SType, LType };
88
89 // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
90 // S-type or L-type, respectively.
91
92 // A character S[i] is called LMS (leftmost S-type), if S[i] is S-type and
93 // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
94 // character.
95
96 // An LMS-substring is a substring S[i..j) with both S[i] and S[j], being LMS
97 // characters, and there is no other LMS character in the substring, or the
98 // sentinel itself
99
100 // A substring S[i..j) is an LMS-substring if
huangs 2017/07/10 20:01:18 I meant to have this replace the above comment --
etiennep1 2017/07/10 22:57:08 Oops, I forgot to remove the above.
101 // (1) S[i] is LMS, S[j] is LMS or the sentinel $, and S[i..j) has no other
102 // LMS characters, or
103 // (2) S[i..j) is the sentinel $.
104
105 template <class SizeType, class KeyType>
106 struct Implementation {
107 static_assert(std::is_unsigned<SizeType>::value,
108 "SizeType must be unsigned");
109 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
110 using size_type = SizeType;
111 using key_type = KeyType;
112
113 using iterator = typename std::vector<size_type>::iterator;
114 using const_iterator = typename std::vector<size_type>::const_iterator;
115
116 // Partition every suffix based on SL-type. Returns the number of LMS
117 // suffixes.
118 template <class StrIt>
119 static size_type BuildSLPartition(
120 StrIt str,
121 size_type length,
122 key_type key_bound,
123 std::vector<SLType>::reverse_iterator sl_partition_it) {
124 // We will count LMS suffixes (S to L-type or last S-type).
125 size_type lms_count = 0;
126
127 // |previous_type| is initialized to L-type to avoid counting an extra
128 // LMS suffix at the end
129 SLType previous_type = LType;
130
131 // Initialized to dummy, impossible key.
132 key_type previous_key = key_bound;
133
134 // We're travelling backward to determine the partition,
135 // as if we prepend one character at a time to the string, ex:
136 // b$ is L-type because b > $.
137 // ab$ is S-type because a < b, implying ab$ < b$.
138 // bab$ is L-type because b > a, implying bab$ > ab$.
139 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$.
140 for (auto str_it = std::reverse_iterator<StrIt>(str + length);
141 str_it != std::reverse_iterator<StrIt>(str);
142 ++str_it, ++sl_partition_it) {
143 key_type current_key = *str_it;
144
145 if (current_key > previous_key || previous_key == key_bound) {
146 // S[i] > S[i + 1] or S[i] is last character.
147 if (previous_type == SType)
148 // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
149 // suf(S,i+1) was a LMS suffix.
150 ++lms_count;
151
152 previous_type = LType; // For next round.
153 } else if (current_key < previous_key) {
154 // S[i] < S[i + 1]
155 previous_type = SType; // For next round.
156 }
157 // Else, S[i] == S[i + 1]:
158 // The next character that differs determines the SL-type,
159 // so we reuse the last seen type.
160
161 *sl_partition_it = previous_type;
162 previous_key = current_key; // For next round.
163 }
164
165 return lms_count;
166 }
167
168 // Find indices of LMS suffixes and write result to |lms_indices|.
169 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
170 iterator lms_indices) {
171 // |previous_type| is initialized to S-type to avoid counting an extra
172 // LMS suffix at the beginning
173 SLType previous_type = SType;
174 for (size_type i = 0; i < sl_partition.size(); ++i) {
175 if (sl_partition[i] == SType && previous_type == LType)
176 *lms_indices++ = i;
177 previous_type = sl_partition[i];
178 }
179 }
180
181 template <class StrIt>
182 static std::vector<size_type> MakeBucketCount(StrIt str,
183 size_type length,
184 key_type key_bound) {
185 // Occurrence of every unique character is counted in |buckets|
186 std::vector<size_type> buckets(static_cast<size_type>(key_bound));
187
188 for (auto it = str; it != str + length; ++it)
189 ++buckets[*it];
190 return buckets;
191 }
192
193 // Apply induced sort from |lms_indices| to |suffix_array| associated with
194 // the string |str|.
195 template <class StrIt, class SAIt>
196 static void InducedSort(StrIt str,
197 size_type length,
198 const std::vector<SLType>& sl_partition,
199 const std::vector<size_type>& lms_indices,
200 const std::vector<size_type>& buckets,
201 SAIt suffix_array) {
202 // All indices are first marked as unset with the illegal value |length|.
203 std::fill(suffix_array, suffix_array + length, length);
204
205 // Used to mark bucket boundaries (head or end) as indices in str.
206 DCHECK(!buckets.empty());
207 std::vector<size_type> bucket_bounds(buckets.size());
208
209 // Step 1: Assign indices for LMS suffixes, populating the end of
210 // respective buckets but keeping relative order.
211
212 // Find the end of each bucket and write it to |bucket_bounds|.
213 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
214
215 // Process each |lms_indices| backward, and assign them to the end of
216 // their respective buckets, so relative order is preserved.
217 for (auto it = lms_indices.crbegin(); it != lms_indices.crend(); ++it) {
218 key_type key = str[*it];
219 suffix_array[--bucket_bounds[key]] = *it;
220 }
221
222 // Step 2
223 // Scan forward |suffix_array|; for each modified suf(S,i) for which
224 // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
225 // head of the corresponding bucket and forward the bucket head to the
226 // right.
227
228 // Find the head of each bucket and write it to |bucket_bounds|. Since
229 // only LMS suffixes where inserted in |suffix_array| during Step 1,
230 // |bucket_bounds| does not contains the head of each bucket and needs to
231 // be updated.
232 bucket_bounds[0] = 0;
233 std::partial_sum(buckets.begin(), buckets.end() - 1,
234 bucket_bounds.begin() + 1);
235
236 // From Step 1, the sentinel $, which we treat implicitly, would have
237 // been placed at the beginning of |suffix_array|, since $ is always
238 // considered as the smallest character. We then have to deal with the
239 // previous (last) suffix.
240 if (sl_partition[length - 1] == LType) {
241 key_type key = str[length - 1];
242 suffix_array[bucket_bounds[key]++] = length - 1;
243 }
244 for (auto it = suffix_array; it != suffix_array + length; ++it) {
245 size_type suffix_index = *it;
246
247 // While the original algorithm marks unset suffixes with -1,
248 // we found that marking them with |length| is also possible and more
249 // convenient because we are working with unsigned integers.
250 if (suffix_index != length && suffix_index > 0 &&
251 sl_partition[--suffix_index] == LType) {
252 key_type key = str[suffix_index];
253 suffix_array[bucket_bounds[key]++] = suffix_index;
254 }
255 }
256
257 // Step 3
258 // Scan backward |suffix_array|; for each modified suf(S, i) for which
259 // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
260 // end of the corresponding bucket and forward the bucket head to the
261 // left.
262
263 // Find the end of each bucket and write it to |bucket_bounds|. Since
264 // only L-type suffixes where inserted in |suffix_array| during Step 2,
265 // |bucket_bounds| does not contain the end of each bucket and needs to
266 // be updated.
267 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
268
269 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
270 it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
271 size_type suffix_index = *it;
272 if (suffix_index != length && suffix_index > 0 &&
273 sl_partition[--suffix_index] == SType) {
274 key_type key = str[suffix_index];
275 suffix_array[--bucket_bounds[key]] = suffix_index;
276 }
277 }
278 // Deals with the last suffix, because of the sentinel.
279 if (sl_partition[length - 1] == SType) {
280 key_type key = str[length - 1];
281 suffix_array[--bucket_bounds[key]] = length - 1;
282 }
283 }
284
285 // Given a string S starting at |str| with length |length|, an array
286 // starting at |substring_array| containing lexicographically ordered LMS
287 // terminated substring indices of S and an SL-Type partition |sl_partition|
288 // of S, assigns a unique label to every unique LMS substring. The sorted
289 // labels for all LMS substrings are written to |lms_str|, while the indices
290 // of LMS suffixes are written to |lms_indices|. In addition, returns the
291 // total number of unique labels.
292 template <class StrIt, class SAIt>
293 static size_type LabelLmsSubstrings(StrIt str,
294 size_type length,
295 const std::vector<SLType>& sl_partition,
296 SAIt suffix_array,
297 iterator lms_indices,
298 iterator lms_str) {
299 // Labelling starts at 0.
300 size_type label = 0;
301
302 // |previous_lms| is initialized to 0 to indicate it is unset.
303 // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
304 // lexicographical order.
305 size_type previous_lms = 0;
306 for (auto it = suffix_array; it != suffix_array + length; ++it) {
307 if (*it > 0 && sl_partition[*it] == SType &&
308 sl_partition[*it - 1] == LType) {
309 // suf(S, *it) is a LMS suffix.
310
311 size_type current_lms = *it;
312 if (previous_lms != 0) {
313 // There was a previous LMS suffix. Check if the current LMS
314 // substring is equal to the previous one.
315 SLType current_lms_type = SType;
316 SLType previous_lms_type = SType;
317 for (size_type k = 0;; ++k) {
318 // |current_lms_end| and |previous_lms_end| denote whether we have
319 // reached the end of the current and previous LMS substring,
320 // respectively
321 bool current_lms_end = false;
322 bool previous_lms_end = false;
323
324 // Check for both previous and current substring ends.
325 // Note that it is more convenient to check if
326 // suf(S,current_lms + k) is an LMS suffix than to retrieve it
327 // from lms_indices.
328 if (current_lms + k >= length ||
329 (current_lms_type == LType &&
330 sl_partition[current_lms + k] == SType)) {
331 current_lms_end = true;
332 }
333 if (previous_lms + k >= length ||
334 (previous_lms_type == LType &&
335 sl_partition[previous_lms + k] == SType)) {
336 previous_lms_end = true;
337 }
338
339 if (current_lms_end && previous_lms_end) {
340 break; // Previous and current substrings are identical.
341 } else if (current_lms_end != previous_lms_end ||
342 str[current_lms + k] != str[previous_lms + k]) {
343 // Previous and current substrings differ, a new label is used.
344 ++label;
345 break;
346 }
347
348 current_lms_type = sl_partition[current_lms + k];
349 previous_lms_type = sl_partition[previous_lms + k];
350 }
351 }
352 *lms_indices++ = *it;
353 *lms_str++ = label;
354 previous_lms = current_lms;
355 }
356 }
357
358 return label + 1;
359 }
360
361 // Implementation of the SA-IS algorithm. |str| must be a random access
362 // iterator pointing at the beginning of S with length |length|. The result
363 // is writtend in |suffix_array|, a random access iterator.
364 template <class StrIt, class SAIt>
365 static void SuffixSort(StrIt str,
366 size_type length,
367 key_type key_bound,
368 SAIt suffix_array) {
369 if (length == 1)
370 *suffix_array = 0;
371 if (length < 2)
372 return;
373
374 std::vector<SLType> sl_partition(length);
375 size_type lms_count =
376 BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
377 std::vector<size_type> lms_indices(lms_count);
378 FindLmsSuffixes(sl_partition, lms_indices.begin());
379 std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);
380
381 if (lms_indices.size() > 1) {
382 // Given |lms_indices| in the same order they appear in |str|, induce
383 // LMS substrings relative order and write result to |suffix_array|.
384 InducedSort(str, length, sl_partition, lms_indices, buckets,
385 suffix_array);
386 std::vector<size_type> lms_str(lms_indices.size());
387
388 // Given LMS substrings in relative order found in |suffix_array|,
389 // map LMS substrings to unique labels to form a new string, |lms_str|.
390 size_type label_count =
391 LabelLmsSubstrings(str, length, sl_partition, suffix_array,
392 lms_indices.begin(), lms_str.begin());
393
394 if (label_count < lms_str.size()) {
395 // Reorder |lms_str| to have LMS suffixes in the same order they
396 // appear in |str|.
397 for (size_type i = 0; i < lms_indices.size(); ++i)
398 suffix_array[lms_indices[i]] = lms_str[i];
399
400 SLType previous_type = SType;
401 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
402 if (sl_partition[i] == SType && previous_type == LType) {
403 lms_str[j] = suffix_array[i];
404 lms_indices[j++] = i;
405 }
406 previous_type = sl_partition[i];
407 }
408
409 // Recursively apply SuffixSort on |lms_str|, which is formed from
410 // labeled LMS suffixes in the same order they appear in |str|.
411 // Note that |KeyType| will be size_type because |lms_str| contains
412 // indices. |lms_str| is at most half the length of |str|.
413 Implementation<size_type, size_type>::SuffixSort(
414 lms_str.begin(), static_cast<size_type>(lms_str.size()),
415 label_count, suffix_array);
416
417 // Map LMS labels back to indices in |str| and write result to
418 // |lms_indices|. We're using |suffix_array| as a temporary buffer.
419 for (size_type i = 0; i < lms_indices.size(); ++i)
420 suffix_array[i] = lms_indices[suffix_array[i]];
421 std::copy_n(suffix_array, lms_indices.size(), lms_indices.begin());
422
423 // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
424 }
425 }
426 // Given |lms_indices| where LMS suffixes are sorted, induce the full
427 // order of suffixes in |str|.
428 InducedSort(str, length, sl_partition, lms_indices, buckets,
429 suffix_array);
430 }
431 };
432 };
433
434 // Generates a sorted suffix array for the input string |str| using the functor
435 // |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
436 /// Characters found in |str| are assumed to be in range [0, |key_bound|).
437 // Returns the suffix array as a vector.
438 // |StrRng| is an input random access range.
439 // |KeyType| is an unsigned integer type.
440 template <class Algorithm, class StrRng, class KeyType>
441 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
442 KeyType key_bound) {
443 Algorithm sort;
444 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
445 sort(str, key_bound, suffix_array.begin());
446 return suffix_array;
447 }
448
449 // Type requirements:
450 // |SARng| is an input random access range.
451 // |StrIt1| is a random access iterator.
452 // |StrIt2| is a forward iterator.
453 template <class SARng, class StrIt1, class StrIt2>
454 // Lexicographical lower bound using binary search for
455 // [|str2_first|, |str2_last|) in the suffix array |suffix_array| of a string
456 // starting at |str1_first|. This does not necessarily return the index of
457 // the longest matching substring.
458 auto SuffixLowerBound(const SARng& suffix_array,
459 StrIt1 str1_first,
460 StrIt2 str2_first,
461 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
462 using size_type = typename SARng::value_type;
463
464 size_t n = std::end(suffix_array) - std::begin(suffix_array);
465 auto it = std::lower_bound(
466 std::begin(suffix_array), std::end(suffix_array), str2_first,
467 [str1_first, str2_last, n](size_type a, StrIt2 b) {
468 return std::lexicographical_compare(str1_first + a, str1_first + n, b,
469 str2_last);
470 });
471 return it;
472 }
473
474 } // namespace zucchini
475
476 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
OLDNEW
« no previous file with comments | « chrome/installer/zucchini/BUILD.gn ('k') | chrome/installer/zucchini/suffix_array_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698