Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(796)

Side by Side Diff: chrome/installer/zucchini/suffix_array.h

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)
Patch Set: NIT corrections in comments Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
7
8 #include <algorithm>
9 #include <cassert>
10 #include <iterator>
11 #include <numeric>
12 #include <vector>
13
14 namespace zucchini {
15
16 // A functor class that implements the naive suffix sorting algorithm that uses
17 // std::sort with lexicographical compare. This is only meant as reference of
18 // the interface.
19 class NaiveSuffixSort {
20 public:
21 // |str| is the input string on which suffix sort is applied.
huangs 2017/07/05 05:51:58 It's a bit odd that the variables are mentioned be
etiennep1 2017/07/10 17:51:08 Following reference for stl, I added "Type require
22 // Characters found in |str| must be in the range [0, |key_bound|)
23 // |suffix_array| is the beginning of the destination range, which is at least
24 // as large as |str|.
25 // |InputRng| is an input random access range.
26 // |KeyType| is an unsigned integer type.
27 // |SAIt| is an random access iterator with mutable references.
28 template <class InputRng, class KeyType, class SAIt>
29 void operator()(const InputRng& str,
30 KeyType key_bound,
31 SAIt suffix_array) const {
32 using size_type = typename SAIt::value_type;
33
34 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
35
36 // |suffix_array| is first filled with ordered indices of |str|.
37 // Those indices are then sorted with lexicographical comparisons in |str|.
38 std::iota(suffix_array, suffix_array + n, 0);
39 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
40 return std::lexicographical_compare(std::begin(str) + i, std::end(str),
41 std::begin(str) + j, std::end(str));
42 });
43 }
44 };
45
46 // A functor class that implements suffix array induced sorting (SA-IS)
47 // algorithm with linear time and memory complexity,
48 // see http://ieeexplore.ieee.org/abstract/document/5582081/
49 class Sais {
50 public:
51 // |str| is the input string on which suffix sort is applied.
52 // Characters found in |str| must be in the range [0, |key_bound|)
53 // |suffix_array| is the beginning of the destination range, which is at least
54 // as large as |str|.
55 // |InputRng| is an input random access range.
56 // |KeyType| is an unsigned integer type.
57 // |SAIt| is an random access iterator with mutable values.
58 template <class InputRng, class KeyType, class SAIt>
59 void operator()(const InputRng& str,
60 KeyType key_bound,
61 SAIt suffix_array) const {
62 using value_type = typename InputRng::value_type;
63 using size_type = typename SAIt::value_type;
64
65 static_assert(std::is_unsigned<value_type>::value,
66 "Sais only supports input string with unsigned values");
67 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
68
69 size_type n = static_cast<size_type>(std::end(str) - std::begin(str));
70
71 Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n,
72 key_bound, suffix_array);
73 }
74
75 // Given the string S of length n. We assume S is terminated by a unique
huangs 2017/07/05 05:51:57 NIT: Remove "the" in "the string". NIT: In the pa
etiennep1 2017/07/10 17:51:08 Done. |n| (or |length|) does not count the sentine
76 // sentinel $, which is considered as the smallest character. This sentinel
77 // does not exist in memory and is only treated implicitly. We denote
78 // suf(S,i) the suffix formed by S[i..n).
79
80 // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1)
81 // or suf(S,i) > suf(S,i+1), respectively.
82 enum SLType : bool { SType, LType };
83
84 // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is
85 // S-type or L-type, respectively.
86
87 // A character S[i], i is called LMS (leftmost S-type), if S[i] is S-type and
huangs 2017/07/05 05:51:58 ", i" seems redundant?
etiennep1 2017/07/10 17:51:07 Yeah, don't know how it got there.
88 // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS
89 // character.
90
91 // An LMS-substring is a substring S[i..j) with both S[i] and S[j], being LMS
huangs 2017/07/05 05:51:58 // A substring S[i..j) is an LMS-substring if (1)
etiennep1 2017/07/10 17:51:07 Done.
92 // characters, and there is no other LMS character in the substring, or the
93 // sentinel itself
94
95 template <class SizeType, class KeyType>
96 struct Implementation {
97 static_assert(std::is_unsigned<SizeType>::value,
98 "SizeType must be unsigned");
99 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
100 using size_type = SizeType;
101 using key_type = KeyType;
102
103 using iterator = typename std::vector<size_type>::iterator;
104 using const_iterator = typename std::vector<size_type>::const_iterator;
105
106 // Partition every suffix based on SL-type.
huangs 2017/07/05 05:51:57 // ... Returns the number of LMS suffixes.
etiennep1 2017/07/10 17:51:07 Done.
107 template <class StrIt>
108 static size_type BuildSLPartition(
109 StrIt str,
110 size_type length,
111 key_type key_bound,
112 std::vector<SLType>::reverse_iterator sl_partition) {
huangs 2017/07/10 02:45:29 NIT: |sl_partition| is actually an interator, and
etiennep1 2017/07/10 17:51:08 Done.
113 // We will count LMS suffixes (S to L-type or last S-type).
114 size_type lms_count = 0;
115
116 // |previous_type| is initialized to L-type to avoid counting an extra
117 // LMS suffix at the end
118 SLType previous_type = LType;
119
120 // Initialized to dummy, impossible key.
121 key_type previous_key = key_bound;
122
123 // We're travelling backward to determine the partition,
124 // as if we prepend one character at a time to the string, ex:
125 // b$ is L-type because b > $
126 // ab$ is S-type because a < b, implying ab$ < b$
127 // bab$ is L-type because b > a, implying bab$ > ab$
128 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$
129 for (auto str_it = std::reverse_iterator<StrIt>(str + length);
130 str_it != std::reverse_iterator<StrIt>(str);
131 ++str_it, ++sl_partition) {
132 key_type current_key = *str_it;
133
134 if (current_key > previous_key || previous_key == key_bound) {
135 // S[i] > S[i + 1] or S[i] is last character
huangs 2017/07/05 05:51:58 NIT: Period at end. Please apply to all comments b
etiennep1 2017/07/10 17:51:07 Done.
136 *sl_partition = LType;
137 if (previous_type == SType)
138 // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore,
139 // suf(S,i+1) was a LMS suffix.
140 ++lms_count;
141
142 previous_type = LType; // For next round.
143 } else if (current_key < previous_key) {
144 // S[i] < S[i + 1]
145 *sl_partition = SType;
146 previous_type = SType; // For next round.
147 } else {
148 // S[i] == S[i + 1]
149 // The next character that differs determines the SL-type,
150 // so we reuse the last seen type.
151 *sl_partition = previous_type;
huangs 2017/07/05 05:51:58 These 3 cases can be combined by assigning |previo
etiennep1 2017/07/10 17:51:07 Done.
152 }
153 previous_key = current_key; // For next round.
154 }
155
156 return lms_count;
157 }
158
159 // Find indices of LMS suffixes and write result to |lms_indices|.
160 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
huangs 2017/07/05 05:51:58 This is called once, and not tested independently.
huangs 2017/07/10 02:45:29 NVM, keep it the way it is.
etiennep1 2017/07/10 17:51:08 It is called once, but is tested in TestSlPartitio
161 iterator lms_indices) {
162 // |previous_type| is initialized to S-type to avoid counting an extra
163 // LMS suffix at the beginning
164 SLType previous_type = SType;
165 for (size_type i = 0; i < sl_partition.size(); ++i) {
166 if (sl_partition[i] == SType && previous_type == LType)
167 *lms_indices++ = i;
168 previous_type = sl_partition[i];
169 }
170 }
171
172 template <class StrIt>
173 static std::vector<size_type> MakeBucketCount(StrIt str,
huangs 2017/07/05 05:51:57 Same: Embed into call site?
huangs 2017/07/10 02:45:29 NVM, keep it the way it is.
etiennep1 2017/07/10 17:51:08 This is also tested in BucketCount.
174 size_type length,
175 key_type key_bound) {
176 // Occurrence of every unique character is counted in |buckets|
177 std::vector<size_type> buckets(static_cast<size_type>(key_bound));
178
179 for (auto it = str; it != str + length; ++it)
180 ++buckets[*it];
181 return buckets;
182 }
183
184 // Apply induced sort from |lms_indices| to |suffix_array| associated with
huangs 2017/07/05 05:51:58 Now called |lms_substrings|? Please reconcile the
etiennep1 2017/07/10 17:51:08 I switched to lms_indices.
185 // the string |str|.
186 template <class StrIt, class SAIt>
187 static void InducedSort(StrIt str,
188 size_type length,
189 const std::vector<SLType>& sl_partition,
190 const std::vector<size_type>& lms_substrings,
191 const std::vector<size_type>& buckets,
192 SAIt suffix_array) {
193 // All indices are first marked as unset with 0.
huangs 2017/07/05 05:51:58 0 is a valid final value. It's better for error ch
etiennep1 2017/07/10 17:51:07 I switched to |length|
194 std::fill(suffix_array, suffix_array + length, 0);
195
huangs 2017/07/05 05:51:58 DCHECK(!buckets.empty()); (reason: We use buckets
etiennep1 2017/07/10 17:51:09 Done.
196 // Used to mark bucket boundaries (head or end) as indices in str.
197 std::vector<size_type> bucket_bounds(buckets.size());
198
199 // Step 1
huangs 2017/07/05 05:51:57 // Step 1: Assign indices for LMS suffixes, popula
etiennep1 2017/07/10 17:51:08 Done.
200 // Scan backway |lms_indices|; for each LMS suffix suf(S,i) found in
201 // |lms_indices|, place suf(S,i) at the end of the corresponding bucket
202 // and forward the bucket end to the left.
203 // By bucket corresponding to suf(S, i), we mean the bucket associated
204 // with the character S(i).
205
206 // Find the end of each bucket and write it to |bucket_bounds|.
207 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
208
huangs 2017/07/05 05:51:58 // Process each |lms_indices| backward, and assign
etiennep1 2017/07/10 17:51:08 Done.
209 for (auto it = lms_substrings.crbegin(); it != lms_substrings.crend();
210 ++it) {
211 key_type key = str[*it];
212 suffix_array[--bucket_bounds[key]] = *it;
213 }
214
215 // Step 2
216 // Scan forward |suffix_array|; for each modified suf(S,i) for which
217 // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current
218 // head of the corresponding bucket and forward the bucket head to the
219 // right.
220
221 // Find the head of each bucket and write it to |bucket_bounds|. Since
222 // only LMS suffixes where inserted in |suffix_array| during Step 1,
223 // |bucket_bounds| does not contains the head of each bucket and needs to
224 // be updated.
225 bucket_bounds[0] = 0;
226 std::partial_sum(buckets.begin(), buckets.end() - 1,
227 bucket_bounds.begin() + 1);
228
229 // From Step 1, the sentinel $, which we treat implicitely, would have
huangs 2017/07/05 05:51:58 TYPO: implicitely -> implicitly
etiennep1 2017/07/10 17:51:08 Done.
230 // been placed at the beginning of |suffix_array| since $ is always
231 // considered as the smallest character. We then have to deal with the
232 // previous (last) suffix.
233 if (sl_partition[length - 1] == LType) {
234 key_type key = str[length - 1];
235 suffix_array[bucket_bounds[key]++] = length - 1;
236 }
237 for (auto it = suffix_array; it != suffix_array + length; ++it) {
238 size_type suffix_index = *it;
239
240 // While the original algorithm marks unset suffixes with -1,
huangs 2017/07/05 05:51:57 Update comment if we use kUnassigned. Inconsiste
etiennep1 2017/07/10 17:51:07 Done.
241 // we found that marking them with 0 is also possible, since suf(S, 0)
242 // has no previous suffix, and also more convenient because we are
243 // working with unsigned integers.
244 if (suffix_index > 0 && sl_partition[--suffix_index] == LType) {
245 key_type key = str[suffix_index];
246 suffix_array[bucket_bounds[key]++] = suffix_index;
247 }
248 }
249
250 // Step 3
251 // Scan backward |suffix_array|; for each modified suf(S, i) for which
252 // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current
253 // end of the corresponding bucket and forward the bucket head to the
254 // left.
255
256 // Find the end of each bucket and write it to |bucket_bounds|. Since
257 // only L-type suffixes where inserted in |suffix_array| during Step21,
huangs 2017/07/05 05:51:58 Step21?
etiennep1 2017/07/10 17:51:08 Step 2
258 // |bucket_bounds| does not contains the end of each bucket and needs to
huangs 2017/07/05 05:51:57 s/contains/contain/
etiennep1 2017/07/10 17:51:09 Done.
259 // be updated.
260 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin());
261
262 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
263 it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
264 size_type suffix_index = *it;
265 if (suffix_index > 0 && sl_partition[--suffix_index] == SType) {
266 key_type key = str[suffix_index];
267 suffix_array[--bucket_bounds[key]] = suffix_index;
268 }
269 }
270 // Deals with the last suffix, because of the sentinel.
271 if (sl_partition[length - 1] == SType) {
272 key_type key = str[length - 1];
273 suffix_array[--bucket_bounds[key]] = length - 1;
274 }
275 }
276
277 // Given a string S starting at |str| with length |length|, an array
278 // starting at |substring_array| containing lexicographically ordered LMS
279 // terminated substring indices of S and an SL-Type partition |sl_partition|
280 // of S, assigns a unique label to every unique LMS substring. The sorted
281 // labels for all LMS substrings are written to |lms_str|, while the indices
282 // of LMS suffixes are written to |lms_indices|. In addition, returns the
283 // total number of unique labels.
284 template <class StrIt, class SAIt>
285 static size_type LabelLmsSubstrings(StrIt str,
286 size_type length,
287 const std::vector<SLType>& sl_partition,
288 SAIt suffix_array,
289 iterator lms_indices,
290 iterator lms_str) {
291 // Labelling starts at 0.
292 size_type label = 0;
293
294 // |previous_lms| is initialized to 0 to indicate it is unset.
295 // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in
296 // lexicographical order.
297 size_type previous_lms = 0;
298 for (auto it = suffix_array; it != suffix_array + length; ++it) {
299 if (*it > 0 && sl_partition[*it] == SType &&
300 sl_partition[*it - 1] == LType) {
301 // suf(S, *it) is a LMS suffix.
302
303 size_type current_lms = *it;
304 if (previous_lms != 0) {
305 // There was a previous LMS suffix. Check if the current LMS
306 // substring is equal to the previous one.
307 SLType current_lms_type = SType, previous_lms_type = SType;
huangs 2017/07/05 05:51:57 Break into separate lines.
etiennep1 2017/07/10 17:51:07 Done.
308 for (size_type k = 0;; ++k) {
309 // |current_lms_end| and |previous_lms_end| denote whether we have
310 // reached the end of the current and previous LMS substring,
311 // respectively
312 bool current_lms_end = false, previous_lms_end = false;
huangs 2017/07/05 05:51:58 Break into separate lines.
etiennep1 2017/07/10 17:51:07 Done.
313
314 // Check for both previous and current substring ends.
315 // Note that it is more convenient to check if
316 // suf(S,current_lms + k) is an LMS suffix than to retrieve it
317 // from lms_indices.
318 if (current_lms + k >= length ||
319 (current_lms_type == LType &&
320 sl_partition[current_lms + k] == SType)) {
321 current_lms_end = true;
322 }
323 if (previous_lms + k >= length ||
324 (previous_lms_type == LType &&
325 sl_partition[previous_lms + k] == SType)) {
326 previous_lms_end = true;
327 }
328
329 if (current_lms_end && previous_lms_end) {
330 break; // Previous and current substrings are identical.
331 } else if (current_lms_end != previous_lms_end ||
332 str[current_lms + k] != str[previous_lms + k]) {
333 // Previous and current substrings differ, a new label is used.
334 ++label;
335 break;
336 }
337
338 current_lms_type = sl_partition[current_lms + k];
339 previous_lms_type = sl_partition[previous_lms + k];
340 }
341 }
342 *lms_indices++ = *it;
343 *lms_str++ = label;
344 previous_lms = current_lms;
345 }
346 }
347
348 return ++label;
huangs 2017/07/05 05:51:58 Style: Should be |label + 1|: The side effect of u
etiennep1 2017/07/10 17:51:07 Done.
349 }
350
351 // Implementation of the SA-IS algorithm. |str| must be a random access
352 // iterator pointing at the beginning of S with length |length|. The result
353 // is writtend in |suffix_array|, a random access iterator.
354 template <class StrIt, class SAIt>
355 static void SuffixSort(StrIt str,
356 size_type length,
357 key_type key_bound,
358 SAIt suffix_array) {
359 if (length == 1)
360 *suffix_array = 0;
361 if (length < 2)
362 return;
363
364 std::vector<SLType> sl_partition(length);
365 size_type lms_count =
366 BuildSLPartition(str, length, key_bound, sl_partition.rbegin());
367 std::vector<size_type> lms_indices(lms_count);
368 FindLmsSuffixes(sl_partition, lms_indices.begin());
369 std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound);
370
371 if (lms_indices.size() > 1) {
372 // Given |lms_indices| in the same order they appear in |str|, induce
373 // LMS substrings relative order and write result to |suffix_array|.
374 InducedSort(str, length, sl_partition, lms_indices, buckets,
375 suffix_array);
376 std::vector<size_type> lms_str(lms_indices.size());
377
378 // Given LMS substrings in relative order found in |suffix_array|,
379 // map LMS substrings to unique labels to form a new string, |lms_str|.
380 size_type label_count =
381 LabelLmsSubstrings(str, length, sl_partition, suffix_array,
382 lms_indices.begin(), lms_str.begin());
383
384 if (label_count < lms_str.size()) {
385 // Reorder |lms_str| to have LMS suffixes in the same order they
386 // appear in |str|.
387 for (size_type i = 0; i < lms_indices.size(); ++i)
388 suffix_array[lms_indices[i]] = lms_str[i];
389
390 SLType previous_type = SType;
391 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
392 if (sl_partition[i] == SType && previous_type == LType) {
393 lms_str[j] = suffix_array[i];
394 lms_indices[j++] = i;
395 }
396 previous_type = sl_partition[i];
397 }
398
399 // Recursively apply SuffixSort on |lms_str|, which is formed from
400 // labeled LMS suffixes in the same order they appear in |str|.
401 // |lms_str| is at most half the length of |str|.
402 Implementation<size_type, size_type>::SuffixSort(
403 lms_str.begin(), static_cast<size_type>(lms_str.size()),
404 label_count, suffix_array);
huangs 2017/07/10 02:45:29 |suffix_array| is SAIt, but in recursion it's now
etiennep1 2017/07/10 17:51:08 Not sure what you mean. SAIt stays the same in the
huangs 2017/07/10 20:01:18 Note that |suffix_array| was a parameter into this
etiennep1 2017/07/10 22:57:08 We pass it as is (the last argument). SAIt is an i
huangs 2017/07/11 14:52:55 Oh I see, the full signature is Implementation<siz
405
406 // Map LMS labels back to indices in |str| and write result to
407 // |lms_indices|. We're using |suffix_array| as a temporary buffer.
408 for (size_type i = 0; i < lms_indices.size(); ++i)
409 suffix_array[i] = lms_indices[suffix_array[i]];
410 for (size_type i = 0; i < lms_indices.size(); ++i)
huangs 2017/07/05 05:51:58 std::copy()?
etiennep1 2017/07/10 17:51:08 Done.
411 lms_indices[i] = suffix_array[i];
412
413 // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
414 }
415 }
416 // Given |lms_indices| where LMS suffixes are sorted, induce the full
417 // order of suffixes in |str|.
418 InducedSort(str, length, sl_partition, lms_indices, buckets,
419 suffix_array);
420 }
421 };
422 };
423
424 // Generates a sorted suffix array for the input string |str| using the functor
425 // |Algorithm| which provides an interface equivalent to NaiveSuffixSort.
426 /// Characters found in |str| are assumed to be in range [0, |key_bound|).
427 // Returns the suffix array as a vector.
428 // |StrRng| is an input random access range.
429 // |KeyType| is an unsigned integer type.
430 template <class Algorithm, class StrRng, class KeyType>
431 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
432 KeyType key_bound) {
433 Algorithm sort;
434 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
435 sort(str, key_bound, suffix_array.begin());
436 return suffix_array;
437 }
438
439 // Lexicographical lower bound of |str2| in the suffix array of |str1| using
440 // binary search. |str1_first| is a random access iterator pointing to the
441 // beginning of |str1|. |str2_first| and |str2_last| are forward iterators
442 // pointing to the beginning and end of |str2|, respectively.
443 template <class SARng, class StrIt1, class StrIt2>
444 auto SearchSuffixArray(const SARng& suffix_array,
445 StrIt1 str1_first,
446 StrIt2 str2_first,
447 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
448 using size_type = typename SARng::value_type;
449
450 size_t n = std::end(suffix_array) - std::begin(suffix_array);
451 auto it = std::lower_bound(
452 std::begin(suffix_array), std::end(suffix_array), str2_first,
453 [str1_first, str2_last, n](size_type a, StrIt2 b) {
454 return std::lexicographical_compare(str1_first + a, str1_first + n, b,
455 str2_last);
456 });
457 return it;
huangs 2017/07/05 05:51:58 What happened to the code that uses SA to solve ma
huangs 2017/07/05 15:46:35 For a more realistic example: Query "aaaam" in ["a
etiennep1 2017/07/10 17:51:08 This is a lower level function, that only computes
458 }
459
460 } // namespace zucchini
461
462 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698