Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(958)

Side by Side Diff: chrome/installer/zucchini/suffix_array.h

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)
Patch Set: Make ustring conversions pretty Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « chrome/installer/zucchini/BUILD.gn ('k') | chrome/installer/zucchini/suffix_array_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
7
8 #include <algorithm>
9 #include <cassert>
10 #include <iterator>
11 #include <numeric>
12 #include <vector>
13
14 namespace zucchini {
15
16 // A functor class that implements the naive suffix sorting algorithm
17 // that uses std::sort with lexicographical compare.
18 // This is only meant as reference of the interface.
19 class NaiveSuffixSort {
20 public:
21 // |str| must be a random access input range.
etiennep1 2017/06/30 17:18:17 Split argument description and type requirements.
22 // Characters found in |str| must be in the range [0, |max_key|)
huangs 2017/06/30 04:57:32 NIT: |max_key| is a misnomer, since it's not a "ma
etiennep1 2017/06/30 17:18:17 Agreed
23 // |suffix_array| is a random access mutable range containing the result.
24 template <class InputRng, class KeyType, class SAIt>
25 void operator()(const InputRng& str,
26 KeyType max_key,
27 SAIt suffix_array) const {
28 using size_type = typename SAIt::value_type;
29
30 size_type n = str.end() - str.begin();
huangs 2017/06/30 04:57:32 std::end(str) - std::begin(str)? Same below. Als
etiennep1 2017/06/30 17:18:17 Done.
31
32 // |suffix_array| is first filled with ordered indices of |str|.
33 // Those indices are then sorted with lexicographical comparisons in |str|.
34 std::iota(suffix_array, suffix_array + n, 0);
35 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
huangs 2017/06/30 04:57:32 Note that you're using lambda with capture here. S
etiennep1 2017/06/30 17:18:17 I disagree. From the link: "Be careful with defaul
36 return std::lexicographical_compare(std::begin(str) + i, std::end(str),
37 std::begin(str) + j, std::end(str));
38 });
39 }
40 };
41
42 // A functor class that implements suffix array induced sorting (SA-IS)
43 // algorithm with linear time and memory complexity,
44 // see http://ieeexplore.ieee.org/abstract/document/5582081/
45 class Sais {
46 public:
47 // |str| must be a random access input range.
48 // Characters found in |str| must be in the range [0, |max_key|)
49 // |suffix_array| is a random access mutable range containing the result.
huangs 2017/06/30 04:57:32 Mention that |suffix_array| should have length at
etiennep1 2017/06/30 17:18:17 Done. Same for NaiveSuffixSort.
50 template <class InputRng, class KeyType, class SAIt>
51 void operator()(const InputRng& str,
52 KeyType max_key,
53 SAIt suffix_array) const {
54 using value_type = typename InputRng::value_type;
55 using size_type = typename SAIt::value_type;
56
57 static_assert(std::is_unsigned<value_type>::value,
58 "Sais only supports input string with unsigned values");
59
60 size_type n = static_cast<size_type>(str.end() - str.begin());
61
62 Implementation<size_type, typename std::make_unsigned<KeyType>::type>::
huangs 2017/06/30 04:57:32 Do we need std::make_unsigned<>? We used static_as
etiennep1 2017/06/30 17:18:18 We don't. That comes from an failed attempt to sup
63 SuffixSort(std::begin(str), n, max_key, suffix_array);
64 }
65
66 // Given the string S of length n. We assume S is terminated by a unique
67 // sentinel $, which is considered as the smallest character. This sentinel
68 // does not exist in memory and is only treated implicitly. We denote
69 // suf(S,i) the suffix formed by S[i, n).
huangs 2017/06/30 04:57:32 NIT: suf(S,i) has inconsistent spacing with later
etiennep1 2017/06/30 17:18:17 Done.
70
71 // A suffix suf(S, i) is said to be S-type or L-type,
72 // if suf(S, i) < suf(S, i+1) or suf(S, i) > suf(S, i+1), respectively.
73 enum SLType : bool { SType, LType };
74
75 // A character S[i] is said to be S-type or L-type if the suffix suf(S, i) is
76 // S-type or L-type, respectively.
77
78 // A character S[i], i is called LMS (leftmost S-type), if S[i] is S-type and
79 // S[i-1] is L-type. A suffix suf(S, i) is called LMS, if S[i] is an LMS
80 // character.
81
82 // An LMS-substring is a substring S[i..j) with both S[i] and S[j], being LMS
huangs 2017/06/30 04:57:32 S[i..j): Inconsistent notatoin with S[i,n) above.
etiennep1 2017/06/30 17:18:17 Done.
83 // characters, and there is no other LMS character in the substring, or the
84 // sentinel itself
85
86 template <class SizeType, class KeyType>
87 struct Implementation {
88 static_assert(std::is_unsigned<SizeType>::value,
89 "SizeType must be unsigned");
90 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
91 using size_type = SizeType;
92 using key_type = KeyType;
93
94 using iterator = typename std::vector<size_type>::iterator;
95 using const_iterator = typename std::vector<size_type>::const_iterator;
96
97 // Partition every suffix based on SL-type.
98 template <class StrIt>
99 static size_type BuildSLPartition(
100 StrIt str,
101 size_type length,
102 key_type max_key,
103 std::vector<SLType>::reverse_iterator sl_partition) {
104 // We will count LMS suffixes (S to L-type or last S-type)
105 size_type lms_count = 0;
106
107 // |previous_type| is initialized to L-type to avoid counting an extra
108 // LMS suffix at the end
109 SLType previous_type = LType;
110
111 key_type previous_key = max_key; // initialized to dummy, impossible key
112
113 // We're travelling backward to determine the partition,
114 // as if we prepend one character at a time to the string, ex:
115 // b$ is L-type because b > $
116 // ab$ is S-type because a < b, implying ab$ < b$
117 // bab$ is L-type because b > a, implying bab$ > ab$
118 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$
119 for (auto str_it = std::reverse_iterator<StrIt>(str + length);
120 str_it != std::reverse_iterator<StrIt>(str);
121 ++str_it, ++sl_partition) {
122 key_type current_key = *str_it;
123
124 if (current_key > previous_key || previous_key == max_key) {
125 // S[i] > S[i + 1] or S[i] is last character
126 *sl_partition = LType;
127 if (previous_type == SType)
128 // suf(S, i) is L-type and suf(S, i + 1) is S-type,
129 // therefore, suf(S, i+1) was a LMS suffix.
130 ++lms_count;
131
132 previous_type = LType; // for next round
133 } else if (current_key < previous_key) {
134 // S[i] < S[i + 1]
135 *sl_partition = SType;
136 previous_type = SType; // for next round
137 } else {
138 // S[i] == S[i + 1]
139 // The next character that differs determines the SL-type,
140 // so we reuse the last seen type.
141 *sl_partition = previous_type;
142 }
143 previous_key = current_key; // for next round
144 }
145
146 return lms_count;
147 }
148
149 // Find indices of LMS suffixes and write result to |lms_indices|.
150 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
151 iterator lms_indices) {
152 // |previous_type| is initialized to S-type to avoid counting an extra
153 // LMS suffix at the beginning
154 SLType previous_type = SType;
155 size_type j = 0;
huangs 2017/06/30 04:57:32 Do we need |j|? Can we just omit it, and have *
etiennep1 2017/06/30 17:18:17 Good point.
156 for (size_type i = 0; i < sl_partition.size(); ++i) {
157 if (sl_partition[i] == SType && previous_type == LType)
158 lms_indices[j++] = i;
159 previous_type = sl_partition[i];
160 }
161 }
162
163 template <class StrIt>
164 static std::vector<size_type> MakeBucketCount(StrIt str,
165 size_type length,
166 key_type max_key) {
167 // Occurence of every unique character is counted in |buckets|
huangs 2017/06/30 04:57:32 Typo: Occurrence.
etiennep1 2017/06/30 17:18:18 Done.
168 std::vector<size_type> buckets(static_cast<size_type>(max_key));
169
170 for (auto it = str; it != str + length; ++it)
171 ++buckets[*it];
172 return buckets;
173 }
174
175 // Apply induced sort from |lms_indices| to |suffix_array| associated with
176 // the string |str|.
177 template <class StrIt, class SAIt>
178 static void InducedSort(StrIt str,
179 size_type length,
180 const std::vector<SLType>& sl_partition,
181 const std::vector<size_type>& lms_substrings,
182 const std::vector<size_type>& buckets,
183 SAIt suffix_array) {
184 // All indices are first marked as unset with 0.
185 std::fill(suffix_array, suffix_array + length, 0);
186
187 // Used to mark bucket boundaries (head or end) as indices in str.
188 std::vector<size_type> bucket_bounds(buckets.size());
189
190 // Step 1
191 // for each leftmost S-type suffix suf(S, i) found in |lms_indices|
huangs 2017/06/30 04:57:33 NIT: Replace "leftmost S-type" with LMS, since we'
etiennep1 2017/06/30 17:18:17 Done.
192 // scanned backward, place suf(S, i) at the end of the corresponding
193 // bucket and forward the bucket end to the left.
194
195 // By corresponding bucket for suf(S, i), we mean the bucket associated
196 // with the character S(i).
197
198 // find the end of each bucket
199 bucket_bounds[0] = buckets[0];
huangs 2017/06/30 04:57:32 Use std::partial_sum() for this?
etiennep1 2017/06/30 17:18:17 Done.
200 for (key_type i = 1; i < buckets.size(); ++i)
201 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i];
202
203 for (auto it = lms_substrings.crbegin(); it != lms_substrings.crend();
204 ++it) {
205 key_type key = str[*it];
206 suffix_array[--bucket_bounds[key]] = *it;
207 }
208
209 // Step 2
210 // for each modified suf(S, i), scanned forward, for which
211 // suf(S, SA(i) - 1) is L-type, place suf(S, SA(i) - 1) to the current
212 // head of the corresponding bucket and forward the bucket head to the
213 // right.
214
215 // find the head of each bucket
216 bucket_bounds[0] = 0;
huangs 2017/06/30 04:57:33 Is this step needed? I'd think that after the prev
etiennep1 2017/06/30 17:18:17 This step is needed, yes.
217 for (key_type i = 1; i < buckets.size(); ++i)
218 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i - 1];
219
220 // from Step 1, the sentinel $, which we treat implicitely, would have
221 // been place at the beginning of |suffix_array| since $ is always
huangs 2017/06/30 04:57:32 NIT: s/place/placed/
etiennep1 2017/06/30 17:18:17 Done.
222 // considered as the smallest character. We then have to deal with the
223 // previous (last) suffix.
224 if (sl_partition[length - 1] == LType) {
225 key_type key = str[length - 1];
226 suffix_array[bucket_bounds[key]++] = length - 1;
227 }
228 for (auto it = suffix_array; it != suffix_array + length; ++it) {
229 size_type suffix_index = *it;
230
231 // While the original algorithm marks unset suffixes with -1,
232 // we found that marking them with 0 is also possible, since suf(S, 0)
233 // has no previous suffix, and also more convenient because we are
234 // working with unsigned integers.
235 if (suffix_index > 0 && sl_partition[--suffix_index] == LType) {
236 key_type key = str[suffix_index];
237 suffix_array[bucket_bounds[key]++] = suffix_index;
238 }
239 }
240
241 // Step 3
242 // for each modified suf(S, i), scanned backward, for which
243 // suf(S, SA(i) - 1) is S-type, place suf(S, SA(i) - 1) to the current
244 // end of the corresponding bucket and forward the bucket head to the
245 // left.
246
247 // find the end of each bucket
248 bucket_bounds[0] = buckets[0];
249 for (size_type i = 1; i < buckets.size(); ++i)
250 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i];
251
252 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
253 it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
254 size_type suffix_index = *it;
255 if (suffix_index > 0 && sl_partition[--suffix_index] == SType) {
256 key_type key = str[suffix_index];
257 suffix_array[--bucket_bounds[key]] = suffix_index;
258 }
259 }
260 // Deals with the last suffix, because of the sentinel.
261 if (sl_partition[length - 1] == SType) {
262 key_type key = str[length - 1];
263 suffix_array[--bucket_bounds[key]] = length - 1;
264 }
265 }
266
267 // Given a string S starting at |str| with length |length|, an array
268 // starting at |substring_array| containing ordered LMS terminated substring
269 // indices of S and an SL-Type partition |sl_partition| of S, assigns a
270 // unique label to every unique LMS substring. The sorted labels for all LMS
271 // substrings are written to |lms_str|, while the indices of LMS suffixes
272 // are written to |lms_indices|. In addition, returns the total number of
273 // unique labels.
274 template <class StrIt, class SAIt>
275 static size_type LabelLmsSubstrings(StrIt str,
276 size_type length,
277 const std::vector<SLType>& sl_partition,
278 SAIt suffix_array,
279 iterator lms_indices,
280 iterator lms_str) {
281 // Labelling starts at 0
282 size_type label = 0;
283
284 // |previous_lms| is initialized to 0 to indicate it is unset.
285 // Note that suf(S, 0) is never a LMS suffix.
286 // Substrings will be visited in relative order.
287 size_type previous_lms = 0;
288 for (auto it = suffix_array; it != suffix_array + length; ++it) {
289 if (*it > 0 && sl_partition[*it] == SType &&
290 sl_partition[*it - 1] == LType) {
291 // suf(S, *it) is a LMS suffix
292
293 size_type current_lms = *it;
294 if (previous_lms != 0) {
295 // There was a previous LMS suffix
296 // Check if the current LMS substring is equal to the previous one
297 SLType current_lms_type = SType, previous_lms_type = SType;
298 for (size_type k = 0;; ++k) {
299 // |current_lms_end| and |previous_lms_end| denote weither we have
300 // reached the end of the current and previous LMS substring,
301 // respectively
302 bool current_lms_end = false, previous_lms_end = false;
303
304 // Check both previous and current substrings end ie
305 // Note: it is more convenient to check if suf(S, current_lms + k)
306 // is an LMS suffix than to retrieve it from lms_indices.
307 if (current_lms + k >= length ||
308 (current_lms_type == LType &&
309 sl_partition[current_lms + k] == SType)) {
310 current_lms_end = true;
311 }
312 if (previous_lms + k >= length ||
313 (previous_lms_type == LType &&
314 sl_partition[previous_lms + k] == SType)) {
315 previous_lms_end = true;
316 }
317
318 if (current_lms_end && previous_lms_end) {
319 break;
320 } else if (current_lms_end != previous_lms_end ||
321 str[current_lms + k] != str[previous_lms + k]) {
322 // previous and current substrings differ,
323 ++label; // use a new label
324 break;
325 }
326
327 current_lms_type = sl_partition[current_lms + k];
328 previous_lms_type = sl_partition[previous_lms + k];
329 }
330 }
331 *lms_indices++ = *it;
332 *lms_str++ = label;
333 previous_lms = current_lms;
334 }
335 }
336
337 return ++label;
338 }
339
340 // Implementation of the SA-IS algorithm. |str| must be a random access
341 // iterator pointing at the beginning of S with length |length|. The result
342 // is writtend in |suffix_array|, a random access iterator.
343 template <class StrIt, class SAIt>
344 static void SuffixSort(StrIt str,
345 size_type length,
346 key_type max_key,
347 SAIt suffix_array) {
348 if (length == 1)
349 *suffix_array = 0;
350 if (length < 2)
351 return;
352
353 std::vector<SLType> sl_partition(length);
354 size_type lms_count =
355 BuildSLPartition(str, length, max_key, sl_partition.rbegin());
356 std::vector<size_type> lms_indices(lms_count);
357 FindLmsSuffixes(sl_partition, lms_indices.begin());
358 std::vector<size_type> buckets = MakeBucketCount(str, length, max_key);
359
360 if (lms_indices.size() > 1) {
361 // Given |lms_indices| in order of apparition, induce LMS substrings
huangs 2017/06/30 04:57:32 "apparition" means "ghost"? 2 more instances below
etiennep1 2017/06/30 17:18:17 Wrong translation, meant appearance
362 // relative order and write result to |suffix_array|.
363 InducedSort(str, length, sl_partition, lms_indices, buckets,
364 suffix_array);
365 std::vector<size_type> lms_str(lms_indices.size());
366
367 // Given LMS substrings in relative order found in |suffix_array|,
368 // map LMS substrings to unique labels to form a new string, |lms_str|.
369 size_type label_count =
370 LabelLmsSubstrings(str, length, sl_partition, suffix_array,
371 lms_indices.begin(), lms_str.begin());
372
373 if (label_count < lms_str.size()) {
374 // Reorder |lms_str| to have LMS suffixes in order of apparition.
375 for (size_type i = 0; i < lms_indices.size(); ++i)
376 suffix_array[lms_indices[i]] = lms_str[i];
377
378 SLType previous_type = SType;
379 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
380 if (sl_partition[i] == SType && previous_type == LType) {
381 lms_str[j] = suffix_array[i];
382 lms_indices[j++] = i;
383 }
384 previous_type = sl_partition[i];
385 }
386
387 // Recursively apply SuffixSort on |lms_str|, which is
388 // formed from labeled LMS suffixes in order of apparition.
389 // |lms_str| is at most half the length of |str|.
390 Implementation<size_type, size_type>::SuffixSort(
391 lms_str.begin(), static_cast<size_type>(lms_str.size()),
392 label_count, suffix_array);
393
394 // Map LMS labels back to indices in |str| and
395 // write result to |lms_indices|.
396 // We're using |suffix_array| as a temporary buffer.
397 for (size_type i = 0; i < lms_indices.size(); ++i)
398 suffix_array[i] = lms_indices[suffix_array[i]];
399 for (size_type i = 0; i < lms_indices.size(); ++i)
400 lms_indices[i] = suffix_array[i];
401
402 // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
403 }
404 }
405 // Given |lms_indices| where LMS suffixes are sorted,
406 // induce the full order of suffixes in |str|
407 InducedSort(str, length, sl_partition, lms_indices, buckets,
408 suffix_array);
409 }
410 };
411 };
412
413 // Generates a suffix array from |str|, a random access input range from which
414 // suffixes are sorted, using the functor |algorithm| which provides an
415 // interface equivalent to NaiveSuffixSort. Characters found in |str| are
416 // assumed to be in range [0, |max_key|). Returns the suffix array as a vector.
417 template <class Algorithm, class StrRng, class KeyType>
418 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
419 KeyType max_key) {
420 Algorithm sort;
421 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
422 sort(str, max_key, suffix_array.begin());
423 return suffix_array;
424 }
425
426 // Lexicographical lower bound of |str2| in the suffix array of |str1| using
427 // binary search. |str1_first| is a random access iterator pointing to the
428 // beginning of |str1|. |str2_first| and |str2_last| are forward iterators
429 // pointing to the beginning and end of |str2|, respectively.
430 template <class SARng, class StrIt1, class StrIt2>
431 auto SearchSuffixArray(const SARng& suffix_array,
432 StrIt1 str1_first,
433 StrIt2 str2_first,
434 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
435 using size_type = typename SARng::value_type;
436
437 size_t n = std::end(suffix_array) - std::begin(suffix_array);
438 auto it = std::lower_bound(
439 std::begin(suffix_array), std::end(suffix_array), str2_first,
440 [str1_first, str2_last, n](size_type a, StrIt2 b) {
441 return std::lexicographical_compare(str1_first + a, str1_first + n, b,
442 str2_last);
443 });
444 return it;
445 }
446
447 } // namespace zucchini
448
449 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
OLDNEW
« no previous file with comments | « chrome/installer/zucchini/BUILD.gn ('k') | chrome/installer/zucchini/suffix_array_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698