Index: chrome/installer/zucchini/suffix_array.h |
diff --git a/chrome/installer/zucchini/suffix_array.h b/chrome/installer/zucchini/suffix_array.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..e06e35e319841d434b08ca39439891b0cf353d28 |
--- /dev/null |
+++ b/chrome/installer/zucchini/suffix_array.h |
@@ -0,0 +1,449 @@ |
+// Copyright 2017 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ |
+#define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ |
+ |
+#include <algorithm> |
+#include <cassert> |
+#include <iterator> |
+#include <numeric> |
+#include <vector> |
+ |
+namespace zucchini { |
+ |
+// A functor class that implements the naive suffix sorting algorithm |
+// that uses std::sort with lexicographical compare. |
+// This is only meant as reference of the interface. |
+class NaiveSuffixSort { |
+ public: |
+ // |str| must be a random access input range. |
etiennep1
2017/06/30 17:18:17
Split argument description and type requirements.
|
+ // Characters found in |str| must be in the range [0, |max_key|) |
huangs
2017/06/30 04:57:32
NIT: |max_key| is a misnomer, since it's not a "ma
etiennep1
2017/06/30 17:18:17
Agreed
|
+ // |suffix_array| is a random access mutable range containing the result. |
+ template <class InputRng, class KeyType, class SAIt> |
+ void operator()(const InputRng& str, |
+ KeyType max_key, |
+ SAIt suffix_array) const { |
+ using size_type = typename SAIt::value_type; |
+ |
+ size_type n = str.end() - str.begin(); |
huangs
2017/06/30 04:57:32
std::end(str) - std::begin(str)? Same below.
Als
etiennep1
2017/06/30 17:18:17
Done.
|
+ |
+ // |suffix_array| is first filled with ordered indices of |str|. |
+ // Those indices are then sorted with lexicographical comparisons in |str|. |
+ std::iota(suffix_array, suffix_array + n, 0); |
+ std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) { |
huangs
2017/06/30 04:57:32
Note that you're using lambda with capture here. S
etiennep1
2017/06/30 17:18:17
I disagree.
From the link:
"Be careful with defaul
|
+ return std::lexicographical_compare(std::begin(str) + i, std::end(str), |
+ std::begin(str) + j, std::end(str)); |
+ }); |
+ } |
+}; |
+ |
+// A functor class that implements suffix array induced sorting (SA-IS) |
+// algorithm with linear time and memory complexity, |
+// see http://ieeexplore.ieee.org/abstract/document/5582081/ |
+class Sais { |
+ public: |
+ // |str| must be a random access input range. |
+ // Characters found in |str| must be in the range [0, |max_key|) |
+ // |suffix_array| is a random access mutable range containing the result. |
huangs
2017/06/30 04:57:32
Mention that |suffix_array| should have length at
etiennep1
2017/06/30 17:18:17
Done. Same for NaiveSuffixSort.
|
+ template <class InputRng, class KeyType, class SAIt> |
+ void operator()(const InputRng& str, |
+ KeyType max_key, |
+ SAIt suffix_array) const { |
+ using value_type = typename InputRng::value_type; |
+ using size_type = typename SAIt::value_type; |
+ |
+ static_assert(std::is_unsigned<value_type>::value, |
+ "Sais only supports input string with unsigned values"); |
+ |
+ size_type n = static_cast<size_type>(str.end() - str.begin()); |
+ |
+ Implementation<size_type, typename std::make_unsigned<KeyType>::type>:: |
huangs
2017/06/30 04:57:32
Do we need std::make_unsigned<>? We used static_as
etiennep1
2017/06/30 17:18:18
We don't. That comes from an failed attempt to sup
|
+ SuffixSort(std::begin(str), n, max_key, suffix_array); |
+ } |
+ |
+ // Given the string S of length n. We assume S is terminated by a unique |
+ // sentinel $, which is considered as the smallest character. This sentinel |
+ // does not exist in memory and is only treated implicitly. We denote |
+ // suf(S,i) the suffix formed by S[i, n). |
huangs
2017/06/30 04:57:32
NIT: suf(S,i) has inconsistent spacing with later
etiennep1
2017/06/30 17:18:17
Done.
|
+ |
+ // A suffix suf(S, i) is said to be S-type or L-type, |
+ // if suf(S, i) < suf(S, i+1) or suf(S, i) > suf(S, i+1), respectively. |
+ enum SLType : bool { SType, LType }; |
+ |
+ // A character S[i] is said to be S-type or L-type if the suffix suf(S, i) is |
+ // S-type or L-type, respectively. |
+ |
+ // A character S[i], i is called LMS (leftmost S-type), if S[i] is S-type and |
+ // S[i-1] is L-type. A suffix suf(S, i) is called LMS, if S[i] is an LMS |
+ // character. |
+ |
+ // An LMS-substring is a substring S[i..j) with both S[i] and S[j], being LMS |
huangs
2017/06/30 04:57:32
S[i..j): Inconsistent notatoin with S[i,n) above.
etiennep1
2017/06/30 17:18:17
Done.
|
+ // characters, and there is no other LMS character in the substring, or the |
+ // sentinel itself |
+ |
+ template <class SizeType, class KeyType> |
+ struct Implementation { |
+ static_assert(std::is_unsigned<SizeType>::value, |
+ "SizeType must be unsigned"); |
+ static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); |
+ using size_type = SizeType; |
+ using key_type = KeyType; |
+ |
+ using iterator = typename std::vector<size_type>::iterator; |
+ using const_iterator = typename std::vector<size_type>::const_iterator; |
+ |
+ // Partition every suffix based on SL-type. |
+ template <class StrIt> |
+ static size_type BuildSLPartition( |
+ StrIt str, |
+ size_type length, |
+ key_type max_key, |
+ std::vector<SLType>::reverse_iterator sl_partition) { |
+ // We will count LMS suffixes (S to L-type or last S-type) |
+ size_type lms_count = 0; |
+ |
+ // |previous_type| is initialized to L-type to avoid counting an extra |
+ // LMS suffix at the end |
+ SLType previous_type = LType; |
+ |
+ key_type previous_key = max_key; // initialized to dummy, impossible key |
+ |
+ // We're travelling backward to determine the partition, |
+ // as if we prepend one character at a time to the string, ex: |
+ // b$ is L-type because b > $ |
+ // ab$ is S-type because a < b, implying ab$ < b$ |
+ // bab$ is L-type because b > a, implying bab$ > ab$ |
+ // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$ |
+ for (auto str_it = std::reverse_iterator<StrIt>(str + length); |
+ str_it != std::reverse_iterator<StrIt>(str); |
+ ++str_it, ++sl_partition) { |
+ key_type current_key = *str_it; |
+ |
+ if (current_key > previous_key || previous_key == max_key) { |
+ // S[i] > S[i + 1] or S[i] is last character |
+ *sl_partition = LType; |
+ if (previous_type == SType) |
+ // suf(S, i) is L-type and suf(S, i + 1) is S-type, |
+ // therefore, suf(S, i+1) was a LMS suffix. |
+ ++lms_count; |
+ |
+ previous_type = LType; // for next round |
+ } else if (current_key < previous_key) { |
+ // S[i] < S[i + 1] |
+ *sl_partition = SType; |
+ previous_type = SType; // for next round |
+ } else { |
+ // S[i] == S[i + 1] |
+ // The next character that differs determines the SL-type, |
+ // so we reuse the last seen type. |
+ *sl_partition = previous_type; |
+ } |
+ previous_key = current_key; // for next round |
+ } |
+ |
+ return lms_count; |
+ } |
+ |
+ // Find indices of LMS suffixes and write result to |lms_indices|. |
+ static void FindLmsSuffixes(const std::vector<SLType>& sl_partition, |
+ iterator lms_indices) { |
+ // |previous_type| is initialized to S-type to avoid counting an extra |
+ // LMS suffix at the beginning |
+ SLType previous_type = SType; |
+ size_type j = 0; |
huangs
2017/06/30 04:57:32
Do we need |j|? Can we just omit it, and have
*
etiennep1
2017/06/30 17:18:17
Good point.
|
+ for (size_type i = 0; i < sl_partition.size(); ++i) { |
+ if (sl_partition[i] == SType && previous_type == LType) |
+ lms_indices[j++] = i; |
+ previous_type = sl_partition[i]; |
+ } |
+ } |
+ |
+ template <class StrIt> |
+ static std::vector<size_type> MakeBucketCount(StrIt str, |
+ size_type length, |
+ key_type max_key) { |
+ // Occurence of every unique character is counted in |buckets| |
huangs
2017/06/30 04:57:32
Typo: Occurrence.
etiennep1
2017/06/30 17:18:18
Done.
|
+ std::vector<size_type> buckets(static_cast<size_type>(max_key)); |
+ |
+ for (auto it = str; it != str + length; ++it) |
+ ++buckets[*it]; |
+ return buckets; |
+ } |
+ |
+ // Apply induced sort from |lms_indices| to |suffix_array| associated with |
+ // the string |str|. |
+ template <class StrIt, class SAIt> |
+ static void InducedSort(StrIt str, |
+ size_type length, |
+ const std::vector<SLType>& sl_partition, |
+ const std::vector<size_type>& lms_substrings, |
+ const std::vector<size_type>& buckets, |
+ SAIt suffix_array) { |
+ // All indices are first marked as unset with 0. |
+ std::fill(suffix_array, suffix_array + length, 0); |
+ |
+ // Used to mark bucket boundaries (head or end) as indices in str. |
+ std::vector<size_type> bucket_bounds(buckets.size()); |
+ |
+ // Step 1 |
+ // for each leftmost S-type suffix suf(S, i) found in |lms_indices| |
huangs
2017/06/30 04:57:33
NIT: Replace "leftmost S-type" with LMS, since we'
etiennep1
2017/06/30 17:18:17
Done.
|
+ // scanned backward, place suf(S, i) at the end of the corresponding |
+ // bucket and forward the bucket end to the left. |
+ |
+ // By corresponding bucket for suf(S, i), we mean the bucket associated |
+ // with the character S(i). |
+ |
+ // find the end of each bucket |
+ bucket_bounds[0] = buckets[0]; |
huangs
2017/06/30 04:57:32
Use std::partial_sum() for this?
etiennep1
2017/06/30 17:18:17
Done.
|
+ for (key_type i = 1; i < buckets.size(); ++i) |
+ bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i]; |
+ |
+ for (auto it = lms_substrings.crbegin(); it != lms_substrings.crend(); |
+ ++it) { |
+ key_type key = str[*it]; |
+ suffix_array[--bucket_bounds[key]] = *it; |
+ } |
+ |
+ // Step 2 |
+ // for each modified suf(S, i), scanned forward, for which |
+ // suf(S, SA(i) - 1) is L-type, place suf(S, SA(i) - 1) to the current |
+ // head of the corresponding bucket and forward the bucket head to the |
+ // right. |
+ |
+ // find the head of each bucket |
+ bucket_bounds[0] = 0; |
huangs
2017/06/30 04:57:33
Is this step needed? I'd think that after the prev
etiennep1
2017/06/30 17:18:17
This step is needed, yes.
|
+ for (key_type i = 1; i < buckets.size(); ++i) |
+ bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i - 1]; |
+ |
+ // from Step 1, the sentinel $, which we treat implicitely, would have |
+ // been place at the beginning of |suffix_array| since $ is always |
huangs
2017/06/30 04:57:32
NIT: s/place/placed/
etiennep1
2017/06/30 17:18:17
Done.
|
+ // considered as the smallest character. We then have to deal with the |
+ // previous (last) suffix. |
+ if (sl_partition[length - 1] == LType) { |
+ key_type key = str[length - 1]; |
+ suffix_array[bucket_bounds[key]++] = length - 1; |
+ } |
+ for (auto it = suffix_array; it != suffix_array + length; ++it) { |
+ size_type suffix_index = *it; |
+ |
+ // While the original algorithm marks unset suffixes with -1, |
+ // we found that marking them with 0 is also possible, since suf(S, 0) |
+ // has no previous suffix, and also more convenient because we are |
+ // working with unsigned integers. |
+ if (suffix_index > 0 && sl_partition[--suffix_index] == LType) { |
+ key_type key = str[suffix_index]; |
+ suffix_array[bucket_bounds[key]++] = suffix_index; |
+ } |
+ } |
+ |
+ // Step 3 |
+ // for each modified suf(S, i), scanned backward, for which |
+ // suf(S, SA(i) - 1) is S-type, place suf(S, SA(i) - 1) to the current |
+ // end of the corresponding bucket and forward the bucket head to the |
+ // left. |
+ |
+ // find the end of each bucket |
+ bucket_bounds[0] = buckets[0]; |
+ for (size_type i = 1; i < buckets.size(); ++i) |
+ bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i]; |
+ |
+ for (auto it = std::reverse_iterator<SAIt>(suffix_array + length); |
+ it != std::reverse_iterator<SAIt>(suffix_array); ++it) { |
+ size_type suffix_index = *it; |
+ if (suffix_index > 0 && sl_partition[--suffix_index] == SType) { |
+ key_type key = str[suffix_index]; |
+ suffix_array[--bucket_bounds[key]] = suffix_index; |
+ } |
+ } |
+ // Deals with the last suffix, because of the sentinel. |
+ if (sl_partition[length - 1] == SType) { |
+ key_type key = str[length - 1]; |
+ suffix_array[--bucket_bounds[key]] = length - 1; |
+ } |
+ } |
+ |
+ // Given a string S starting at |str| with length |length|, an array |
+ // starting at |substring_array| containing ordered LMS terminated substring |
+ // indices of S and an SL-Type partition |sl_partition| of S, assigns a |
+ // unique label to every unique LMS substring. The sorted labels for all LMS |
+ // substrings are written to |lms_str|, while the indices of LMS suffixes |
+ // are written to |lms_indices|. In addition, returns the total number of |
+ // unique labels. |
+ template <class StrIt, class SAIt> |
+ static size_type LabelLmsSubstrings(StrIt str, |
+ size_type length, |
+ const std::vector<SLType>& sl_partition, |
+ SAIt suffix_array, |
+ iterator lms_indices, |
+ iterator lms_str) { |
+ // Labelling starts at 0 |
+ size_type label = 0; |
+ |
+ // |previous_lms| is initialized to 0 to indicate it is unset. |
+ // Note that suf(S, 0) is never a LMS suffix. |
+ // Substrings will be visited in relative order. |
+ size_type previous_lms = 0; |
+ for (auto it = suffix_array; it != suffix_array + length; ++it) { |
+ if (*it > 0 && sl_partition[*it] == SType && |
+ sl_partition[*it - 1] == LType) { |
+ // suf(S, *it) is a LMS suffix |
+ |
+ size_type current_lms = *it; |
+ if (previous_lms != 0) { |
+ // There was a previous LMS suffix |
+ // Check if the current LMS substring is equal to the previous one |
+ SLType current_lms_type = SType, previous_lms_type = SType; |
+ for (size_type k = 0;; ++k) { |
+ // |current_lms_end| and |previous_lms_end| denote weither we have |
+ // reached the end of the current and previous LMS substring, |
+ // respectively |
+ bool current_lms_end = false, previous_lms_end = false; |
+ |
+ // Check both previous and current substrings end ie |
+ // Note: it is more convenient to check if suf(S, current_lms + k) |
+ // is an LMS suffix than to retrieve it from lms_indices. |
+ if (current_lms + k >= length || |
+ (current_lms_type == LType && |
+ sl_partition[current_lms + k] == SType)) { |
+ current_lms_end = true; |
+ } |
+ if (previous_lms + k >= length || |
+ (previous_lms_type == LType && |
+ sl_partition[previous_lms + k] == SType)) { |
+ previous_lms_end = true; |
+ } |
+ |
+ if (current_lms_end && previous_lms_end) { |
+ break; |
+ } else if (current_lms_end != previous_lms_end || |
+ str[current_lms + k] != str[previous_lms + k]) { |
+ // previous and current substrings differ, |
+ ++label; // use a new label |
+ break; |
+ } |
+ |
+ current_lms_type = sl_partition[current_lms + k]; |
+ previous_lms_type = sl_partition[previous_lms + k]; |
+ } |
+ } |
+ *lms_indices++ = *it; |
+ *lms_str++ = label; |
+ previous_lms = current_lms; |
+ } |
+ } |
+ |
+ return ++label; |
+ } |
+ |
+ // Implementation of the SA-IS algorithm. |str| must be a random access |
+ // iterator pointing at the beginning of S with length |length|. The result |
+ // is writtend in |suffix_array|, a random access iterator. |
+ template <class StrIt, class SAIt> |
+ static void SuffixSort(StrIt str, |
+ size_type length, |
+ key_type max_key, |
+ SAIt suffix_array) { |
+ if (length == 1) |
+ *suffix_array = 0; |
+ if (length < 2) |
+ return; |
+ |
+ std::vector<SLType> sl_partition(length); |
+ size_type lms_count = |
+ BuildSLPartition(str, length, max_key, sl_partition.rbegin()); |
+ std::vector<size_type> lms_indices(lms_count); |
+ FindLmsSuffixes(sl_partition, lms_indices.begin()); |
+ std::vector<size_type> buckets = MakeBucketCount(str, length, max_key); |
+ |
+ if (lms_indices.size() > 1) { |
+ // Given |lms_indices| in order of apparition, induce LMS substrings |
huangs
2017/06/30 04:57:32
"apparition" means "ghost"? 2 more instances below
etiennep1
2017/06/30 17:18:17
Wrong translation, meant appearance
|
+ // relative order and write result to |suffix_array|. |
+ InducedSort(str, length, sl_partition, lms_indices, buckets, |
+ suffix_array); |
+ std::vector<size_type> lms_str(lms_indices.size()); |
+ |
+ // Given LMS substrings in relative order found in |suffix_array|, |
+ // map LMS substrings to unique labels to form a new string, |lms_str|. |
+ size_type label_count = |
+ LabelLmsSubstrings(str, length, sl_partition, suffix_array, |
+ lms_indices.begin(), lms_str.begin()); |
+ |
+ if (label_count < lms_str.size()) { |
+ // Reorder |lms_str| to have LMS suffixes in order of apparition. |
+ for (size_type i = 0; i < lms_indices.size(); ++i) |
+ suffix_array[lms_indices[i]] = lms_str[i]; |
+ |
+ SLType previous_type = SType; |
+ for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) { |
+ if (sl_partition[i] == SType && previous_type == LType) { |
+ lms_str[j] = suffix_array[i]; |
+ lms_indices[j++] = i; |
+ } |
+ previous_type = sl_partition[i]; |
+ } |
+ |
+ // Recursively apply SuffixSort on |lms_str|, which is |
+ // formed from labeled LMS suffixes in order of apparition. |
+ // |lms_str| is at most half the length of |str|. |
+ Implementation<size_type, size_type>::SuffixSort( |
+ lms_str.begin(), static_cast<size_type>(lms_str.size()), |
+ label_count, suffix_array); |
+ |
+ // Map LMS labels back to indices in |str| and |
+ // write result to |lms_indices|. |
+ // We're using |suffix_array| as a temporary buffer. |
+ for (size_type i = 0; i < lms_indices.size(); ++i) |
+ suffix_array[i] = lms_indices[suffix_array[i]]; |
+ for (size_type i = 0; i < lms_indices.size(); ++i) |
+ lms_indices[i] = suffix_array[i]; |
+ |
+ // At this point, |lms_indices| contains sorted LMS suffixes of |str|. |
+ } |
+ } |
+ // Given |lms_indices| where LMS suffixes are sorted, |
+ // induce the full order of suffixes in |str| |
+ InducedSort(str, length, sl_partition, lms_indices, buckets, |
+ suffix_array); |
+ } |
+ }; |
+}; |
+ |
+// Generates a suffix array from |str|, a random access input range from which |
+// suffixes are sorted, using the functor |algorithm| which provides an |
+// interface equivalent to NaiveSuffixSort. Characters found in |str| are |
+// assumed to be in range [0, |max_key|). Returns the suffix array as a vector. |
+template <class Algorithm, class StrRng, class KeyType> |
+std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str, |
+ KeyType max_key) { |
+ Algorithm sort; |
+ std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin()); |
+ sort(str, max_key, suffix_array.begin()); |
+ return suffix_array; |
+} |
+ |
+// Lexicographical lower bound of |str2| in the suffix array of |str1| using |
+// binary search. |str1_first| is a random access iterator pointing to the |
+// beginning of |str1|. |str2_first| and |str2_last| are forward iterators |
+// pointing to the beginning and end of |str2|, respectively. |
+template <class SARng, class StrIt1, class StrIt2> |
+auto SearchSuffixArray(const SARng& suffix_array, |
+ StrIt1 str1_first, |
+ StrIt2 str2_first, |
+ StrIt2 str2_last) -> decltype(std::begin(suffix_array)) { |
+ using size_type = typename SARng::value_type; |
+ |
+ size_t n = std::end(suffix_array) - std::begin(suffix_array); |
+ auto it = std::lower_bound( |
+ std::begin(suffix_array), std::end(suffix_array), str2_first, |
+ [str1_first, str2_last, n](size_type a, StrIt2 b) { |
+ return std::lexicographical_compare(str1_first + a, str1_first + n, b, |
+ str2_last); |
+ }); |
+ return it; |
+} |
+ |
+} // namespace zucchini |
+ |
+#endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ |