| Index: chrome/installer/zucchini/suffix_array.h
|
| diff --git a/chrome/installer/zucchini/suffix_array.h b/chrome/installer/zucchini/suffix_array.h
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..e06e35e319841d434b08ca39439891b0cf353d28
|
| --- /dev/null
|
| +++ b/chrome/installer/zucchini/suffix_array.h
|
| @@ -0,0 +1,449 @@
|
| +// Copyright 2017 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
|
| +#define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
|
| +
|
| +#include <algorithm>
|
| +#include <cassert>
|
| +#include <iterator>
|
| +#include <numeric>
|
| +#include <vector>
|
| +
|
| +namespace zucchini {
|
| +
|
| +// A functor class that implements the naive suffix sorting algorithm
|
| +// that uses std::sort with lexicographical compare.
|
| +// This is only meant as reference of the interface.
|
| +class NaiveSuffixSort {
|
| + public:
|
| + // |str| must be a random access input range.
|
| + // Characters found in |str| must be in the range [0, |max_key|)
|
| + // |suffix_array| is a random access mutable range containing the result.
|
| + template <class InputRng, class KeyType, class SAIt>
|
| + void operator()(const InputRng& str,
|
| + KeyType max_key,
|
| + SAIt suffix_array) const {
|
| + using size_type = typename SAIt::value_type;
|
| +
|
| + size_type n = str.end() - str.begin();
|
| +
|
| + // |suffix_array| is first filled with ordered indices of |str|.
|
| + // Those indices are then sorted with lexicographical comparisons in |str|.
|
| + std::iota(suffix_array, suffix_array + n, 0);
|
| + std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) {
|
| + return std::lexicographical_compare(std::begin(str) + i, std::end(str),
|
| + std::begin(str) + j, std::end(str));
|
| + });
|
| + }
|
| +};
|
| +
|
| +// A functor class that implements suffix array induced sorting (SA-IS)
|
| +// algorithm with linear time and memory complexity,
|
| +// see http://ieeexplore.ieee.org/abstract/document/5582081/
|
| +class Sais {
|
| + public:
|
| + // |str| must be a random access input range.
|
| + // Characters found in |str| must be in the range [0, |max_key|)
|
| + // |suffix_array| is a random access mutable range containing the result.
|
| + template <class InputRng, class KeyType, class SAIt>
|
| + void operator()(const InputRng& str,
|
| + KeyType max_key,
|
| + SAIt suffix_array) const {
|
| + using value_type = typename InputRng::value_type;
|
| + using size_type = typename SAIt::value_type;
|
| +
|
| + static_assert(std::is_unsigned<value_type>::value,
|
| + "Sais only supports input string with unsigned values");
|
| +
|
| + size_type n = static_cast<size_type>(str.end() - str.begin());
|
| +
|
| + Implementation<size_type, typename std::make_unsigned<KeyType>::type>::
|
| + SuffixSort(std::begin(str), n, max_key, suffix_array);
|
| + }
|
| +
|
| + // Given the string S of length n. We assume S is terminated by a unique
|
| + // sentinel $, which is considered as the smallest character. This sentinel
|
| + // does not exist in memory and is only treated implicitly. We denote
|
| + // suf(S,i) the suffix formed by S[i, n).
|
| +
|
| + // A suffix suf(S, i) is said to be S-type or L-type,
|
| + // if suf(S, i) < suf(S, i+1) or suf(S, i) > suf(S, i+1), respectively.
|
| + enum SLType : bool { SType, LType };
|
| +
|
| + // A character S[i] is said to be S-type or L-type if the suffix suf(S, i) is
|
| + // S-type or L-type, respectively.
|
| +
|
| + // A character S[i], i is called LMS (leftmost S-type), if S[i] is S-type and
|
| + // S[i-1] is L-type. A suffix suf(S, i) is called LMS, if S[i] is an LMS
|
| + // character.
|
| +
|
| + // An LMS-substring is a substring S[i..j) with both S[i] and S[j], being LMS
|
| + // characters, and there is no other LMS character in the substring, or the
|
| + // sentinel itself
|
| +
|
| + template <class SizeType, class KeyType>
|
| + struct Implementation {
|
| + static_assert(std::is_unsigned<SizeType>::value,
|
| + "SizeType must be unsigned");
|
| + static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned");
|
| + using size_type = SizeType;
|
| + using key_type = KeyType;
|
| +
|
| + using iterator = typename std::vector<size_type>::iterator;
|
| + using const_iterator = typename std::vector<size_type>::const_iterator;
|
| +
|
| + // Partition every suffix based on SL-type.
|
| + template <class StrIt>
|
| + static size_type BuildSLPartition(
|
| + StrIt str,
|
| + size_type length,
|
| + key_type max_key,
|
| + std::vector<SLType>::reverse_iterator sl_partition) {
|
| + // We will count LMS suffixes (S to L-type or last S-type)
|
| + size_type lms_count = 0;
|
| +
|
| + // |previous_type| is initialized to L-type to avoid counting an extra
|
| + // LMS suffix at the end
|
| + SLType previous_type = LType;
|
| +
|
| + key_type previous_key = max_key; // initialized to dummy, impossible key
|
| +
|
| + // We're travelling backward to determine the partition,
|
| + // as if we prepend one character at a time to the string, ex:
|
| + // b$ is L-type because b > $
|
| + // ab$ is S-type because a < b, implying ab$ < b$
|
| + // bab$ is L-type because b > a, implying bab$ > ab$
|
| + // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$
|
| + for (auto str_it = std::reverse_iterator<StrIt>(str + length);
|
| + str_it != std::reverse_iterator<StrIt>(str);
|
| + ++str_it, ++sl_partition) {
|
| + key_type current_key = *str_it;
|
| +
|
| + if (current_key > previous_key || previous_key == max_key) {
|
| + // S[i] > S[i + 1] or S[i] is last character
|
| + *sl_partition = LType;
|
| + if (previous_type == SType)
|
| + // suf(S, i) is L-type and suf(S, i + 1) is S-type,
|
| + // therefore, suf(S, i+1) was a LMS suffix.
|
| + ++lms_count;
|
| +
|
| + previous_type = LType; // for next round
|
| + } else if (current_key < previous_key) {
|
| + // S[i] < S[i + 1]
|
| + *sl_partition = SType;
|
| + previous_type = SType; // for next round
|
| + } else {
|
| + // S[i] == S[i + 1]
|
| + // The next character that differs determines the SL-type,
|
| + // so we reuse the last seen type.
|
| + *sl_partition = previous_type;
|
| + }
|
| + previous_key = current_key; // for next round
|
| + }
|
| +
|
| + return lms_count;
|
| + }
|
| +
|
| + // Find indices of LMS suffixes and write result to |lms_indices|.
|
| + static void FindLmsSuffixes(const std::vector<SLType>& sl_partition,
|
| + iterator lms_indices) {
|
| + // |previous_type| is initialized to S-type to avoid counting an extra
|
| + // LMS suffix at the beginning
|
| + SLType previous_type = SType;
|
| + size_type j = 0;
|
| + for (size_type i = 0; i < sl_partition.size(); ++i) {
|
| + if (sl_partition[i] == SType && previous_type == LType)
|
| + lms_indices[j++] = i;
|
| + previous_type = sl_partition[i];
|
| + }
|
| + }
|
| +
|
| + template <class StrIt>
|
| + static std::vector<size_type> MakeBucketCount(StrIt str,
|
| + size_type length,
|
| + key_type max_key) {
|
| + // Occurence of every unique character is counted in |buckets|
|
| + std::vector<size_type> buckets(static_cast<size_type>(max_key));
|
| +
|
| + for (auto it = str; it != str + length; ++it)
|
| + ++buckets[*it];
|
| + return buckets;
|
| + }
|
| +
|
| + // Apply induced sort from |lms_indices| to |suffix_array| associated with
|
| + // the string |str|.
|
| + template <class StrIt, class SAIt>
|
| + static void InducedSort(StrIt str,
|
| + size_type length,
|
| + const std::vector<SLType>& sl_partition,
|
| + const std::vector<size_type>& lms_substrings,
|
| + const std::vector<size_type>& buckets,
|
| + SAIt suffix_array) {
|
| + // All indices are first marked as unset with 0.
|
| + std::fill(suffix_array, suffix_array + length, 0);
|
| +
|
| + // Used to mark bucket boundaries (head or end) as indices in str.
|
| + std::vector<size_type> bucket_bounds(buckets.size());
|
| +
|
| + // Step 1
|
| + // for each leftmost S-type suffix suf(S, i) found in |lms_indices|
|
| + // scanned backward, place suf(S, i) at the end of the corresponding
|
| + // bucket and forward the bucket end to the left.
|
| +
|
| + // By corresponding bucket for suf(S, i), we mean the bucket associated
|
| + // with the character S(i).
|
| +
|
| + // find the end of each bucket
|
| + bucket_bounds[0] = buckets[0];
|
| + for (key_type i = 1; i < buckets.size(); ++i)
|
| + bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i];
|
| +
|
| + for (auto it = lms_substrings.crbegin(); it != lms_substrings.crend();
|
| + ++it) {
|
| + key_type key = str[*it];
|
| + suffix_array[--bucket_bounds[key]] = *it;
|
| + }
|
| +
|
| + // Step 2
|
| + // for each modified suf(S, i), scanned forward, for which
|
| + // suf(S, SA(i) - 1) is L-type, place suf(S, SA(i) - 1) to the current
|
| + // head of the corresponding bucket and forward the bucket head to the
|
| + // right.
|
| +
|
| + // find the head of each bucket
|
| + bucket_bounds[0] = 0;
|
| + for (key_type i = 1; i < buckets.size(); ++i)
|
| + bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i - 1];
|
| +
|
| + // from Step 1, the sentinel $, which we treat implicitely, would have
|
| + // been place at the beginning of |suffix_array| since $ is always
|
| + // considered as the smallest character. We then have to deal with the
|
| + // previous (last) suffix.
|
| + if (sl_partition[length - 1] == LType) {
|
| + key_type key = str[length - 1];
|
| + suffix_array[bucket_bounds[key]++] = length - 1;
|
| + }
|
| + for (auto it = suffix_array; it != suffix_array + length; ++it) {
|
| + size_type suffix_index = *it;
|
| +
|
| + // While the original algorithm marks unset suffixes with -1,
|
| + // we found that marking them with 0 is also possible, since suf(S, 0)
|
| + // has no previous suffix, and also more convenient because we are
|
| + // working with unsigned integers.
|
| + if (suffix_index > 0 && sl_partition[--suffix_index] == LType) {
|
| + key_type key = str[suffix_index];
|
| + suffix_array[bucket_bounds[key]++] = suffix_index;
|
| + }
|
| + }
|
| +
|
| + // Step 3
|
| + // for each modified suf(S, i), scanned backward, for which
|
| + // suf(S, SA(i) - 1) is S-type, place suf(S, SA(i) - 1) to the current
|
| + // end of the corresponding bucket and forward the bucket head to the
|
| + // left.
|
| +
|
| + // find the end of each bucket
|
| + bucket_bounds[0] = buckets[0];
|
| + for (size_type i = 1; i < buckets.size(); ++i)
|
| + bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i];
|
| +
|
| + for (auto it = std::reverse_iterator<SAIt>(suffix_array + length);
|
| + it != std::reverse_iterator<SAIt>(suffix_array); ++it) {
|
| + size_type suffix_index = *it;
|
| + if (suffix_index > 0 && sl_partition[--suffix_index] == SType) {
|
| + key_type key = str[suffix_index];
|
| + suffix_array[--bucket_bounds[key]] = suffix_index;
|
| + }
|
| + }
|
| + // Deals with the last suffix, because of the sentinel.
|
| + if (sl_partition[length - 1] == SType) {
|
| + key_type key = str[length - 1];
|
| + suffix_array[--bucket_bounds[key]] = length - 1;
|
| + }
|
| + }
|
| +
|
| + // Given a string S starting at |str| with length |length|, an array
|
| + // starting at |substring_array| containing ordered LMS terminated substring
|
| + // indices of S and an SL-Type partition |sl_partition| of S, assigns a
|
| + // unique label to every unique LMS substring. The sorted labels for all LMS
|
| + // substrings are written to |lms_str|, while the indices of LMS suffixes
|
| + // are written to |lms_indices|. In addition, returns the total number of
|
| + // unique labels.
|
| + template <class StrIt, class SAIt>
|
| + static size_type LabelLmsSubstrings(StrIt str,
|
| + size_type length,
|
| + const std::vector<SLType>& sl_partition,
|
| + SAIt suffix_array,
|
| + iterator lms_indices,
|
| + iterator lms_str) {
|
| + // Labelling starts at 0
|
| + size_type label = 0;
|
| +
|
| + // |previous_lms| is initialized to 0 to indicate it is unset.
|
| + // Note that suf(S, 0) is never a LMS suffix.
|
| + // Substrings will be visited in relative order.
|
| + size_type previous_lms = 0;
|
| + for (auto it = suffix_array; it != suffix_array + length; ++it) {
|
| + if (*it > 0 && sl_partition[*it] == SType &&
|
| + sl_partition[*it - 1] == LType) {
|
| + // suf(S, *it) is a LMS suffix
|
| +
|
| + size_type current_lms = *it;
|
| + if (previous_lms != 0) {
|
| + // There was a previous LMS suffix
|
| + // Check if the current LMS substring is equal to the previous one
|
| + SLType current_lms_type = SType, previous_lms_type = SType;
|
| + for (size_type k = 0;; ++k) {
|
| + // |current_lms_end| and |previous_lms_end| denote weither we have
|
| + // reached the end of the current and previous LMS substring,
|
| + // respectively
|
| + bool current_lms_end = false, previous_lms_end = false;
|
| +
|
| + // Check both previous and current substrings end ie
|
| + // Note: it is more convenient to check if suf(S, current_lms + k)
|
| + // is an LMS suffix than to retrieve it from lms_indices.
|
| + if (current_lms + k >= length ||
|
| + (current_lms_type == LType &&
|
| + sl_partition[current_lms + k] == SType)) {
|
| + current_lms_end = true;
|
| + }
|
| + if (previous_lms + k >= length ||
|
| + (previous_lms_type == LType &&
|
| + sl_partition[previous_lms + k] == SType)) {
|
| + previous_lms_end = true;
|
| + }
|
| +
|
| + if (current_lms_end && previous_lms_end) {
|
| + break;
|
| + } else if (current_lms_end != previous_lms_end ||
|
| + str[current_lms + k] != str[previous_lms + k]) {
|
| + // previous and current substrings differ,
|
| + ++label; // use a new label
|
| + break;
|
| + }
|
| +
|
| + current_lms_type = sl_partition[current_lms + k];
|
| + previous_lms_type = sl_partition[previous_lms + k];
|
| + }
|
| + }
|
| + *lms_indices++ = *it;
|
| + *lms_str++ = label;
|
| + previous_lms = current_lms;
|
| + }
|
| + }
|
| +
|
| + return ++label;
|
| + }
|
| +
|
| + // Implementation of the SA-IS algorithm. |str| must be a random access
|
| + // iterator pointing at the beginning of S with length |length|. The result
|
| + // is writtend in |suffix_array|, a random access iterator.
|
| + template <class StrIt, class SAIt>
|
| + static void SuffixSort(StrIt str,
|
| + size_type length,
|
| + key_type max_key,
|
| + SAIt suffix_array) {
|
| + if (length == 1)
|
| + *suffix_array = 0;
|
| + if (length < 2)
|
| + return;
|
| +
|
| + std::vector<SLType> sl_partition(length);
|
| + size_type lms_count =
|
| + BuildSLPartition(str, length, max_key, sl_partition.rbegin());
|
| + std::vector<size_type> lms_indices(lms_count);
|
| + FindLmsSuffixes(sl_partition, lms_indices.begin());
|
| + std::vector<size_type> buckets = MakeBucketCount(str, length, max_key);
|
| +
|
| + if (lms_indices.size() > 1) {
|
| + // Given |lms_indices| in order of apparition, induce LMS substrings
|
| + // relative order and write result to |suffix_array|.
|
| + InducedSort(str, length, sl_partition, lms_indices, buckets,
|
| + suffix_array);
|
| + std::vector<size_type> lms_str(lms_indices.size());
|
| +
|
| + // Given LMS substrings in relative order found in |suffix_array|,
|
| + // map LMS substrings to unique labels to form a new string, |lms_str|.
|
| + size_type label_count =
|
| + LabelLmsSubstrings(str, length, sl_partition, suffix_array,
|
| + lms_indices.begin(), lms_str.begin());
|
| +
|
| + if (label_count < lms_str.size()) {
|
| + // Reorder |lms_str| to have LMS suffixes in order of apparition.
|
| + for (size_type i = 0; i < lms_indices.size(); ++i)
|
| + suffix_array[lms_indices[i]] = lms_str[i];
|
| +
|
| + SLType previous_type = SType;
|
| + for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) {
|
| + if (sl_partition[i] == SType && previous_type == LType) {
|
| + lms_str[j] = suffix_array[i];
|
| + lms_indices[j++] = i;
|
| + }
|
| + previous_type = sl_partition[i];
|
| + }
|
| +
|
| + // Recursively apply SuffixSort on |lms_str|, which is
|
| + // formed from labeled LMS suffixes in order of apparition.
|
| + // |lms_str| is at most half the length of |str|.
|
| + Implementation<size_type, size_type>::SuffixSort(
|
| + lms_str.begin(), static_cast<size_type>(lms_str.size()),
|
| + label_count, suffix_array);
|
| +
|
| + // Map LMS labels back to indices in |str| and
|
| + // write result to |lms_indices|.
|
| + // We're using |suffix_array| as a temporary buffer.
|
| + for (size_type i = 0; i < lms_indices.size(); ++i)
|
| + suffix_array[i] = lms_indices[suffix_array[i]];
|
| + for (size_type i = 0; i < lms_indices.size(); ++i)
|
| + lms_indices[i] = suffix_array[i];
|
| +
|
| + // At this point, |lms_indices| contains sorted LMS suffixes of |str|.
|
| + }
|
| + }
|
| + // Given |lms_indices| where LMS suffixes are sorted,
|
| + // induce the full order of suffixes in |str|
|
| + InducedSort(str, length, sl_partition, lms_indices, buckets,
|
| + suffix_array);
|
| + }
|
| + };
|
| +};
|
| +
|
| +// Generates a suffix array from |str|, a random access input range from which
|
| +// suffixes are sorted, using the functor |algorithm| which provides an
|
| +// interface equivalent to NaiveSuffixSort. Characters found in |str| are
|
| +// assumed to be in range [0, |max_key|). Returns the suffix array as a vector.
|
| +template <class Algorithm, class StrRng, class KeyType>
|
| +std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str,
|
| + KeyType max_key) {
|
| + Algorithm sort;
|
| + std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin());
|
| + sort(str, max_key, suffix_array.begin());
|
| + return suffix_array;
|
| +}
|
| +
|
| +// Lexicographical lower bound of |str2| in the suffix array of |str1| using
|
| +// binary search. |str1_first| is a random access iterator pointing to the
|
| +// beginning of |str1|. |str2_first| and |str2_last| are forward iterators
|
| +// pointing to the beginning and end of |str2|, respectively.
|
| +template <class SARng, class StrIt1, class StrIt2>
|
| +auto SearchSuffixArray(const SARng& suffix_array,
|
| + StrIt1 str1_first,
|
| + StrIt2 str2_first,
|
| + StrIt2 str2_last) -> decltype(std::begin(suffix_array)) {
|
| + using size_type = typename SARng::value_type;
|
| +
|
| + size_t n = std::end(suffix_array) - std::begin(suffix_array);
|
| + auto it = std::lower_bound(
|
| + std::begin(suffix_array), std::end(suffix_array), str2_first,
|
| + [str1_first, str2_last, n](size_type a, StrIt2 b) {
|
| + return std::lexicographical_compare(str1_first + a, str1_first + n, b,
|
| + str2_last);
|
| + });
|
| + return it;
|
| +}
|
| +
|
| +} // namespace zucchini
|
| +
|
| +#endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_
|
|
|