OLD | NEW |
(Empty) | |
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ |
| 6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ |
| 7 |
| 8 #include <algorithm> |
| 9 #include <cassert> |
| 10 #include <iterator> |
| 11 #include <numeric> |
| 12 #include <vector> |
| 13 |
| 14 namespace zucchini { |
| 15 |
| 16 // A functor class that implements the naive suffix sorting algorithm |
| 17 // that uses std::sort with lexicographical compare. |
| 18 // This is only meant as reference of the interface. |
| 19 class NaiveSuffixSort { |
| 20 public: |
| 21 // |str| must be a random access input range. |
| 22 // Characters found in |str| must be in the range [0, |max_key|) |
| 23 // |suffix_array| is a random access mutable range containing the result. |
| 24 template <class InputRng, class KeyType, class SAIt> |
| 25 void operator()(const InputRng& str, |
| 26 KeyType max_key, |
| 27 SAIt suffix_array) const { |
| 28 using size_type = typename SAIt::value_type; |
| 29 |
| 30 size_type n = str.end() - str.begin(); |
| 31 |
| 32 // |suffix_array| is first filled with ordered indices of |str|. |
| 33 // Those indices are then sorted with lexicographical comparisons in |str|. |
| 34 std::iota(suffix_array, suffix_array + n, 0); |
| 35 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) { |
| 36 return std::lexicographical_compare(std::begin(str) + i, std::end(str), |
| 37 std::begin(str) + j, std::end(str)); |
| 38 }); |
| 39 } |
| 40 }; |
| 41 |
| 42 // A functor class that implements suffix array induced sorting (SA-IS) |
| 43 // algorithm with linear time and memory complexity, |
| 44 // see http://ieeexplore.ieee.org/abstract/document/5582081/ |
| 45 class Sais { |
| 46 public: |
| 47 // |str| must be a random access input range. |
| 48 // Characters found in |str| must be in the range [0, |max_key|) |
| 49 // |suffix_array| is a random access mutable range containing the result. |
| 50 template <class InputRng, class KeyType, class SAIt> |
| 51 void operator()(const InputRng& str, |
| 52 KeyType max_key, |
| 53 SAIt suffix_array) const { |
| 54 using value_type = typename InputRng::value_type; |
| 55 using size_type = typename SAIt::value_type; |
| 56 |
| 57 static_assert(std::is_unsigned<value_type>::value, |
| 58 "Sais only supports input string with unsigned values"); |
| 59 |
| 60 size_type n = static_cast<size_type>(str.end() - str.begin()); |
| 61 |
| 62 Implementation<size_type, typename std::make_unsigned<KeyType>::type>:: |
| 63 SuffixSort(std::begin(str), n, max_key, suffix_array); |
| 64 } |
| 65 |
| 66 // Given the string S of length n. We assume S is terminated by a unique |
| 67 // sentinel $, which is considered as the smallest character. This sentinel |
| 68 // does not exist in memory and is only treated implicitly. We denote |
| 69 // suf(S,i) the suffix formed by S[i, n). |
| 70 |
| 71 // A suffix suf(S, i) is said to be S-type or L-type, |
| 72 // if suf(S, i) < suf(S, i+1) or suf(S, i) > suf(S, i+1), respectively. |
| 73 enum SLType : bool { SType, LType }; |
| 74 |
| 75 // A character S[i] is said to be S-type or L-type if the suffix suf(S, i) is |
| 76 // S-type or L-type, respectively. |
| 77 |
| 78 // A character S[i], i is called LMS (leftmost S-type), if S[i] is S-type and |
| 79 // S[i-1] is L-type. A suffix suf(S, i) is called LMS, if S[i] is an LMS |
| 80 // character. |
| 81 |
| 82 // An LMS-substring is a substring S[i..j) with both S[i] and S[j], being LMS |
| 83 // characters, and there is no other LMS character in the substring, or the |
| 84 // sentinel itself |
| 85 |
| 86 template <class SizeType, class KeyType> |
| 87 struct Implementation { |
| 88 static_assert(std::is_unsigned<SizeType>::value, |
| 89 "SizeType must be unsigned"); |
| 90 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); |
| 91 using size_type = SizeType; |
| 92 using key_type = KeyType; |
| 93 |
| 94 using iterator = typename std::vector<size_type>::iterator; |
| 95 using const_iterator = typename std::vector<size_type>::const_iterator; |
| 96 |
| 97 // Partition every suffix based on SL-type. |
| 98 template <class StrIt> |
| 99 static size_type BuildSLPartition( |
| 100 StrIt str, |
| 101 size_type length, |
| 102 key_type max_key, |
| 103 std::vector<SLType>::reverse_iterator sl_partition) { |
| 104 // We will count LMS suffixes (S to L-type or last S-type) |
| 105 size_type lms_count = 0; |
| 106 |
| 107 // |previous_type| is initialized to L-type to avoid counting an extra |
| 108 // LMS suffix at the end |
| 109 SLType previous_type = LType; |
| 110 |
| 111 key_type previous_key = max_key; // initialized to dummy, impossible key |
| 112 |
| 113 // We're travelling backward to determine the partition, |
| 114 // as if we prepend one character at a time to the string, ex: |
| 115 // b$ is L-type because b > $ |
| 116 // ab$ is S-type because a < b, implying ab$ < b$ |
| 117 // bab$ is L-type because b > a, implying bab$ > ab$ |
| 118 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$ |
| 119 for (auto str_it = std::reverse_iterator<StrIt>(str + length); |
| 120 str_it != std::reverse_iterator<StrIt>(str); |
| 121 ++str_it, ++sl_partition) { |
| 122 key_type current_key = *str_it; |
| 123 |
| 124 if (current_key > previous_key || previous_key == max_key) { |
| 125 // S[i] > S[i + 1] or S[i] is last character |
| 126 *sl_partition = LType; |
| 127 if (previous_type == SType) |
| 128 // suf(S, i) is L-type and suf(S, i + 1) is S-type, |
| 129 // therefore, suf(S, i+1) was a LMS suffix. |
| 130 ++lms_count; |
| 131 |
| 132 previous_type = LType; // for next round |
| 133 } else if (current_key < previous_key) { |
| 134 // S[i] < S[i + 1] |
| 135 *sl_partition = SType; |
| 136 previous_type = SType; // for next round |
| 137 } else { |
| 138 // S[i] == S[i + 1] |
| 139 // The next character that differs determines the SL-type, |
| 140 // so we reuse the last seen type. |
| 141 *sl_partition = previous_type; |
| 142 } |
| 143 previous_key = current_key; // for next round |
| 144 } |
| 145 |
| 146 return lms_count; |
| 147 } |
| 148 |
| 149 // Find indices of LMS suffixes and write result to |lms_indices|. |
| 150 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition, |
| 151 iterator lms_indices) { |
| 152 // |previous_type| is initialized to S-type to avoid counting an extra |
| 153 // LMS suffix at the beginning |
| 154 SLType previous_type = SType; |
| 155 size_type j = 0; |
| 156 for (size_type i = 0; i < sl_partition.size(); ++i) { |
| 157 if (sl_partition[i] == SType && previous_type == LType) |
| 158 lms_indices[j++] = i; |
| 159 previous_type = sl_partition[i]; |
| 160 } |
| 161 } |
| 162 |
| 163 template <class StrIt> |
| 164 static std::vector<size_type> MakeBucketCount(StrIt str, |
| 165 size_type length, |
| 166 key_type max_key) { |
| 167 // Occurence of every unique character is counted in |buckets| |
| 168 std::vector<size_type> buckets(static_cast<size_type>(max_key)); |
| 169 |
| 170 for (auto it = str; it != str + length; ++it) |
| 171 ++buckets[*it]; |
| 172 return buckets; |
| 173 } |
| 174 |
| 175 // Apply induced sort from |lms_indices| to |suffix_array| associated with |
| 176 // the string |str|. |
| 177 template <class StrIt, class SAIt> |
| 178 static void InducedSort(StrIt str, |
| 179 size_type length, |
| 180 const std::vector<SLType>& sl_partition, |
| 181 const std::vector<size_type>& lms_substrings, |
| 182 const std::vector<size_type>& buckets, |
| 183 SAIt suffix_array) { |
| 184 // All indices are first marked as unset with 0. |
| 185 std::fill(suffix_array, suffix_array + length, 0); |
| 186 |
| 187 // Used to mark bucket boundaries (head or end) as indices in str. |
| 188 std::vector<size_type> bucket_bounds(buckets.size()); |
| 189 |
| 190 // Step 1 |
| 191 // for each leftmost S-type suffix suf(S, i) found in |lms_indices| |
| 192 // scanned backward, place suf(S, i) at the end of the corresponding |
| 193 // bucket and forward the bucket end to the left. |
| 194 |
| 195 // By corresponding bucket for suf(S, i), we mean the bucket associated |
| 196 // with the character S(i). |
| 197 |
| 198 // find the end of each bucket |
| 199 bucket_bounds[0] = buckets[0]; |
| 200 for (key_type i = 1; i < buckets.size(); ++i) |
| 201 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i]; |
| 202 |
| 203 for (auto it = lms_substrings.crbegin(); it != lms_substrings.crend(); |
| 204 ++it) { |
| 205 key_type key = str[*it]; |
| 206 suffix_array[--bucket_bounds[key]] = *it; |
| 207 } |
| 208 |
| 209 // Step 2 |
| 210 // for each modified suf(S, i), scanned forward, for which |
| 211 // suf(S, SA(i) - 1) is L-type, place suf(S, SA(i) - 1) to the current |
| 212 // head of the corresponding bucket and forward the bucket head to the |
| 213 // right. |
| 214 |
| 215 // find the head of each bucket |
| 216 bucket_bounds[0] = 0; |
| 217 for (key_type i = 1; i < buckets.size(); ++i) |
| 218 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i - 1]; |
| 219 |
| 220 // from Step 1, the sentinel $, which we treat implicitely, would have |
| 221 // been place at the beginning of |suffix_array| since $ is always |
| 222 // considered as the smallest character. We then have to deal with the |
| 223 // previous (last) suffix. |
| 224 if (sl_partition[length - 1] == LType) { |
| 225 key_type key = str[length - 1]; |
| 226 suffix_array[bucket_bounds[key]++] = length - 1; |
| 227 } |
| 228 for (auto it = suffix_array; it != suffix_array + length; ++it) { |
| 229 size_type suffix_index = *it; |
| 230 |
| 231 // While the original algorithm marks unset suffixes with -1, |
| 232 // we found that marking them with 0 is also possible, since suf(S, 0) |
| 233 // has no previous suffix, and also more convenient because we are |
| 234 // working with unsigned integers. |
| 235 if (suffix_index > 0 && sl_partition[--suffix_index] == LType) { |
| 236 key_type key = str[suffix_index]; |
| 237 suffix_array[bucket_bounds[key]++] = suffix_index; |
| 238 } |
| 239 } |
| 240 |
| 241 // Step 3 |
| 242 // for each modified suf(S, i), scanned backward, for which |
| 243 // suf(S, SA(i) - 1) is S-type, place suf(S, SA(i) - 1) to the current |
| 244 // end of the corresponding bucket and forward the bucket head to the |
| 245 // left. |
| 246 |
| 247 // find the end of each bucket |
| 248 bucket_bounds[0] = buckets[0]; |
| 249 for (size_type i = 1; i < buckets.size(); ++i) |
| 250 bucket_bounds[i] = bucket_bounds[i - 1] + buckets[i]; |
| 251 |
| 252 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length); |
| 253 it != std::reverse_iterator<SAIt>(suffix_array); ++it) { |
| 254 size_type suffix_index = *it; |
| 255 if (suffix_index > 0 && sl_partition[--suffix_index] == SType) { |
| 256 key_type key = str[suffix_index]; |
| 257 suffix_array[--bucket_bounds[key]] = suffix_index; |
| 258 } |
| 259 } |
| 260 // Deals with the last suffix, because of the sentinel. |
| 261 if (sl_partition[length - 1] == SType) { |
| 262 key_type key = str[length - 1]; |
| 263 suffix_array[--bucket_bounds[key]] = length - 1; |
| 264 } |
| 265 } |
| 266 |
| 267 // Given a string S starting at |str| with length |length|, an array |
| 268 // starting at |substring_array| containing ordered LMS terminated substring |
| 269 // indices of S and an SL-Type partition |sl_partition| of S, assigns a |
| 270 // unique label to every unique LMS substring. The sorted labels for all LMS |
| 271 // substrings are written to |lms_str|, while the indices of LMS suffixes |
| 272 // are written to |lms_indices|. In addition, returns the total number of |
| 273 // unique labels. |
| 274 template <class StrIt, class SAIt> |
| 275 static size_type LabelLmsSubstrings(StrIt str, |
| 276 size_type length, |
| 277 const std::vector<SLType>& sl_partition, |
| 278 SAIt suffix_array, |
| 279 iterator lms_indices, |
| 280 iterator lms_str) { |
| 281 // Labelling starts at 0 |
| 282 size_type label = 0; |
| 283 |
| 284 // |previous_lms| is initialized to 0 to indicate it is unset. |
| 285 // Note that suf(S, 0) is never a LMS suffix. |
| 286 // Substrings will be visited in relative order. |
| 287 size_type previous_lms = 0; |
| 288 for (auto it = suffix_array; it != suffix_array + length; ++it) { |
| 289 if (*it > 0 && sl_partition[*it] == SType && |
| 290 sl_partition[*it - 1] == LType) { |
| 291 // suf(S, *it) is a LMS suffix |
| 292 |
| 293 size_type current_lms = *it; |
| 294 if (previous_lms != 0) { |
| 295 // There was a previous LMS suffix |
| 296 // Check if the current LMS substring is equal to the previous one |
| 297 SLType current_lms_type = SType, previous_lms_type = SType; |
| 298 for (size_type k = 0;; ++k) { |
| 299 // |current_lms_end| and |previous_lms_end| denote weither we have |
| 300 // reached the end of the current and previous LMS substring, |
| 301 // respectively |
| 302 bool current_lms_end = false, previous_lms_end = false; |
| 303 |
| 304 // Check both previous and current substrings end ie |
| 305 // Note: it is more convenient to check if suf(S, current_lms + k) |
| 306 // is an LMS suffix than to retrieve it from lms_indices. |
| 307 if (current_lms + k >= length || |
| 308 (current_lms_type == LType && |
| 309 sl_partition[current_lms + k] == SType)) { |
| 310 current_lms_end = true; |
| 311 } |
| 312 if (previous_lms + k >= length || |
| 313 (previous_lms_type == LType && |
| 314 sl_partition[previous_lms + k] == SType)) { |
| 315 previous_lms_end = true; |
| 316 } |
| 317 |
| 318 if (current_lms_end && previous_lms_end) { |
| 319 break; |
| 320 } else if (current_lms_end != previous_lms_end || |
| 321 str[current_lms + k] != str[previous_lms + k]) { |
| 322 // previous and current substrings differ, |
| 323 ++label; // use a new label |
| 324 break; |
| 325 } |
| 326 |
| 327 current_lms_type = sl_partition[current_lms + k]; |
| 328 previous_lms_type = sl_partition[previous_lms + k]; |
| 329 } |
| 330 } |
| 331 *lms_indices++ = *it; |
| 332 *lms_str++ = label; |
| 333 previous_lms = current_lms; |
| 334 } |
| 335 } |
| 336 |
| 337 return ++label; |
| 338 } |
| 339 |
| 340 // Implementation of the SA-IS algorithm. |str| must be a random access |
| 341 // iterator pointing at the beginning of S with length |length|. The result |
| 342 // is writtend in |suffix_array|, a random access iterator. |
| 343 template <class StrIt, class SAIt> |
| 344 static void SuffixSort(StrIt str, |
| 345 size_type length, |
| 346 key_type max_key, |
| 347 SAIt suffix_array) { |
| 348 if (length == 1) |
| 349 *suffix_array = 0; |
| 350 if (length < 2) |
| 351 return; |
| 352 |
| 353 std::vector<SLType> sl_partition(length); |
| 354 size_type lms_count = |
| 355 BuildSLPartition(str, length, max_key, sl_partition.rbegin()); |
| 356 std::vector<size_type> lms_indices(lms_count); |
| 357 FindLmsSuffixes(sl_partition, lms_indices.begin()); |
| 358 std::vector<size_type> buckets = MakeBucketCount(str, length, max_key); |
| 359 |
| 360 if (lms_indices.size() > 1) { |
| 361 // Given |lms_indices| in order of apparition, induce LMS substrings |
| 362 // relative order and write result to |suffix_array|. |
| 363 InducedSort(str, length, sl_partition, lms_indices, buckets, |
| 364 suffix_array); |
| 365 std::vector<size_type> lms_str(lms_indices.size()); |
| 366 |
| 367 // Given LMS substrings in relative order found in |suffix_array|, |
| 368 // map LMS substrings to unique labels to form a new string, |lms_str|. |
| 369 size_type label_count = |
| 370 LabelLmsSubstrings(str, length, sl_partition, suffix_array, |
| 371 lms_indices.begin(), lms_str.begin()); |
| 372 |
| 373 if (label_count < lms_str.size()) { |
| 374 // Reorder |lms_str| to have LMS suffixes in order of apparition. |
| 375 for (size_type i = 0; i < lms_indices.size(); ++i) |
| 376 suffix_array[lms_indices[i]] = lms_str[i]; |
| 377 |
| 378 SLType previous_type = SType; |
| 379 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) { |
| 380 if (sl_partition[i] == SType && previous_type == LType) { |
| 381 lms_str[j] = suffix_array[i]; |
| 382 lms_indices[j++] = i; |
| 383 } |
| 384 previous_type = sl_partition[i]; |
| 385 } |
| 386 |
| 387 // Recursively apply SuffixSort on |lms_str|, which is |
| 388 // formed from labeled LMS suffixes in order of apparition. |
| 389 // |lms_str| is at most half the length of |str|. |
| 390 Implementation<size_type, size_type>::SuffixSort( |
| 391 lms_str.begin(), static_cast<size_type>(lms_str.size()), |
| 392 label_count, suffix_array); |
| 393 |
| 394 // Map LMS labels back to indices in |str| and |
| 395 // write result to |lms_indices|. |
| 396 // We're using |suffix_array| as a temporary buffer. |
| 397 for (size_type i = 0; i < lms_indices.size(); ++i) |
| 398 suffix_array[i] = lms_indices[suffix_array[i]]; |
| 399 for (size_type i = 0; i < lms_indices.size(); ++i) |
| 400 lms_indices[i] = suffix_array[i]; |
| 401 |
| 402 // At this point, |lms_indices| contains sorted LMS suffixes of |str|. |
| 403 } |
| 404 } |
| 405 // Given |lms_indices| where LMS suffixes are sorted, |
| 406 // induce the full order of suffixes in |str| |
| 407 InducedSort(str, length, sl_partition, lms_indices, buckets, |
| 408 suffix_array); |
| 409 } |
| 410 }; |
| 411 }; |
| 412 |
| 413 // Generates a suffix array from |str|, a random access input range from which |
| 414 // suffixes are sorted, using the functor |algorithm| which provides an |
| 415 // interface equivalent to NaiveSuffixSort. Characters found in |str| are |
| 416 // assumed to be in range [0, |max_key|). Returns the suffix array as a vector. |
| 417 template <class Algorithm, class StrRng, class KeyType> |
| 418 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str, |
| 419 KeyType max_key) { |
| 420 Algorithm sort; |
| 421 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin()); |
| 422 sort(str, max_key, suffix_array.begin()); |
| 423 return suffix_array; |
| 424 } |
| 425 |
| 426 // Lexicographical lower bound of |str2| in the suffix array of |str1| using |
| 427 // binary search. |str1_first| is a random access iterator pointing to the |
| 428 // beginning of |str1|. |str2_first| and |str2_last| are forward iterators |
| 429 // pointing to the beginning and end of |str2|, respectively. |
| 430 template <class SARng, class StrIt1, class StrIt2> |
| 431 auto SearchSuffixArray(const SARng& suffix_array, |
| 432 StrIt1 str1_first, |
| 433 StrIt2 str2_first, |
| 434 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) { |
| 435 using size_type = typename SARng::value_type; |
| 436 |
| 437 size_t n = std::end(suffix_array) - std::begin(suffix_array); |
| 438 auto it = std::lower_bound( |
| 439 std::begin(suffix_array), std::end(suffix_array), str2_first, |
| 440 [str1_first, str2_last, n](size_type a, StrIt2 b) { |
| 441 return std::lexicographical_compare(str1_first + a, str1_first + n, b, |
| 442 str2_last); |
| 443 }); |
| 444 return it; |
| 445 } |
| 446 |
| 447 } // namespace zucchini |
| 448 |
| 449 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ |
OLD | NEW |