Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #ifndef CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
| 6 #define CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
| 7 | |
| 8 #include <algorithm> | |
| 9 #include <cassert> | |
| 10 #include <iterator> | |
| 11 #include <numeric> | |
| 12 #include <vector> | |
| 13 | |
| 14 namespace zucchini { | |
| 15 | |
| 16 // A functor class that implements the naive suffix sorting algorithm that uses | |
| 17 // std::sort with lexicographical compare. This is only meant as reference of | |
| 18 // the interface. | |
| 19 class NaiveSuffixSort { | |
| 20 public: | |
| 21 // |str| is the input string on which suffix sort is applied. | |
|
huangs
2017/07/05 05:51:58
It's a bit odd that the variables are mentioned be
etiennep1
2017/07/10 17:51:08
Following reference for stl, I added "Type require
| |
| 22 // Characters found in |str| must be in the range [0, |key_bound|) | |
| 23 // |suffix_array| is the beginning of the destination range, which is at least | |
| 24 // as large as |str|. | |
| 25 // |InputRng| is an input random access range. | |
| 26 // |KeyType| is an unsigned integer type. | |
| 27 // |SAIt| is an random access iterator with mutable references. | |
| 28 template <class InputRng, class KeyType, class SAIt> | |
| 29 void operator()(const InputRng& str, | |
| 30 KeyType key_bound, | |
| 31 SAIt suffix_array) const { | |
| 32 using size_type = typename SAIt::value_type; | |
| 33 | |
| 34 size_type n = static_cast<size_type>(std::end(str) - std::begin(str)); | |
| 35 | |
| 36 // |suffix_array| is first filled with ordered indices of |str|. | |
| 37 // Those indices are then sorted with lexicographical comparisons in |str|. | |
| 38 std::iota(suffix_array, suffix_array + n, 0); | |
| 39 std::sort(suffix_array, suffix_array + n, [&str](size_type i, size_type j) { | |
| 40 return std::lexicographical_compare(std::begin(str) + i, std::end(str), | |
| 41 std::begin(str) + j, std::end(str)); | |
| 42 }); | |
| 43 } | |
| 44 }; | |
| 45 | |
| 46 // A functor class that implements suffix array induced sorting (SA-IS) | |
| 47 // algorithm with linear time and memory complexity, | |
| 48 // see http://ieeexplore.ieee.org/abstract/document/5582081/ | |
| 49 class Sais { | |
| 50 public: | |
| 51 // |str| is the input string on which suffix sort is applied. | |
| 52 // Characters found in |str| must be in the range [0, |key_bound|) | |
| 53 // |suffix_array| is the beginning of the destination range, which is at least | |
| 54 // as large as |str|. | |
| 55 // |InputRng| is an input random access range. | |
| 56 // |KeyType| is an unsigned integer type. | |
| 57 // |SAIt| is an random access iterator with mutable values. | |
| 58 template <class InputRng, class KeyType, class SAIt> | |
| 59 void operator()(const InputRng& str, | |
| 60 KeyType key_bound, | |
| 61 SAIt suffix_array) const { | |
| 62 using value_type = typename InputRng::value_type; | |
| 63 using size_type = typename SAIt::value_type; | |
| 64 | |
| 65 static_assert(std::is_unsigned<value_type>::value, | |
| 66 "Sais only supports input string with unsigned values"); | |
| 67 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); | |
| 68 | |
| 69 size_type n = static_cast<size_type>(std::end(str) - std::begin(str)); | |
| 70 | |
| 71 Implementation<size_type, KeyType>::SuffixSort(std::begin(str), n, | |
| 72 key_bound, suffix_array); | |
| 73 } | |
| 74 | |
| 75 // Given the string S of length n. We assume S is terminated by a unique | |
|
huangs
2017/07/05 05:51:57
NIT: Remove "the" in "the string".
NIT: In the pa
etiennep1
2017/07/10 17:51:08
Done.
|n| (or |length|) does not count the sentine
| |
| 76 // sentinel $, which is considered as the smallest character. This sentinel | |
| 77 // does not exist in memory and is only treated implicitly. We denote | |
| 78 // suf(S,i) the suffix formed by S[i..n). | |
| 79 | |
| 80 // A suffix suf(S,i) is said to be S-type or L-type, if suf(S,i) < suf(S,i+1) | |
| 81 // or suf(S,i) > suf(S,i+1), respectively. | |
| 82 enum SLType : bool { SType, LType }; | |
| 83 | |
| 84 // A character S[i] is said to be S-type or L-type if the suffix suf(S,i) is | |
| 85 // S-type or L-type, respectively. | |
| 86 | |
| 87 // A character S[i], i is called LMS (leftmost S-type), if S[i] is S-type and | |
|
huangs
2017/07/05 05:51:58
", i" seems redundant?
etiennep1
2017/07/10 17:51:07
Yeah, don't know how it got there.
| |
| 88 // S[i-1] is L-type. A suffix suf(S,i) is called LMS, if S[i] is an LMS | |
| 89 // character. | |
| 90 | |
| 91 // An LMS-substring is a substring S[i..j) with both S[i] and S[j], being LMS | |
|
huangs
2017/07/05 05:51:58
// A substring S[i..j) is an LMS-substring if (1)
etiennep1
2017/07/10 17:51:07
Done.
| |
| 92 // characters, and there is no other LMS character in the substring, or the | |
| 93 // sentinel itself | |
| 94 | |
| 95 template <class SizeType, class KeyType> | |
| 96 struct Implementation { | |
| 97 static_assert(std::is_unsigned<SizeType>::value, | |
| 98 "SizeType must be unsigned"); | |
| 99 static_assert(std::is_unsigned<KeyType>::value, "KeyType must be unsigned"); | |
| 100 using size_type = SizeType; | |
| 101 using key_type = KeyType; | |
| 102 | |
| 103 using iterator = typename std::vector<size_type>::iterator; | |
| 104 using const_iterator = typename std::vector<size_type>::const_iterator; | |
| 105 | |
| 106 // Partition every suffix based on SL-type. | |
|
huangs
2017/07/05 05:51:57
// ... Returns the number of LMS suffixes.
etiennep1
2017/07/10 17:51:07
Done.
| |
| 107 template <class StrIt> | |
| 108 static size_type BuildSLPartition( | |
| 109 StrIt str, | |
| 110 size_type length, | |
| 111 key_type key_bound, | |
| 112 std::vector<SLType>::reverse_iterator sl_partition) { | |
|
huangs
2017/07/10 02:45:29
NIT: |sl_partition| is actually an interator, and
etiennep1
2017/07/10 17:51:08
Done.
| |
| 113 // We will count LMS suffixes (S to L-type or last S-type). | |
| 114 size_type lms_count = 0; | |
| 115 | |
| 116 // |previous_type| is initialized to L-type to avoid counting an extra | |
| 117 // LMS suffix at the end | |
| 118 SLType previous_type = LType; | |
| 119 | |
| 120 // Initialized to dummy, impossible key. | |
| 121 key_type previous_key = key_bound; | |
| 122 | |
| 123 // We're travelling backward to determine the partition, | |
| 124 // as if we prepend one character at a time to the string, ex: | |
| 125 // b$ is L-type because b > $ | |
| 126 // ab$ is S-type because a < b, implying ab$ < b$ | |
| 127 // bab$ is L-type because b > a, implying bab$ > ab$ | |
| 128 // bbab$ is L-type, because bab$ was also L-type, implying bbab$ > bab$ | |
| 129 for (auto str_it = std::reverse_iterator<StrIt>(str + length); | |
| 130 str_it != std::reverse_iterator<StrIt>(str); | |
| 131 ++str_it, ++sl_partition) { | |
| 132 key_type current_key = *str_it; | |
| 133 | |
| 134 if (current_key > previous_key || previous_key == key_bound) { | |
| 135 // S[i] > S[i + 1] or S[i] is last character | |
|
huangs
2017/07/05 05:51:58
NIT: Period at end. Please apply to all comments b
etiennep1
2017/07/10 17:51:07
Done.
| |
| 136 *sl_partition = LType; | |
| 137 if (previous_type == SType) | |
| 138 // suf(S,i) is L-type and suf(S,i + 1) is S-type, therefore, | |
| 139 // suf(S,i+1) was a LMS suffix. | |
| 140 ++lms_count; | |
| 141 | |
| 142 previous_type = LType; // For next round. | |
| 143 } else if (current_key < previous_key) { | |
| 144 // S[i] < S[i + 1] | |
| 145 *sl_partition = SType; | |
| 146 previous_type = SType; // For next round. | |
| 147 } else { | |
| 148 // S[i] == S[i + 1] | |
| 149 // The next character that differs determines the SL-type, | |
| 150 // so we reuse the last seen type. | |
| 151 *sl_partition = previous_type; | |
|
huangs
2017/07/05 05:51:58
These 3 cases can be combined by assigning |previo
etiennep1
2017/07/10 17:51:07
Done.
| |
| 152 } | |
| 153 previous_key = current_key; // For next round. | |
| 154 } | |
| 155 | |
| 156 return lms_count; | |
| 157 } | |
| 158 | |
| 159 // Find indices of LMS suffixes and write result to |lms_indices|. | |
| 160 static void FindLmsSuffixes(const std::vector<SLType>& sl_partition, | |
|
huangs
2017/07/05 05:51:58
This is called once, and not tested independently.
huangs
2017/07/10 02:45:29
NVM, keep it the way it is.
etiennep1
2017/07/10 17:51:08
It is called once, but is tested in TestSlPartitio
| |
| 161 iterator lms_indices) { | |
| 162 // |previous_type| is initialized to S-type to avoid counting an extra | |
| 163 // LMS suffix at the beginning | |
| 164 SLType previous_type = SType; | |
| 165 for (size_type i = 0; i < sl_partition.size(); ++i) { | |
| 166 if (sl_partition[i] == SType && previous_type == LType) | |
| 167 *lms_indices++ = i; | |
| 168 previous_type = sl_partition[i]; | |
| 169 } | |
| 170 } | |
| 171 | |
| 172 template <class StrIt> | |
| 173 static std::vector<size_type> MakeBucketCount(StrIt str, | |
|
huangs
2017/07/05 05:51:57
Same: Embed into call site?
huangs
2017/07/10 02:45:29
NVM, keep it the way it is.
etiennep1
2017/07/10 17:51:08
This is also tested in BucketCount.
| |
| 174 size_type length, | |
| 175 key_type key_bound) { | |
| 176 // Occurrence of every unique character is counted in |buckets| | |
| 177 std::vector<size_type> buckets(static_cast<size_type>(key_bound)); | |
| 178 | |
| 179 for (auto it = str; it != str + length; ++it) | |
| 180 ++buckets[*it]; | |
| 181 return buckets; | |
| 182 } | |
| 183 | |
| 184 // Apply induced sort from |lms_indices| to |suffix_array| associated with | |
|
huangs
2017/07/05 05:51:58
Now called |lms_substrings|? Please reconcile the
etiennep1
2017/07/10 17:51:08
I switched to lms_indices.
| |
| 185 // the string |str|. | |
| 186 template <class StrIt, class SAIt> | |
| 187 static void InducedSort(StrIt str, | |
| 188 size_type length, | |
| 189 const std::vector<SLType>& sl_partition, | |
| 190 const std::vector<size_type>& lms_substrings, | |
| 191 const std::vector<size_type>& buckets, | |
| 192 SAIt suffix_array) { | |
| 193 // All indices are first marked as unset with 0. | |
|
huangs
2017/07/05 05:51:58
0 is a valid final value. It's better for error ch
etiennep1
2017/07/10 17:51:07
I switched to |length|
| |
| 194 std::fill(suffix_array, suffix_array + length, 0); | |
| 195 | |
|
huangs
2017/07/05 05:51:58
DCHECK(!buckets.empty());
(reason: We use buckets
etiennep1
2017/07/10 17:51:09
Done.
| |
| 196 // Used to mark bucket boundaries (head or end) as indices in str. | |
| 197 std::vector<size_type> bucket_bounds(buckets.size()); | |
| 198 | |
| 199 // Step 1 | |
|
huangs
2017/07/05 05:51:57
// Step 1: Assign indices for LMS suffixes, popula
etiennep1
2017/07/10 17:51:08
Done.
| |
| 200 // Scan backway |lms_indices|; for each LMS suffix suf(S,i) found in | |
| 201 // |lms_indices|, place suf(S,i) at the end of the corresponding bucket | |
| 202 // and forward the bucket end to the left. | |
| 203 // By bucket corresponding to suf(S, i), we mean the bucket associated | |
| 204 // with the character S(i). | |
| 205 | |
| 206 // Find the end of each bucket and write it to |bucket_bounds|. | |
| 207 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); | |
| 208 | |
|
huangs
2017/07/05 05:51:58
// Process each |lms_indices| backward, and assign
etiennep1
2017/07/10 17:51:08
Done.
| |
| 209 for (auto it = lms_substrings.crbegin(); it != lms_substrings.crend(); | |
| 210 ++it) { | |
| 211 key_type key = str[*it]; | |
| 212 suffix_array[--bucket_bounds[key]] = *it; | |
| 213 } | |
| 214 | |
| 215 // Step 2 | |
| 216 // Scan forward |suffix_array|; for each modified suf(S,i) for which | |
| 217 // suf(S,SA(i) - 1) is L-type, place suf(S,SA(i) - 1) to the current | |
| 218 // head of the corresponding bucket and forward the bucket head to the | |
| 219 // right. | |
| 220 | |
| 221 // Find the head of each bucket and write it to |bucket_bounds|. Since | |
| 222 // only LMS suffixes where inserted in |suffix_array| during Step 1, | |
| 223 // |bucket_bounds| does not contains the head of each bucket and needs to | |
| 224 // be updated. | |
| 225 bucket_bounds[0] = 0; | |
| 226 std::partial_sum(buckets.begin(), buckets.end() - 1, | |
| 227 bucket_bounds.begin() + 1); | |
| 228 | |
| 229 // From Step 1, the sentinel $, which we treat implicitely, would have | |
|
huangs
2017/07/05 05:51:58
TYPO: implicitely -> implicitly
etiennep1
2017/07/10 17:51:08
Done.
| |
| 230 // been placed at the beginning of |suffix_array| since $ is always | |
| 231 // considered as the smallest character. We then have to deal with the | |
| 232 // previous (last) suffix. | |
| 233 if (sl_partition[length - 1] == LType) { | |
| 234 key_type key = str[length - 1]; | |
| 235 suffix_array[bucket_bounds[key]++] = length - 1; | |
| 236 } | |
| 237 for (auto it = suffix_array; it != suffix_array + length; ++it) { | |
| 238 size_type suffix_index = *it; | |
| 239 | |
| 240 // While the original algorithm marks unset suffixes with -1, | |
|
huangs
2017/07/05 05:51:57
Update comment if we use kUnassigned.
Inconsiste
etiennep1
2017/07/10 17:51:07
Done.
| |
| 241 // we found that marking them with 0 is also possible, since suf(S, 0) | |
| 242 // has no previous suffix, and also more convenient because we are | |
| 243 // working with unsigned integers. | |
| 244 if (suffix_index > 0 && sl_partition[--suffix_index] == LType) { | |
| 245 key_type key = str[suffix_index]; | |
| 246 suffix_array[bucket_bounds[key]++] = suffix_index; | |
| 247 } | |
| 248 } | |
| 249 | |
| 250 // Step 3 | |
| 251 // Scan backward |suffix_array|; for each modified suf(S, i) for which | |
| 252 // suf(S,SA(i) - 1) is S-type, place suf(S,SA(i) - 1) to the current | |
| 253 // end of the corresponding bucket and forward the bucket head to the | |
| 254 // left. | |
| 255 | |
| 256 // Find the end of each bucket and write it to |bucket_bounds|. Since | |
| 257 // only L-type suffixes where inserted in |suffix_array| during Step21, | |
|
huangs
2017/07/05 05:51:58
Step21?
etiennep1
2017/07/10 17:51:08
Step 2
| |
| 258 // |bucket_bounds| does not contains the end of each bucket and needs to | |
|
huangs
2017/07/05 05:51:57
s/contains/contain/
etiennep1
2017/07/10 17:51:09
Done.
| |
| 259 // be updated. | |
| 260 std::partial_sum(buckets.begin(), buckets.end(), bucket_bounds.begin()); | |
| 261 | |
| 262 for (auto it = std::reverse_iterator<SAIt>(suffix_array + length); | |
| 263 it != std::reverse_iterator<SAIt>(suffix_array); ++it) { | |
| 264 size_type suffix_index = *it; | |
| 265 if (suffix_index > 0 && sl_partition[--suffix_index] == SType) { | |
| 266 key_type key = str[suffix_index]; | |
| 267 suffix_array[--bucket_bounds[key]] = suffix_index; | |
| 268 } | |
| 269 } | |
| 270 // Deals with the last suffix, because of the sentinel. | |
| 271 if (sl_partition[length - 1] == SType) { | |
| 272 key_type key = str[length - 1]; | |
| 273 suffix_array[--bucket_bounds[key]] = length - 1; | |
| 274 } | |
| 275 } | |
| 276 | |
| 277 // Given a string S starting at |str| with length |length|, an array | |
| 278 // starting at |substring_array| containing lexicographically ordered LMS | |
| 279 // terminated substring indices of S and an SL-Type partition |sl_partition| | |
| 280 // of S, assigns a unique label to every unique LMS substring. The sorted | |
| 281 // labels for all LMS substrings are written to |lms_str|, while the indices | |
| 282 // of LMS suffixes are written to |lms_indices|. In addition, returns the | |
| 283 // total number of unique labels. | |
| 284 template <class StrIt, class SAIt> | |
| 285 static size_type LabelLmsSubstrings(StrIt str, | |
| 286 size_type length, | |
| 287 const std::vector<SLType>& sl_partition, | |
| 288 SAIt suffix_array, | |
| 289 iterator lms_indices, | |
| 290 iterator lms_str) { | |
| 291 // Labelling starts at 0. | |
| 292 size_type label = 0; | |
| 293 | |
| 294 // |previous_lms| is initialized to 0 to indicate it is unset. | |
| 295 // Note that suf(S,0) is never a LMS suffix. Substrings will be visited in | |
| 296 // lexicographical order. | |
| 297 size_type previous_lms = 0; | |
| 298 for (auto it = suffix_array; it != suffix_array + length; ++it) { | |
| 299 if (*it > 0 && sl_partition[*it] == SType && | |
| 300 sl_partition[*it - 1] == LType) { | |
| 301 // suf(S, *it) is a LMS suffix. | |
| 302 | |
| 303 size_type current_lms = *it; | |
| 304 if (previous_lms != 0) { | |
| 305 // There was a previous LMS suffix. Check if the current LMS | |
| 306 // substring is equal to the previous one. | |
| 307 SLType current_lms_type = SType, previous_lms_type = SType; | |
|
huangs
2017/07/05 05:51:57
Break into separate lines.
etiennep1
2017/07/10 17:51:07
Done.
| |
| 308 for (size_type k = 0;; ++k) { | |
| 309 // |current_lms_end| and |previous_lms_end| denote whether we have | |
| 310 // reached the end of the current and previous LMS substring, | |
| 311 // respectively | |
| 312 bool current_lms_end = false, previous_lms_end = false; | |
|
huangs
2017/07/05 05:51:58
Break into separate lines.
etiennep1
2017/07/10 17:51:07
Done.
| |
| 313 | |
| 314 // Check for both previous and current substring ends. | |
| 315 // Note that it is more convenient to check if | |
| 316 // suf(S,current_lms + k) is an LMS suffix than to retrieve it | |
| 317 // from lms_indices. | |
| 318 if (current_lms + k >= length || | |
| 319 (current_lms_type == LType && | |
| 320 sl_partition[current_lms + k] == SType)) { | |
| 321 current_lms_end = true; | |
| 322 } | |
| 323 if (previous_lms + k >= length || | |
| 324 (previous_lms_type == LType && | |
| 325 sl_partition[previous_lms + k] == SType)) { | |
| 326 previous_lms_end = true; | |
| 327 } | |
| 328 | |
| 329 if (current_lms_end && previous_lms_end) { | |
| 330 break; // Previous and current substrings are identical. | |
| 331 } else if (current_lms_end != previous_lms_end || | |
| 332 str[current_lms + k] != str[previous_lms + k]) { | |
| 333 // Previous and current substrings differ, a new label is used. | |
| 334 ++label; | |
| 335 break; | |
| 336 } | |
| 337 | |
| 338 current_lms_type = sl_partition[current_lms + k]; | |
| 339 previous_lms_type = sl_partition[previous_lms + k]; | |
| 340 } | |
| 341 } | |
| 342 *lms_indices++ = *it; | |
| 343 *lms_str++ = label; | |
| 344 previous_lms = current_lms; | |
| 345 } | |
| 346 } | |
| 347 | |
| 348 return ++label; | |
|
huangs
2017/07/05 05:51:58
Style: Should be |label + 1|: The side effect of u
etiennep1
2017/07/10 17:51:07
Done.
| |
| 349 } | |
| 350 | |
| 351 // Implementation of the SA-IS algorithm. |str| must be a random access | |
| 352 // iterator pointing at the beginning of S with length |length|. The result | |
| 353 // is writtend in |suffix_array|, a random access iterator. | |
| 354 template <class StrIt, class SAIt> | |
| 355 static void SuffixSort(StrIt str, | |
| 356 size_type length, | |
| 357 key_type key_bound, | |
| 358 SAIt suffix_array) { | |
| 359 if (length == 1) | |
| 360 *suffix_array = 0; | |
| 361 if (length < 2) | |
| 362 return; | |
| 363 | |
| 364 std::vector<SLType> sl_partition(length); | |
| 365 size_type lms_count = | |
| 366 BuildSLPartition(str, length, key_bound, sl_partition.rbegin()); | |
| 367 std::vector<size_type> lms_indices(lms_count); | |
| 368 FindLmsSuffixes(sl_partition, lms_indices.begin()); | |
| 369 std::vector<size_type> buckets = MakeBucketCount(str, length, key_bound); | |
| 370 | |
| 371 if (lms_indices.size() > 1) { | |
| 372 // Given |lms_indices| in the same order they appear in |str|, induce | |
| 373 // LMS substrings relative order and write result to |suffix_array|. | |
| 374 InducedSort(str, length, sl_partition, lms_indices, buckets, | |
| 375 suffix_array); | |
| 376 std::vector<size_type> lms_str(lms_indices.size()); | |
| 377 | |
| 378 // Given LMS substrings in relative order found in |suffix_array|, | |
| 379 // map LMS substrings to unique labels to form a new string, |lms_str|. | |
| 380 size_type label_count = | |
| 381 LabelLmsSubstrings(str, length, sl_partition, suffix_array, | |
| 382 lms_indices.begin(), lms_str.begin()); | |
| 383 | |
| 384 if (label_count < lms_str.size()) { | |
| 385 // Reorder |lms_str| to have LMS suffixes in the same order they | |
| 386 // appear in |str|. | |
| 387 for (size_type i = 0; i < lms_indices.size(); ++i) | |
| 388 suffix_array[lms_indices[i]] = lms_str[i]; | |
| 389 | |
| 390 SLType previous_type = SType; | |
| 391 for (size_type i = 0, j = 0; i < sl_partition.size(); ++i) { | |
| 392 if (sl_partition[i] == SType && previous_type == LType) { | |
| 393 lms_str[j] = suffix_array[i]; | |
| 394 lms_indices[j++] = i; | |
| 395 } | |
| 396 previous_type = sl_partition[i]; | |
| 397 } | |
| 398 | |
| 399 // Recursively apply SuffixSort on |lms_str|, which is formed from | |
| 400 // labeled LMS suffixes in the same order they appear in |str|. | |
| 401 // |lms_str| is at most half the length of |str|. | |
| 402 Implementation<size_type, size_type>::SuffixSort( | |
| 403 lms_str.begin(), static_cast<size_type>(lms_str.size()), | |
| 404 label_count, suffix_array); | |
|
huangs
2017/07/10 02:45:29
|suffix_array| is SAIt, but in recursion it's now
etiennep1
2017/07/10 17:51:08
Not sure what you mean. SAIt stays the same in the
huangs
2017/07/10 20:01:18
Note that |suffix_array| was a parameter into this
etiennep1
2017/07/10 22:57:08
We pass it as is (the last argument).
SAIt is an i
huangs
2017/07/11 14:52:55
Oh I see, the full signature is
Implementation<siz
| |
| 405 | |
| 406 // Map LMS labels back to indices in |str| and write result to | |
| 407 // |lms_indices|. We're using |suffix_array| as a temporary buffer. | |
| 408 for (size_type i = 0; i < lms_indices.size(); ++i) | |
| 409 suffix_array[i] = lms_indices[suffix_array[i]]; | |
| 410 for (size_type i = 0; i < lms_indices.size(); ++i) | |
|
huangs
2017/07/05 05:51:58
std::copy()?
etiennep1
2017/07/10 17:51:08
Done.
| |
| 411 lms_indices[i] = suffix_array[i]; | |
| 412 | |
| 413 // At this point, |lms_indices| contains sorted LMS suffixes of |str|. | |
| 414 } | |
| 415 } | |
| 416 // Given |lms_indices| where LMS suffixes are sorted, induce the full | |
| 417 // order of suffixes in |str|. | |
| 418 InducedSort(str, length, sl_partition, lms_indices, buckets, | |
| 419 suffix_array); | |
| 420 } | |
| 421 }; | |
| 422 }; | |
| 423 | |
| 424 // Generates a sorted suffix array for the input string |str| using the functor | |
| 425 // |Algorithm| which provides an interface equivalent to NaiveSuffixSort. | |
| 426 /// Characters found in |str| are assumed to be in range [0, |key_bound|). | |
| 427 // Returns the suffix array as a vector. | |
| 428 // |StrRng| is an input random access range. | |
| 429 // |KeyType| is an unsigned integer type. | |
| 430 template <class Algorithm, class StrRng, class KeyType> | |
| 431 std::vector<typename StrRng::size_type> MakeSuffixArray(const StrRng& str, | |
| 432 KeyType key_bound) { | |
| 433 Algorithm sort; | |
| 434 std::vector<typename StrRng::size_type> suffix_array(str.end() - str.begin()); | |
| 435 sort(str, key_bound, suffix_array.begin()); | |
| 436 return suffix_array; | |
| 437 } | |
| 438 | |
| 439 // Lexicographical lower bound of |str2| in the suffix array of |str1| using | |
| 440 // binary search. |str1_first| is a random access iterator pointing to the | |
| 441 // beginning of |str1|. |str2_first| and |str2_last| are forward iterators | |
| 442 // pointing to the beginning and end of |str2|, respectively. | |
| 443 template <class SARng, class StrIt1, class StrIt2> | |
| 444 auto SearchSuffixArray(const SARng& suffix_array, | |
| 445 StrIt1 str1_first, | |
| 446 StrIt2 str2_first, | |
| 447 StrIt2 str2_last) -> decltype(std::begin(suffix_array)) { | |
| 448 using size_type = typename SARng::value_type; | |
| 449 | |
| 450 size_t n = std::end(suffix_array) - std::begin(suffix_array); | |
| 451 auto it = std::lower_bound( | |
| 452 std::begin(suffix_array), std::end(suffix_array), str2_first, | |
| 453 [str1_first, str2_last, n](size_type a, StrIt2 b) { | |
| 454 return std::lexicographical_compare(str1_first + a, str1_first + n, b, | |
| 455 str2_last); | |
| 456 }); | |
| 457 return it; | |
|
huangs
2017/07/05 05:51:58
What happened to the code that uses SA to solve ma
huangs
2017/07/05 15:46:35
For a more realistic example: Query "aaaam" in ["a
etiennep1
2017/07/10 17:51:08
This is a lower level function, that only computes
| |
| 458 } | |
| 459 | |
| 460 } // namespace zucchini | |
| 461 | |
| 462 #endif // CHROME_INSTALLER_ZUCCHINI_SUFFIX_ARRAY_H_ | |
| OLD | NEW |