Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/installer/zucchini/suffix_array.h" | |
| 6 | |
| 7 #include <algorithm> | |
| 8 #include <cstddef> | |
| 9 #include <initializer_list> | |
| 10 #include <string> | |
| 11 #include <vector> | |
| 12 | |
| 13 #include "testing/gtest/include/gtest/gtest.h" | |
| 14 | |
| 15 namespace zucchini { | |
| 16 | |
| 17 using ustring = std::basic_string<unsigned char>; | |
| 18 | |
| 19 ustring MakeUnsignedString(const std::string& str) { | |
| 20 return {str.begin(), str.end()}; | |
| 21 } | |
| 22 | |
| 23 template <class T> | |
| 24 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) { | |
| 25 return {ilist.begin(), ilist.end()}; | |
| 26 } | |
| 27 | |
| 28 void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition, | |
| 29 std::initializer_list<size_t> expected_lms_indices, | |
| 30 std::string str) { | |
| 31 using SaisImpl = Sais::Implementation<size_t, size_t>; | |
| 32 | |
| 33 std::vector<Sais::SLType> sl_partition(str.size()); | |
| 34 EXPECT_EQ(expected_lms_indices.size(), | |
| 35 SaisImpl::BuildSLPartition(str.begin(), str.size(), 255, | |
| 36 sl_partition.rbegin())); | |
| 37 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition); | |
| 38 | |
| 39 std::vector<size_t> lms_indices(expected_lms_indices.size()); | |
| 40 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin()); | |
| 41 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices); | |
| 42 } | |
| 43 | |
| 44 TEST(SaisTest, BuildSLPartition) { | |
| 45 TestSlPartition({}, {}, ""); | |
| 46 TestSlPartition( | |
| 47 { | |
| 48 Sais::LType, | |
| 49 }, | |
| 50 {}, "a"); | |
| 51 TestSlPartition( | |
| 52 { | |
| 53 Sais::LType, Sais::LType, | |
| 54 }, | |
| 55 {}, "ba"); | |
| 56 TestSlPartition( | |
| 57 { | |
| 58 Sais::SType, Sais::LType, | |
| 59 }, | |
| 60 {}, "ab"); | |
| 61 TestSlPartition( | |
| 62 { | |
| 63 Sais::SType, Sais::SType, Sais::LType, | |
| 64 }, | |
| 65 {}, "aab"); | |
| 66 TestSlPartition( | |
| 67 { | |
| 68 Sais::LType, Sais::LType, Sais::LType, | |
| 69 }, | |
| 70 {}, "bba"); | |
| 71 TestSlPartition( | |
| 72 { | |
| 73 Sais::LType, Sais::SType, Sais::LType, | |
| 74 }, | |
| 75 {1}, "bab"); | |
| 76 TestSlPartition( | |
| 77 { | |
| 78 Sais::LType, Sais::SType, Sais::SType, Sais::LType, | |
| 79 }, | |
| 80 {1}, "baab"); | |
| 81 | |
| 82 TestSlPartition( | |
| 83 { | |
| 84 Sais::LType, // zucchini | |
| 85 Sais::LType, // ucchini | |
| 86 Sais::SType, // cchini | |
| 87 Sais::SType, // chini | |
| 88 Sais::SType, // hini | |
| 89 Sais::SType, // ini | |
| 90 Sais::LType, // ni | |
| 91 Sais::LType, // i | |
| 92 }, | |
| 93 {2}, "zucchini"); | |
| 94 } | |
| 95 | |
| 96 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str, | |
| 97 size_t max_key) { | |
| 98 using SaisImpl = Sais::Implementation<size_t, size_t>; | |
| 99 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key); | |
| 100 } | |
| 101 | |
| 102 TEST(SaisTest, BucketCount) { | |
| 103 using vec = std::vector<size_t>; | |
| 104 | |
| 105 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4)); | |
| 106 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4)); | |
| 107 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4)); | |
| 108 } | |
| 109 | |
| 110 std::vector<size_t> InducedSortSubstring(std::string str) { | |
| 111 using SaisImpl = Sais::Implementation<size_t, size_t>; | |
| 112 std::vector<Sais::SLType> sl_partition(str.size()); | |
| 113 size_t lms_count = SaisImpl::BuildSLPartition(str.begin(), str.size(), 255, | |
| 114 sl_partition.rbegin()); | |
| 115 std::vector<size_t> lms_indices(lms_count); | |
| 116 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin()); | |
| 117 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), 255); | |
| 118 | |
| 119 std::vector<size_t> suffix_array(str.size()); | |
| 120 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets, | |
| 121 suffix_array.begin()); | |
| 122 | |
| 123 return suffix_array; | |
| 124 } | |
| 125 | |
| 126 TEST(SaisTest, InducedSortSubstring) { | |
| 127 using vec = std::vector<size_t>; | |
| 128 | |
| 129 // L; a$ | |
| 130 EXPECT_EQ(vec({0}), InducedSortSubstring("a")); | |
| 131 | |
| 132 // SL; ab$, b$ | |
| 133 EXPECT_EQ(vec({0, 1}), InducedSortSubstring("ab")); | |
| 134 | |
| 135 // LL; a$, ba$ | |
| 136 EXPECT_EQ(vec({1, 0}), InducedSortSubstring("ba")); | |
| 137 | |
| 138 // SLL; a$, aba$, ba$ | |
| 139 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring("aba")); | |
| 140 | |
| 141 // LSL; ab$, b$, ba | |
| 142 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring("bab")); | |
| 143 | |
| 144 // SSL; aab$, ab$, b$ | |
| 145 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring("aab")); | |
| 146 | |
| 147 // LSSL; aab$, ab$, b$, ba | |
| 148 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring("baab")); | |
| 149 } | |
| 150 | |
| 151 template <class Algorithm> | |
| 152 void TestSuffixSort(ustring test_str) { | |
| 153 std::vector<size_t> suffix_array = MakeSuffixArray<Algorithm>(test_str, 256); | |
| 154 EXPECT_EQ(test_str.size(), suffix_array.size()); | |
| 155 | |
| 156 // Expect that I[] is a permutation of [0, len]. | |
| 157 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end()); | |
| 158 std::sort(sorted_suffix.begin(), sorted_suffix.end()); | |
| 159 for (size_t i = 0; i < test_str.size(); ++i) | |
| 160 EXPECT_EQ(i, sorted_suffix[i]); | |
| 161 | |
| 162 // Expect that all suffixes are strictly ordered. | |
| 163 auto end = test_str.end(); | |
| 164 for (size_t i = 1; i < test_str.size(); ++i) { | |
| 165 auto suf1 = test_str.begin() + suffix_array[i - 1]; | |
| 166 auto suf2 = test_str.begin() + suffix_array[i]; | |
| 167 bool is_less = std::lexicographical_compare(suf1, end, suf2, end); | |
| 168 EXPECT_TRUE(is_less); | |
| 169 } | |
| 170 } | |
| 171 | |
| 172 constexpr const char* test_strs[] = { | |
| 173 "", | |
| 174 "a", | |
| 175 "aa", | |
| 176 "za", | |
| 177 "CACAO", | |
| 178 "aaaaa", | |
| 179 "banana", | |
| 180 "tobeornottobe", | |
| 181 "The quick brown fox jumps over the lazy dog.", | |
| 182 "elephantelephantelephantelephantelephant", | |
| 183 "walawalawashington", | |
| 184 "-------------------------", | |
| 185 "011010011001011010010110011010010", | |
| 186 "3141592653589793238462643383279502884197169399375105", | |
| 187 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", | |
| 188 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba", | |
| 189 "0123456789876543210", | |
| 190 "9876543210123456789", | |
| 191 "aababcabcdabcdeabcdefabcdefg", | |
| 192 "asdhklgalksdjghalksdjghalksdjgh", | |
| 193 }; | |
| 194 | |
| 195 TEST(SuffixSortTest, NaiveSuffixSort) { | |
| 196 for (const std::string& test_str : test_strs) { | |
| 197 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str)); | |
| 198 } | |
| 199 } | |
| 200 | |
| 201 TEST(SuffixSortTest, SaisSort) { | |
| 202 for (const std::string& test_str : test_strs) { | |
| 203 TestSuffixSort<Sais>(MakeUnsignedString(test_str)); | |
| 204 } | |
| 205 } | |
| 206 | |
| 207 // Test with sequence that has every character. | |
| 208 TEST(SuffixSortTest, AllChar) { | |
| 209 const int kNumChar = 256; | |
| 210 std::vector<unsigned char> all_char(kNumChar); | |
| 211 std::iota(all_char.begin(), all_char.end(), 0); | |
| 212 | |
| 213 { | |
| 214 std::vector<size_t> suffix_array = | |
| 215 MakeSuffixArray<Sais>(all_char, kNumChar); | |
| 216 for (int i = 0; i < kNumChar; ++i) | |
| 217 EXPECT_EQ(i, suffix_array[i]); | |
| 218 } | |
| 219 | |
| 220 std::vector<unsigned char> all_char_reverse(all_char.rbegin(), | |
| 221 all_char.rend()); | |
| 222 { | |
| 223 std::vector<size_t> suffix_array = | |
| 224 MakeSuffixArray<Sais>(all_char_reverse, kNumChar); | |
| 225 for (int i = 0; i < kNumChar; ++i) | |
| 226 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]); | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 void TestSearchSuffixArray(std::string base_str, std::string search_str) { | |
|
huangs
2017/06/28 22:30:35
Make this take const ustring&, and update all call
etiennep1
2017/06/28 23:03:48
Probably more clean like that.
| |
| 231 ustring base_ustr = MakeUnsignedString(base_str); | |
| 232 ustring search_ustr = MakeUnsignedString(search_str); | |
| 233 | |
| 234 std::vector<size_t> suffix_array = | |
| 235 MakeSuffixArray<NaiveSuffixSort>(base_ustr, 256); | |
| 236 | |
| 237 auto pos = SearchSuffixArray(suffix_array, base_ustr.begin(), | |
| 238 search_ustr.begin(), search_ustr.end()); | |
| 239 | |
| 240 auto end = base_ustr.end(); | |
| 241 if (pos != suffix_array.begin()) { | |
| 242 // Previous suffix is less than |search_str|. | |
| 243 auto suf = base_ustr.begin() + pos[-1]; | |
| 244 bool is_less = std::lexicographical_compare(suf, end, search_ustr.begin(), | |
| 245 search_ustr.end()); | |
| 246 EXPECT_TRUE(is_less); | |
| 247 } | |
| 248 if (pos != suffix_array.end()) { | |
| 249 // Current suffix is greater of equal to |search_str|. | |
| 250 auto suf = base_ustr.begin() + *pos; | |
| 251 bool is_less = std::lexicographical_compare(suf, end, search_ustr.begin(), | |
| 252 search_ustr.end()); | |
| 253 EXPECT_FALSE(is_less); | |
| 254 } | |
| 255 } | |
| 256 | |
| 257 TEST(SuffixArrayTest, Search) { | |
| 258 TestSearchSuffixArray("", ""); | |
|
huangs
2017/06/28 22:30:35
using us = MakeUnsignedString;
TestSearchSuffixAr
| |
| 259 TestSearchSuffixArray("", "a"); | |
| 260 TestSearchSuffixArray("b", ""); | |
| 261 TestSearchSuffixArray("b", "a"); | |
| 262 TestSearchSuffixArray("b", "c"); | |
| 263 TestSearchSuffixArray("b", "bc"); | |
| 264 TestSearchSuffixArray("aa", "a"); | |
| 265 TestSearchSuffixArray("aa", "aa"); | |
| 266 | |
| 267 const char* sentence = "the quick brown fox jumps over the lazy dog."; | |
|
huangs
2017/06/28 22:30:35
ustring sentence = us("...");
etiennep1
2017/06/28 23:03:48
Done.
| |
| 268 // Entire string: exact and unique. | |
| 269 TestSearchSuffixArray(sentence, sentence); | |
| 270 // Empty string: exact and non-unique. | |
| 271 TestSearchSuffixArray(sentence, ""); | |
| 272 // Exact and unique suffix matches. | |
| 273 TestSearchSuffixArray(sentence, "."); | |
| 274 TestSearchSuffixArray(sentence, "the lazy dog."); | |
| 275 // Exact and unique non-suffix matches. | |
| 276 TestSearchSuffixArray(sentence, "quick"); | |
| 277 TestSearchSuffixArray(sentence, "the quick"); | |
| 278 // Partial and unique matches. | |
| 279 TestSearchSuffixArray(sentence, "fox jumps with the hosps"); | |
| 280 TestSearchSuffixArray(sentence, "xyz"); | |
| 281 // Exact and non-unique match: take lexicographical first. | |
| 282 TestSearchSuffixArray(sentence, "the"); | |
| 283 TestSearchSuffixArray(sentence, " "); | |
| 284 // Partial and non-unique match. | |
| 285 // query < "the l"... < "the q"... | |
| 286 TestSearchSuffixArray(sentence, "the apple"); | |
| 287 // "the l"... < query < "the q"... | |
| 288 TestSearchSuffixArray(sentence, "the opera"); | |
| 289 // "the l"... < "the q"... < query | |
| 290 TestSearchSuffixArray(sentence, "the zebra"); | |
| 291 // Prefix match dominates suffix match (unique). | |
| 292 TestSearchSuffixArray(sentence, "over quick brown fox"); | |
| 293 // Empty matchs. | |
| 294 TestSearchSuffixArray(sentence, ","); | |
| 295 TestSearchSuffixArray(sentence, "1234"); | |
| 296 TestSearchSuffixArray(sentence, "THE QUICK BROWN FOX"); | |
| 297 TestSearchSuffixArray(sentence, "(the"); | |
| 298 } | |
| 299 | |
| 300 TEST(SuffixArrayTest, SearchExact) { | |
| 301 for (const std::string& test_str : test_strs) { | |
| 302 ustring test_ustr = MakeUnsignedString(test_str); | |
| 303 | |
| 304 std::vector<size_t> suffix_array = MakeSuffixArray<Sais>(test_ustr, 256); | |
| 305 | |
| 306 for (int lo = 0; lo < test_str.size(); ++lo) { | |
| 307 for (int hi = lo + 1; hi <= test_str.size(); ++hi) { | |
| 308 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi); | |
| 309 ASSERT_EQ(query.size(), hi - lo); | |
| 310 auto pos = SearchSuffixArray(suffix_array, test_ustr.begin(), | |
| 311 query.begin(), query.end()); | |
| 312 EXPECT_TRUE( | |
| 313 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos)); | |
| 314 } | |
| 315 } | |
| 316 } | |
| 317 } | |
| 318 | |
| 319 } // namespace zucchini | |
| OLD | NEW |