Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(906)

Side by Side Diff: chrome/installer/zucchini/suffix_array_unittest.cc

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)
Patch Set: Make ustring conversions pretty Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/installer/zucchini/suffix_array.h"
6
7 #include <algorithm>
8 #include <cstddef>
9 #include <initializer_list>
10 #include <string>
11 #include <vector>
12
13 #include "testing/gtest/include/gtest/gtest.h"
14
15 namespace zucchini {
16
17 using ustring = std::basic_string<unsigned char>;
18
19 ustring MakeUnsignedString(const std::string& str) {
20 return {str.begin(), str.end()};
21 }
22
23 template <class T>
24 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) {
25 return {ilist.begin(), ilist.end()};
26 }
27
28 void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition,
29 std::initializer_list<size_t> expected_lms_indices,
30 std::string str) {
31 using SaisImpl = Sais::Implementation<size_t, size_t>;
32
33 std::vector<Sais::SLType> sl_partition(str.size());
34 EXPECT_EQ(expected_lms_indices.size(),
35 SaisImpl::BuildSLPartition(str.begin(), str.size(), 255,
36 sl_partition.rbegin()));
37 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition);
38
39 std::vector<size_t> lms_indices(expected_lms_indices.size());
40 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin());
41 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices);
42 }
43
44 TEST(SaisTest, BuildSLPartition) {
45 TestSlPartition({}, {}, "");
46 TestSlPartition(
47 {
48 Sais::LType,
49 },
50 {}, "a");
51 TestSlPartition(
52 {
53 Sais::LType, Sais::LType,
54 },
55 {}, "ba");
56 TestSlPartition(
57 {
58 Sais::SType, Sais::LType,
59 },
60 {}, "ab");
61 TestSlPartition(
62 {
63 Sais::SType, Sais::SType, Sais::LType,
64 },
65 {}, "aab");
66 TestSlPartition(
67 {
68 Sais::LType, Sais::LType, Sais::LType,
69 },
70 {}, "bba");
71 TestSlPartition(
72 {
73 Sais::LType, Sais::SType, Sais::LType,
74 },
75 {1}, "bab");
76 TestSlPartition(
77 {
78 Sais::LType, Sais::SType, Sais::SType, Sais::LType,
79 },
80 {1}, "baab");
81
82 TestSlPartition(
83 {
84 Sais::LType, // zucchini
85 Sais::LType, // ucchini
86 Sais::SType, // cchini
87 Sais::SType, // chini
88 Sais::SType, // hini
89 Sais::SType, // ini
90 Sais::LType, // ni
91 Sais::LType, // i
92 },
93 {2}, "zucchini");
94 }
95
96 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str,
97 size_t max_key) {
98 using SaisImpl = Sais::Implementation<size_t, size_t>;
99 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key);
100 }
101
102 TEST(SaisTest, BucketCount) {
103 using vec = std::vector<size_t>;
104
105 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4));
106 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4));
107 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4));
108 }
109
110 std::vector<size_t> InducedSortSubstring(ustring str) {
111 using SaisImpl = Sais::Implementation<size_t, size_t>;
112 std::vector<Sais::SLType> sl_partition(str.size());
113 size_t lms_count = SaisImpl::BuildSLPartition(str.begin(), str.size(), 255,
114 sl_partition.rbegin());
115 std::vector<size_t> lms_indices(lms_count);
116 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin());
117 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), 255);
118
119 std::vector<size_t> suffix_array(str.size());
120 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets,
121 suffix_array.begin());
122
123 return suffix_array;
124 }
125
126 TEST(SaisTest, InducedSortSubstring) {
127 using vec = std::vector<size_t>;
128
129 auto us = MakeUnsignedString;
130
131 // L; a$
132 EXPECT_EQ(vec({0}), InducedSortSubstring(us("a")));
133
134 // SL; ab$, b$
135 EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab")));
136
137 // LL; a$, ba$
138 EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba")));
139
140 // SLL; a$, aba$, ba$
141 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba")));
142
143 // LSL; ab$, b$, ba
144 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab")));
145
146 // SSL; aab$, ab$, b$
147 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab")));
148
149 // LSSL; aab$, ab$, b$, ba
150 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab")));
151 }
152
153 template <class Algorithm>
154 void TestSuffixSort(ustring test_str) {
155 std::vector<size_t> suffix_array = MakeSuffixArray<Algorithm>(test_str, 256);
156 EXPECT_EQ(test_str.size(), suffix_array.size());
157
158 // Expect that I[] is a permutation of [0, len].
159 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end());
160 std::sort(sorted_suffix.begin(), sorted_suffix.end());
161 for (size_t i = 0; i < test_str.size(); ++i)
162 EXPECT_EQ(i, sorted_suffix[i]);
163
164 // Expect that all suffixes are strictly ordered.
165 auto end = test_str.end();
166 for (size_t i = 1; i < test_str.size(); ++i) {
167 auto suf1 = test_str.begin() + suffix_array[i - 1];
168 auto suf2 = test_str.begin() + suffix_array[i];
169 bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
170 EXPECT_TRUE(is_less);
171 }
172 }
173
174 constexpr const char* test_strs[] = {
175 "",
176 "a",
177 "aa",
178 "za",
179 "CACAO",
180 "aaaaa",
181 "banana",
182 "tobeornottobe",
183 "The quick brown fox jumps over the lazy dog.",
184 "elephantelephantelephantelephantelephant",
185 "walawalawashington",
186 "-------------------------",
187 "011010011001011010010110011010010",
188 "3141592653589793238462643383279502884197169399375105",
189 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
190 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba",
191 "0123456789876543210",
192 "9876543210123456789",
193 "aababcabcdabcdeabcdefabcdefg",
194 "asdhklgalksdjghalksdjghalksdjgh",
195 };
196
197 TEST(SuffixSortTest, NaiveSuffixSort) {
198 for (const std::string& test_str : test_strs) {
199 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str));
200 }
201 }
202
203 TEST(SuffixSortTest, SaisSort) {
204 for (const std::string& test_str : test_strs) {
205 TestSuffixSort<Sais>(MakeUnsignedString(test_str));
206 }
207 }
208
209 // Test with sequence that has every character.
210 TEST(SuffixSortTest, AllChar) {
211 const int kNumChar = 256;
212 std::vector<unsigned char> all_char(kNumChar);
213 std::iota(all_char.begin(), all_char.end(), 0);
214
215 {
216 std::vector<size_t> suffix_array =
217 MakeSuffixArray<Sais>(all_char, kNumChar);
218 for (int i = 0; i < kNumChar; ++i)
219 EXPECT_EQ(i, suffix_array[i]);
220 }
221
222 std::vector<unsigned char> all_char_reverse(all_char.rbegin(),
223 all_char.rend());
224 {
225 std::vector<size_t> suffix_array =
226 MakeSuffixArray<Sais>(all_char_reverse, kNumChar);
227 for (int i = 0; i < kNumChar; ++i)
228 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]);
229 }
230 }
231
232 void TestSearchSuffixArray(ustring base_str, ustring search_str) {
233 std::vector<size_t> suffix_array =
234 MakeSuffixArray<NaiveSuffixSort>(base_str, 256);
235
236 auto pos = SearchSuffixArray(suffix_array, base_str.begin(),
237 search_str.begin(), search_str.end());
238
239 auto end = base_str.end();
240 if (pos != suffix_array.begin()) {
241 // Previous suffix is less than |search_str|.
242 auto suf = base_str.begin() + pos[-1];
243 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
244 search_str.end());
245 EXPECT_TRUE(is_less);
246 }
247 if (pos != suffix_array.end()) {
248 // Current suffix is greater of equal to |search_str|.
249 auto suf = base_str.begin() + *pos;
250 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
251 search_str.end());
252 EXPECT_FALSE(is_less);
253 }
254 }
255
256 TEST(SuffixArrayTest, Search) {
257 auto us = MakeUnsignedString;
258
259 TestSearchSuffixArray(us(""), us(""));
260 TestSearchSuffixArray(us(""), us("a"));
261 TestSearchSuffixArray(us("b"), us(""));
262 TestSearchSuffixArray(us("b"), us("a"));
263 TestSearchSuffixArray(us("b"), us("c"));
264 TestSearchSuffixArray(us("b"), us("bc"));
265 TestSearchSuffixArray(us("aa"), us("a"));
266 TestSearchSuffixArray(us("aa"), us("aa"));
267
268 ustring sentence = us("the quick brown fox jumps over the lazy dog.");
269 // Entire string: exact and unique.
270 TestSearchSuffixArray(sentence, sentence);
271 // Empty string: exact and non-unique.
272 TestSearchSuffixArray(sentence, us(""));
273 // Exact and unique suffix matches.
274 TestSearchSuffixArray(sentence, us("."));
275 TestSearchSuffixArray(sentence, us("the lazy dog."));
276 // Exact and unique non-suffix matches.
277 TestSearchSuffixArray(sentence, us("quick"));
278 TestSearchSuffixArray(sentence, us("the quick"));
279 // Partial and unique matches.
280 TestSearchSuffixArray(sentence, us("fox jumps with the hosps"));
281 TestSearchSuffixArray(sentence, us("xyz"));
282 // Exact and non-unique match: take lexicographical first.
283 TestSearchSuffixArray(sentence, us("the"));
284 TestSearchSuffixArray(sentence, us(" "));
285 // Partial and non-unique match.
286 // query < "the l"... < "the q"...
287 TestSearchSuffixArray(sentence, us("the apple"));
288 // "the l"... < query < "the q"...
289 TestSearchSuffixArray(sentence, us("the opera"));
290 // "the l"... < "the q"... < query
291 TestSearchSuffixArray(sentence, us("the zebra"));
292 // Prefix match dominates suffix match (unique).
293 TestSearchSuffixArray(sentence, us("over quick brown fox"));
294 // Empty matchs.
295 TestSearchSuffixArray(sentence, us(","));
296 TestSearchSuffixArray(sentence, us("1234"));
297 TestSearchSuffixArray(sentence, us("THE QUICK BROWN FOX"));
298 TestSearchSuffixArray(sentence, us("(the"));
299 }
300
301 TEST(SuffixArrayTest, SearchExact) {
302 for (const std::string& test_str : test_strs) {
303 ustring test_ustr = MakeUnsignedString(test_str);
304
305 std::vector<size_t> suffix_array = MakeSuffixArray<Sais>(test_ustr, 256);
306
307 for (int lo = 0; lo < test_str.size(); ++lo) {
308 for (int hi = lo + 1; hi <= test_str.size(); ++hi) {
309 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi);
310 ASSERT_EQ(query.size(), hi - lo);
311 auto pos = SearchSuffixArray(suffix_array, test_ustr.begin(),
312 query.begin(), query.end());
313 EXPECT_TRUE(
314 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos));
315 }
316 }
317 }
318 }
319
320 } // namespace zucchini
OLDNEW
« chrome/installer/zucchini/suffix_array.h ('K') | « chrome/installer/zucchini/suffix_array.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698