Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(830)

Side by Side Diff: chrome/installer/zucchini/suffix_array_unittest.cc

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)
Patch Set: Fix SearchExact performance Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/installer/zucchini/suffix_array.h"
6
7 #include <algorithm>
8 #include <cstddef>
9 #include <initializer_list>
10 #include <string>
11 #include <vector>
12
13 #include "testing/gtest/include/gtest/gtest.h"
14
15 namespace zucchini {
16
17 using ustring = std::basic_string<unsigned char>;
18
19 ustring MakeUnsignedString(const std::string& str) {
20 return {str.begin(), str.end()};
21 }
22
23 template <class T>
24 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) {
25 return {ilist.begin(), ilist.end()};
26 }
27
28 void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition,
29 std::initializer_list<size_t> expected_lms_indices,
30 std::string str) {
31 using SaisImpl = Sais::Implementation<size_t, size_t>;
32
33 std::vector<Sais::SLType> sl_partition(str.size());
34 EXPECT_EQ(expected_lms_indices.size(),
35 SaisImpl::BuildSLPartition(str.begin(), str.size(), 255,
36 sl_partition.rbegin()));
37 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition);
38
39 std::vector<size_t> lms_indices(expected_lms_indices.size());
40 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin());
41 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices);
42 }
43
44 TEST(SaisTest, BuildSLPartition) {
45 TestSlPartition({}, {}, "");
46 TestSlPartition(
47 {
48 Sais::LType,
49 },
50 {}, "a");
51 TestSlPartition(
52 {
53 Sais::LType, Sais::LType,
54 },
55 {}, "ba");
56 TestSlPartition(
57 {
58 Sais::SType, Sais::LType,
59 },
60 {}, "ab");
61 TestSlPartition(
62 {
63 Sais::SType, Sais::SType, Sais::LType,
64 },
65 {}, "aab");
66 TestSlPartition(
67 {
68 Sais::LType, Sais::LType, Sais::LType,
69 },
70 {}, "bba");
71 TestSlPartition(
72 {
73 Sais::LType, Sais::SType, Sais::LType,
74 },
75 {1}, "bab");
76 TestSlPartition(
77 {
78 Sais::LType, Sais::SType, Sais::SType, Sais::LType,
79 },
80 {1}, "baab");
81
82 TestSlPartition(
83 {
84 Sais::LType, // zucchini
85 Sais::LType, // ucchini
86 Sais::SType, // cchini
87 Sais::SType, // chini
88 Sais::SType, // hini
89 Sais::SType, // ini
90 Sais::LType, // ni
91 Sais::LType, // i
92 },
93 {2}, "zucchini");
94 }
95
96 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str,
97 size_t max_key) {
98 using SaisImpl = Sais::Implementation<size_t, size_t>;
99 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key);
100 }
101
102 TEST(SaisTest, BucketCount) {
103 using vec = std::vector<size_t>;
104
105 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4));
106 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4));
107 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4));
108 }
109
110 std::vector<size_t> InducedSortSubstring(std::string str) {
111 using SaisImpl = Sais::Implementation<size_t, size_t>;
112 std::vector<Sais::SLType> sl_partition(str.size());
113 size_t lms_count = SaisImpl::BuildSLPartition(str.begin(), str.size(), 255,
114 sl_partition.rbegin());
115 std::vector<size_t> lms_indices(lms_count);
116 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin());
117 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), 255);
118
119 std::vector<size_t> suffix_array(str.size());
120 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets,
121 suffix_array.begin());
122
123 return suffix_array;
124 }
125
126 TEST(SaisTest, InducedSortSubstring) {
127 using vec = std::vector<size_t>;
128
129 // L; a$
130 EXPECT_EQ(vec({0}), InducedSortSubstring("a"));
131
132 // SL; ab$, b$
133 EXPECT_EQ(vec({0, 1}), InducedSortSubstring("ab"));
134
135 // LL; a$, ba$
136 EXPECT_EQ(vec({1, 0}), InducedSortSubstring("ba"));
137
138 // SLL; a$, aba$, ba$
139 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring("aba"));
140
141 // LSL; ab$, b$, ba
142 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring("bab"));
143
144 // SSL; aab$, ab$, b$
145 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring("aab"));
146
147 // LSSL; aab$, ab$, b$, ba
148 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring("baab"));
149 }
150
151 template <class Algorithm>
152 void TestSuffixSort(ustring test_str) {
153 std::vector<size_t> suffix_array = MakeSuffixArray<Algorithm>(test_str, 256);
154 EXPECT_EQ(test_str.size(), suffix_array.size());
155
156 // Expect that I[] is a permutation of [0, len].
157 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end());
158 std::sort(sorted_suffix.begin(), sorted_suffix.end());
159 for (size_t i = 0; i < test_str.size(); ++i)
160 EXPECT_EQ(i, sorted_suffix[i]);
161
162 // Expect that all suffixes are strictly ordered.
163 auto end = test_str.end();
164 for (size_t i = 1; i < test_str.size(); ++i) {
165 auto suf1 = test_str.begin() + suffix_array[i - 1];
166 auto suf2 = test_str.begin() + suffix_array[i];
167 bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
168 EXPECT_TRUE(is_less);
169 }
170 }
171
172 constexpr const char* test_strs[] = {
173 "",
174 "a",
175 "aa",
176 "za",
177 "CACAO",
178 "aaaaa",
179 "banana",
180 "tobeornottobe",
181 "The quick brown fox jumps over the lazy dog.",
182 "elephantelephantelephantelephantelephant",
183 "walawalawashington",
184 "-------------------------",
185 "011010011001011010010110011010010",
186 "3141592653589793238462643383279502884197169399375105",
187 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
188 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba",
189 "0123456789876543210",
190 "9876543210123456789",
191 "aababcabcdabcdeabcdefabcdefg",
192 "asdhklgalksdjghalksdjghalksdjgh",
193 };
194
195 TEST(SuffixSortTest, NaiveSuffixSort) {
196 for (const std::string& test_str : test_strs) {
197 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str));
198 }
199 }
200
201 TEST(SuffixSortTest, SaisSort) {
202 for (const std::string& test_str : test_strs) {
203 TestSuffixSort<Sais>(MakeUnsignedString(test_str));
204 }
205 }
206
207 // Test with sequence that has every character.
208 TEST(SuffixSortTest, AllChar) {
209 const int kNumChar = 256;
210 std::vector<unsigned char> all_char(kNumChar);
211 std::iota(all_char.begin(), all_char.end(), 0);
212
213 {
214 std::vector<size_t> suffix_array =
215 MakeSuffixArray<Sais>(all_char, kNumChar);
216 for (int i = 0; i < kNumChar; ++i)
217 EXPECT_EQ(i, suffix_array[i]);
218 }
219
220 std::vector<unsigned char> all_char_reverse(all_char.rbegin(),
221 all_char.rend());
222 {
223 std::vector<size_t> suffix_array =
224 MakeSuffixArray<Sais>(all_char_reverse, kNumChar);
225 for (int i = 0; i < kNumChar; ++i)
226 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]);
227 }
228 }
229
230 void TestSearchSuffixArray(std::string base_str, std::string search_str) {
huangs 2017/06/28 22:30:35 Make this take const ustring&, and update all call
etiennep1 2017/06/28 23:03:48 Probably more clean like that.
231 ustring base_ustr = MakeUnsignedString(base_str);
232 ustring search_ustr = MakeUnsignedString(search_str);
233
234 std::vector<size_t> suffix_array =
235 MakeSuffixArray<NaiveSuffixSort>(base_ustr, 256);
236
237 auto pos = SearchSuffixArray(suffix_array, base_ustr.begin(),
238 search_ustr.begin(), search_ustr.end());
239
240 auto end = base_ustr.end();
241 if (pos != suffix_array.begin()) {
242 // Previous suffix is less than |search_str|.
243 auto suf = base_ustr.begin() + pos[-1];
244 bool is_less = std::lexicographical_compare(suf, end, search_ustr.begin(),
245 search_ustr.end());
246 EXPECT_TRUE(is_less);
247 }
248 if (pos != suffix_array.end()) {
249 // Current suffix is greater of equal to |search_str|.
250 auto suf = base_ustr.begin() + *pos;
251 bool is_less = std::lexicographical_compare(suf, end, search_ustr.begin(),
252 search_ustr.end());
253 EXPECT_FALSE(is_less);
254 }
255 }
256
257 TEST(SuffixArrayTest, Search) {
258 TestSearchSuffixArray("", "");
huangs 2017/06/28 22:30:35 using us = MakeUnsignedString; TestSearchSuffixAr
259 TestSearchSuffixArray("", "a");
260 TestSearchSuffixArray("b", "");
261 TestSearchSuffixArray("b", "a");
262 TestSearchSuffixArray("b", "c");
263 TestSearchSuffixArray("b", "bc");
264 TestSearchSuffixArray("aa", "a");
265 TestSearchSuffixArray("aa", "aa");
266
267 const char* sentence = "the quick brown fox jumps over the lazy dog.";
huangs 2017/06/28 22:30:35 ustring sentence = us("...");
etiennep1 2017/06/28 23:03:48 Done.
268 // Entire string: exact and unique.
269 TestSearchSuffixArray(sentence, sentence);
270 // Empty string: exact and non-unique.
271 TestSearchSuffixArray(sentence, "");
272 // Exact and unique suffix matches.
273 TestSearchSuffixArray(sentence, ".");
274 TestSearchSuffixArray(sentence, "the lazy dog.");
275 // Exact and unique non-suffix matches.
276 TestSearchSuffixArray(sentence, "quick");
277 TestSearchSuffixArray(sentence, "the quick");
278 // Partial and unique matches.
279 TestSearchSuffixArray(sentence, "fox jumps with the hosps");
280 TestSearchSuffixArray(sentence, "xyz");
281 // Exact and non-unique match: take lexicographical first.
282 TestSearchSuffixArray(sentence, "the");
283 TestSearchSuffixArray(sentence, " ");
284 // Partial and non-unique match.
285 // query < "the l"... < "the q"...
286 TestSearchSuffixArray(sentence, "the apple");
287 // "the l"... < query < "the q"...
288 TestSearchSuffixArray(sentence, "the opera");
289 // "the l"... < "the q"... < query
290 TestSearchSuffixArray(sentence, "the zebra");
291 // Prefix match dominates suffix match (unique).
292 TestSearchSuffixArray(sentence, "over quick brown fox");
293 // Empty matchs.
294 TestSearchSuffixArray(sentence, ",");
295 TestSearchSuffixArray(sentence, "1234");
296 TestSearchSuffixArray(sentence, "THE QUICK BROWN FOX");
297 TestSearchSuffixArray(sentence, "(the");
298 }
299
300 TEST(SuffixArrayTest, SearchExact) {
301 for (const std::string& test_str : test_strs) {
302 ustring test_ustr = MakeUnsignedString(test_str);
303
304 std::vector<size_t> suffix_array = MakeSuffixArray<Sais>(test_ustr, 256);
305
306 for (int lo = 0; lo < test_str.size(); ++lo) {
307 for (int hi = lo + 1; hi <= test_str.size(); ++hi) {
308 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi);
309 ASSERT_EQ(query.size(), hi - lo);
310 auto pos = SearchSuffixArray(suffix_array, test_ustr.begin(),
311 query.begin(), query.end());
312 EXPECT_TRUE(
313 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos));
314 }
315 }
316 }
317 }
318
319 } // namespace zucchini
OLDNEW
« no previous file with comments | « chrome/installer/zucchini/suffix_array.h ('k') | courgette/BUILD.gn » ('j') | courgette/BUILD.gn » ('J')

Powered by Google App Engine
This is Rietveld 408576698