Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(673)

Side by Side Diff: chrome/installer/zucchini/suffix_array_unittest.cc

Issue 2963463002: [Zucchini] Generic suffix array algorithms. (Closed)
Patch Set: NIT corrections in comments Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/installer/zucchini/suffix_array.h"
6
7 #include <algorithm>
8 #include <cstddef>
9 #include <initializer_list>
10 #include <string>
11 #include <vector>
12
13 #include "testing/gtest/include/gtest/gtest.h"
14
15 namespace zucchini {
16
17 using ustring = std::basic_string<unsigned char>;
18
19 constexpr uint16_t kNumChar = 256;
20
21 ustring MakeUnsignedString(const std::string& str) {
22 return {str.begin(), str.end()};
23 }
24
25 template <class T>
26 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) {
27 return {ilist.begin(), ilist.end()};
28 }
29
30 void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition,
31 std::initializer_list<size_t> expected_lms_indices,
32 std::string str) {
33 using SaisImpl = Sais::Implementation<size_t, uint16_t>;
34
35 std::vector<Sais::SLType> sl_partition(str.size());
36 EXPECT_EQ(expected_lms_indices.size(),
37 SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar,
38 sl_partition.rbegin()));
39 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition);
40
41 std::vector<size_t> lms_indices(expected_lms_indices.size());
42 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin());
43 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices);
44 }
45
46 TEST(SaisTest, BuildSLPartition) {
47 TestSlPartition({}, {}, "");
48 TestSlPartition(
49 {
50 Sais::LType,
51 },
52 {}, "a");
53 TestSlPartition(
54 {
55 Sais::LType, Sais::LType,
56 },
57 {}, "ba");
58 TestSlPartition(
59 {
60 Sais::SType, Sais::LType,
61 },
62 {}, "ab");
63 TestSlPartition(
64 {
65 Sais::SType, Sais::SType, Sais::LType,
66 },
67 {}, "aab");
68 TestSlPartition(
69 {
70 Sais::LType, Sais::LType, Sais::LType,
71 },
72 {}, "bba");
73 TestSlPartition(
74 {
75 Sais::LType, Sais::SType, Sais::LType,
76 },
77 {1}, "bab");
78 TestSlPartition(
79 {
80 Sais::LType, Sais::SType, Sais::SType, Sais::LType,
81 },
82 {1}, "baab");
83
84 TestSlPartition(
85 {
86 Sais::LType, // zucchini
87 Sais::LType, // ucchini
88 Sais::SType, // cchini
89 Sais::SType, // chini
90 Sais::SType, // hini
91 Sais::SType, // ini
92 Sais::LType, // ni
93 Sais::LType, // i
94 },
95 {2}, "zucchini");
96 }
97
98 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str,
99 uint16_t max_key) {
100 using SaisImpl = Sais::Implementation<size_t, uint16_t>;
101 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key);
102 }
103
104 TEST(SaisTest, BucketCount) {
105 using vec = std::vector<size_t>;
106
107 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4));
108 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4));
109 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4));
110 }
111
112 std::vector<size_t> InducedSortSubstring(ustring str) {
113 using SaisImpl = Sais::Implementation<size_t, uint16_t>;
114 std::vector<Sais::SLType> sl_partition(str.size());
115 size_t lms_count = SaisImpl::BuildSLPartition(
116 str.begin(), str.size(), kNumChar, sl_partition.rbegin());
117 std::vector<size_t> lms_indices(lms_count);
118 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin());
119 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar);
120
121 std::vector<size_t> suffix_array(str.size());
122 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets,
123 suffix_array.begin());
124
125 return suffix_array;
126 }
127
128 TEST(SaisTest, InducedSortSubstring) {
129 using vec = std::vector<size_t>;
130
131 auto us = MakeUnsignedString;
132
133 // L; a$
134 EXPECT_EQ(vec({0}), InducedSortSubstring(us("a")));
135
136 // SL; ab$, b$
137 EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab")));
138
139 // LL; a$, ba$
140 EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba")));
141
142 // SLL; a$, aba$, ba$
143 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba")));
144
145 // LSL; ab$, b$, ba
146 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab")));
147
148 // SSL; aab$, ab$, b$
149 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab")));
150
151 // LSSL; aab$, ab$, b$, ba
152 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab")));
153 }
154
155 template <class Algorithm>
156 void TestSuffixSort(ustring test_str) {
157 std::vector<size_t> suffix_array =
158 MakeSuffixArray<Algorithm>(test_str, kNumChar);
159 EXPECT_EQ(test_str.size(), suffix_array.size());
160
161 // Expect that I[] is a permutation of [0, len].
162 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end());
163 std::sort(sorted_suffix.begin(), sorted_suffix.end());
164 for (size_t i = 0; i < test_str.size(); ++i)
165 EXPECT_EQ(i, sorted_suffix[i]);
166
167 // Expect that all suffixes are strictly ordered.
168 auto end = test_str.end();
169 for (size_t i = 1; i < test_str.size(); ++i) {
170 auto suf1 = test_str.begin() + suffix_array[i - 1];
171 auto suf2 = test_str.begin() + suffix_array[i];
172 bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
173 EXPECT_TRUE(is_less);
174 }
175 }
176
177 constexpr const char* test_strs[] = {
178 "",
179 "a",
180 "aa",
181 "za",
182 "CACAO",
183 "aaaaa",
184 "banana",
185 "tobeornottobe",
186 "The quick brown fox jumps over the lazy dog.",
187 "elephantelephantelephantelephantelephant",
188 "walawalawashington",
189 "-------------------------",
190 "011010011001011010010110011010010",
191 "3141592653589793238462643383279502884197169399375105",
192 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
193 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba",
194 "0123456789876543210",
195 "9876543210123456789",
196 "aababcabcdabcdeabcdefabcdefg",
197 "asdhklgalksdjghalksdjghalksdjgh",
198 };
199
200 TEST(SuffixSortTest, NaiveSuffixSort) {
201 for (const std::string& test_str : test_strs) {
202 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str));
203 }
204 }
205
206 TEST(SuffixSortTest, SaisSort) {
207 for (const std::string& test_str : test_strs) {
208 TestSuffixSort<Sais>(MakeUnsignedString(test_str));
209 }
210 }
211
212 // Test with sequence that has every character.
213 TEST(SuffixSortTest, AllChar) {
214 std::vector<unsigned char> all_char(kNumChar);
215 std::iota(all_char.begin(), all_char.end(), 0);
216
217 {
218 std::vector<size_t> suffix_array =
219 MakeSuffixArray<Sais>(all_char, kNumChar);
220 for (int i = 0; i < kNumChar; ++i)
221 EXPECT_EQ(i, suffix_array[i]);
222 }
223
224 std::vector<unsigned char> all_char_reverse(all_char.rbegin(),
225 all_char.rend());
226 {
227 std::vector<size_t> suffix_array =
228 MakeSuffixArray<Sais>(all_char_reverse, kNumChar);
229 for (int i = 0; i < kNumChar; ++i)
230 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]);
231 }
232 }
233
234 void TestSearchSuffixArray(ustring base_str, ustring search_str) {
235 std::vector<size_t> suffix_array =
236 MakeSuffixArray<NaiveSuffixSort>(base_str, kNumChar);
237
238 auto pos = SearchSuffixArray(suffix_array, base_str.begin(),
239 search_str.begin(), search_str.end());
240
241 auto end = base_str.end();
242 if (pos != suffix_array.begin()) {
huangs 2017/07/05 05:51:59 This code looks like hack to make test work? Pleas
etiennep1 2017/07/10 17:51:09 It tests lower bound. I updated the name.
243 // Previous suffix is less than |search_str|.
244 auto suf = base_str.begin() + pos[-1];
245 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
246 search_str.end());
247 EXPECT_TRUE(is_less);
248 }
249 if (pos != suffix_array.end()) {
250 // Current suffix is greater of equal to |search_str|.
251 auto suf = base_str.begin() + *pos;
252 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(),
253 search_str.end());
254 EXPECT_FALSE(is_less);
255 }
256 }
257
258 TEST(SuffixArrayTest, Search) {
259 auto us = MakeUnsignedString;
260
261 TestSearchSuffixArray(us(""), us(""));
262 TestSearchSuffixArray(us(""), us("a"));
263 TestSearchSuffixArray(us("b"), us(""));
264 TestSearchSuffixArray(us("b"), us("a"));
265 TestSearchSuffixArray(us("b"), us("c"));
266 TestSearchSuffixArray(us("b"), us("bc"));
267 TestSearchSuffixArray(us("aa"), us("a"));
268 TestSearchSuffixArray(us("aa"), us("aa"));
269
270 ustring sentence = us("the quick brown fox jumps over the lazy dog.");
271 // Entire string: exact and unique.
272 TestSearchSuffixArray(sentence, sentence);
273 // Empty string: exact and non-unique.
274 TestSearchSuffixArray(sentence, us(""));
275 // Exact and unique suffix matches.
276 TestSearchSuffixArray(sentence, us("."));
277 TestSearchSuffixArray(sentence, us("the lazy dog."));
278 // Exact and unique non-suffix matches.
279 TestSearchSuffixArray(sentence, us("quick"));
280 TestSearchSuffixArray(sentence, us("the quick"));
281 // Partial and unique matches.
282 TestSearchSuffixArray(sentence, us("fox jumps with the hosps"));
283 TestSearchSuffixArray(sentence, us("xyz"));
284 // Exact and non-unique match: take lexicographical first.
285 TestSearchSuffixArray(sentence, us("the"));
286 TestSearchSuffixArray(sentence, us(" "));
287 // Partial and non-unique match.
288 // query < "the l"... < "the q"...
289 TestSearchSuffixArray(sentence, us("the apple"));
290 // "the l"... < query < "the q"...
291 TestSearchSuffixArray(sentence, us("the opera"));
292 // "the l"... < "the q"... < query
293 TestSearchSuffixArray(sentence, us("the zebra"));
294 // Prefix match dominates suffix match (unique).
295 TestSearchSuffixArray(sentence, us("over quick brown fox"));
296 // Empty matchs.
297 TestSearchSuffixArray(sentence, us(","));
298 TestSearchSuffixArray(sentence, us("1234"));
299 TestSearchSuffixArray(sentence, us("THE QUICK BROWN FOX"));
300 TestSearchSuffixArray(sentence, us("(the"));
301 }
302
303 TEST(SuffixArrayTest, SearchExact) {
304 for (const std::string& test_str : test_strs) {
305 ustring test_ustr = MakeUnsignedString(test_str);
306
307 std::vector<size_t> suffix_array =
308 MakeSuffixArray<Sais>(test_ustr, kNumChar);
309
310 for (int lo = 0; lo < test_str.size(); ++lo) {
huangs 2017/07/05 05:51:58 |lo| and |hi| should be size_t?
etiennep1 2017/07/10 17:51:09 Done.
311 for (int hi = lo + 1; hi <= test_str.size(); ++hi) {
312 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi);
313 ASSERT_EQ(query.size(), hi - lo);
314 auto pos = SearchSuffixArray(suffix_array, test_ustr.begin(),
315 query.begin(), query.end());
316 EXPECT_TRUE(
317 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos));
318 }
319 }
320 }
321 }
322
323 } // namespace zucchini
OLDNEW
« chrome/installer/zucchini/suffix_array.h ('K') | « chrome/installer/zucchini/suffix_array.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698