OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/installer/zucchini/suffix_array.h" | |
6 | |
7 #include <algorithm> | |
8 #include <cstddef> | |
9 #include <initializer_list> | |
10 #include <string> | |
11 #include <vector> | |
12 | |
13 #include "testing/gtest/include/gtest/gtest.h" | |
14 | |
15 namespace zucchini { | |
16 | |
17 using ustring = std::basic_string<unsigned char>; | |
18 | |
19 ustring MakeUnsignedString(const std::string& str) { | |
20 return {str.begin(), str.end()}; | |
21 } | |
22 | |
23 template <class T> | |
24 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) { | |
25 return {ilist.begin(), ilist.end()}; | |
26 } | |
27 | |
28 void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition, | |
29 std::initializer_list<size_t> expected_lms_indices, | |
30 std::string str) { | |
31 using SaisImpl = Sais::Implementation<size_t, size_t>; | |
32 | |
33 std::vector<Sais::SLType> sl_partition(str.size()); | |
34 EXPECT_EQ(expected_lms_indices.size(), | |
35 SaisImpl::BuildSLPartition(str.begin(), str.size(), 255, | |
36 sl_partition.rbegin())); | |
37 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition); | |
38 | |
39 std::vector<size_t> lms_indices(expected_lms_indices.size()); | |
40 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin()); | |
41 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices); | |
42 } | |
43 | |
44 TEST(SaisTest, BuildSLPartition) { | |
45 TestSlPartition({}, {}, ""); | |
46 TestSlPartition( | |
47 { | |
48 Sais::LType, | |
49 }, | |
50 {}, "a"); | |
51 TestSlPartition( | |
52 { | |
53 Sais::LType, Sais::LType, | |
54 }, | |
55 {}, "ba"); | |
56 TestSlPartition( | |
57 { | |
58 Sais::SType, Sais::LType, | |
59 }, | |
60 {}, "ab"); | |
61 TestSlPartition( | |
62 { | |
63 Sais::SType, Sais::SType, Sais::LType, | |
64 }, | |
65 {}, "aab"); | |
66 TestSlPartition( | |
67 { | |
68 Sais::LType, Sais::LType, Sais::LType, | |
69 }, | |
70 {}, "bba"); | |
71 TestSlPartition( | |
72 { | |
73 Sais::LType, Sais::SType, Sais::LType, | |
74 }, | |
75 {1}, "bab"); | |
76 TestSlPartition( | |
77 { | |
78 Sais::LType, Sais::SType, Sais::SType, Sais::LType, | |
79 }, | |
80 {1}, "baab"); | |
81 | |
82 TestSlPartition( | |
83 { | |
84 Sais::LType, // zucchini | |
85 Sais::LType, // ucchini | |
86 Sais::SType, // cchini | |
87 Sais::SType, // chini | |
88 Sais::SType, // hini | |
89 Sais::SType, // ini | |
90 Sais::LType, // ni | |
91 Sais::LType, // i | |
92 }, | |
93 {2}, "zucchini"); | |
94 } | |
95 | |
96 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str, | |
97 size_t max_key) { | |
98 using SaisImpl = Sais::Implementation<size_t, size_t>; | |
99 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key); | |
100 } | |
101 | |
102 TEST(SaisTest, BucketCount) { | |
103 using vec = std::vector<size_t>; | |
104 | |
105 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4)); | |
106 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4)); | |
107 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4)); | |
108 } | |
109 | |
110 std::vector<size_t> InducedSortSubstring(std::string str) { | |
111 using SaisImpl = Sais::Implementation<size_t, size_t>; | |
112 std::vector<Sais::SLType> sl_partition(str.size()); | |
113 size_t lms_count = SaisImpl::BuildSLPartition(str.begin(), str.size(), 255, | |
114 sl_partition.rbegin()); | |
115 std::vector<size_t> lms_indices(lms_count); | |
116 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin()); | |
117 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), 255); | |
118 | |
119 std::vector<size_t> suffix_array(str.size()); | |
120 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets, | |
121 suffix_array.begin()); | |
122 | |
123 return suffix_array; | |
124 } | |
125 | |
126 TEST(SaisTest, InducedSortSubstring) { | |
127 using vec = std::vector<size_t>; | |
128 | |
129 // L; a$ | |
130 EXPECT_EQ(vec({0}), InducedSortSubstring("a")); | |
131 | |
132 // SL; ab$, b$ | |
133 EXPECT_EQ(vec({0, 1}), InducedSortSubstring("ab")); | |
134 | |
135 // LL; a$, ba$ | |
136 EXPECT_EQ(vec({1, 0}), InducedSortSubstring("ba")); | |
137 | |
138 // SLL; a$, aba$, ba$ | |
139 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring("aba")); | |
140 | |
141 // LSL; ab$, b$, ba | |
142 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring("bab")); | |
143 | |
144 // SSL; aab$, ab$, b$ | |
145 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring("aab")); | |
146 | |
147 // LSSL; aab$, ab$, b$, ba | |
148 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring("baab")); | |
149 } | |
150 | |
151 template <class Algorithm> | |
152 void TestSuffixSort(ustring test_str) { | |
153 std::vector<size_t> suffix_array = MakeSuffixArray<Algorithm>(test_str, 256); | |
154 EXPECT_EQ(test_str.size(), suffix_array.size()); | |
155 | |
156 // Expect that I[] is a permutation of [0, len]. | |
157 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end()); | |
158 std::sort(sorted_suffix.begin(), sorted_suffix.end()); | |
159 for (size_t i = 0; i < test_str.size(); ++i) | |
160 EXPECT_EQ(i, sorted_suffix[i]); | |
161 | |
162 // Expect that all suffixes are strictly ordered. | |
163 auto end = test_str.end(); | |
164 for (size_t i = 1; i < test_str.size(); ++i) { | |
165 auto suf1 = test_str.begin() + suffix_array[i - 1]; | |
166 auto suf2 = test_str.begin() + suffix_array[i]; | |
167 bool is_less = std::lexicographical_compare(suf1, end, suf2, end); | |
168 EXPECT_TRUE(is_less); | |
169 } | |
170 } | |
171 | |
172 constexpr const char* test_strs[] = { | |
173 "", | |
174 "a", | |
175 "aa", | |
176 "za", | |
177 "CACAO", | |
178 "aaaaa", | |
179 "banana", | |
180 "tobeornottobe", | |
181 "The quick brown fox jumps over the lazy dog.", | |
182 "elephantelephantelephantelephantelephant", | |
183 "walawalawashington", | |
184 "-------------------------", | |
185 "011010011001011010010110011010010", | |
186 "3141592653589793238462643383279502884197169399375105", | |
187 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", | |
188 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba", | |
189 "0123456789876543210", | |
190 "9876543210123456789", | |
191 "aababcabcdabcdeabcdefabcdefg", | |
192 "asdhklgalksdjghalksdjghalksdjgh", | |
193 }; | |
194 | |
195 TEST(SuffixSortTest, NaiveSuffixSort) { | |
196 for (const std::string& test_str : test_strs) { | |
197 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str)); | |
198 } | |
199 } | |
200 | |
201 TEST(SuffixSortTest, SaisSort) { | |
202 for (const std::string& test_str : test_strs) { | |
203 TestSuffixSort<Sais>(MakeUnsignedString(test_str)); | |
204 } | |
205 } | |
206 | |
207 // Test with sequence that has every character. | |
208 TEST(SuffixSortTest, AllChar) { | |
209 const int kNumChar = 256; | |
210 std::vector<unsigned char> all_char(kNumChar); | |
211 std::iota(all_char.begin(), all_char.end(), 0); | |
212 | |
213 { | |
214 std::vector<size_t> suffix_array = | |
215 MakeSuffixArray<Sais>(all_char, kNumChar); | |
216 for (int i = 0; i < kNumChar; ++i) | |
217 EXPECT_EQ(i, suffix_array[i]); | |
218 } | |
219 | |
220 std::vector<unsigned char> all_char_reverse(all_char.rbegin(), | |
221 all_char.rend()); | |
222 { | |
223 std::vector<size_t> suffix_array = | |
224 MakeSuffixArray<Sais>(all_char_reverse, kNumChar); | |
225 for (int i = 0; i < kNumChar; ++i) | |
226 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]); | |
227 } | |
228 } | |
229 | |
230 void TestSearchSuffixArray(std::string base_str, std::string search_str) { | |
huangs
2017/06/28 22:30:35
Make this take const ustring&, and update all call
etiennep1
2017/06/28 23:03:48
Probably more clean like that.
| |
231 ustring base_ustr = MakeUnsignedString(base_str); | |
232 ustring search_ustr = MakeUnsignedString(search_str); | |
233 | |
234 std::vector<size_t> suffix_array = | |
235 MakeSuffixArray<NaiveSuffixSort>(base_ustr, 256); | |
236 | |
237 auto pos = SearchSuffixArray(suffix_array, base_ustr.begin(), | |
238 search_ustr.begin(), search_ustr.end()); | |
239 | |
240 auto end = base_ustr.end(); | |
241 if (pos != suffix_array.begin()) { | |
242 // Previous suffix is less than |search_str|. | |
243 auto suf = base_ustr.begin() + pos[-1]; | |
244 bool is_less = std::lexicographical_compare(suf, end, search_ustr.begin(), | |
245 search_ustr.end()); | |
246 EXPECT_TRUE(is_less); | |
247 } | |
248 if (pos != suffix_array.end()) { | |
249 // Current suffix is greater of equal to |search_str|. | |
250 auto suf = base_ustr.begin() + *pos; | |
251 bool is_less = std::lexicographical_compare(suf, end, search_ustr.begin(), | |
252 search_ustr.end()); | |
253 EXPECT_FALSE(is_less); | |
254 } | |
255 } | |
256 | |
257 TEST(SuffixArrayTest, Search) { | |
258 TestSearchSuffixArray("", ""); | |
huangs
2017/06/28 22:30:35
using us = MakeUnsignedString;
TestSearchSuffixAr
| |
259 TestSearchSuffixArray("", "a"); | |
260 TestSearchSuffixArray("b", ""); | |
261 TestSearchSuffixArray("b", "a"); | |
262 TestSearchSuffixArray("b", "c"); | |
263 TestSearchSuffixArray("b", "bc"); | |
264 TestSearchSuffixArray("aa", "a"); | |
265 TestSearchSuffixArray("aa", "aa"); | |
266 | |
267 const char* sentence = "the quick brown fox jumps over the lazy dog."; | |
huangs
2017/06/28 22:30:35
ustring sentence = us("...");
etiennep1
2017/06/28 23:03:48
Done.
| |
268 // Entire string: exact and unique. | |
269 TestSearchSuffixArray(sentence, sentence); | |
270 // Empty string: exact and non-unique. | |
271 TestSearchSuffixArray(sentence, ""); | |
272 // Exact and unique suffix matches. | |
273 TestSearchSuffixArray(sentence, "."); | |
274 TestSearchSuffixArray(sentence, "the lazy dog."); | |
275 // Exact and unique non-suffix matches. | |
276 TestSearchSuffixArray(sentence, "quick"); | |
277 TestSearchSuffixArray(sentence, "the quick"); | |
278 // Partial and unique matches. | |
279 TestSearchSuffixArray(sentence, "fox jumps with the hosps"); | |
280 TestSearchSuffixArray(sentence, "xyz"); | |
281 // Exact and non-unique match: take lexicographical first. | |
282 TestSearchSuffixArray(sentence, "the"); | |
283 TestSearchSuffixArray(sentence, " "); | |
284 // Partial and non-unique match. | |
285 // query < "the l"... < "the q"... | |
286 TestSearchSuffixArray(sentence, "the apple"); | |
287 // "the l"... < query < "the q"... | |
288 TestSearchSuffixArray(sentence, "the opera"); | |
289 // "the l"... < "the q"... < query | |
290 TestSearchSuffixArray(sentence, "the zebra"); | |
291 // Prefix match dominates suffix match (unique). | |
292 TestSearchSuffixArray(sentence, "over quick brown fox"); | |
293 // Empty matchs. | |
294 TestSearchSuffixArray(sentence, ","); | |
295 TestSearchSuffixArray(sentence, "1234"); | |
296 TestSearchSuffixArray(sentence, "THE QUICK BROWN FOX"); | |
297 TestSearchSuffixArray(sentence, "(the"); | |
298 } | |
299 | |
300 TEST(SuffixArrayTest, SearchExact) { | |
301 for (const std::string& test_str : test_strs) { | |
302 ustring test_ustr = MakeUnsignedString(test_str); | |
303 | |
304 std::vector<size_t> suffix_array = MakeSuffixArray<Sais>(test_ustr, 256); | |
305 | |
306 for (int lo = 0; lo < test_str.size(); ++lo) { | |
307 for (int hi = lo + 1; hi <= test_str.size(); ++hi) { | |
308 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi); | |
309 ASSERT_EQ(query.size(), hi - lo); | |
310 auto pos = SearchSuffixArray(suffix_array, test_ustr.begin(), | |
311 query.begin(), query.end()); | |
312 EXPECT_TRUE( | |
313 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos)); | |
314 } | |
315 } | |
316 } | |
317 } | |
318 | |
319 } // namespace zucchini | |
OLD | NEW |