OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/installer/zucchini/suffix_array.h" | |
6 | |
7 #include <algorithm> | |
8 #include <cstddef> | |
9 #include <initializer_list> | |
10 #include <string> | |
11 #include <vector> | |
12 | |
13 #include "testing/gtest/include/gtest/gtest.h" | |
14 | |
15 namespace zucchini { | |
16 | |
17 using ustring = std::basic_string<unsigned char>; | |
18 | |
19 constexpr uint16_t kNumChar = 256; | |
20 | |
21 ustring MakeUnsignedString(const std::string& str) { | |
22 return {str.begin(), str.end()}; | |
23 } | |
24 | |
25 template <class T> | |
26 std::vector<T> MakeVector(const std::initializer_list<T>& ilist) { | |
27 return {ilist.begin(), ilist.end()}; | |
28 } | |
29 | |
30 void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition, | |
31 std::initializer_list<size_t> expected_lms_indices, | |
32 std::string str) { | |
33 using SaisImpl = Sais::Implementation<size_t, uint16_t>; | |
34 | |
35 std::vector<Sais::SLType> sl_partition(str.size()); | |
36 EXPECT_EQ(expected_lms_indices.size(), | |
37 SaisImpl::BuildSLPartition(str.begin(), str.size(), kNumChar, | |
38 sl_partition.rbegin())); | |
39 EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition); | |
40 | |
41 std::vector<size_t> lms_indices(expected_lms_indices.size()); | |
42 SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin()); | |
43 EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices); | |
44 } | |
45 | |
46 TEST(SaisTest, BuildSLPartition) { | |
47 TestSlPartition({}, {}, ""); | |
48 TestSlPartition( | |
49 { | |
50 Sais::LType, | |
51 }, | |
52 {}, "a"); | |
53 TestSlPartition( | |
54 { | |
55 Sais::LType, Sais::LType, | |
56 }, | |
57 {}, "ba"); | |
58 TestSlPartition( | |
59 { | |
60 Sais::SType, Sais::LType, | |
61 }, | |
62 {}, "ab"); | |
63 TestSlPartition( | |
64 { | |
65 Sais::SType, Sais::SType, Sais::LType, | |
66 }, | |
67 {}, "aab"); | |
68 TestSlPartition( | |
69 { | |
70 Sais::LType, Sais::LType, Sais::LType, | |
71 }, | |
72 {}, "bba"); | |
73 TestSlPartition( | |
74 { | |
75 Sais::LType, Sais::SType, Sais::LType, | |
76 }, | |
77 {1}, "bab"); | |
78 TestSlPartition( | |
79 { | |
80 Sais::LType, Sais::SType, Sais::SType, Sais::LType, | |
81 }, | |
82 {1}, "baab"); | |
83 | |
84 TestSlPartition( | |
85 { | |
86 Sais::LType, // zucchini | |
87 Sais::LType, // ucchini | |
88 Sais::SType, // cchini | |
89 Sais::SType, // chini | |
90 Sais::SType, // hini | |
91 Sais::SType, // ini | |
92 Sais::LType, // ni | |
93 Sais::LType, // i | |
94 }, | |
95 {2}, "zucchini"); | |
96 } | |
97 | |
98 std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str, | |
99 uint16_t max_key) { | |
100 using SaisImpl = Sais::Implementation<size_t, uint16_t>; | |
101 return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key); | |
102 } | |
103 | |
104 TEST(SaisTest, BucketCount) { | |
105 using vec = std::vector<size_t>; | |
106 | |
107 EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4)); | |
108 EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4)); | |
109 EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4)); | |
110 } | |
111 | |
112 std::vector<size_t> InducedSortSubstring(ustring str) { | |
113 using SaisImpl = Sais::Implementation<size_t, uint16_t>; | |
114 std::vector<Sais::SLType> sl_partition(str.size()); | |
115 size_t lms_count = SaisImpl::BuildSLPartition( | |
116 str.begin(), str.size(), kNumChar, sl_partition.rbegin()); | |
117 std::vector<size_t> lms_indices(lms_count); | |
118 SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin()); | |
119 auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), kNumChar); | |
120 | |
121 std::vector<size_t> suffix_array(str.size()); | |
122 SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets, | |
123 suffix_array.begin()); | |
124 | |
125 return suffix_array; | |
126 } | |
127 | |
128 TEST(SaisTest, InducedSortSubstring) { | |
129 using vec = std::vector<size_t>; | |
130 | |
131 auto us = MakeUnsignedString; | |
132 | |
133 // L; a$ | |
134 EXPECT_EQ(vec({0}), InducedSortSubstring(us("a"))); | |
135 | |
136 // SL; ab$, b$ | |
137 EXPECT_EQ(vec({0, 1}), InducedSortSubstring(us("ab"))); | |
138 | |
139 // LL; a$, ba$ | |
140 EXPECT_EQ(vec({1, 0}), InducedSortSubstring(us("ba"))); | |
141 | |
142 // SLL; a$, aba$, ba$ | |
143 EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring(us("aba"))); | |
144 | |
145 // LSL; ab$, b$, ba | |
146 EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring(us("bab"))); | |
147 | |
148 // SSL; aab$, ab$, b$ | |
149 EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring(us("aab"))); | |
150 | |
151 // LSSL; aab$, ab$, b$, ba | |
152 EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring(us("baab"))); | |
153 } | |
154 | |
155 template <class Algorithm> | |
156 void TestSuffixSort(ustring test_str) { | |
157 std::vector<size_t> suffix_array = | |
158 MakeSuffixArray<Algorithm>(test_str, kNumChar); | |
159 EXPECT_EQ(test_str.size(), suffix_array.size()); | |
160 | |
161 // Expect that I[] is a permutation of [0, len]. | |
162 std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end()); | |
163 std::sort(sorted_suffix.begin(), sorted_suffix.end()); | |
164 for (size_t i = 0; i < test_str.size(); ++i) | |
165 EXPECT_EQ(i, sorted_suffix[i]); | |
166 | |
167 // Expect that all suffixes are strictly ordered. | |
168 auto end = test_str.end(); | |
169 for (size_t i = 1; i < test_str.size(); ++i) { | |
170 auto suf1 = test_str.begin() + suffix_array[i - 1]; | |
171 auto suf2 = test_str.begin() + suffix_array[i]; | |
172 bool is_less = std::lexicographical_compare(suf1, end, suf2, end); | |
173 EXPECT_TRUE(is_less); | |
174 } | |
175 } | |
176 | |
177 constexpr const char* test_strs[] = { | |
178 "", | |
179 "a", | |
180 "aa", | |
181 "za", | |
182 "CACAO", | |
183 "aaaaa", | |
184 "banana", | |
185 "tobeornottobe", | |
186 "The quick brown fox jumps over the lazy dog.", | |
187 "elephantelephantelephantelephantelephant", | |
188 "walawalawashington", | |
189 "-------------------------", | |
190 "011010011001011010010110011010010", | |
191 "3141592653589793238462643383279502884197169399375105", | |
192 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", | |
193 "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba", | |
194 "0123456789876543210", | |
195 "9876543210123456789", | |
196 "aababcabcdabcdeabcdefabcdefg", | |
197 "asdhklgalksdjghalksdjghalksdjgh", | |
198 }; | |
199 | |
200 TEST(SuffixSortTest, NaiveSuffixSort) { | |
201 for (const std::string& test_str : test_strs) { | |
202 TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str)); | |
203 } | |
204 } | |
205 | |
206 TEST(SuffixSortTest, SaisSort) { | |
207 for (const std::string& test_str : test_strs) { | |
208 TestSuffixSort<Sais>(MakeUnsignedString(test_str)); | |
209 } | |
210 } | |
211 | |
212 // Test with sequence that has every character. | |
213 TEST(SuffixSortTest, AllChar) { | |
214 std::vector<unsigned char> all_char(kNumChar); | |
215 std::iota(all_char.begin(), all_char.end(), 0); | |
216 | |
217 { | |
218 std::vector<size_t> suffix_array = | |
219 MakeSuffixArray<Sais>(all_char, kNumChar); | |
220 for (int i = 0; i < kNumChar; ++i) | |
221 EXPECT_EQ(i, suffix_array[i]); | |
222 } | |
223 | |
224 std::vector<unsigned char> all_char_reverse(all_char.rbegin(), | |
225 all_char.rend()); | |
226 { | |
227 std::vector<size_t> suffix_array = | |
228 MakeSuffixArray<Sais>(all_char_reverse, kNumChar); | |
229 for (int i = 0; i < kNumChar; ++i) | |
230 EXPECT_EQ(kNumChar - i - 1, suffix_array[i]); | |
231 } | |
232 } | |
233 | |
234 void TestSearchSuffixArray(ustring base_str, ustring search_str) { | |
235 std::vector<size_t> suffix_array = | |
236 MakeSuffixArray<NaiveSuffixSort>(base_str, kNumChar); | |
237 | |
238 auto pos = SearchSuffixArray(suffix_array, base_str.begin(), | |
239 search_str.begin(), search_str.end()); | |
240 | |
241 auto end = base_str.end(); | |
242 if (pos != suffix_array.begin()) { | |
huangs
2017/07/05 05:51:59
This code looks like hack to make test work? Pleas
etiennep1
2017/07/10 17:51:09
It tests lower bound. I updated the name.
| |
243 // Previous suffix is less than |search_str|. | |
244 auto suf = base_str.begin() + pos[-1]; | |
245 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(), | |
246 search_str.end()); | |
247 EXPECT_TRUE(is_less); | |
248 } | |
249 if (pos != suffix_array.end()) { | |
250 // Current suffix is greater of equal to |search_str|. | |
251 auto suf = base_str.begin() + *pos; | |
252 bool is_less = std::lexicographical_compare(suf, end, search_str.begin(), | |
253 search_str.end()); | |
254 EXPECT_FALSE(is_less); | |
255 } | |
256 } | |
257 | |
258 TEST(SuffixArrayTest, Search) { | |
259 auto us = MakeUnsignedString; | |
260 | |
261 TestSearchSuffixArray(us(""), us("")); | |
262 TestSearchSuffixArray(us(""), us("a")); | |
263 TestSearchSuffixArray(us("b"), us("")); | |
264 TestSearchSuffixArray(us("b"), us("a")); | |
265 TestSearchSuffixArray(us("b"), us("c")); | |
266 TestSearchSuffixArray(us("b"), us("bc")); | |
267 TestSearchSuffixArray(us("aa"), us("a")); | |
268 TestSearchSuffixArray(us("aa"), us("aa")); | |
269 | |
270 ustring sentence = us("the quick brown fox jumps over the lazy dog."); | |
271 // Entire string: exact and unique. | |
272 TestSearchSuffixArray(sentence, sentence); | |
273 // Empty string: exact and non-unique. | |
274 TestSearchSuffixArray(sentence, us("")); | |
275 // Exact and unique suffix matches. | |
276 TestSearchSuffixArray(sentence, us(".")); | |
277 TestSearchSuffixArray(sentence, us("the lazy dog.")); | |
278 // Exact and unique non-suffix matches. | |
279 TestSearchSuffixArray(sentence, us("quick")); | |
280 TestSearchSuffixArray(sentence, us("the quick")); | |
281 // Partial and unique matches. | |
282 TestSearchSuffixArray(sentence, us("fox jumps with the hosps")); | |
283 TestSearchSuffixArray(sentence, us("xyz")); | |
284 // Exact and non-unique match: take lexicographical first. | |
285 TestSearchSuffixArray(sentence, us("the")); | |
286 TestSearchSuffixArray(sentence, us(" ")); | |
287 // Partial and non-unique match. | |
288 // query < "the l"... < "the q"... | |
289 TestSearchSuffixArray(sentence, us("the apple")); | |
290 // "the l"... < query < "the q"... | |
291 TestSearchSuffixArray(sentence, us("the opera")); | |
292 // "the l"... < "the q"... < query | |
293 TestSearchSuffixArray(sentence, us("the zebra")); | |
294 // Prefix match dominates suffix match (unique). | |
295 TestSearchSuffixArray(sentence, us("over quick brown fox")); | |
296 // Empty matchs. | |
297 TestSearchSuffixArray(sentence, us(",")); | |
298 TestSearchSuffixArray(sentence, us("1234")); | |
299 TestSearchSuffixArray(sentence, us("THE QUICK BROWN FOX")); | |
300 TestSearchSuffixArray(sentence, us("(the")); | |
301 } | |
302 | |
303 TEST(SuffixArrayTest, SearchExact) { | |
304 for (const std::string& test_str : test_strs) { | |
305 ustring test_ustr = MakeUnsignedString(test_str); | |
306 | |
307 std::vector<size_t> suffix_array = | |
308 MakeSuffixArray<Sais>(test_ustr, kNumChar); | |
309 | |
310 for (int lo = 0; lo < test_str.size(); ++lo) { | |
huangs
2017/07/05 05:51:58
|lo| and |hi| should be size_t?
etiennep1
2017/07/10 17:51:09
Done.
| |
311 for (int hi = lo + 1; hi <= test_str.size(); ++hi) { | |
312 ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi); | |
313 ASSERT_EQ(query.size(), hi - lo); | |
314 auto pos = SearchSuffixArray(suffix_array, test_ustr.begin(), | |
315 query.begin(), query.end()); | |
316 EXPECT_TRUE( | |
317 std::equal(query.begin(), query.end(), test_ustr.begin() + *pos)); | |
318 } | |
319 } | |
320 } | |
321 } | |
322 | |
323 } // namespace zucchini | |
OLD | NEW |