Chromium Code Reviews| Index: chrome/installer/zucchini/suffix_array_unittest.cc |
| diff --git a/chrome/installer/zucchini/suffix_array_unittest.cc b/chrome/installer/zucchini/suffix_array_unittest.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..fca19a4957b38af8fee75cca4f651a83dd982225 |
| --- /dev/null |
| +++ b/chrome/installer/zucchini/suffix_array_unittest.cc |
| @@ -0,0 +1,319 @@ |
| +// Copyright 2017 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "chrome/installer/zucchini/suffix_array.h" |
| + |
| +#include <algorithm> |
| +#include <cstddef> |
| +#include <initializer_list> |
| +#include <string> |
| +#include <vector> |
| + |
| +#include "testing/gtest/include/gtest/gtest.h" |
| + |
| +namespace zucchini { |
| + |
| +using ustring = std::basic_string<unsigned char>; |
| + |
| +ustring MakeUnsignedString(const std::string& str) { |
| + return {str.begin(), str.end()}; |
| +} |
| + |
| +template <class T> |
| +std::vector<T> MakeVector(const std::initializer_list<T>& ilist) { |
| + return {ilist.begin(), ilist.end()}; |
| +} |
| + |
| +void TestSlPartition(std::initializer_list<Sais::SLType> expected_sl_partition, |
| + std::initializer_list<size_t> expected_lms_indices, |
| + std::string str) { |
| + using SaisImpl = Sais::Implementation<size_t, size_t>; |
| + |
| + std::vector<Sais::SLType> sl_partition(str.size()); |
| + EXPECT_EQ(expected_lms_indices.size(), |
| + SaisImpl::BuildSLPartition(str.begin(), str.size(), 255, |
| + sl_partition.rbegin())); |
| + EXPECT_EQ(MakeVector(expected_sl_partition), sl_partition); |
| + |
| + std::vector<size_t> lms_indices(expected_lms_indices.size()); |
| + SaisImpl::FindLmsSuffixes(expected_sl_partition, lms_indices.begin()); |
| + EXPECT_EQ(MakeVector(expected_lms_indices), lms_indices); |
| +} |
| + |
| +TEST(SaisTest, BuildSLPartition) { |
| + TestSlPartition({}, {}, ""); |
| + TestSlPartition( |
| + { |
| + Sais::LType, |
| + }, |
| + {}, "a"); |
| + TestSlPartition( |
| + { |
| + Sais::LType, Sais::LType, |
| + }, |
| + {}, "ba"); |
| + TestSlPartition( |
| + { |
| + Sais::SType, Sais::LType, |
| + }, |
| + {}, "ab"); |
| + TestSlPartition( |
| + { |
| + Sais::SType, Sais::SType, Sais::LType, |
| + }, |
| + {}, "aab"); |
| + TestSlPartition( |
| + { |
| + Sais::LType, Sais::LType, Sais::LType, |
| + }, |
| + {}, "bba"); |
| + TestSlPartition( |
| + { |
| + Sais::LType, Sais::SType, Sais::LType, |
| + }, |
| + {1}, "bab"); |
| + TestSlPartition( |
| + { |
| + Sais::LType, Sais::SType, Sais::SType, Sais::LType, |
| + }, |
| + {1}, "baab"); |
| + |
| + TestSlPartition( |
| + { |
| + Sais::LType, // zucchini |
| + Sais::LType, // ucchini |
| + Sais::SType, // cchini |
| + Sais::SType, // chini |
| + Sais::SType, // hini |
| + Sais::SType, // ini |
| + Sais::LType, // ni |
| + Sais::LType, // i |
| + }, |
| + {2}, "zucchini"); |
| +} |
| + |
| +std::vector<size_t> BucketCount(const std::initializer_list<unsigned char> str, |
| + size_t max_key) { |
| + using SaisImpl = Sais::Implementation<size_t, size_t>; |
| + return SaisImpl::MakeBucketCount(str.begin(), str.size(), max_key); |
| +} |
| + |
| +TEST(SaisTest, BucketCount) { |
| + using vec = std::vector<size_t>; |
| + |
| + EXPECT_EQ(vec({0, 0, 0, 0}), BucketCount({}, 4)); |
| + EXPECT_EQ(vec({1, 0, 0, 0}), BucketCount({0}, 4)); |
| + EXPECT_EQ(vec({0, 2, 0, 1}), BucketCount({1, 1, 3}, 4)); |
| +} |
| + |
| +std::vector<size_t> InducedSortSubstring(std::string str) { |
| + using SaisImpl = Sais::Implementation<size_t, size_t>; |
| + std::vector<Sais::SLType> sl_partition(str.size()); |
| + size_t lms_count = SaisImpl::BuildSLPartition(str.begin(), str.size(), 255, |
| + sl_partition.rbegin()); |
| + std::vector<size_t> lms_indices(lms_count); |
| + SaisImpl::FindLmsSuffixes(sl_partition, lms_indices.begin()); |
| + auto buckets = SaisImpl::MakeBucketCount(str.begin(), str.size(), 255); |
| + |
| + std::vector<size_t> suffix_array(str.size()); |
| + SaisImpl::InducedSort(str, str.size(), sl_partition, lms_indices, buckets, |
| + suffix_array.begin()); |
| + |
| + return suffix_array; |
| +} |
| + |
| +TEST(SaisTest, InducedSortSubstring) { |
| + using vec = std::vector<size_t>; |
| + |
| + // L; a$ |
| + EXPECT_EQ(vec({0}), InducedSortSubstring("a")); |
| + |
| + // SL; ab$, b$ |
| + EXPECT_EQ(vec({0, 1}), InducedSortSubstring("ab")); |
| + |
| + // LL; a$, ba$ |
| + EXPECT_EQ(vec({1, 0}), InducedSortSubstring("ba")); |
| + |
| + // SLL; a$, aba$, ba$ |
| + EXPECT_EQ(vec({2, 0, 1}), InducedSortSubstring("aba")); |
| + |
| + // LSL; ab$, b$, ba |
| + EXPECT_EQ(vec({1, 2, 0}), InducedSortSubstring("bab")); |
| + |
| + // SSL; aab$, ab$, b$ |
| + EXPECT_EQ(vec({0, 1, 2}), InducedSortSubstring("aab")); |
| + |
| + // LSSL; aab$, ab$, b$, ba |
| + EXPECT_EQ(vec({1, 2, 3, 0}), InducedSortSubstring("baab")); |
| +} |
| + |
| +template <class Algorithm> |
| +void TestSuffixSort(ustring test_str) { |
| + std::vector<size_t> suffix_array = MakeSuffixArray<Algorithm>(test_str, 256); |
| + EXPECT_EQ(test_str.size(), suffix_array.size()); |
| + |
| + // Expect that I[] is a permutation of [0, len]. |
| + std::vector<size_t> sorted_suffix(suffix_array.begin(), suffix_array.end()); |
| + std::sort(sorted_suffix.begin(), sorted_suffix.end()); |
| + for (size_t i = 0; i < test_str.size(); ++i) |
| + EXPECT_EQ(i, sorted_suffix[i]); |
| + |
| + // Expect that all suffixes are strictly ordered. |
| + auto end = test_str.end(); |
| + for (size_t i = 1; i < test_str.size(); ++i) { |
| + auto suf1 = test_str.begin() + suffix_array[i - 1]; |
| + auto suf2 = test_str.begin() + suffix_array[i]; |
| + bool is_less = std::lexicographical_compare(suf1, end, suf2, end); |
| + EXPECT_TRUE(is_less); |
| + } |
| +} |
| + |
| +constexpr const char* test_strs[] = { |
| + "", |
| + "a", |
| + "aa", |
| + "za", |
| + "CACAO", |
| + "aaaaa", |
| + "banana", |
| + "tobeornottobe", |
| + "The quick brown fox jumps over the lazy dog.", |
| + "elephantelephantelephantelephantelephant", |
| + "walawalawashington", |
| + "-------------------------", |
| + "011010011001011010010110011010010", |
| + "3141592653589793238462643383279502884197169399375105", |
| + "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", |
| + "abccbaabccbaabccbaabccbaabccbaabccbaabccbaabccba", |
| + "0123456789876543210", |
| + "9876543210123456789", |
| + "aababcabcdabcdeabcdefabcdefg", |
| + "asdhklgalksdjghalksdjghalksdjgh", |
| +}; |
| + |
| +TEST(SuffixSortTest, NaiveSuffixSort) { |
| + for (const std::string& test_str : test_strs) { |
| + TestSuffixSort<NaiveSuffixSort>(MakeUnsignedString(test_str)); |
| + } |
| +} |
| + |
| +TEST(SuffixSortTest, SaisSort) { |
| + for (const std::string& test_str : test_strs) { |
| + TestSuffixSort<Sais>(MakeUnsignedString(test_str)); |
| + } |
| +} |
| + |
| +// Test with sequence that has every character. |
| +TEST(SuffixSortTest, AllChar) { |
| + const int kNumChar = 256; |
| + std::vector<unsigned char> all_char(kNumChar); |
| + std::iota(all_char.begin(), all_char.end(), 0); |
| + |
| + { |
| + std::vector<size_t> suffix_array = |
| + MakeSuffixArray<Sais>(all_char, kNumChar); |
| + for (int i = 0; i < kNumChar; ++i) |
| + EXPECT_EQ(i, suffix_array[i]); |
| + } |
| + |
| + std::vector<unsigned char> all_char_reverse(all_char.rbegin(), |
| + all_char.rend()); |
| + { |
| + std::vector<size_t> suffix_array = |
| + MakeSuffixArray<Sais>(all_char_reverse, kNumChar); |
| + for (int i = 0; i < kNumChar; ++i) |
| + EXPECT_EQ(kNumChar - i - 1, suffix_array[i]); |
| + } |
| +} |
| + |
| +void TestSearchSuffixArray(std::string base_str, std::string search_str) { |
|
huangs
2017/06/28 22:30:35
Make this take const ustring&, and update all call
etiennep1
2017/06/28 23:03:48
Probably more clean like that.
|
| + ustring base_ustr = MakeUnsignedString(base_str); |
| + ustring search_ustr = MakeUnsignedString(search_str); |
| + |
| + std::vector<size_t> suffix_array = |
| + MakeSuffixArray<NaiveSuffixSort>(base_ustr, 256); |
| + |
| + auto pos = SearchSuffixArray(suffix_array, base_ustr.begin(), |
| + search_ustr.begin(), search_ustr.end()); |
| + |
| + auto end = base_ustr.end(); |
| + if (pos != suffix_array.begin()) { |
| + // Previous suffix is less than |search_str|. |
| + auto suf = base_ustr.begin() + pos[-1]; |
| + bool is_less = std::lexicographical_compare(suf, end, search_ustr.begin(), |
| + search_ustr.end()); |
| + EXPECT_TRUE(is_less); |
| + } |
| + if (pos != suffix_array.end()) { |
| + // Current suffix is greater of equal to |search_str|. |
| + auto suf = base_ustr.begin() + *pos; |
| + bool is_less = std::lexicographical_compare(suf, end, search_ustr.begin(), |
| + search_ustr.end()); |
| + EXPECT_FALSE(is_less); |
| + } |
| +} |
| + |
| +TEST(SuffixArrayTest, Search) { |
| + TestSearchSuffixArray("", ""); |
|
huangs
2017/06/28 22:30:35
using us = MakeUnsignedString;
TestSearchSuffixAr
|
| + TestSearchSuffixArray("", "a"); |
| + TestSearchSuffixArray("b", ""); |
| + TestSearchSuffixArray("b", "a"); |
| + TestSearchSuffixArray("b", "c"); |
| + TestSearchSuffixArray("b", "bc"); |
| + TestSearchSuffixArray("aa", "a"); |
| + TestSearchSuffixArray("aa", "aa"); |
| + |
| + const char* sentence = "the quick brown fox jumps over the lazy dog."; |
|
huangs
2017/06/28 22:30:35
ustring sentence = us("...");
etiennep1
2017/06/28 23:03:48
Done.
|
| + // Entire string: exact and unique. |
| + TestSearchSuffixArray(sentence, sentence); |
| + // Empty string: exact and non-unique. |
| + TestSearchSuffixArray(sentence, ""); |
| + // Exact and unique suffix matches. |
| + TestSearchSuffixArray(sentence, "."); |
| + TestSearchSuffixArray(sentence, "the lazy dog."); |
| + // Exact and unique non-suffix matches. |
| + TestSearchSuffixArray(sentence, "quick"); |
| + TestSearchSuffixArray(sentence, "the quick"); |
| + // Partial and unique matches. |
| + TestSearchSuffixArray(sentence, "fox jumps with the hosps"); |
| + TestSearchSuffixArray(sentence, "xyz"); |
| + // Exact and non-unique match: take lexicographical first. |
| + TestSearchSuffixArray(sentence, "the"); |
| + TestSearchSuffixArray(sentence, " "); |
| + // Partial and non-unique match. |
| + // query < "the l"... < "the q"... |
| + TestSearchSuffixArray(sentence, "the apple"); |
| + // "the l"... < query < "the q"... |
| + TestSearchSuffixArray(sentence, "the opera"); |
| + // "the l"... < "the q"... < query |
| + TestSearchSuffixArray(sentence, "the zebra"); |
| + // Prefix match dominates suffix match (unique). |
| + TestSearchSuffixArray(sentence, "over quick brown fox"); |
| + // Empty matchs. |
| + TestSearchSuffixArray(sentence, ","); |
| + TestSearchSuffixArray(sentence, "1234"); |
| + TestSearchSuffixArray(sentence, "THE QUICK BROWN FOX"); |
| + TestSearchSuffixArray(sentence, "(the"); |
| +} |
| + |
| +TEST(SuffixArrayTest, SearchExact) { |
| + for (const std::string& test_str : test_strs) { |
| + ustring test_ustr = MakeUnsignedString(test_str); |
| + |
| + std::vector<size_t> suffix_array = MakeSuffixArray<Sais>(test_ustr, 256); |
| + |
| + for (int lo = 0; lo < test_str.size(); ++lo) { |
| + for (int hi = lo + 1; hi <= test_str.size(); ++hi) { |
| + ustring query(test_ustr.begin() + lo, test_ustr.begin() + hi); |
| + ASSERT_EQ(query.size(), hi - lo); |
| + auto pos = SearchSuffixArray(suffix_array, test_ustr.begin(), |
| + query.begin(), query.end()); |
| + EXPECT_TRUE( |
| + std::equal(query.begin(), query.end(), test_ustr.begin() + *pos)); |
| + } |
| + } |
| + } |
| +} |
| + |
| +} // namespace zucchini |