| OLD | NEW |
| (Empty) | |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "courgette/third_party/divsufsort/divsufsort.h" |
| 6 |
| 7 #include <stddef.h> |
| 8 |
| 9 #include <algorithm> |
| 10 #include <cstring> |
| 11 #include <memory> |
| 12 #include <random> |
| 13 #include <string> |
| 14 #include <vector> |
| 15 |
| 16 #include "base/macros.h" |
| 17 #include "base/time/time.h" |
| 18 #include "courgette/third_party/bsdiff/bsdiff_search.h" |
| 19 #include "courgette/third_party/bsdiff/paged_array.h" |
| 20 #include "testing/gtest/include/gtest/gtest.h" |
| 21 |
| 22 namespace courgette { |
| 23 |
| 24 TEST(DivSufSortTest, Sort) { |
| 25 const char* test_cases[] = { |
| 26 "", |
| 27 "a", |
| 28 "za", |
| 29 "CACAO", |
| 30 "banana", |
| 31 "tobeornottobe", |
| 32 "The quick brown fox jumps over the lazy dog.", |
| 33 "elephantelephantelephantelephantelephant", |
| 34 "-------------------------", |
| 35 "011010011001011010010110011010010", |
| 36 "3141592653589793238462643383279502884197169399375105", |
| 37 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", |
| 38 }; |
| 39 |
| 40 for (size_t idx = 0; idx < arraysize(test_cases); ++idx) { |
| 41 int len = static_cast<int>(::strlen(test_cases[idx])); |
| 42 const unsigned char* s = |
| 43 reinterpret_cast<const unsigned char*>(test_cases[idx]); |
| 44 |
| 45 // Generate the suffix array as I. |
| 46 PagedArray<divsuf::saidx_t> I; |
| 47 ASSERT_TRUE(I.Allocate(len + 1)); |
| 48 I[0] = len; |
| 49 divsuf::divsufsort(s, I.begin() + 1, len); |
| 50 |
| 51 // Expect that I[] is a permutation of [0, len]. |
| 52 std::vector<divsuf::saidx_t> I_sorted(I.begin(), I.end()); |
| 53 std::sort(I_sorted.begin(), I_sorted.end()); |
| 54 |
| 55 for (divsuf::saidx_t i = 0; i < len; ++i) |
| 56 EXPECT_EQ(i, I_sorted[i]) << "test_case[" << idx << "]"; |
| 57 |
| 58 // Expect that the |len| non-empty suffixes are strictly ordered. |
| 59 const unsigned char* end = s + len; |
| 60 for (divsuf::saidx_t i = 1; i < len; ++i) { |
| 61 const unsigned char* suf1 = s + I[i - 1]; |
| 62 const unsigned char* suf2 = s + I[i]; |
| 63 bool is_less = std::lexicographical_compare(suf1, end, suf2, end); |
| 64 EXPECT_TRUE(is_less) << "test_case[" << idx << "]"; |
| 65 } |
| 66 } |
| 67 } |
| 68 |
| 69 TEST(DivSufSortTest, Search) { |
| 70 // Initialize main string and the suffix array. |
| 71 // Positions: 00000000001111111111122222222233333333334444 |
| 72 // 01234567890123456789012345678901234567890123 |
| 73 const char* old_str = "the quick brown fox jumps over the lazy dog."; |
| 74 int old_size = static_cast<int>(::strlen(old_str)); |
| 75 const unsigned char* old_buf = |
| 76 reinterpret_cast<const unsigned char*>(old_str); |
| 77 PagedArray<divsuf::saidx_t> I; |
| 78 ASSERT_TRUE(I.Allocate(old_size + 1)); |
| 79 I[0] = old_size; |
| 80 divsuf::divsufsort(old_buf, I.begin() + 1, old_size); |
| 81 |
| 82 // Test queries. |
| 83 const struct { |
| 84 int exp_pos; // -1 means "don't care". |
| 85 int exp_match_len; |
| 86 const char* query_str; |
| 87 } test_cases[] = { |
| 88 // Entire string. |
| 89 {0, 44, "the quick brown fox jumps over the lazy dog."}, |
| 90 // Empty string. |
| 91 {-1, 0, ""}, // Current algorithm does not enforce |pos| == 0. |
| 92 // Exact and unique suffix match. |
| 93 {43, 1, "."}, |
| 94 {31, 13, "the lazy dog."}, |
| 95 // Exact and unique non-suffix match. |
| 96 {4, 5, "quick"}, |
| 97 {0, 9, "the quick"}, // Unique prefix. |
| 98 // Entire word match with mutiple results: take lexicographical first. |
| 99 {31, 3, "the"}, // Non-unique prefix: "the l"... < "the q"... |
| 100 {9, 1, " "}, // " brown"... wins. |
| 101 // Partial and unique match of query prefix. |
| 102 {16, 10, "fox jumps with the hosps"}, |
| 103 {16, 10, "fox jumps "}, |
| 104 {16, 10, "fox jumps w"}, |
| 105 // Partial and multiple match of query prefix: no guarantees on |pos|. |
| 106 // Take lexicographical first for matching portion *only*, so same results: |
| 107 {-1, 4, "the apple"}, // query < "the l"... < "the q"... |
| 108 {-1, 4, "the opera"}, // "the l"... < query < "the q"... |
| 109 {-1, 4, "the zebra"}, // "the l"... < "the q"... < query |
| 110 // Prefix match dominates suffix match. |
| 111 {26, 5, "over quick brown fox"}, |
| 112 // No match. |
| 113 {-1, 0, ","}, |
| 114 {-1, 0, "1234"}, |
| 115 {-1, 0, "THE QUICK BROWN FOX"}, |
| 116 {-1, 0, "(the"}, |
| 117 }; |
| 118 |
| 119 for (size_t idx = 0; idx < arraysize(test_cases); ++idx) { |
| 120 const auto& test_case = test_cases[idx]; |
| 121 int new_size = static_cast<int>(::strlen(test_case.query_str)); |
| 122 const unsigned char* new_buf = |
| 123 reinterpret_cast<const unsigned char*>(test_case.query_str); |
| 124 |
| 125 // Perform the search. |
| 126 int pos = 0; |
| 127 int match_len = bsdiff::search<PagedArray<divsuf::saidx_t>&>( |
| 128 I, old_buf, old_size, new_buf, new_size, &pos); |
| 129 |
| 130 // Check basic properties and match with expected values. |
| 131 EXPECT_GE(match_len, 0) << "test_case[" << idx << "]"; |
| 132 EXPECT_LE(match_len, new_size) << "test_case[" << idx << "]"; |
| 133 if (match_len > 0) { |
| 134 EXPECT_GE(pos, 0) << "test_case[" << idx << "]"; |
| 135 EXPECT_LE(pos, old_size - match_len) << "test_case[" << idx << "]"; |
| 136 EXPECT_EQ(0, ::memcmp(old_buf + pos, new_buf, match_len)) |
| 137 << "test_case[" << idx << "]"; |
| 138 } |
| 139 if (test_case.exp_pos >= 0) { |
| 140 EXPECT_EQ(test_case.exp_pos, pos) << "test_case[" << idx << "]"; |
| 141 } |
| 142 EXPECT_EQ(test_case.exp_match_len, match_len) << "test_case[" << idx << "]"; |
| 143 } |
| 144 } |
| 145 |
| 146 // "Big" test case on pseudorandom data, mainly to measure timing. |
| 147 TEST(DivSufSortTest, Big) { |
| 148 const int kSize = 1 << 24; |
| 149 std::minstd_rand rand_gen; |
| 150 std::unique_ptr<divsuf::sauchar_t> buf(new divsuf::sauchar_t[kSize]); |
| 151 base::Time::EnableHighResolutionTimer(true); |
| 152 base::Time::ActivateHighResolutionTimer(true); |
| 153 |
| 154 std::fill(buf.get(), buf.get() + kSize, 0); |
| 155 const int kBound = kSize / 3; |
| 156 for (int i = 0; i < kBound; ++i) |
| 157 buf.get()[i] = i & 0xFF; |
| 158 std::shuffle(buf.get(), buf.get() + kSize, rand_gen); |
| 159 |
| 160 PagedArray<divsuf::saidx_t> I; |
| 161 ASSERT_TRUE(I.Allocate(kSize + 1)); |
| 162 |
| 163 base::Time t0 = base::Time::Now(); |
| 164 I[0] = kSize; |
| 165 divsuf::divsufsort(buf.get(), I.begin() + 1, kSize); |
| 166 base::TimeDelta dt = base::Time::Now() - t0; |
| 167 fprintf(stderr, "%.3f s\n", dt.InSecondsF()); |
| 168 |
| 169 base::Time::ActivateHighResolutionTimer(false); |
| 170 } |
| 171 |
| 172 } // namespace courgette |
| OLD | NEW |