| Index: courgette/third_party/divsufsort/divsufsort_unittest.cc
|
| diff --git a/courgette/third_party/divsufsort/divsufsort_unittest.cc b/courgette/third_party/divsufsort/divsufsort_unittest.cc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..4adcc9b122ea79943e4422e19f60e6993118db99
|
| --- /dev/null
|
| +++ b/courgette/third_party/divsufsort/divsufsort_unittest.cc
|
| @@ -0,0 +1,172 @@
|
| +// Copyright 2016 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include "courgette/third_party/divsufsort/divsufsort.h"
|
| +
|
| +#include <stddef.h>
|
| +
|
| +#include <algorithm>
|
| +#include <cstring>
|
| +#include <memory>
|
| +#include <random>
|
| +#include <string>
|
| +#include <vector>
|
| +
|
| +#include "base/macros.h"
|
| +#include "base/time/time.h"
|
| +#include "courgette/third_party/bsdiff/bsdiff_search.h"
|
| +#include "courgette/third_party/bsdiff/paged_array.h"
|
| +#include "testing/gtest/include/gtest/gtest.h"
|
| +
|
| +namespace courgette {
|
| +
|
| +TEST(DivSufSortTest, Sort) {
|
| + const char* test_cases[] = {
|
| + "",
|
| + "a",
|
| + "za",
|
| + "CACAO",
|
| + "banana",
|
| + "tobeornottobe",
|
| + "The quick brown fox jumps over the lazy dog.",
|
| + "elephantelephantelephantelephantelephant",
|
| + "-------------------------",
|
| + "011010011001011010010110011010010",
|
| + "3141592653589793238462643383279502884197169399375105",
|
| + "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD",
|
| + };
|
| +
|
| + for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
|
| + int len = static_cast<int>(::strlen(test_cases[idx]));
|
| + const unsigned char* s =
|
| + reinterpret_cast<const unsigned char*>(test_cases[idx]);
|
| +
|
| + // Generate the suffix array as I.
|
| + PagedArray<divsuf::saidx_t> I;
|
| + ASSERT_TRUE(I.Allocate(len + 1));
|
| + I[0] = len;
|
| + divsuf::divsufsort(s, I.begin() + 1, len);
|
| +
|
| + // Expect that I[] is a permutation of [0, len].
|
| + std::vector<divsuf::saidx_t> I_sorted(I.begin(), I.end());
|
| + std::sort(I_sorted.begin(), I_sorted.end());
|
| +
|
| + for (divsuf::saidx_t i = 0; i < len; ++i)
|
| + EXPECT_EQ(i, I_sorted[i]) << "test_case[" << idx << "]";
|
| +
|
| + // Expect that the |len| non-empty suffixes are strictly ordered.
|
| + const unsigned char* end = s + len;
|
| + for (divsuf::saidx_t i = 1; i < len; ++i) {
|
| + const unsigned char* suf1 = s + I[i - 1];
|
| + const unsigned char* suf2 = s + I[i];
|
| + bool is_less = std::lexicographical_compare(suf1, end, suf2, end);
|
| + EXPECT_TRUE(is_less) << "test_case[" << idx << "]";
|
| + }
|
| + }
|
| +}
|
| +
|
| +TEST(DivSufSortTest, Search) {
|
| + // Initialize main string and the suffix array.
|
| + // Positions: 00000000001111111111122222222233333333334444
|
| + // 01234567890123456789012345678901234567890123
|
| + const char* old_str = "the quick brown fox jumps over the lazy dog.";
|
| + int old_size = static_cast<int>(::strlen(old_str));
|
| + const unsigned char* old_buf =
|
| + reinterpret_cast<const unsigned char*>(old_str);
|
| + PagedArray<divsuf::saidx_t> I;
|
| + ASSERT_TRUE(I.Allocate(old_size + 1));
|
| + I[0] = old_size;
|
| + divsuf::divsufsort(old_buf, I.begin() + 1, old_size);
|
| +
|
| + // Test queries.
|
| + const struct {
|
| + int exp_pos; // -1 means "don't care".
|
| + int exp_match_len;
|
| + const char* query_str;
|
| + } test_cases[] = {
|
| + // Entire string.
|
| + {0, 44, "the quick brown fox jumps over the lazy dog."},
|
| + // Empty string.
|
| + {-1, 0, ""}, // Current algorithm does not enforce |pos| == 0.
|
| + // Exact and unique suffix match.
|
| + {43, 1, "."},
|
| + {31, 13, "the lazy dog."},
|
| + // Exact and unique non-suffix match.
|
| + {4, 5, "quick"},
|
| + {0, 9, "the quick"}, // Unique prefix.
|
| + // Entire word match with mutiple results: take lexicographical first.
|
| + {31, 3, "the"}, // Non-unique prefix: "the l"... < "the q"...
|
| + {9, 1, " "}, // " brown"... wins.
|
| + // Partial and unique match of query prefix.
|
| + {16, 10, "fox jumps with the hosps"},
|
| + {16, 10, "fox jumps "},
|
| + {16, 10, "fox jumps w"},
|
| + // Partial and multiple match of query prefix: no guarantees on |pos|.
|
| + // Take lexicographical first for matching portion *only*, so same results:
|
| + {-1, 4, "the apple"}, // query < "the l"... < "the q"...
|
| + {-1, 4, "the opera"}, // "the l"... < query < "the q"...
|
| + {-1, 4, "the zebra"}, // "the l"... < "the q"... < query
|
| + // Prefix match dominates suffix match.
|
| + {26, 5, "over quick brown fox"},
|
| + // No match.
|
| + {-1, 0, ","},
|
| + {-1, 0, "1234"},
|
| + {-1, 0, "THE QUICK BROWN FOX"},
|
| + {-1, 0, "(the"},
|
| + };
|
| +
|
| + for (size_t idx = 0; idx < arraysize(test_cases); ++idx) {
|
| + const auto& test_case = test_cases[idx];
|
| + int new_size = static_cast<int>(::strlen(test_case.query_str));
|
| + const unsigned char* new_buf =
|
| + reinterpret_cast<const unsigned char*>(test_case.query_str);
|
| +
|
| + // Perform the search.
|
| + int pos = 0;
|
| + int match_len = bsdiff::search<PagedArray<divsuf::saidx_t>&>(
|
| + I, old_buf, old_size, new_buf, new_size, &pos);
|
| +
|
| + // Check basic properties and match with expected values.
|
| + EXPECT_GE(match_len, 0) << "test_case[" << idx << "]";
|
| + EXPECT_LE(match_len, new_size) << "test_case[" << idx << "]";
|
| + if (match_len > 0) {
|
| + EXPECT_GE(pos, 0) << "test_case[" << idx << "]";
|
| + EXPECT_LE(pos, old_size - match_len) << "test_case[" << idx << "]";
|
| + EXPECT_EQ(0, ::memcmp(old_buf + pos, new_buf, match_len))
|
| + << "test_case[" << idx << "]";
|
| + }
|
| + if (test_case.exp_pos >= 0) {
|
| + EXPECT_EQ(test_case.exp_pos, pos) << "test_case[" << idx << "]";
|
| + }
|
| + EXPECT_EQ(test_case.exp_match_len, match_len) << "test_case[" << idx << "]";
|
| + }
|
| +}
|
| +
|
| +// "Big" test case on pseudorandom data, mainly to measure timing.
|
| +TEST(DivSufSortTest, Big) {
|
| + const int kSize = 1 << 24;
|
| + std::minstd_rand rand_gen;
|
| + std::unique_ptr<divsuf::sauchar_t> buf(new divsuf::sauchar_t[kSize]);
|
| + base::Time::EnableHighResolutionTimer(true);
|
| + base::Time::ActivateHighResolutionTimer(true);
|
| +
|
| + std::fill(buf.get(), buf.get() + kSize, 0);
|
| + const int kBound = kSize / 3;
|
| + for (int i = 0; i < kBound; ++i)
|
| + buf.get()[i] = i & 0xFF;
|
| + std::shuffle(buf.get(), buf.get() + kSize, rand_gen);
|
| +
|
| + PagedArray<divsuf::saidx_t> I;
|
| + ASSERT_TRUE(I.Allocate(kSize + 1));
|
| +
|
| + base::Time t0 = base::Time::Now();
|
| + I[0] = kSize;
|
| + divsuf::divsufsort(buf.get(), I.begin() + 1, kSize);
|
| + base::TimeDelta dt = base::Time::Now() - t0;
|
| + fprintf(stderr, "%.3f s\n", dt.InSecondsF());
|
| +
|
| + base::Time::ActivateHighResolutionTimer(false);
|
| +}
|
| +
|
| +} // namespace courgette
|
|
|