OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "courgette/third_party/qsufsort.h" | |
6 | |
7 #include <algorithm> | |
8 #include <cstring> | |
9 #include <string> | |
10 #include <vector> | |
11 | |
12 #include "base/macros.h" | |
13 #include "base/memory/scoped_ptr.h" | |
14 #include "testing/gtest/include/gtest/gtest.h" | |
15 | |
16 TEST(QSufSortTest, Sort) { | |
17 const char* test_cases[] = { | |
18 "", | |
19 "a", | |
20 "za", | |
21 "CACAO", | |
22 "banana", | |
23 "tobeornottobe", | |
24 "The quick brown fox jumps over the lazy dog.", | |
25 "elephantelephantelephantelephantelephant", | |
26 "-------------------------", | |
27 "011010011001011010010110011010010", | |
28 "3141592653589793238462643383279502884197169399375105", | |
29 "\xFF\xFE\xFF\xFE\xFD\x80\x30\x31\x32\x80\x30\xFF\x01\xAB\xCD", | |
30 }; | |
31 | |
32 for (size_t idx = 0; idx < arraysize(test_cases); ++idx) { | |
33 int len = ::strlen(test_cases[idx]); | |
Will Harris
2015/08/11 17:31:36
strlen returns size_t, should int mostly be replac
huangs
2015/08/11 17:46:03
Done. Gonna try if this causes signed-unsigned com
Will Harris
2015/08/11 17:58:42
sorry if I wasn't clear; I meant change all the in
| |
34 const unsigned char* s = | |
35 reinterpret_cast<const unsigned char*>(test_cases[idx]); | |
36 | |
37 // Generate the suffix array as I. | |
38 std::vector<int> I(len + 1); | |
39 std::vector<int> V(len + 1); | |
40 courgette::qsuf::qsufsort<int*>(&I[0], &V[0], s, len); | |
41 | |
42 // Expect that I[] is a permutation of [0, len]. | |
43 std::vector<int> I_sorted(I); | |
44 std::sort(I_sorted.begin(), I_sorted.end()); | |
45 for (int i = 0; i < len + 1; ++i) { | |
46 EXPECT_EQ(i, I_sorted[i]) << "test_case[" << idx << "]"; | |
47 } | |
48 | |
49 // First string must be empty string. | |
50 EXPECT_EQ(len, I[0]) << "test_case[" << idx << "]"; | |
51 | |
52 // Expect that the |len + 1| suffixes are strictly ordered. | |
53 const unsigned char* end = s + len; | |
54 for (int i = 0; i < len; ++i) { | |
55 const unsigned char* suf1 = s + I[i]; | |
56 const unsigned char* suf2 = s + I[i + 1]; | |
57 bool is_less = std::lexicographical_compare(suf1, end, suf2, end); | |
58 EXPECT_TRUE(is_less) << "test_case[" << idx << "]"; | |
59 } | |
60 } | |
61 } | |
62 | |
63 TEST(QSufSortTest, Search) { | |
64 // Initialize main string and the suffix array. | |
65 // Positions: 00000000001111111111122222222233333333334444 | |
66 // 01234567890123456789012345678901234567890123 | |
67 const char* old_str = "the quick brown fox jumps over the lazy dog."; | |
68 int old_size = ::strlen(old_str); | |
Will Harris
2015/08/11 17:31:36
same as above
huangs
2015/08/11 17:46:04
Done.
| |
69 const unsigned char* old_buf = | |
70 reinterpret_cast<const unsigned char*>(old_str); | |
71 std::vector<int> I(old_size + 1); | |
72 std::vector<int> V(old_size + 1); | |
73 courgette::qsuf::qsufsort<int*>(&I[0], &V[0], old_buf, old_size); | |
74 | |
75 // Test queries. | |
76 const struct { | |
77 int exp_pos; // -1 means "don't care". | |
78 int exp_match_len; | |
79 const char* query_str; | |
80 } test_cases[] = { | |
81 // Entire string. | |
82 {0, 44, "the quick brown fox jumps over the lazy dog."}, | |
83 // Empty string. | |
84 {-1, 0, ""}, // Current algorithm does not enforce |pos| == 0. | |
85 // Exact and unique suffix match. | |
86 {43, 1, "."}, | |
87 {31, 13, "the lazy dog."}, | |
88 // Exact and unique non-suffix match. | |
89 {4, 5, "quick"}, | |
90 {0, 9, "the quick"}, // Unique prefix. | |
91 // Entire word match with mutiple results: take lexicographical first. | |
92 {31, 3, "the"}, // Non-unique prefix: "the l"... < "the q"... | |
93 {9, 1, " "}, // " brown"... wins. | |
94 // Partial and unique match of query prefix. | |
95 {16, 10, "fox jumps with the hosps"}, | |
96 // Partial and multiple match of query prefix: no guarantees on |pos|. | |
97 // Take lexicographical first for matching portion *only*, so same results: | |
98 {-1, 4, "the apple"}, // query < "the l"... < "the q"... | |
99 {-1, 4, "the opera"}, // "the l"... < query < "the q"... | |
100 {-1, 4, "the zebra"}, // "the l"... < "the q"... < query | |
101 // Prefix match dominates suffix match. | |
102 {26, 5, "over quick brown fox"}, | |
103 // No match. | |
104 {-1, 0, ","}, | |
105 {-1, 0, "1234"}, | |
106 {-1, 0, "THE QUICK BROWN FOX"}, | |
107 {-1, 0, "(the"}, | |
108 }; | |
109 | |
110 for (size_t idx = 0; idx < arraysize(test_cases); ++idx) { | |
111 const auto& test_case = test_cases[idx]; | |
112 int new_size = ::strlen(test_case.query_str); | |
113 const unsigned char* new_buf = | |
114 reinterpret_cast<const unsigned char*>(test_case.query_str); | |
115 | |
116 // Perform the search. | |
117 int pos = 0; | |
118 int match_len = courgette::qsuf::search( | |
119 &I[0], old_buf, old_size, new_buf, new_size, 0, old_size, &pos); | |
120 | |
121 // Check basic properties and match with expected values. | |
122 EXPECT_GE(match_len, 0) << "test_case[" << idx << "]"; | |
123 EXPECT_LE(match_len, new_size) << "test_case[" << idx << "]"; | |
124 if (match_len > 0) { | |
125 EXPECT_GE(pos, 0) << "test_case[" << idx << "]"; | |
126 EXPECT_LE(pos, old_size - match_len) << "test_case[" << idx << "]"; | |
127 EXPECT_EQ(0, ::memcmp(old_buf + pos, new_buf, match_len)) | |
128 << "test_case[" << idx << "]"; | |
129 } | |
130 if (test_case.exp_pos >= 0) { | |
131 EXPECT_EQ(test_case.exp_pos, pos) << "test_case[" << idx << "]"; | |
132 } | |
133 EXPECT_EQ(test_case.exp_match_len, match_len) << "test_case[" << idx << "]"; | |
134 } | |
135 } | |
OLD | NEW |