OLD | NEW |
| (Empty) |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 // This file implements the "failure function" notion used by the | |
6 // Knuth-Morris-Pratt algorithm for solving the string matching problem, i.e. | |
7 // finding all occurences of some pattern |P| in arbitrary text. It also | |
8 // provides tools for traversing these occurences. | |
9 // | |
10 // Consider a character string |P| of length |n|. The failure function of this | |
11 // string is an integer array |F| of length |n|, defined in the following way: | |
12 // F[i] = max{0 <= j <= i : P[0..j-1] == P[i-j+1..i]}, | |
13 // i.e. F[i] is the length of the longest proper suffix of P[0..i] that is also | |
14 // a prefix of |P|. | |
15 // | |
16 // For example, the "abacaba" string has the following failure function: | |
17 // [0, 0, 1, 0, 1, 2, 3]. | |
18 // For further examples see the "knuth_morris_pratt_unittest.cc" file. | |
19 | |
20 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_ | |
21 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_ | |
22 | |
23 #include <stddef.h> | |
24 | |
25 #include <functional> | |
26 #include <iterator> | |
27 #include <limits> | |
28 #include <type_traits> | |
29 #include <vector> | |
30 | |
31 #include "base/logging.h" | |
32 #include "base/numerics/safe_conversions.h" | |
33 #include "base/strings/string_piece.h" | |
34 | |
35 namespace subresource_filter { | |
36 | |
37 // Appends elements of the Knuth-Morris-Pratt failure function corresponding to | |
38 // |pattern| at the end of the |failure| vector. The EquivalentTo should be a | |
39 // bool(char, char) functor returning true iff the two characters are | |
40 // equivalent, and is required to be reflexive, symmetric and transitive. | |
41 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
42 void BuildFailureFunction(base::StringPiece pattern, | |
43 std::vector<IntType>* failure, | |
44 EquivalentTo equivalent_to = EquivalentTo()) { | |
45 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned."); | |
46 DCHECK_LE(pattern.size(), std::numeric_limits<IntType>::max()); | |
47 | |
48 if (pattern.empty()) | |
49 return; | |
50 | |
51 const size_t base_index = failure->size(); | |
52 failure->push_back(0); | |
53 for (size_t i = 1; i != pattern.size(); ++i) { | |
54 const char c = pattern[i]; | |
55 | |
56 IntType prefix_match_size = failure->back(); | |
57 DCHECK_LT(prefix_match_size, i); | |
58 while (prefix_match_size && !equivalent_to(pattern[prefix_match_size], c)) | |
59 prefix_match_size = (*failure)[base_index + prefix_match_size - 1]; | |
60 | |
61 if (prefix_match_size || equivalent_to(pattern[0], c)) | |
62 ++prefix_match_size; | |
63 failure->push_back(prefix_match_size); | |
64 } | |
65 } | |
66 | |
67 // Returns the position of the leftmost occurrence of the |pattern| in the | |
68 // |text|, i.e. the index of the character coming *after* the occurrence, within | |
69 // the [0, text.size()] range. If the pattern does not occur in the |text|, the | |
70 // returned value is base::StringPiece::npos. | |
71 // | |
72 // The method supports the use-case when the |text| is assumed to be the | |
73 // continuation of some imaginary string matching the initial | |
74 // |prefix_match_size| characters of the |pattern|. Hence, the |pattern| can be | |
75 // partially matched by this imaginary string and partially by the actual prefix | |
76 // of the |text|. | |
77 // | |
78 // The |failure| array is the failure function created by BuildFailureFunction | |
79 // with the same EquivalentTo and |pattern|, and containing at least | |
80 // pattern.size() values. | |
81 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
82 size_t FindOccurrence(base::StringPiece text, | |
83 base::StringPiece pattern, | |
84 const IntType* failure, | |
85 IntType prefix_match_size, | |
86 EquivalentTo equivalent_to = EquivalentTo()) { | |
87 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned."); | |
88 DCHECK_LE(prefix_match_size, pattern.size()); | |
89 DCHECK_LE(pattern.size(), std::numeric_limits<IntType>::max()); | |
90 | |
91 if (prefix_match_size == pattern.size()) | |
92 return 0; | |
93 | |
94 for (size_t i = 0; i != text.size(); ++i) { | |
95 const char c = text[i]; | |
96 while (prefix_match_size && !equivalent_to(pattern[prefix_match_size], c)) | |
97 prefix_match_size = failure[prefix_match_size - 1]; | |
98 if (prefix_match_size || equivalent_to(pattern[0], c)) | |
99 ++prefix_match_size; | |
100 if (prefix_match_size == pattern.size()) | |
101 return i + 1; | |
102 } | |
103 return base::StringPiece::npos; | |
104 } | |
105 | |
106 // Same as FindOccurrence, but starts the search from scratch, i.e. with | |
107 // prefix_match_size == 0. | |
108 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
109 size_t FindFirstOccurrence(base::StringPiece text, | |
110 base::StringPiece pattern, | |
111 const IntType* failure, | |
112 EquivalentTo equivalent_to = EquivalentTo()) { | |
113 return FindOccurrence(text, pattern, failure, static_cast<IntType>(0), | |
114 equivalent_to); | |
115 } | |
116 | |
117 // Same as FindOccurrence, but preforms the search assuming the |text| starts | |
118 // right after a full match of the |pattern|. The returned value is either | |
119 // base::StringPiece::npos if no more occurrences found, or is within the range | |
120 // [1, text.size()] if there are occurrences apart from the virtual prefix. | |
121 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
122 size_t FindNextOccurrence(base::StringPiece text, | |
123 base::StringPiece pattern, | |
124 const IntType* failure, | |
125 EquivalentTo equivalent_to = EquivalentTo()) { | |
126 if (pattern.empty()) | |
127 return text.empty() ? base::StringPiece::npos : 1; | |
128 | |
129 const IntType prefix_match_size = failure[pattern.size() - 1]; | |
130 return FindOccurrence(text, pattern, failure, prefix_match_size, | |
131 equivalent_to); | |
132 } | |
133 | |
134 // AllOccurrences can be used to find all occurrences of a |pattern| in a |text| | |
135 // with respect to EquivalentTo relation between characters, and iterate over | |
136 // them using an input iterator. The values pointed to by an iterator indicate | |
137 // positions of occurrences, i.e. indices of |text| characters coming *after* | |
138 // the occurrences. Out-of-range iterators are dereferenced to | |
139 // base::StringPiece::npos. | |
140 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
141 class AllOccurrences { | |
142 public: | |
143 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned."); | |
144 | |
145 class Iterator : public std::iterator<std::input_iterator_tag, size_t> { | |
146 public: | |
147 bool operator==(const Iterator& rhs) const { | |
148 return match_end_ == rhs.match_end_; | |
149 } | |
150 | |
151 bool operator!=(const Iterator& rhs) const { return !operator==(rhs); } | |
152 | |
153 size_t operator*() const { return match_end_; } | |
154 const size_t* operator->() const { return &match_end_; } | |
155 | |
156 Iterator& operator++() { | |
157 DCHECK(match_end_ == base::StringPiece::npos || | |
158 match_end_ <= owner_->text_.size()); | |
159 | |
160 const size_t relative_match_end = owner_->FindNextOccurrence(match_end_); | |
161 if (relative_match_end == base::StringPiece::npos) | |
162 match_end_ = base::StringPiece::npos; | |
163 else | |
164 match_end_ += relative_match_end; | |
165 | |
166 return *this; | |
167 } | |
168 | |
169 Iterator operator++(int) { | |
170 Iterator copy(*this); | |
171 operator++(); | |
172 return copy; | |
173 } | |
174 | |
175 protected: | |
176 friend class AllOccurrences; | |
177 | |
178 // Creates an iterator, which points to the occurrence ending just before | |
179 // the position denoted by |match_end|. If |match_end| == | |
180 // base::StringPiece::npos, the iterator is assumed to be at the end. | |
181 Iterator(const AllOccurrences* owner, size_t match_end) | |
182 : owner_(owner), match_end_(match_end) {} | |
183 | |
184 const AllOccurrences* owner_; | |
185 size_t match_end_; | |
186 }; | |
187 | |
188 AllOccurrences(base::StringPiece text, | |
189 base::StringPiece pattern, | |
190 const IntType* failure, | |
191 EquivalentTo equivalent_to = EquivalentTo()) | |
192 : text_(text), | |
193 pattern_(pattern), | |
194 failure_(failure), | |
195 equivalent_to_(equivalent_to) {} | |
196 | |
197 Iterator begin() const { | |
198 const size_t match_end = | |
199 FindFirstOccurrence(text_, pattern_, failure_, equivalent_to_); | |
200 return Iterator(this, match_end); | |
201 } | |
202 | |
203 Iterator end() const { return Iterator(this, base::StringPiece::npos); } | |
204 | |
205 protected: | |
206 // Returns the position of the next occurrence assuming the current occurrence | |
207 // ends right before the |match_end| position. | |
208 size_t FindNextOccurrence(size_t match_end) const { | |
209 return subresource_filter::FindNextOccurrence( | |
210 text_.substr(match_end), pattern_, failure_, equivalent_to_); | |
211 } | |
212 | |
213 base::StringPiece text_; | |
214 base::StringPiece pattern_; | |
215 const IntType* failure_; | |
216 | |
217 EquivalentTo equivalent_to_; | |
218 }; | |
219 | |
220 // A helper function used to create an AllOccurrences instance without knowing | |
221 // the direct type of the |equivalent_to| functor. | |
222 // | |
223 // Typical usage: | |
224 // auto occurrences = CreateAllOccurrences( | |
225 // "word, word: word-word", | |
226 // "word?", | |
227 // <failure function for "word?">, | |
228 // [](char c1, char c2) { | |
229 // return c1 == c2 || (ispunct(c1) && ispunct(c2)); | |
230 // }); | |
231 // | |
232 // for (size_t match_end : occurrences) { | |
233 // // The |pattern| matches the following substring of the |text|: | |
234 // // text[|match_end|-pattern.size()..|match_end|-1] | |
235 // ... process the |match_end| ... | |
236 // } | |
237 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
238 AllOccurrences<IntType, EquivalentTo> CreateAllOccurrences( | |
239 base::StringPiece text, | |
240 base::StringPiece pattern, | |
241 const IntType* failure, | |
242 EquivalentTo equivalent_to = EquivalentTo()) { | |
243 return AllOccurrences<IntType, EquivalentTo>(text, pattern, failure, | |
244 equivalent_to); | |
245 } | |
246 | |
247 } // namespace subresource_filter | |
248 | |
249 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_ | |
OLD | NEW |