Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(98)

Side by Side Diff: components/subresource_filter/core/common/knuth_morris_pratt.h

Issue 2793993002: [subresource_filter] Replace KMP by std::search. (Closed)
Patch Set: Fix DCHECK. Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // This file implements the "failure function" notion used by the
6 // Knuth-Morris-Pratt algorithm for solving the string matching problem, i.e.
7 // finding all occurences of some pattern |P| in arbitrary text. It also
8 // provides tools for traversing these occurences.
9 //
10 // Consider a character string |P| of length |n|. The failure function of this
11 // string is an integer array |F| of length |n|, defined in the following way:
12 // F[i] = max{0 <= j <= i : P[0..j-1] == P[i-j+1..i]},
13 // i.e. F[i] is the length of the longest proper suffix of P[0..i] that is also
14 // a prefix of |P|.
15 //
16 // For example, the "abacaba" string has the following failure function:
17 // [0, 0, 1, 0, 1, 2, 3].
18 // For further examples see the "knuth_morris_pratt_unittest.cc" file.
19
20 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_
21 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_
22
23 #include <stddef.h>
24
25 #include <functional>
26 #include <iterator>
27 #include <limits>
28 #include <type_traits>
29 #include <vector>
30
31 #include "base/logging.h"
32 #include "base/numerics/safe_conversions.h"
33 #include "base/strings/string_piece.h"
34
35 namespace subresource_filter {
36
37 // Appends elements of the Knuth-Morris-Pratt failure function corresponding to
38 // |pattern| at the end of the |failure| vector. The EquivalentTo should be a
39 // bool(char, char) functor returning true iff the two characters are
40 // equivalent, and is required to be reflexive, symmetric and transitive.
41 template <typename IntType, typename EquivalentTo = std::equal_to<char>>
42 void BuildFailureFunction(base::StringPiece pattern,
43 std::vector<IntType>* failure,
44 EquivalentTo equivalent_to = EquivalentTo()) {
45 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned.");
46 DCHECK_LE(pattern.size(), std::numeric_limits<IntType>::max());
47
48 if (pattern.empty())
49 return;
50
51 const size_t base_index = failure->size();
52 failure->push_back(0);
53 for (size_t i = 1; i != pattern.size(); ++i) {
54 const char c = pattern[i];
55
56 IntType prefix_match_size = failure->back();
57 DCHECK_LT(prefix_match_size, i);
58 while (prefix_match_size && !equivalent_to(pattern[prefix_match_size], c))
59 prefix_match_size = (*failure)[base_index + prefix_match_size - 1];
60
61 if (prefix_match_size || equivalent_to(pattern[0], c))
62 ++prefix_match_size;
63 failure->push_back(prefix_match_size);
64 }
65 }
66
67 // Returns the position of the leftmost occurrence of the |pattern| in the
68 // |text|, i.e. the index of the character coming *after* the occurrence, within
69 // the [0, text.size()] range. If the pattern does not occur in the |text|, the
70 // returned value is base::StringPiece::npos.
71 //
72 // The method supports the use-case when the |text| is assumed to be the
73 // continuation of some imaginary string matching the initial
74 // |prefix_match_size| characters of the |pattern|. Hence, the |pattern| can be
75 // partially matched by this imaginary string and partially by the actual prefix
76 // of the |text|.
77 //
78 // The |failure| array is the failure function created by BuildFailureFunction
79 // with the same EquivalentTo and |pattern|, and containing at least
80 // pattern.size() values.
81 template <typename IntType, typename EquivalentTo = std::equal_to<char>>
82 size_t FindOccurrence(base::StringPiece text,
83 base::StringPiece pattern,
84 const IntType* failure,
85 IntType prefix_match_size,
86 EquivalentTo equivalent_to = EquivalentTo()) {
87 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned.");
88 DCHECK_LE(prefix_match_size, pattern.size());
89 DCHECK_LE(pattern.size(), std::numeric_limits<IntType>::max());
90
91 if (prefix_match_size == pattern.size())
92 return 0;
93
94 for (size_t i = 0; i != text.size(); ++i) {
95 const char c = text[i];
96 while (prefix_match_size && !equivalent_to(pattern[prefix_match_size], c))
97 prefix_match_size = failure[prefix_match_size - 1];
98 if (prefix_match_size || equivalent_to(pattern[0], c))
99 ++prefix_match_size;
100 if (prefix_match_size == pattern.size())
101 return i + 1;
102 }
103 return base::StringPiece::npos;
104 }
105
106 // Same as FindOccurrence, but starts the search from scratch, i.e. with
107 // prefix_match_size == 0.
108 template <typename IntType, typename EquivalentTo = std::equal_to<char>>
109 size_t FindFirstOccurrence(base::StringPiece text,
110 base::StringPiece pattern,
111 const IntType* failure,
112 EquivalentTo equivalent_to = EquivalentTo()) {
113 return FindOccurrence(text, pattern, failure, static_cast<IntType>(0),
114 equivalent_to);
115 }
116
117 // Same as FindOccurrence, but preforms the search assuming the |text| starts
118 // right after a full match of the |pattern|. The returned value is either
119 // base::StringPiece::npos if no more occurrences found, or is within the range
120 // [1, text.size()] if there are occurrences apart from the virtual prefix.
121 template <typename IntType, typename EquivalentTo = std::equal_to<char>>
122 size_t FindNextOccurrence(base::StringPiece text,
123 base::StringPiece pattern,
124 const IntType* failure,
125 EquivalentTo equivalent_to = EquivalentTo()) {
126 if (pattern.empty())
127 return text.empty() ? base::StringPiece::npos : 1;
128
129 const IntType prefix_match_size = failure[pattern.size() - 1];
130 return FindOccurrence(text, pattern, failure, prefix_match_size,
131 equivalent_to);
132 }
133
134 // AllOccurrences can be used to find all occurrences of a |pattern| in a |text|
135 // with respect to EquivalentTo relation between characters, and iterate over
136 // them using an input iterator. The values pointed to by an iterator indicate
137 // positions of occurrences, i.e. indices of |text| characters coming *after*
138 // the occurrences. Out-of-range iterators are dereferenced to
139 // base::StringPiece::npos.
140 template <typename IntType, typename EquivalentTo = std::equal_to<char>>
141 class AllOccurrences {
142 public:
143 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned.");
144
145 class Iterator : public std::iterator<std::input_iterator_tag, size_t> {
146 public:
147 bool operator==(const Iterator& rhs) const {
148 return match_end_ == rhs.match_end_;
149 }
150
151 bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }
152
153 size_t operator*() const { return match_end_; }
154 const size_t* operator->() const { return &match_end_; }
155
156 Iterator& operator++() {
157 DCHECK(match_end_ == base::StringPiece::npos ||
158 match_end_ <= owner_->text_.size());
159
160 const size_t relative_match_end = owner_->FindNextOccurrence(match_end_);
161 if (relative_match_end == base::StringPiece::npos)
162 match_end_ = base::StringPiece::npos;
163 else
164 match_end_ += relative_match_end;
165
166 return *this;
167 }
168
169 Iterator operator++(int) {
170 Iterator copy(*this);
171 operator++();
172 return copy;
173 }
174
175 protected:
176 friend class AllOccurrences;
177
178 // Creates an iterator, which points to the occurrence ending just before
179 // the position denoted by |match_end|. If |match_end| ==
180 // base::StringPiece::npos, the iterator is assumed to be at the end.
181 Iterator(const AllOccurrences* owner, size_t match_end)
182 : owner_(owner), match_end_(match_end) {}
183
184 const AllOccurrences* owner_;
185 size_t match_end_;
186 };
187
188 AllOccurrences(base::StringPiece text,
189 base::StringPiece pattern,
190 const IntType* failure,
191 EquivalentTo equivalent_to = EquivalentTo())
192 : text_(text),
193 pattern_(pattern),
194 failure_(failure),
195 equivalent_to_(equivalent_to) {}
196
197 Iterator begin() const {
198 const size_t match_end =
199 FindFirstOccurrence(text_, pattern_, failure_, equivalent_to_);
200 return Iterator(this, match_end);
201 }
202
203 Iterator end() const { return Iterator(this, base::StringPiece::npos); }
204
205 protected:
206 // Returns the position of the next occurrence assuming the current occurrence
207 // ends right before the |match_end| position.
208 size_t FindNextOccurrence(size_t match_end) const {
209 return subresource_filter::FindNextOccurrence(
210 text_.substr(match_end), pattern_, failure_, equivalent_to_);
211 }
212
213 base::StringPiece text_;
214 base::StringPiece pattern_;
215 const IntType* failure_;
216
217 EquivalentTo equivalent_to_;
218 };
219
220 // A helper function used to create an AllOccurrences instance without knowing
221 // the direct type of the |equivalent_to| functor.
222 //
223 // Typical usage:
224 // auto occurrences = CreateAllOccurrences(
225 // "word, word: word-word",
226 // "word?",
227 // <failure function for "word?">,
228 // [](char c1, char c2) {
229 // return c1 == c2 || (ispunct(c1) && ispunct(c2));
230 // });
231 //
232 // for (size_t match_end : occurrences) {
233 // // The |pattern| matches the following substring of the |text|:
234 // // text[|match_end|-pattern.size()..|match_end|-1]
235 // ... process the |match_end| ...
236 // }
237 template <typename IntType, typename EquivalentTo = std::equal_to<char>>
238 AllOccurrences<IntType, EquivalentTo> CreateAllOccurrences(
239 base::StringPiece text,
240 base::StringPiece pattern,
241 const IntType* failure,
242 EquivalentTo equivalent_to = EquivalentTo()) {
243 return AllOccurrences<IntType, EquivalentTo>(text, pattern, failure,
244 equivalent_to);
245 }
246
247 } // namespace subresource_filter
248
249 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698