| OLD | NEW |
| (Empty) |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // This file implements the "failure function" notion used by the | |
| 6 // Knuth-Morris-Pratt algorithm for solving the string matching problem, i.e. | |
| 7 // finding all occurences of some pattern |P| in arbitrary text. It also | |
| 8 // provides tools for traversing these occurences. | |
| 9 // | |
| 10 // Consider a character string |P| of length |n|. The failure function of this | |
| 11 // string is an integer array |F| of length |n|, defined in the following way: | |
| 12 // F[i] = max{0 <= j <= i : P[0..j-1] == P[i-j+1..i]}, | |
| 13 // i.e. F[i] is the length of the longest proper suffix of P[0..i] that is also | |
| 14 // a prefix of |P|. | |
| 15 // | |
| 16 // For example, the "abacaba" string has the following failure function: | |
| 17 // [0, 0, 1, 0, 1, 2, 3]. | |
| 18 // For further examples see the "knuth_morris_pratt_unittest.cc" file. | |
| 19 | |
| 20 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_ | |
| 21 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_ | |
| 22 | |
| 23 #include <stddef.h> | |
| 24 | |
| 25 #include <functional> | |
| 26 #include <iterator> | |
| 27 #include <limits> | |
| 28 #include <type_traits> | |
| 29 #include <vector> | |
| 30 | |
| 31 #include "base/logging.h" | |
| 32 #include "base/numerics/safe_conversions.h" | |
| 33 #include "base/strings/string_piece.h" | |
| 34 | |
| 35 namespace subresource_filter { | |
| 36 | |
| 37 // Appends elements of the Knuth-Morris-Pratt failure function corresponding to | |
| 38 // |pattern| at the end of the |failure| vector. The EquivalentTo should be a | |
| 39 // bool(char, char) functor returning true iff the two characters are | |
| 40 // equivalent, and is required to be reflexive, symmetric and transitive. | |
| 41 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
| 42 void BuildFailureFunction(base::StringPiece pattern, | |
| 43 std::vector<IntType>* failure, | |
| 44 EquivalentTo equivalent_to = EquivalentTo()) { | |
| 45 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned."); | |
| 46 DCHECK_LE(pattern.size(), std::numeric_limits<IntType>::max()); | |
| 47 | |
| 48 if (pattern.empty()) | |
| 49 return; | |
| 50 | |
| 51 const size_t base_index = failure->size(); | |
| 52 failure->push_back(0); | |
| 53 for (size_t i = 1; i != pattern.size(); ++i) { | |
| 54 const char c = pattern[i]; | |
| 55 | |
| 56 IntType prefix_match_size = failure->back(); | |
| 57 DCHECK_LT(prefix_match_size, i); | |
| 58 while (prefix_match_size && !equivalent_to(pattern[prefix_match_size], c)) | |
| 59 prefix_match_size = (*failure)[base_index + prefix_match_size - 1]; | |
| 60 | |
| 61 if (prefix_match_size || equivalent_to(pattern[0], c)) | |
| 62 ++prefix_match_size; | |
| 63 failure->push_back(prefix_match_size); | |
| 64 } | |
| 65 } | |
| 66 | |
| 67 // Returns the position of the leftmost occurrence of the |pattern| in the | |
| 68 // |text|, i.e. the index of the character coming *after* the occurrence, within | |
| 69 // the [0, text.size()] range. If the pattern does not occur in the |text|, the | |
| 70 // returned value is base::StringPiece::npos. | |
| 71 // | |
| 72 // The method supports the use-case when the |text| is assumed to be the | |
| 73 // continuation of some imaginary string matching the initial | |
| 74 // |prefix_match_size| characters of the |pattern|. Hence, the |pattern| can be | |
| 75 // partially matched by this imaginary string and partially by the actual prefix | |
| 76 // of the |text|. | |
| 77 // | |
| 78 // The |failure| array is the failure function created by BuildFailureFunction | |
| 79 // with the same EquivalentTo and |pattern|, and containing at least | |
| 80 // pattern.size() values. | |
| 81 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
| 82 size_t FindOccurrence(base::StringPiece text, | |
| 83 base::StringPiece pattern, | |
| 84 const IntType* failure, | |
| 85 IntType prefix_match_size, | |
| 86 EquivalentTo equivalent_to = EquivalentTo()) { | |
| 87 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned."); | |
| 88 DCHECK_LE(prefix_match_size, pattern.size()); | |
| 89 DCHECK_LE(pattern.size(), std::numeric_limits<IntType>::max()); | |
| 90 | |
| 91 if (prefix_match_size == pattern.size()) | |
| 92 return 0; | |
| 93 | |
| 94 for (size_t i = 0; i != text.size(); ++i) { | |
| 95 const char c = text[i]; | |
| 96 while (prefix_match_size && !equivalent_to(pattern[prefix_match_size], c)) | |
| 97 prefix_match_size = failure[prefix_match_size - 1]; | |
| 98 if (prefix_match_size || equivalent_to(pattern[0], c)) | |
| 99 ++prefix_match_size; | |
| 100 if (prefix_match_size == pattern.size()) | |
| 101 return i + 1; | |
| 102 } | |
| 103 return base::StringPiece::npos; | |
| 104 } | |
| 105 | |
| 106 // Same as FindOccurrence, but starts the search from scratch, i.e. with | |
| 107 // prefix_match_size == 0. | |
| 108 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
| 109 size_t FindFirstOccurrence(base::StringPiece text, | |
| 110 base::StringPiece pattern, | |
| 111 const IntType* failure, | |
| 112 EquivalentTo equivalent_to = EquivalentTo()) { | |
| 113 return FindOccurrence(text, pattern, failure, static_cast<IntType>(0), | |
| 114 equivalent_to); | |
| 115 } | |
| 116 | |
| 117 // Same as FindOccurrence, but preforms the search assuming the |text| starts | |
| 118 // right after a full match of the |pattern|. The returned value is either | |
| 119 // base::StringPiece::npos if no more occurrences found, or is within the range | |
| 120 // [1, text.size()] if there are occurrences apart from the virtual prefix. | |
| 121 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
| 122 size_t FindNextOccurrence(base::StringPiece text, | |
| 123 base::StringPiece pattern, | |
| 124 const IntType* failure, | |
| 125 EquivalentTo equivalent_to = EquivalentTo()) { | |
| 126 if (pattern.empty()) | |
| 127 return text.empty() ? base::StringPiece::npos : 1; | |
| 128 | |
| 129 const IntType prefix_match_size = failure[pattern.size() - 1]; | |
| 130 return FindOccurrence(text, pattern, failure, prefix_match_size, | |
| 131 equivalent_to); | |
| 132 } | |
| 133 | |
| 134 // AllOccurrences can be used to find all occurrences of a |pattern| in a |text| | |
| 135 // with respect to EquivalentTo relation between characters, and iterate over | |
| 136 // them using an input iterator. The values pointed to by an iterator indicate | |
| 137 // positions of occurrences, i.e. indices of |text| characters coming *after* | |
| 138 // the occurrences. Out-of-range iterators are dereferenced to | |
| 139 // base::StringPiece::npos. | |
| 140 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
| 141 class AllOccurrences { | |
| 142 public: | |
| 143 static_assert(std::is_unsigned<IntType>::value, "IntType is not unsigned."); | |
| 144 | |
| 145 class Iterator : public std::iterator<std::input_iterator_tag, size_t> { | |
| 146 public: | |
| 147 bool operator==(const Iterator& rhs) const { | |
| 148 return match_end_ == rhs.match_end_; | |
| 149 } | |
| 150 | |
| 151 bool operator!=(const Iterator& rhs) const { return !operator==(rhs); } | |
| 152 | |
| 153 size_t operator*() const { return match_end_; } | |
| 154 const size_t* operator->() const { return &match_end_; } | |
| 155 | |
| 156 Iterator& operator++() { | |
| 157 DCHECK(match_end_ == base::StringPiece::npos || | |
| 158 match_end_ <= owner_->text_.size()); | |
| 159 | |
| 160 const size_t relative_match_end = owner_->FindNextOccurrence(match_end_); | |
| 161 if (relative_match_end == base::StringPiece::npos) | |
| 162 match_end_ = base::StringPiece::npos; | |
| 163 else | |
| 164 match_end_ += relative_match_end; | |
| 165 | |
| 166 return *this; | |
| 167 } | |
| 168 | |
| 169 Iterator operator++(int) { | |
| 170 Iterator copy(*this); | |
| 171 operator++(); | |
| 172 return copy; | |
| 173 } | |
| 174 | |
| 175 protected: | |
| 176 friend class AllOccurrences; | |
| 177 | |
| 178 // Creates an iterator, which points to the occurrence ending just before | |
| 179 // the position denoted by |match_end|. If |match_end| == | |
| 180 // base::StringPiece::npos, the iterator is assumed to be at the end. | |
| 181 Iterator(const AllOccurrences* owner, size_t match_end) | |
| 182 : owner_(owner), match_end_(match_end) {} | |
| 183 | |
| 184 const AllOccurrences* owner_; | |
| 185 size_t match_end_; | |
| 186 }; | |
| 187 | |
| 188 AllOccurrences(base::StringPiece text, | |
| 189 base::StringPiece pattern, | |
| 190 const IntType* failure, | |
| 191 EquivalentTo equivalent_to = EquivalentTo()) | |
| 192 : text_(text), | |
| 193 pattern_(pattern), | |
| 194 failure_(failure), | |
| 195 equivalent_to_(equivalent_to) {} | |
| 196 | |
| 197 Iterator begin() const { | |
| 198 const size_t match_end = | |
| 199 FindFirstOccurrence(text_, pattern_, failure_, equivalent_to_); | |
| 200 return Iterator(this, match_end); | |
| 201 } | |
| 202 | |
| 203 Iterator end() const { return Iterator(this, base::StringPiece::npos); } | |
| 204 | |
| 205 protected: | |
| 206 // Returns the position of the next occurrence assuming the current occurrence | |
| 207 // ends right before the |match_end| position. | |
| 208 size_t FindNextOccurrence(size_t match_end) const { | |
| 209 return subresource_filter::FindNextOccurrence( | |
| 210 text_.substr(match_end), pattern_, failure_, equivalent_to_); | |
| 211 } | |
| 212 | |
| 213 base::StringPiece text_; | |
| 214 base::StringPiece pattern_; | |
| 215 const IntType* failure_; | |
| 216 | |
| 217 EquivalentTo equivalent_to_; | |
| 218 }; | |
| 219 | |
| 220 // A helper function used to create an AllOccurrences instance without knowing | |
| 221 // the direct type of the |equivalent_to| functor. | |
| 222 // | |
| 223 // Typical usage: | |
| 224 // auto occurrences = CreateAllOccurrences( | |
| 225 // "word, word: word-word", | |
| 226 // "word?", | |
| 227 // <failure function for "word?">, | |
| 228 // [](char c1, char c2) { | |
| 229 // return c1 == c2 || (ispunct(c1) && ispunct(c2)); | |
| 230 // }); | |
| 231 // | |
| 232 // for (size_t match_end : occurrences) { | |
| 233 // // The |pattern| matches the following substring of the |text|: | |
| 234 // // text[|match_end|-pattern.size()..|match_end|-1] | |
| 235 // ... process the |match_end| ... | |
| 236 // } | |
| 237 template <typename IntType, typename EquivalentTo = std::equal_to<char>> | |
| 238 AllOccurrences<IntType, EquivalentTo> CreateAllOccurrences( | |
| 239 base::StringPiece text, | |
| 240 base::StringPiece pattern, | |
| 241 const IntType* failure, | |
| 242 EquivalentTo equivalent_to = EquivalentTo()) { | |
| 243 return AllOccurrences<IntType, EquivalentTo>(text, pattern, failure, | |
| 244 equivalent_to); | |
| 245 } | |
| 246 | |
| 247 } // namespace subresource_filter | |
| 248 | |
| 249 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_KNUTH_MORRIS_PRATT_H_ | |
| OLD | NEW |