Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // The separator placeholder '^' symbol is used in subpatterns to match any | 5 // The separator placeholder '^' symbol is used in subpatterns to match any |
| 6 // separator character, which is any ASCII symbol except letters, digits, and | 6 // separator character, which is any ASCII symbol except letters, digits, and |
| 7 // the following: '_', '-', '.', '%'. Note that the separator placeholder | 7 // the following: '_', '-', '.', '%'. Note that the separator placeholder |
| 8 // character '^' is itself a separator, as well as '\0'. | 8 // character '^' is itself a separator, as well as '\0'. |
| 9 // TODO(pkalinnikov): In addition, a separator placeholder at the end of the | 9 // TODO(pkalinnikov): In addition, a separator placeholder at the end of the |
| 10 // pattern can be matched by the end of |text|. | 10 // pattern can be matched by the end of |text|. |
| 11 // | 11 // |
| 12 // We define a fuzzy occurrence as an occurrence of a |subpattern| in |text| | 12 // We define a fuzzy occurrence as an occurrence of a |subpattern| in |text| |
| 13 // such that all its non-placeholder characters are equal to the corresponding | 13 // such that all its non-placeholder characters are equal to the corresponding |
| 14 // characters of the |text|, whereas each '^' placeholder can correspond to any | 14 // characters of the |text|, whereas each '^' placeholder can correspond to any |
| 15 // type of separator in |text|. | 15 // type of separator in |text|. |
| 16 | 16 |
| 17 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ | 17 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ |
| 18 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ | 18 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ |
| 19 | 19 |
| 20 #include <stddef.h> | 20 #include <stddef.h> |
| 21 | 21 |
| 22 #include <type_traits> | |
| 23 #include <vector> | |
| 24 | |
| 25 #include "base/logging.h" | |
| 26 #include "base/strings/string_piece.h" | 22 #include "base/strings/string_piece.h" |
| 27 #include "components/subresource_filter/core/common/knuth_morris_pratt.h" | |
| 28 | 23 |
| 29 namespace subresource_filter { | 24 namespace subresource_filter { |
| 30 | 25 |
| 31 constexpr char kSeparatorPlaceholder = '^'; | 26 constexpr char kSeparatorPlaceholder = '^'; |
| 32 | 27 |
| 33 inline bool IsAscii(char c) { | 28 inline bool IsAscii(char c) { |
| 34 return !(c & ~0x7F); | 29 return !(c & ~0x7F); |
| 35 } | 30 } |
| 36 | 31 |
| 37 inline bool IsAlphaNumericAscii(char c) { | 32 inline bool IsAlphaNumericAscii(char c) { |
| (...skipping 11 matching lines...) Expand all Loading... | |
| 49 case '.': | 44 case '.': |
| 50 case '%': | 45 case '%': |
| 51 return false; | 46 return false; |
| 52 case kSeparatorPlaceholder: | 47 case kSeparatorPlaceholder: |
| 53 return true; | 48 return true; |
| 54 default: | 49 default: |
| 55 return !IsAlphaNumericAscii(c) && IsAscii(c); | 50 return !IsAlphaNumericAscii(c) && IsAscii(c); |
| 56 } | 51 } |
| 57 } | 52 } |
| 58 | 53 |
| 59 class IsEqualOrBothSeparators { | |
| 60 public: | |
| 61 bool operator()(char first, char second) const { | |
| 62 return first == second || (IsSeparator(first) && IsSeparator(second)); | |
| 63 } | |
| 64 }; | |
| 65 | |
| 66 // Returns whether |text| starts with a fuzzy occurrence of |subpattern|. | 54 // Returns whether |text| starts with a fuzzy occurrence of |subpattern|. |
| 67 bool StartsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); | 55 bool StartsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); |
| 68 | 56 |
| 69 // Returns whether |text| ends with a fuzzy occurrence of |subpattern|. | 57 // Returns whether |text| ends with a fuzzy occurrence of |subpattern|. |
| 70 bool EndsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); | 58 bool EndsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); |
| 71 | 59 |
| 72 // Appends elements of the Knuth-Morris-Pratt failure function corresponding to | 60 // Returns the position of the leftmost fuzzy occurrence of a |subpattern| in |
| 73 // |subpattern| at the end of the |failure| vector. All separator characters, | 61 // the |text|, starting |from| the specified position. |
|
engedy
2017/04/05 13:41:33
nit: but starting no earlier than |from| ...
pkalinnikov
2017/04/05 13:48:46
Done.
| |
| 74 // including the separator placeholder, are considered equivalent. This is | 62 size_t FindFuzzy(base::StringPiece text, |
| 75 // necessary in order to support fuzzy separator matching while also keeping the | 63 base::StringPiece subpattern, |
| 76 // equality predicate transitive and symmetric which is needed for KMP. However, | 64 size_t from = 0); |
| 77 // the separators will need to be checked in a second pass for KMP match | |
| 78 // candidates, which is implemented by AllOccurrencesFuzzy. | |
| 79 template <typename FailureFunctionElement> | |
| 80 void BuildFailureFunctionFuzzy(base::StringPiece subpattern, | |
| 81 std::vector<FailureFunctionElement>* failure) { | |
| 82 static_assert(std::is_unsigned<FailureFunctionElement>::value, | |
| 83 "The type is not unsigned."); | |
| 84 BuildFailureFunction<FailureFunctionElement, IsEqualOrBothSeparators>( | |
| 85 subpattern, failure); | |
| 86 } | |
| 87 | |
| 88 // Iterator range that yields all fuzzy occurrences of |subpattern| in |text|. | |
| 89 // The values in the range are the positions just beyond each occurrence. | |
| 90 // Out-of-range iterators are dereferenced to base::StringPiece::npos. | |
| 91 // | |
| 92 // The |failure| array is the failure function created by | |
| 93 // BuildFailureFunctionFuzzy for the same |subpattern|, and containing at least | |
| 94 // subpattern.size() values. | |
| 95 template <typename IntType> | |
| 96 class AllOccurrencesFuzzy | |
| 97 : private AllOccurrences<IntType, IsEqualOrBothSeparators> { | |
| 98 public: | |
| 99 using Base = AllOccurrences<IntType, IsEqualOrBothSeparators>; | |
| 100 using BaseIterator = typename Base::Iterator; | |
| 101 | |
| 102 class Iterator : public BaseIterator { | |
| 103 public: | |
| 104 Iterator& operator++() { | |
| 105 BaseIterator::operator++(); | |
| 106 AdvanceWhileFalsePositive(); | |
| 107 return *this; | |
| 108 } | |
| 109 | |
| 110 Iterator operator++(int) { | |
| 111 Iterator copy(*this); | |
| 112 operator++(); | |
| 113 return copy; | |
| 114 } | |
| 115 | |
| 116 private: | |
| 117 friend class AllOccurrencesFuzzy; | |
| 118 | |
| 119 explicit Iterator(const BaseIterator& base_iterator) | |
| 120 : BaseIterator(base_iterator) { | |
| 121 AdvanceWhileFalsePositive(); | |
| 122 } | |
| 123 | |
| 124 const AllOccurrencesFuzzy& owner() const { | |
| 125 return *static_cast<const AllOccurrencesFuzzy*>(BaseIterator::owner_); | |
| 126 } | |
| 127 | |
| 128 // Moves the iterator forward until it either reaches the end, or meets an | |
| 129 // occurrence exactly matching all non-placeholder separators. | |
| 130 void AdvanceWhileFalsePositive() { | |
| 131 while (BaseIterator::match_end_ != base::StringPiece::npos && !IsMatch()) | |
| 132 BaseIterator::operator++(); | |
| 133 } | |
| 134 | |
| 135 // Returns whether the currenct match meets all separators. | |
| 136 bool IsMatch() const { | |
| 137 DCHECK_LE(BaseIterator::match_end_, owner().text_.size()); | |
| 138 DCHECK_GE(BaseIterator::match_end_, owner().pattern_.size()); | |
| 139 | |
| 140 // TODO(pkalinnikov): Store a vector of separator positions externally and | |
| 141 // check if they are equal to the corresponding characters of the |text|. | |
| 142 return StartsWithFuzzy(owner().text_.substr(BaseIterator::match_end_ - | |
| 143 owner().pattern_.size()), | |
| 144 owner().pattern_); | |
| 145 } | |
| 146 }; | |
| 147 | |
| 148 AllOccurrencesFuzzy(base::StringPiece text, | |
| 149 base::StringPiece subpattern, | |
| 150 const IntType* failure) | |
| 151 : Base(text, subpattern, failure) {} | |
| 152 | |
| 153 Iterator begin() const { return Iterator(Base::begin()); } | |
| 154 Iterator end() const { return Iterator(Base::end()); } | |
| 155 }; | |
| 156 | 65 |
| 157 } // namespace subresource_filter | 66 } // namespace subresource_filter |
| 158 | 67 |
| 159 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ | 68 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ |
| OLD | NEW |