OLD | NEW |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // The separator placeholder '^' symbol is used in subpatterns to match any | 5 // The separator placeholder '^' symbol is used in subpatterns to match any |
6 // separator character, which is any ASCII symbol except letters, digits, and | 6 // separator character, which is any ASCII symbol except letters, digits, and |
7 // the following: '_', '-', '.', '%'. Note that the separator placeholder | 7 // the following: '_', '-', '.', '%'. Note that the separator placeholder |
8 // character '^' is itself a separator, as well as '\0'. | 8 // character '^' is itself a separator, as well as '\0'. |
9 // TODO(pkalinnikov): In addition, a separator placeholder at the end of the | 9 // TODO(pkalinnikov): In addition, a separator placeholder at the end of the |
10 // pattern can be matched by the end of |text|. | 10 // pattern can be matched by the end of |text|. |
11 // | 11 // |
12 // We define a fuzzy occurrence as an occurrence of a |subpattern| in |text| | 12 // We define a fuzzy occurrence as an occurrence of a |subpattern| in |text| |
13 // such that all its non-placeholder characters are equal to the corresponding | 13 // such that all its non-placeholder characters are equal to the corresponding |
14 // characters of the |text|, whereas each '^' placeholder can correspond to any | 14 // characters of the |text|, whereas each '^' placeholder can correspond to any |
15 // type of separator in |text|. | 15 // type of separator in |text|. |
16 | 16 |
17 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ | 17 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ |
18 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ | 18 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ |
19 | 19 |
20 #include <stddef.h> | 20 #include <stddef.h> |
21 | 21 |
22 #include <type_traits> | |
23 #include <vector> | |
24 | |
25 #include "base/logging.h" | |
26 #include "base/strings/string_piece.h" | 22 #include "base/strings/string_piece.h" |
27 #include "components/subresource_filter/core/common/knuth_morris_pratt.h" | |
28 | 23 |
29 namespace subresource_filter { | 24 namespace subresource_filter { |
30 | 25 |
31 constexpr char kSeparatorPlaceholder = '^'; | 26 constexpr char kSeparatorPlaceholder = '^'; |
32 | 27 |
33 inline bool IsAscii(char c) { | 28 inline bool IsAscii(char c) { |
34 return !(c & ~0x7F); | 29 return !(c & ~0x7F); |
35 } | 30 } |
36 | 31 |
37 inline bool IsAlphaNumericAscii(char c) { | 32 inline bool IsAlphaNumericAscii(char c) { |
(...skipping 11 matching lines...) Expand all Loading... |
49 case '.': | 44 case '.': |
50 case '%': | 45 case '%': |
51 return false; | 46 return false; |
52 case kSeparatorPlaceholder: | 47 case kSeparatorPlaceholder: |
53 return true; | 48 return true; |
54 default: | 49 default: |
55 return !IsAlphaNumericAscii(c) && IsAscii(c); | 50 return !IsAlphaNumericAscii(c) && IsAscii(c); |
56 } | 51 } |
57 } | 52 } |
58 | 53 |
59 class IsEqualOrBothSeparators { | |
60 public: | |
61 bool operator()(char first, char second) const { | |
62 return first == second || (IsSeparator(first) && IsSeparator(second)); | |
63 } | |
64 }; | |
65 | |
66 // Returns whether |text| starts with a fuzzy occurrence of |subpattern|. | 54 // Returns whether |text| starts with a fuzzy occurrence of |subpattern|. |
67 bool StartsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); | 55 bool StartsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); |
68 | 56 |
69 // Returns whether |text| ends with a fuzzy occurrence of |subpattern|. | 57 // Returns whether |text| ends with a fuzzy occurrence of |subpattern|. |
70 bool EndsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); | 58 bool EndsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); |
71 | 59 |
72 // Appends elements of the Knuth-Morris-Pratt failure function corresponding to | 60 // Returns the position of the leftmost fuzzy occurrence of a |subpattern| in |
73 // |subpattern| at the end of the |failure| vector. All separator characters, | 61 // the |text| starting no earlier than |from| the specified position. |
74 // including the separator placeholder, are considered equivalent. This is | 62 size_t FindFuzzy(base::StringPiece text, |
75 // necessary in order to support fuzzy separator matching while also keeping the | 63 base::StringPiece subpattern, |
76 // equality predicate transitive and symmetric which is needed for KMP. However, | 64 size_t from = 0); |
77 // the separators will need to be checked in a second pass for KMP match | |
78 // candidates, which is implemented by AllOccurrencesFuzzy. | |
79 template <typename FailureFunctionElement> | |
80 void BuildFailureFunctionFuzzy(base::StringPiece subpattern, | |
81 std::vector<FailureFunctionElement>* failure) { | |
82 static_assert(std::is_unsigned<FailureFunctionElement>::value, | |
83 "The type is not unsigned."); | |
84 BuildFailureFunction<FailureFunctionElement, IsEqualOrBothSeparators>( | |
85 subpattern, failure); | |
86 } | |
87 | |
88 // Iterator range that yields all fuzzy occurrences of |subpattern| in |text|. | |
89 // The values in the range are the positions just beyond each occurrence. | |
90 // Out-of-range iterators are dereferenced to base::StringPiece::npos. | |
91 // | |
92 // The |failure| array is the failure function created by | |
93 // BuildFailureFunctionFuzzy for the same |subpattern|, and containing at least | |
94 // subpattern.size() values. | |
95 template <typename IntType> | |
96 class AllOccurrencesFuzzy | |
97 : private AllOccurrences<IntType, IsEqualOrBothSeparators> { | |
98 public: | |
99 using Base = AllOccurrences<IntType, IsEqualOrBothSeparators>; | |
100 using BaseIterator = typename Base::Iterator; | |
101 | |
102 class Iterator : public BaseIterator { | |
103 public: | |
104 Iterator& operator++() { | |
105 BaseIterator::operator++(); | |
106 AdvanceWhileFalsePositive(); | |
107 return *this; | |
108 } | |
109 | |
110 Iterator operator++(int) { | |
111 Iterator copy(*this); | |
112 operator++(); | |
113 return copy; | |
114 } | |
115 | |
116 private: | |
117 friend class AllOccurrencesFuzzy; | |
118 | |
119 explicit Iterator(const BaseIterator& base_iterator) | |
120 : BaseIterator(base_iterator) { | |
121 AdvanceWhileFalsePositive(); | |
122 } | |
123 | |
124 const AllOccurrencesFuzzy& owner() const { | |
125 return *static_cast<const AllOccurrencesFuzzy*>(BaseIterator::owner_); | |
126 } | |
127 | |
128 // Moves the iterator forward until it either reaches the end, or meets an | |
129 // occurrence exactly matching all non-placeholder separators. | |
130 void AdvanceWhileFalsePositive() { | |
131 while (BaseIterator::match_end_ != base::StringPiece::npos && !IsMatch()) | |
132 BaseIterator::operator++(); | |
133 } | |
134 | |
135 // Returns whether the currenct match meets all separators. | |
136 bool IsMatch() const { | |
137 DCHECK_LE(BaseIterator::match_end_, owner().text_.size()); | |
138 DCHECK_GE(BaseIterator::match_end_, owner().pattern_.size()); | |
139 | |
140 // TODO(pkalinnikov): Store a vector of separator positions externally and | |
141 // check if they are equal to the corresponding characters of the |text|. | |
142 return StartsWithFuzzy(owner().text_.substr(BaseIterator::match_end_ - | |
143 owner().pattern_.size()), | |
144 owner().pattern_); | |
145 } | |
146 }; | |
147 | |
148 AllOccurrencesFuzzy(base::StringPiece text, | |
149 base::StringPiece subpattern, | |
150 const IntType* failure) | |
151 : Base(text, subpattern, failure) {} | |
152 | |
153 Iterator begin() const { return Iterator(Base::begin()); } | |
154 Iterator end() const { return Iterator(Base::end()); } | |
155 }; | |
156 | 65 |
157 } // namespace subresource_filter | 66 } // namespace subresource_filter |
158 | 67 |
159 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ | 68 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ |
OLD | NEW |