Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(275)

Side by Side Diff: components/subresource_filter/core/common/fuzzy_pattern_matching.h

Issue 2793993002: [subresource_filter] Replace KMP by std::search. (Closed)
Patch Set: Fix tests. Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // The separator placeholder '^' symbol is used in subpatterns to match any 5 // The separator placeholder '^' symbol is used in subpatterns to match any
6 // separator character, which is any ASCII symbol except letters, digits, and 6 // separator character, which is any ASCII symbol except letters, digits, and
7 // the following: '_', '-', '.', '%'. Note that the separator placeholder 7 // the following: '_', '-', '.', '%'. Note that the separator placeholder
8 // character '^' is itself a separator, as well as '\0'. 8 // character '^' is itself a separator, as well as '\0'.
9 // TODO(pkalinnikov): In addition, a separator placeholder at the end of the 9 // TODO(pkalinnikov): In addition, a separator placeholder at the end of the
10 // pattern can be matched by the end of |text|. 10 // pattern can be matched by the end of |text|.
11 // 11 //
12 // We define a fuzzy occurrence as an occurrence of a |subpattern| in |text| 12 // We define a fuzzy occurrence as an occurrence of a |subpattern| in |text|
13 // such that all its non-placeholder characters are equal to the corresponding 13 // such that all its non-placeholder characters are equal to the corresponding
14 // characters of the |text|, whereas each '^' placeholder can correspond to any 14 // characters of the |text|, whereas each '^' placeholder can correspond to any
15 // type of separator in |text|. 15 // type of separator in |text|.
16 16
17 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ 17 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_
18 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ 18 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_
19 19
20 #include <stddef.h> 20 #include <stddef.h>
21 21
22 #include <type_traits> 22 #include <type_traits>
Charlie Harrison 2017/04/04 13:58:26 I think you can remove all of these includes excep
pkalinnikov 2017/04/04 16:42:58 Done.
23 #include <vector> 23 #include <vector>
24 24
25 #include "base/logging.h" 25 #include "base/logging.h"
26 #include "base/strings/string_piece.h" 26 #include "base/strings/string_piece.h"
27 #include "components/subresource_filter/core/common/knuth_morris_pratt.h"
28 27
29 namespace subresource_filter { 28 namespace subresource_filter {
30 29
31 constexpr char kSeparatorPlaceholder = '^'; 30 constexpr char kSeparatorPlaceholder = '^';
32 31
33 inline bool IsAscii(char c) { 32 inline bool IsAscii(char c) {
34 return !(c & ~0x7F); 33 return !(c & ~0x7F);
35 } 34 }
36 35
37 inline bool IsAlphaNumericAscii(char c) { 36 inline bool IsAlphaNumericAscii(char c) {
(...skipping 11 matching lines...) Expand all
49 case '.': 48 case '.':
50 case '%': 49 case '%':
51 return false; 50 return false;
52 case kSeparatorPlaceholder: 51 case kSeparatorPlaceholder:
53 return true; 52 return true;
54 default: 53 default:
55 return !IsAlphaNumericAscii(c) && IsAscii(c); 54 return !IsAlphaNumericAscii(c) && IsAscii(c);
56 } 55 }
57 } 56 }
58 57
59 class IsEqualOrBothSeparators {
60 public:
61 bool operator()(char first, char second) const {
62 return first == second || (IsSeparator(first) && IsSeparator(second));
63 }
64 };
65
66 // Returns whether |text| starts with a fuzzy occurrence of |subpattern|. 58 // Returns whether |text| starts with a fuzzy occurrence of |subpattern|.
67 bool StartsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); 59 bool StartsWithFuzzy(base::StringPiece text, base::StringPiece subpattern);
68 60
69 // Returns whether |text| ends with a fuzzy occurrence of |subpattern|. 61 // Returns whether |text| ends with a fuzzy occurrence of |subpattern|.
70 bool EndsWithFuzzy(base::StringPiece text, base::StringPiece subpattern); 62 bool EndsWithFuzzy(base::StringPiece text, base::StringPiece subpattern);
71 63
72 // Appends elements of the Knuth-Morris-Pratt failure function corresponding to
73 // |subpattern| at the end of the |failure| vector. All separator characters,
74 // including the separator placeholder, are considered equivalent. This is
75 // necessary in order to support fuzzy separator matching while also keeping the
76 // equality predicate transitive and symmetric which is needed for KMP. However,
77 // the separators will need to be checked in a second pass for KMP match
78 // candidates, which is implemented by AllOccurrencesFuzzy.
79 template <typename FailureFunctionElement>
80 void BuildFailureFunctionFuzzy(base::StringPiece subpattern,
81 std::vector<FailureFunctionElement>* failure) {
82 static_assert(std::is_unsigned<FailureFunctionElement>::value,
83 "The type is not unsigned.");
84 BuildFailureFunction<FailureFunctionElement, IsEqualOrBothSeparators>(
85 subpattern, failure);
86 }
87
88 // Iterator range that yields all fuzzy occurrences of |subpattern| in |text|.
89 // The values in the range are the positions just beyond each occurrence.
90 // Out-of-range iterators are dereferenced to base::StringPiece::npos.
91 //
92 // The |failure| array is the failure function created by
93 // BuildFailureFunctionFuzzy for the same |subpattern|, and containing at least
94 // subpattern.size() values.
95 template <typename IntType>
96 class AllOccurrencesFuzzy
97 : private AllOccurrences<IntType, IsEqualOrBothSeparators> {
98 public:
99 using Base = AllOccurrences<IntType, IsEqualOrBothSeparators>;
100 using BaseIterator = typename Base::Iterator;
101
102 class Iterator : public BaseIterator {
103 public:
104 Iterator& operator++() {
105 BaseIterator::operator++();
106 AdvanceWhileFalsePositive();
107 return *this;
108 }
109
110 Iterator operator++(int) {
111 Iterator copy(*this);
112 operator++();
113 return copy;
114 }
115
116 private:
117 friend class AllOccurrencesFuzzy;
118
119 explicit Iterator(const BaseIterator& base_iterator)
120 : BaseIterator(base_iterator) {
121 AdvanceWhileFalsePositive();
122 }
123
124 const AllOccurrencesFuzzy& owner() const {
125 return *static_cast<const AllOccurrencesFuzzy*>(BaseIterator::owner_);
126 }
127
128 // Moves the iterator forward until it either reaches the end, or meets an
129 // occurrence exactly matching all non-placeholder separators.
130 void AdvanceWhileFalsePositive() {
131 while (BaseIterator::match_end_ != base::StringPiece::npos && !IsMatch())
132 BaseIterator::operator++();
133 }
134
135 // Returns whether the currenct match meets all separators.
136 bool IsMatch() const {
137 DCHECK_LE(BaseIterator::match_end_, owner().text_.size());
138 DCHECK_GE(BaseIterator::match_end_, owner().pattern_.size());
139
140 // TODO(pkalinnikov): Store a vector of separator positions externally and
141 // check if they are equal to the corresponding characters of the |text|.
142 return StartsWithFuzzy(owner().text_.substr(BaseIterator::match_end_ -
143 owner().pattern_.size()),
144 owner().pattern_);
145 }
146 };
147
148 AllOccurrencesFuzzy(base::StringPiece text,
149 base::StringPiece subpattern,
150 const IntType* failure)
151 : Base(text, subpattern, failure) {}
152
153 Iterator begin() const { return Iterator(Base::begin()); }
154 Iterator end() const { return Iterator(Base::end()); }
155 };
156
157 } // namespace subresource_filter 64 } // namespace subresource_filter
158 65
159 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_ 66 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_FUZZY_PATTERN_MATCHING_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698