OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ | 5 #ifndef CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ |
6 #define CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ | 6 #define CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ |
7 | 7 |
8 #include <map> | 8 #include <map> |
9 #include <set> | 9 #include <set> |
10 #include <string> | 10 #include <string> |
11 #include <vector> | 11 #include <vector> |
12 | 12 |
13 #include "base/basictypes.h" | 13 #include "base/basictypes.h" |
| 14 #include "chrome/common/extensions/matcher/string_pattern.h" |
14 | 15 |
15 namespace extensions { | 16 namespace extensions { |
16 | 17 |
17 // An individual pattern of the SubstringSetMatcher. A pattern consists of | |
18 // a string (interpreted as individual bytes, no character encoding) and an | |
19 // identifier. | |
20 // Each pattern that matches a string emits its ID to the caller of | |
21 // SubstringSetMatcher::Match(). This helps the caller to figure out what | |
22 // patterns matched a string. All patterns registered to a SubstringSetMatcher | |
23 // need to contain unique IDs. | |
24 class SubstringPattern { | |
25 public: | |
26 typedef int ID; | |
27 | |
28 SubstringPattern(const std::string& pattern, ID id); | |
29 ~SubstringPattern(); | |
30 const std::string& pattern() const { return pattern_; } | |
31 ID id() const { return id_; } | |
32 | |
33 bool operator<(const SubstringPattern& rhs) const; | |
34 | |
35 private: | |
36 std::string pattern_; | |
37 ID id_; | |
38 | |
39 DISALLOW_COPY_AND_ASSIGN(SubstringPattern); | |
40 }; | |
41 | |
42 // Class that store a set of string patterns and can find for a string S, | 18 // Class that store a set of string patterns and can find for a string S, |
43 // which string patterns occur in S. | 19 // which string patterns occur in S. |
44 class SubstringSetMatcher { | 20 class SubstringSetMatcher { |
45 public: | 21 public: |
46 SubstringSetMatcher(); | 22 SubstringSetMatcher(); |
47 ~SubstringSetMatcher(); | 23 ~SubstringSetMatcher(); |
48 | 24 |
49 // Registers all |patterns|. The ownership remains with the caller. | 25 // Registers all |patterns|. The ownership remains with the caller. |
50 // The same pattern cannot be registered twice and each pattern needs to have | 26 // The same pattern cannot be registered twice and each pattern needs to have |
51 // a unique ID. | 27 // a unique ID. |
52 // Ownership of the patterns remains with the caller. | 28 // Ownership of the patterns remains with the caller. |
53 void RegisterPatterns(const std::vector<const SubstringPattern*>& patterns); | 29 void RegisterPatterns(const std::vector<const StringPattern*>& patterns); |
54 | 30 |
55 // Unregisters the passed |patterns|. | 31 // Unregisters the passed |patterns|. |
56 void UnregisterPatterns(const std::vector<const SubstringPattern*>& patterns); | 32 void UnregisterPatterns(const std::vector<const StringPattern*>& patterns); |
57 | 33 |
58 // Analogous to RegisterPatterns and UnregisterPatterns but executes both | 34 // Analogous to RegisterPatterns and UnregisterPatterns but executes both |
59 // operations in one step, which is cheaper in the execution. | 35 // operations in one step, which is cheaper in the execution. |
60 void RegisterAndUnregisterPatterns( | 36 void RegisterAndUnregisterPatterns( |
61 const std::vector<const SubstringPattern*>& to_register, | 37 const std::vector<const StringPattern*>& to_register, |
62 const std::vector<const SubstringPattern*>& to_unregister); | 38 const std::vector<const StringPattern*>& to_unregister); |
63 | 39 |
64 // Matches |text| against all registered SubstringPatterns. Stores the IDs | 40 // Matches |text| against all registered StringPatterns. Stores the IDs |
65 // of matching patterns in |matches|. |matches| is not cleared before adding | 41 // of matching patterns in |matches|. |matches| is not cleared before adding |
66 // to it. | 42 // to it. |
67 bool Match(const std::string& text, | 43 bool Match(const std::string& text, |
68 std::set<SubstringPattern::ID>* matches) const; | 44 std::set<StringPattern::ID>* matches) const; |
69 | 45 |
70 // Returns true if this object retains no allocated data. Only for debugging. | 46 // Returns true if this object retains no allocated data. Only for debugging. |
71 bool IsEmpty() const; | 47 bool IsEmpty() const; |
72 | 48 |
73 private: | 49 private: |
74 // A node of an Aho Corasick Tree. This is implemented according to | 50 // A node of an Aho Corasick Tree. This is implemented according to |
75 // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf | 51 // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf |
76 // | 52 // |
77 // The algorithm is based on the idea of building a trie of all registered | 53 // The algorithm is based on the idea of building a trie of all registered |
78 // patterns. Each node of the tree is annotated with a set of pattern | 54 // patterns. Each node of the tree is annotated with a set of pattern |
(...skipping 19 matching lines...) Expand all Loading... |
98 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that | 74 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that |
99 // is a prefix of any registered pattern. | 75 // is a prefix of any registered pattern. |
100 // | 76 // |
101 // If your brain thinks "Forget it, let's go shopping.", don't worry. | 77 // If your brain thinks "Forget it, let's go shopping.", don't worry. |
102 // Take a nap and read an introductory text on the Aho Corasick algorithm. | 78 // Take a nap and read an introductory text on the Aho Corasick algorithm. |
103 // It will make sense. Eventually. | 79 // It will make sense. Eventually. |
104 class AhoCorasickNode { | 80 class AhoCorasickNode { |
105 public: | 81 public: |
106 // Key: label of the edge, value: node index in |tree_| of parent class. | 82 // Key: label of the edge, value: node index in |tree_| of parent class. |
107 typedef std::map<char, int> Edges; | 83 typedef std::map<char, int> Edges; |
108 typedef std::set<SubstringPattern::ID> Matches; | 84 typedef std::set<StringPattern::ID> Matches; |
109 | 85 |
110 AhoCorasickNode(); | 86 AhoCorasickNode(); |
111 ~AhoCorasickNode(); | 87 ~AhoCorasickNode(); |
112 AhoCorasickNode(const AhoCorasickNode& other); | 88 AhoCorasickNode(const AhoCorasickNode& other); |
113 AhoCorasickNode& operator=(const AhoCorasickNode& other); | 89 AhoCorasickNode& operator=(const AhoCorasickNode& other); |
114 | 90 |
115 bool HasEdge(char c) const; | 91 bool HasEdge(char c) const; |
116 int GetEdge(char c) const; | 92 int GetEdge(char c) const; |
117 void SetEdge(char c, int node); | 93 void SetEdge(char c, int node); |
118 const Edges& edges() const { return edges_; } | 94 const Edges& edges() const { return edges_; } |
119 | 95 |
120 int failure() const { return failure_; } | 96 int failure() const { return failure_; } |
121 void set_failure(int failure) { failure_ = failure; } | 97 void set_failure(int failure) { failure_ = failure; } |
122 | 98 |
123 void AddMatch(SubstringPattern::ID id); | 99 void AddMatch(StringPattern::ID id); |
124 void AddMatches(const Matches& matches); | 100 void AddMatches(const Matches& matches); |
125 const Matches& matches() const { return matches_; } | 101 const Matches& matches() const { return matches_; } |
126 | 102 |
127 private: | 103 private: |
128 // Outgoing edges of current node. | 104 // Outgoing edges of current node. |
129 Edges edges_; | 105 Edges edges_; |
130 | 106 |
131 // Node index that failure edge leads to. | 107 // Node index that failure edge leads to. |
132 int failure_; | 108 int failure_; |
133 | 109 |
134 // Identifiers of matches. | 110 // Identifiers of matches. |
135 Matches matches_; | 111 Matches matches_; |
136 }; | 112 }; |
137 | 113 |
138 void RebuildAhoCorasickTree(); | 114 void RebuildAhoCorasickTree(); |
139 | 115 |
140 // Inserts a path for |pattern->pattern()| into the tree and adds | 116 // Inserts a path for |pattern->pattern()| into the tree and adds |
141 // |pattern->id()| to the set of matches. Ownership of |pattern| remains with | 117 // |pattern->id()| to the set of matches. Ownership of |pattern| remains with |
142 // the caller. | 118 // the caller. |
143 void InsertPatternIntoAhoCorasickTree(const SubstringPattern* pattern); | 119 void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern); |
144 void CreateFailureEdges(); | 120 void CreateFailureEdges(); |
145 | 121 |
146 // Set of all registered SubstringPatterns. Used to regenerate the | 122 // Set of all registered StringPatterns. Used to regenerate the |
147 // Aho-Corasick tree in case patterns are registered or unregistered. | 123 // Aho-Corasick tree in case patterns are registered or unregistered. |
148 typedef std::map<SubstringPattern::ID, const SubstringPattern*> | 124 typedef std::map<StringPattern::ID, const StringPattern*> |
149 SubstringPatternSet; | 125 SubstringPatternSet; |
150 SubstringPatternSet patterns_; | 126 SubstringPatternSet patterns_; |
151 | 127 |
152 // The nodes of a Aho-Corasick tree. | 128 // The nodes of a Aho-Corasick tree. |
153 std::vector<AhoCorasickNode> tree_; | 129 std::vector<AhoCorasickNode> tree_; |
154 | 130 |
155 DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher); | 131 DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher); |
156 }; | 132 }; |
157 | 133 |
158 } // namespace extensions | 134 } // namespace extensions |
159 | 135 |
160 #endif // CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ | 136 #endif // CHROME_COMMON_EXTENSIONS_MATCHER_SUBSTRING_SET_MATCHER_H_ |
OLD | NEW |