OLD | NEW |
| (Empty) |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #ifndef NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ | |
6 #define NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ | |
7 | |
8 #include <stddef.h> | |
9 | |
10 #include "net/base/net_export.h" | |
11 | |
12 namespace net { | |
13 | |
14 enum { | |
15 kDafsaNotFound = -1, // key is not in set | |
16 kDafsaFound = 0, // key is in set | |
17 // The following return values are used by the implementation of | |
18 // GetDomainAndRegistry() and are probably not generally useful. | |
19 kDafsaExceptionRule = 1, // key excluded from set via exception | |
20 kDafsaWildcardRule = 2, // key matched a wildcard rule | |
21 kDafsaPrivateRule = 4, // key matched a private rule | |
22 }; | |
23 | |
24 // Looks up the string |key| with length |key_length| in a fixed set of | |
25 // strings. The set of strings must be known at compile time. It is converted to | |
26 // a graph structure named a DAFSA (Deterministic Acyclic Finite State | |
27 // Automaton) by the script make_dafsa.py during compilation. This permits | |
28 // efficient (in time and space) lookup. The graph generated by make_dafsa.py | |
29 // takes the form of a constant byte array which should be supplied via the | |
30 // |graph| and |length| parameters. The return value is kDafsaNotFound, | |
31 // kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, | |
32 // kDafsaWildcardRule and kDafsaPrivateRule ORed together. | |
33 // | |
34 // TODO(nick): Replace this with FixedSetIncrementalLookup everywhere. | |
35 NET_EXPORT int LookupStringInFixedSet(const unsigned char* graph, | |
36 size_t length, | |
37 const char* key, | |
38 size_t key_length); | |
39 | |
40 // FixedSetIncrementalLookup provides efficient membership and prefix queries | |
41 // against a fixed set of strings. The set of strings must be known at compile | |
42 // time. The set is converted to a graph structure named a DAFSA (Deterministic | |
43 // Acyclic Finite State Automaton) by the script //net/tools/dafsa/make_dafsa.py | |
44 // during compilation. The conversion generates a C++ header file defining the | |
45 // encoded graph as a constant byte array. This class provides a fast, constant- | |
46 // space lookup operation against such byte arrays. | |
47 // | |
48 // The lookup proceeds incrementally, with input characters provided one at a | |
49 // time. This approach allow queries of the form: "given an input string, which | |
50 // prefixes of that string that appear in the fixed set?" As the matching | |
51 // prefixes (and their result codes) are enumerated, the most suitable match | |
52 // among them can be selected in a single pass. | |
53 // | |
54 // This class can also be used to perform suffix queries (instead of prefix | |
55 // queries) against a fixed set, so long as the DAFSA is constructed on reversed | |
56 // values, and the input is provided in reverse order. | |
57 // | |
58 // Example usage for simple membership query; |input| is a std::string: | |
59 // | |
60 // FixedSetIncrementalLookup lookup(kDafsa, sizeof(kDafsa)); | |
61 // for (char c : input) { | |
62 // if (!lookup.Advance(c)) | |
63 // return false; | |
64 // } | |
65 // return lookup.GetResultForCurrentSequence() != kDafsaNotFound; | |
66 // | |
67 // Example usage for 'find longest prefix in set with result code == 3' query: | |
68 // | |
69 // FixedSetIncrementalLookup prefix_lookup(kDafsa, sizeof(kDafsa)); | |
70 // size_t longest_match_end = 0; | |
71 // for (size_t i = 0; i < input.length(); ++i) { | |
72 // if (!prefix_lookup.Advance(input[i])) | |
73 // break; | |
74 // if (prefix_lookup.GetResultForCurrentSequence() == 3) | |
75 // longest_match_end = (i + 1); | |
76 // } | |
77 // return input.substr(0, longest_match_end); | |
78 // | |
79 class NET_EXPORT FixedSetIncrementalLookup { | |
80 public: | |
81 // Begin a lookup against the provided fixed set. |graph| and |length| | |
82 // describe a byte buffer generated by the make_dafsa.py script, as described | |
83 // in the class comment. | |
84 // | |
85 // FixedSetIncrementalLookup is initialized to a state corresponding to the | |
86 // empty input sequence. Calling GetResultForCurrentSequence() in the initial | |
87 // state would indicate whether the empty string appears in the fixed set. | |
88 // Characters can be added to the sequence by calling Advance(), and the | |
89 // lookup result can be checked after each addition by calling | |
90 // GetResultForCurrentSequence(). | |
91 FixedSetIncrementalLookup(const unsigned char* graph, size_t length); | |
92 | |
93 // FixedSetIncrementalLookup is copyable so that callers can save/restore | |
94 // their position in the search. This is for cases where branching or | |
95 // backtracking might be required (e.g. to probe for the presence of a | |
96 // designated wildcard character). | |
97 FixedSetIncrementalLookup(const FixedSetIncrementalLookup&); | |
98 FixedSetIncrementalLookup& operator=(const FixedSetIncrementalLookup&); | |
99 | |
100 ~FixedSetIncrementalLookup(); | |
101 | |
102 // Advance the query by adding a character to the input sequence. |input| can | |
103 // be any char value, but only ASCII characters will ever result in matches, | |
104 // since the fixed set itself is limited to ASCII strings. | |
105 // | |
106 // Returns true if the resulting input sequence either appears in the fixed | |
107 // set itself, or is a prefix of some longer string in the fixed set. Returns | |
108 // false otherwise, implying that the graph is exhausted and | |
109 // GetResultForCurrentSequence() will return kDafsaNotFound. | |
110 // | |
111 // Once Advance() has returned false, the caller can safely stop feeding more | |
112 // characters, as subsequent calls to Advance() will return false and have no | |
113 // effect. | |
114 bool Advance(char input); | |
115 | |
116 // Returns the result code corresponding to the input sequence provided thus | |
117 // far to Advance(). | |
118 // | |
119 // If the sequence does not appear in the fixed set, the return value is | |
120 // kDafsaNotFound. Otherwise, the value is a non-negative integer (currently | |
121 // limited to 0-7) corresponding to the result code for that string, as listed | |
122 // in the .gperf file from which the DAFSA was generated. For | |
123 // GetDomainAndRegistry DAFSAs, these values should be interpreted as a | |
124 // bitmask of kDafsaExceptionRule, kDafsaWildcardRule, and kDafsaPrivateRule. | |
125 // | |
126 // It is okay to call this function, and then extend the sequence further by | |
127 // calling Advance(). | |
128 int GetResultForCurrentSequence() const; | |
129 | |
130 private: | |
131 // Pointer to the current position in the graph indicating the current state | |
132 // of the automaton, or nullptr if the graph is exhausted. | |
133 const unsigned char* pos_; | |
134 | |
135 // Pointer just past the end of the graph. |pos_| should never get here. This | |
136 // is used only in DCHECKs. | |
137 const unsigned char* end_; | |
138 | |
139 // Contains the current decoder state. If true, |pos_| points to a label | |
140 // character or a return code. If false, |pos_| points to a sequence of | |
141 // offsets that indicate the child nodes of the current state. | |
142 bool pos_is_label_character_; | |
143 }; | |
144 | |
145 } // namespace net | |
146 | |
147 #endif // NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ | |
OLD | NEW |