OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ | 5 #ifndef NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ |
6 #define NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ | 6 #define NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ |
7 | 7 |
8 #include <stddef.h> | 8 #include <stddef.h> |
9 | 9 |
10 #include "net/base/net_export.h" | 10 #include "net/base/net_export.h" |
(...skipping 12 matching lines...) Expand all Loading... |
23 | 23 |
24 // Looks up the string |key| with length |key_length| in a fixed set of | 24 // Looks up the string |key| with length |key_length| in a fixed set of |
25 // strings. The set of strings must be known at compile time. It is converted to | 25 // strings. The set of strings must be known at compile time. It is converted to |
26 // a graph structure named a DAFSA (Deterministic Acyclic Finite State | 26 // a graph structure named a DAFSA (Deterministic Acyclic Finite State |
27 // Automaton) by the script make_dafsa.py during compilation. This permits | 27 // Automaton) by the script make_dafsa.py during compilation. This permits |
28 // efficient (in time and space) lookup. The graph generated by make_dafsa.py | 28 // efficient (in time and space) lookup. The graph generated by make_dafsa.py |
29 // takes the form of a constant byte array which should be supplied via the | 29 // takes the form of a constant byte array which should be supplied via the |
30 // |graph| and |length| parameters. The return value is kDafsaNotFound, | 30 // |graph| and |length| parameters. The return value is kDafsaNotFound, |
31 // kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, | 31 // kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, |
32 // kDafsaWildcardRule and kDafsaPrivateRule ORed together. | 32 // kDafsaWildcardRule and kDafsaPrivateRule ORed together. |
| 33 // |
| 34 // TODO(nick): Replace this with FixedSetIncrementalLookup everywhere. |
33 NET_EXPORT int LookupStringInFixedSet(const unsigned char* graph, | 35 NET_EXPORT int LookupStringInFixedSet(const unsigned char* graph, |
34 size_t length, | 36 size_t length, |
35 const char* key, | 37 const char* key, |
36 size_t key_length); | 38 size_t key_length); |
37 | 39 |
| 40 // FixedSetIncrementalLookup provides efficient membership and prefix queries |
| 41 // against a fixed set of strings. The set of strings must be known at compile |
| 42 // time. The set is converted to a graph structure named a DAFSA (Deterministic |
| 43 // Acyclic Finite State Automaton) by the script //net/tools/dafsa/make_dafsa.py |
| 44 // during compilation. The conversion generates a C++ header file defining the |
| 45 // encoded graph as a constant byte array. This class provides a fast, constant- |
| 46 // space lookup operation against such byte arrays. |
| 47 // |
| 48 // The lookup proceeds incrementally, with input characters provided one at a |
| 49 // time. This approach allow queries of the form: "given an input string, which |
| 50 // prefixes of that string that appear in the fixed set?" As the matching |
| 51 // prefixes (and their result codes) are enumerated, the most suitable match |
| 52 // among them can be selected in a single pass. |
| 53 // |
| 54 // This class can also be used to perform suffix queries (instead of prefix |
| 55 // queries) against a fixed set, so long as the DAFSA is constructed on reversed |
| 56 // values, and the input is provided in reverse order. |
| 57 // |
| 58 // Example usage for simple membership query; |input| is a std::string: |
| 59 // |
| 60 // FixedSetIncrementalLookup lookup(kDafsa, sizeof(kDafsa)); |
| 61 // for (char c : input) { |
| 62 // if (!lookup.Advance(c)) |
| 63 // return false; |
| 64 // } |
| 65 // return lookup.GetResultForCurrentSequence() != kDafsaNotFound; |
| 66 // |
| 67 // Example usage for 'find longest prefix in set with result code == 3' query: |
| 68 // |
| 69 // FixedSetIncrementalLookup prefix_lookup(kDafsa, sizeof(kDafsa)); |
| 70 // size_t longest_match_end = 0; |
| 71 // for (size_t i = 0; i < input.length(); ++i) { |
| 72 // if (!prefix_lookup.Advance(input[i])) |
| 73 // break; |
| 74 // if (prefix_lookup.GetResultForCurrentSequence() == 3) |
| 75 // longest_match_end = (i + 1); |
| 76 // } |
| 77 // return input.substr(0, longest_match_end); |
| 78 // |
| 79 class NET_EXPORT FixedSetIncrementalLookup { |
| 80 public: |
| 81 // Begin a lookup against the provided fixed set. |graph| and |length| |
| 82 // describe a byte buffer generated by the make_dafsa.py script, as described |
| 83 // in the class comment. |
| 84 // |
| 85 // FixedSetIncrementalLookup is initialized to a state corresponding to the |
| 86 // empty input sequence. Calling GetResultForCurrentSequence() in the initial |
| 87 // state would indicate whether the empty string appears in the fixed set. |
| 88 // Characters can be added to the sequence by calling Advance(), and the |
| 89 // lookup result can be checked after each addition by calling |
| 90 // GetResultForCurrentSequence(). |
| 91 FixedSetIncrementalLookup(const unsigned char* graph, size_t length); |
| 92 |
| 93 // FixedSetIncrementalLookup is copyable so that callers can save/restore |
| 94 // their position in the search. This is for cases where branching or |
| 95 // backtracking might be required (e.g. to probe for the presence of a |
| 96 // designated wildcard character). |
| 97 FixedSetIncrementalLookup(const FixedSetIncrementalLookup&); |
| 98 FixedSetIncrementalLookup& operator=(const FixedSetIncrementalLookup&); |
| 99 |
| 100 ~FixedSetIncrementalLookup(); |
| 101 |
| 102 // Advance the query by adding a character to the input sequence. |input| can |
| 103 // be any char value, but only ASCII characters will ever result in matches, |
| 104 // since the fixed set itself is limited to ASCII strings. |
| 105 // |
| 106 // Returns true if the resulting input sequence either appears in the fixed |
| 107 // set itself, or is a prefix of some longer string in the fixed set. Returns |
| 108 // false otherwise, implying that the graph is exhausted and |
| 109 // GetResultForCurrentSequence() will return kDafsaNotFound. |
| 110 // |
| 111 // Once Advance() has returned false, the caller can safely stop feeding more |
| 112 // characters, as subsequent calls to Advance() will return false and have no |
| 113 // effect. |
| 114 bool Advance(char input); |
| 115 |
| 116 // Returns the result code corresponding to the input sequence provided thus |
| 117 // far to Advance(). |
| 118 // |
| 119 // If the sequence does not appear in the fixed set, the return value is |
| 120 // kDafsaNotFound. Otherwise, the value is a non-negative integer (currently |
| 121 // limited to 0-7) corresponding to the result code for that string, as listed |
| 122 // in the .gperf file from which the DAFSA was generated. For |
| 123 // GetDomainAndRegistry DAFSAs, these values should be interpreted as a |
| 124 // bitmask of kDafsaExceptionRule, kDafsaWildcardRule, and kDafsaPrivateRule. |
| 125 // |
| 126 // It is okay to call this function, and then extend the sequence further by |
| 127 // calling Advance(). |
| 128 int GetResultForCurrentSequence() const; |
| 129 |
| 130 private: |
| 131 // Pointer to the current position in the graph indicating the current state |
| 132 // of the automaton, or nullptr if the graph is exhausted. |
| 133 const unsigned char* pos_; |
| 134 |
| 135 // Pointer just past the end of the graph. |pos_| should never get here. This |
| 136 // is used only in DCHECKs. |
| 137 const unsigned char* end_; |
| 138 |
| 139 // Contains the current decoder state. If true, |pos_| points to a label |
| 140 // character or a return code. If false, |pos_| points to a sequence of |
| 141 // offsets that indicate the child nodes of the current state. |
| 142 bool pos_is_label_character_; |
| 143 }; |
| 144 |
38 } // namespace net | 145 } // namespace net |
39 | 146 |
40 #endif // NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ | 147 #endif // NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ |
OLD | NEW |