Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(160)

Side by Side Diff: net/base/lookup_string_in_fixed_set.cc

Issue 2641953009: [1 of 4] Support prefix queries against the effective_tld_names DAFSA (Closed)
Patch Set: Fixes. Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "net/base/lookup_string_in_fixed_set.h" 5 #include "net/base/lookup_string_in_fixed_set.h"
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 8
9 namespace net { 9 namespace net {
10 10
11 namespace { 11 namespace {
12 12
13 // Read next offset from pos. 13 // Read next offset from |pos|, increment |offset| by that amount, and increment
14 // Returns true if an offset could be read, false otherwise. 14 // |pos| either to point to the start of the next encoded offset in its node, or
15 bool GetNextOffset(const unsigned char** pos, 15 // nullptr, if there are no remaining offsets.
16 const unsigned char* end, 16 //
17 const unsigned char** offset) { 17 // Returns true if an offset could be read; false otherwise.
18 if (*pos == end) 18 inline bool GetNextOffset(const unsigned char** pos,
19 const unsigned char** offset) {
20 if (*pos == nullptr)
19 return false; 21 return false;
20 22
21 // When reading an offset the byte array must always contain at least
22 // three more bytes to consume. First the offset to read, then a node
23 // to skip over and finally a destination node. No object can be smaller
24 // than one byte.
25 CHECK_LT(*pos + 2, end);
26 size_t bytes_consumed; 23 size_t bytes_consumed;
27 switch (**pos & 0x60) { 24 switch (**pos & 0x60) {
28 case 0x60: // Read three byte offset 25 case 0x60: // Read three byte offset
29 *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2]; 26 *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
30 bytes_consumed = 3; 27 bytes_consumed = 3;
31 break; 28 break;
32 case 0x40: // Read two byte offset 29 case 0x40: // Read two byte offset
33 *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1]; 30 *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
34 bytes_consumed = 2; 31 bytes_consumed = 2;
35 break; 32 break;
36 default: 33 default:
37 *offset += (*pos)[0] & 0x3F; 34 *offset += (*pos)[0] & 0x3F;
38 bytes_consumed = 1; 35 bytes_consumed = 1;
39 } 36 }
40 if ((**pos & 0x80) != 0) { 37 if ((**pos & 0x80) != 0) {
41 *pos = end; 38 *pos = nullptr;
42 } else { 39 } else {
43 *pos += bytes_consumed; 40 *pos += bytes_consumed;
44 } 41 }
45 return true; 42 return true;
46 } 43 }
47 44
48 // Check if byte at offset is last in label. 45 // Check if byte at |offset| is last in label.
49 bool IsEOL(const unsigned char* offset, const unsigned char* end) { 46 bool IsEOL(const unsigned char* offset) {
50 CHECK_LT(offset, end);
51 return (*offset & 0x80) != 0; 47 return (*offset & 0x80) != 0;
52 } 48 }
53 49
54 // Check if byte at offset matches first character in key. 50 // Check if byte at |offset| matches key. This version matches both end-of-label
55 // This version matches characters not last in label. 51 // chars and not-end-of-label chars.
56 bool IsMatch(const unsigned char* offset, 52 bool IsMatch(const unsigned char* offset, char key) {
57 const unsigned char* end, 53 return (*offset & 0x7F) == key;
58 const char* key) {
59 CHECK_LT(offset, end);
60 return *offset == *key;
61 } 54 }
62 55
63 // Check if byte at offset matches first character in key. 56 // Read return value at |offset|, if it is a return value. Returns true if a
64 // This version matches characters last in label. 57 // return value could be read, false otherwise.
65 bool IsEndCharMatch(const unsigned char* offset, 58 bool GetReturnValue(const unsigned char* offset, int* return_value) {
66 const unsigned char* end, 59 // Return values are always encoded as end-of-label chars (so the high bit is
67 const char* key) { 60 // set). So byte values in the inclusive range [0x80, 0x9F] encode the return
68 CHECK_LT(offset, end); 61 // values 0 through 31 (though make_dafsa.py doesn't currently encode values
69 return *offset == (*key | 0x80); 62 // higher than 7). The following code does that translation.
70 }
71
72 // Read return value at offset.
73 // Returns true if a return value could be read, false otherwise.
74 bool GetReturnValue(const unsigned char* offset,
75 const unsigned char* end,
76 int* return_value) {
77 CHECK_LT(offset, end);
78 if ((*offset & 0xE0) == 0x80) { 63 if ((*offset & 0xE0) == 0x80) {
79 *return_value = *offset & 0x0F; 64 *return_value = *offset & 0x1F;
80 return true; 65 return true;
81 } 66 }
82 return false; 67 return false;
83 } 68 }
84 69
85 } // namespace 70 } // namespace
86 71
87 // Lookup a domain key in a byte array generated by make_dafsa.py. 72 FixedSetIncrementalLookup::FixedSetIncrementalLookup(const unsigned char* graph,
88 // The rule type is returned if key is found, otherwise kDafsaNotFound is 73 size_t length)
89 // returned. 74 : pos_(graph), end_(graph + length), pos_is_label_character_(false) {}
75
76 FixedSetIncrementalLookup::FixedSetIncrementalLookup(
77 const FixedSetIncrementalLookup& other) = default;
78
79 FixedSetIncrementalLookup& FixedSetIncrementalLookup::operator=(
80 const FixedSetIncrementalLookup& other) = default;
81
82 FixedSetIncrementalLookup::~FixedSetIncrementalLookup() {}
83
84 bool FixedSetIncrementalLookup::Advance(char input) {
85 if (!pos_) {
86 // A previous input exhausted the graph, so there are no possible matches.
87 return false;
88 }
89
90 // Only ASCII printable chars are supported by the current DAFSA format -- the
91 // high bit (values 0x80-0xFF) is reserved as a label-end signifier, and the
92 // low values (values 0x00-0x1F) are reserved to encode the return values. So
93 // values outside this range will never be in the dictionary.
94 if (input >= 0x20) {
95 if (pos_is_label_character_) {
96 // Currently processing a label, so it is only necessary to check the byte
97 // at |pos_| to see if it encodes a character matching |input|.
98 bool is_last_char_in_label = IsEOL(pos_);
99 bool is_match = IsMatch(pos_, input);
100 if (is_match) {
101 // If this is not the last character in the label, the next byte should
102 // be interpreted as a character or return value. Otherwise, the next
103 // byte should be interpreted as a list of child node offsets.
104 ++pos_;
105 DCHECK(pos_ < end_);
106 pos_is_label_character_ = !is_last_char_in_label;
107 return true;
108 }
109 } else {
110 const unsigned char* offset = pos_;
111 // Read offsets from |pos_| until the label of the child node at |offset|
112 // matches |input|, or until there are no more offsets.
113 while (GetNextOffset(&pos_, &offset)) {
114 DCHECK(offset < end_);
115 DCHECK((pos_ == nullptr) || (pos_ < end_));
116
117 // |offset| points to a DAFSA node that is a child of the original node.
118 //
119 // The low 7 bits of a node encodes a character value; the high bit
120 // indicates whether it's the last character in the label.
121 //
122 // Note that |*offset| could also be a result code value, but these are
123 // really just out-of-range ASCII values, encoded the same way as
124 // characters. Since |input| was already validated as a printable ASCII
125 // value ASCII value, IsMatch will never return true if |offset| is a
126 // result code.
127 bool is_last_char_in_label = IsEOL(offset);
128 bool is_match = IsMatch(offset, input);
129
130 if (is_match) {
131 // If this is not the last character in the label, the next byte
132 // should be interpreted as a character or return value. Otherwise,
133 // the next byte should be interpreted as a list of child node
134 // offsets.
135 pos_ = offset + 1;
136 DCHECK(pos_ < end_);
137 pos_is_label_character_ = !is_last_char_in_label;
138 return true;
139 }
140 }
141 }
142 }
143
144 // If no match was found, then end of the DAFSA has been reached.
145 pos_ = nullptr;
146 pos_is_label_character_ = false;
147 return false;
148 }
149
150 int FixedSetIncrementalLookup::GetResultForCurrentSequence() const {
151 int value = kDafsaNotFound;
152 // Look to see if there is a next character that's a return value.
153 if (pos_is_label_character_) {
154 // Currently processing a label, so it is only necessary to check the byte
155 // at |pos_| to see if encodes a return value.
156 GetReturnValue(pos_, &value);
157 } else {
158 // Otherwise, |pos_| is an offset list (or nullptr). Explore the list of
159 // child nodes (given by their offsets) to find one whose label is a result
160 // code.
161 //
162 // This search uses a temporary copy of |pos_|, since mutating |pos_| could
163 // skip over a node that would be important to a subsequent Advance() call.
164 const unsigned char* temp_pos = pos_;
165
166 // Read offsets from |temp_pos| until either |temp_pos| is nullptr or until
167 // the byte at |offset| contains a result code (encoded as an ASCII
168 // character below 0x20).
169 const unsigned char* offset = pos_;
170 while (GetNextOffset(&temp_pos, &offset)) {
171 DCHECK(offset < end_);
172 DCHECK((temp_pos == nullptr) || temp_pos < end_);
173 if (GetReturnValue(offset, &value))
174 break;
175 }
176 }
177 return value;
178 }
179
90 int LookupStringInFixedSet(const unsigned char* graph, 180 int LookupStringInFixedSet(const unsigned char* graph,
91 size_t length, 181 size_t length,
92 const char* key, 182 const char* key,
93 size_t key_length) { 183 size_t key_length) {
94 const unsigned char* pos = graph; 184 // Do an incremental lookup until either the end of the graph is reached, or
95 const unsigned char* end = graph + length; 185 // until every character in |key| is consumed.
96 const unsigned char* offset = pos; 186 FixedSetIncrementalLookup lookup(graph, length);
97 const char* key_end = key + key_length; 187 const char* key_end = key + key_length;
98 while (GetNextOffset(&pos, end, &offset)) { 188 while (key != key_end) {
99 // char <char>+ end_char offsets 189 if (!lookup.Advance(*key))
100 // char <char>+ return value 190 return kDafsaNotFound;
101 // char end_char offsets 191 key++;
102 // char return value
103 // end_char offsets
104 // return_value
105 bool did_consume = false;
106 if (key != key_end && !IsEOL(offset, end)) {
107 // Leading <char> is not a match. Don't dive into this child
108 if (!IsMatch(offset, end, key))
109 continue;
110 did_consume = true;
111 ++offset;
112 ++key;
113 // Possible matches at this point:
114 // <char>+ end_char offsets
115 // <char>+ return value
116 // end_char offsets
117 // return value
118 // Remove all remaining <char> nodes possible
119 while (!IsEOL(offset, end) && key != key_end) {
120 if (!IsMatch(offset, end, key))
121 return kDafsaNotFound;
122 ++key;
123 ++offset;
124 }
125 }
126 // Possible matches at this point:
127 // end_char offsets
128 // return_value
129 // If one or more <char> elements were consumed, a failure
130 // to match is terminal. Otherwise, try the next node.
131 if (key == key_end) {
132 int return_value;
133 if (GetReturnValue(offset, end, &return_value))
134 return return_value;
135 // The DAFSA guarantees that if the first char is a match, all
136 // remaining char elements MUST match if the key is truly present.
137 if (did_consume)
138 return kDafsaNotFound;
139 continue;
140 }
141 if (!IsEndCharMatch(offset, end, key)) {
142 if (did_consume)
143 return kDafsaNotFound; // Unexpected
144 continue;
145 }
146 ++key;
147 pos = ++offset; // Dive into child
148 } 192 }
149 return kDafsaNotFound; // No match 193 // The entire input was consumed without reaching the end of the graph. Return
194 // the result code (if present) for the current position, or kDafsaNotFound.
195 return lookup.GetResultForCurrentSequence();
150 } 196 }
151 197
152 } // namespace net 198 } // namespace net
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698