OLD | NEW |
| (Empty) |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "net/base/lookup_string_in_fixed_set.h" | |
6 | |
7 #include "base/logging.h" | |
8 | |
9 namespace net { | |
10 | |
11 namespace { | |
12 | |
13 // Read next offset from |pos|, increment |offset| by that amount, and increment | |
14 // |pos| either to point to the start of the next encoded offset in its node, or | |
15 // nullptr, if there are no remaining offsets. | |
16 // | |
17 // Returns true if an offset could be read; false otherwise. | |
18 inline bool GetNextOffset(const unsigned char** pos, | |
19 const unsigned char** offset) { | |
20 if (*pos == nullptr) | |
21 return false; | |
22 | |
23 size_t bytes_consumed; | |
24 switch (**pos & 0x60) { | |
25 case 0x60: // Read three byte offset | |
26 *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2]; | |
27 bytes_consumed = 3; | |
28 break; | |
29 case 0x40: // Read two byte offset | |
30 *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1]; | |
31 bytes_consumed = 2; | |
32 break; | |
33 default: | |
34 *offset += (*pos)[0] & 0x3F; | |
35 bytes_consumed = 1; | |
36 } | |
37 if ((**pos & 0x80) != 0) { | |
38 *pos = nullptr; | |
39 } else { | |
40 *pos += bytes_consumed; | |
41 } | |
42 return true; | |
43 } | |
44 | |
45 // Check if byte at |offset| is last in label. | |
46 bool IsEOL(const unsigned char* offset) { | |
47 return (*offset & 0x80) != 0; | |
48 } | |
49 | |
50 // Check if byte at |offset| matches key. This version matches both end-of-label | |
51 // chars and not-end-of-label chars. | |
52 bool IsMatch(const unsigned char* offset, char key) { | |
53 return (*offset & 0x7F) == key; | |
54 } | |
55 | |
56 // Read return value at |offset|, if it is a return value. Returns true if a | |
57 // return value could be read, false otherwise. | |
58 bool GetReturnValue(const unsigned char* offset, int* return_value) { | |
59 // Return values are always encoded as end-of-label chars (so the high bit is | |
60 // set). So byte values in the inclusive range [0x80, 0x9F] encode the return | |
61 // values 0 through 31 (though make_dafsa.py doesn't currently encode values | |
62 // higher than 7). The following code does that translation. | |
63 if ((*offset & 0xE0) == 0x80) { | |
64 *return_value = *offset & 0x1F; | |
65 return true; | |
66 } | |
67 return false; | |
68 } | |
69 | |
70 } // namespace | |
71 | |
72 FixedSetIncrementalLookup::FixedSetIncrementalLookup(const unsigned char* graph, | |
73 size_t length) | |
74 : pos_(graph), end_(graph + length), pos_is_label_character_(false) {} | |
75 | |
76 FixedSetIncrementalLookup::FixedSetIncrementalLookup( | |
77 const FixedSetIncrementalLookup& other) = default; | |
78 | |
79 FixedSetIncrementalLookup& FixedSetIncrementalLookup::operator=( | |
80 const FixedSetIncrementalLookup& other) = default; | |
81 | |
82 FixedSetIncrementalLookup::~FixedSetIncrementalLookup() {} | |
83 | |
84 bool FixedSetIncrementalLookup::Advance(char input) { | |
85 if (!pos_) { | |
86 // A previous input exhausted the graph, so there are no possible matches. | |
87 return false; | |
88 } | |
89 | |
90 // Only ASCII printable chars are supported by the current DAFSA format -- the | |
91 // high bit (values 0x80-0xFF) is reserved as a label-end signifier, and the | |
92 // low values (values 0x00-0x1F) are reserved to encode the return values. So | |
93 // values outside this range will never be in the dictionary. | |
94 if (input >= 0x20) { | |
95 if (pos_is_label_character_) { | |
96 // Currently processing a label, so it is only necessary to check the byte | |
97 // at |pos_| to see if it encodes a character matching |input|. | |
98 bool is_last_char_in_label = IsEOL(pos_); | |
99 bool is_match = IsMatch(pos_, input); | |
100 if (is_match) { | |
101 // If this is not the last character in the label, the next byte should | |
102 // be interpreted as a character or return value. Otherwise, the next | |
103 // byte should be interpreted as a list of child node offsets. | |
104 ++pos_; | |
105 DCHECK(pos_ < end_); | |
106 pos_is_label_character_ = !is_last_char_in_label; | |
107 return true; | |
108 } | |
109 } else { | |
110 const unsigned char* offset = pos_; | |
111 // Read offsets from |pos_| until the label of the child node at |offset| | |
112 // matches |input|, or until there are no more offsets. | |
113 while (GetNextOffset(&pos_, &offset)) { | |
114 DCHECK(offset < end_); | |
115 DCHECK((pos_ == nullptr) || (pos_ < end_)); | |
116 | |
117 // |offset| points to a DAFSA node that is a child of the original node. | |
118 // | |
119 // The low 7 bits of a node encodes a character value; the high bit | |
120 // indicates whether it's the last character in the label. | |
121 // | |
122 // Note that |*offset| could also be a result code value, but these are | |
123 // really just out-of-range ASCII values, encoded the same way as | |
124 // characters. Since |input| was already validated as a printable ASCII | |
125 // value ASCII value, IsMatch will never return true if |offset| is a | |
126 // result code. | |
127 bool is_last_char_in_label = IsEOL(offset); | |
128 bool is_match = IsMatch(offset, input); | |
129 | |
130 if (is_match) { | |
131 // If this is not the last character in the label, the next byte | |
132 // should be interpreted as a character or return value. Otherwise, | |
133 // the next byte should be interpreted as a list of child node | |
134 // offsets. | |
135 pos_ = offset + 1; | |
136 DCHECK(pos_ < end_); | |
137 pos_is_label_character_ = !is_last_char_in_label; | |
138 return true; | |
139 } | |
140 } | |
141 } | |
142 } | |
143 | |
144 // If no match was found, then end of the DAFSA has been reached. | |
145 pos_ = nullptr; | |
146 pos_is_label_character_ = false; | |
147 return false; | |
148 } | |
149 | |
150 int FixedSetIncrementalLookup::GetResultForCurrentSequence() const { | |
151 int value = kDafsaNotFound; | |
152 // Look to see if there is a next character that's a return value. | |
153 if (pos_is_label_character_) { | |
154 // Currently processing a label, so it is only necessary to check the byte | |
155 // at |pos_| to see if encodes a return value. | |
156 GetReturnValue(pos_, &value); | |
157 } else { | |
158 // Otherwise, |pos_| is an offset list (or nullptr). Explore the list of | |
159 // child nodes (given by their offsets) to find one whose label is a result | |
160 // code. | |
161 // | |
162 // This search uses a temporary copy of |pos_|, since mutating |pos_| could | |
163 // skip over a node that would be important to a subsequent Advance() call. | |
164 const unsigned char* temp_pos = pos_; | |
165 | |
166 // Read offsets from |temp_pos| until either |temp_pos| is nullptr or until | |
167 // the byte at |offset| contains a result code (encoded as an ASCII | |
168 // character below 0x20). | |
169 const unsigned char* offset = pos_; | |
170 while (GetNextOffset(&temp_pos, &offset)) { | |
171 DCHECK(offset < end_); | |
172 DCHECK((temp_pos == nullptr) || temp_pos < end_); | |
173 if (GetReturnValue(offset, &value)) | |
174 break; | |
175 } | |
176 } | |
177 return value; | |
178 } | |
179 | |
180 int LookupStringInFixedSet(const unsigned char* graph, | |
181 size_t length, | |
182 const char* key, | |
183 size_t key_length) { | |
184 // Do an incremental lookup until either the end of the graph is reached, or | |
185 // until every character in |key| is consumed. | |
186 FixedSetIncrementalLookup lookup(graph, length); | |
187 const char* key_end = key + key_length; | |
188 while (key != key_end) { | |
189 if (!lookup.Advance(*key)) | |
190 return kDafsaNotFound; | |
191 key++; | |
192 } | |
193 // The entire input was consumed without reaching the end of the graph. Return | |
194 // the result code (if present) for the current position, or kDafsaNotFound. | |
195 return lookup.GetResultForCurrentSequence(); | |
196 } | |
197 | |
198 } // namespace net | |
OLD | NEW |