Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(400)

Side by Side Diff: net/base/lookup_string_in_fixed_set.cc

Issue 2784933002: Mitigate spoofing attempt using Latin letters. (Closed)
Patch Set: add similarity check unittests Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/base/lookup_string_in_fixed_set.h"
6
7 #include "base/logging.h"
8
9 namespace net {
10
11 namespace {
12
13 // Read next offset from |pos|, increment |offset| by that amount, and increment
14 // |pos| either to point to the start of the next encoded offset in its node, or
15 // nullptr, if there are no remaining offsets.
16 //
17 // Returns true if an offset could be read; false otherwise.
18 inline bool GetNextOffset(const unsigned char** pos,
19 const unsigned char** offset) {
20 if (*pos == nullptr)
21 return false;
22
23 size_t bytes_consumed;
24 switch (**pos & 0x60) {
25 case 0x60: // Read three byte offset
26 *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
27 bytes_consumed = 3;
28 break;
29 case 0x40: // Read two byte offset
30 *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
31 bytes_consumed = 2;
32 break;
33 default:
34 *offset += (*pos)[0] & 0x3F;
35 bytes_consumed = 1;
36 }
37 if ((**pos & 0x80) != 0) {
38 *pos = nullptr;
39 } else {
40 *pos += bytes_consumed;
41 }
42 return true;
43 }
44
45 // Check if byte at |offset| is last in label.
46 bool IsEOL(const unsigned char* offset) {
47 return (*offset & 0x80) != 0;
48 }
49
50 // Check if byte at |offset| matches key. This version matches both end-of-label
51 // chars and not-end-of-label chars.
52 bool IsMatch(const unsigned char* offset, char key) {
53 return (*offset & 0x7F) == key;
54 }
55
56 // Read return value at |offset|, if it is a return value. Returns true if a
57 // return value could be read, false otherwise.
58 bool GetReturnValue(const unsigned char* offset, int* return_value) {
59 // Return values are always encoded as end-of-label chars (so the high bit is
60 // set). So byte values in the inclusive range [0x80, 0x9F] encode the return
61 // values 0 through 31 (though make_dafsa.py doesn't currently encode values
62 // higher than 7). The following code does that translation.
63 if ((*offset & 0xE0) == 0x80) {
64 *return_value = *offset & 0x1F;
65 return true;
66 }
67 return false;
68 }
69
70 } // namespace
71
72 FixedSetIncrementalLookup::FixedSetIncrementalLookup(const unsigned char* graph,
73 size_t length)
74 : pos_(graph), end_(graph + length), pos_is_label_character_(false) {}
75
76 FixedSetIncrementalLookup::FixedSetIncrementalLookup(
77 const FixedSetIncrementalLookup& other) = default;
78
79 FixedSetIncrementalLookup& FixedSetIncrementalLookup::operator=(
80 const FixedSetIncrementalLookup& other) = default;
81
82 FixedSetIncrementalLookup::~FixedSetIncrementalLookup() {}
83
84 bool FixedSetIncrementalLookup::Advance(char input) {
85 if (!pos_) {
86 // A previous input exhausted the graph, so there are no possible matches.
87 return false;
88 }
89
90 // Only ASCII printable chars are supported by the current DAFSA format -- the
91 // high bit (values 0x80-0xFF) is reserved as a label-end signifier, and the
92 // low values (values 0x00-0x1F) are reserved to encode the return values. So
93 // values outside this range will never be in the dictionary.
94 if (input >= 0x20) {
95 if (pos_is_label_character_) {
96 // Currently processing a label, so it is only necessary to check the byte
97 // at |pos_| to see if it encodes a character matching |input|.
98 bool is_last_char_in_label = IsEOL(pos_);
99 bool is_match = IsMatch(pos_, input);
100 if (is_match) {
101 // If this is not the last character in the label, the next byte should
102 // be interpreted as a character or return value. Otherwise, the next
103 // byte should be interpreted as a list of child node offsets.
104 ++pos_;
105 DCHECK(pos_ < end_);
106 pos_is_label_character_ = !is_last_char_in_label;
107 return true;
108 }
109 } else {
110 const unsigned char* offset = pos_;
111 // Read offsets from |pos_| until the label of the child node at |offset|
112 // matches |input|, or until there are no more offsets.
113 while (GetNextOffset(&pos_, &offset)) {
114 DCHECK(offset < end_);
115 DCHECK((pos_ == nullptr) || (pos_ < end_));
116
117 // |offset| points to a DAFSA node that is a child of the original node.
118 //
119 // The low 7 bits of a node encodes a character value; the high bit
120 // indicates whether it's the last character in the label.
121 //
122 // Note that |*offset| could also be a result code value, but these are
123 // really just out-of-range ASCII values, encoded the same way as
124 // characters. Since |input| was already validated as a printable ASCII
125 // value ASCII value, IsMatch will never return true if |offset| is a
126 // result code.
127 bool is_last_char_in_label = IsEOL(offset);
128 bool is_match = IsMatch(offset, input);
129
130 if (is_match) {
131 // If this is not the last character in the label, the next byte
132 // should be interpreted as a character or return value. Otherwise,
133 // the next byte should be interpreted as a list of child node
134 // offsets.
135 pos_ = offset + 1;
136 DCHECK(pos_ < end_);
137 pos_is_label_character_ = !is_last_char_in_label;
138 return true;
139 }
140 }
141 }
142 }
143
144 // If no match was found, then end of the DAFSA has been reached.
145 pos_ = nullptr;
146 pos_is_label_character_ = false;
147 return false;
148 }
149
150 int FixedSetIncrementalLookup::GetResultForCurrentSequence() const {
151 int value = kDafsaNotFound;
152 // Look to see if there is a next character that's a return value.
153 if (pos_is_label_character_) {
154 // Currently processing a label, so it is only necessary to check the byte
155 // at |pos_| to see if encodes a return value.
156 GetReturnValue(pos_, &value);
157 } else {
158 // Otherwise, |pos_| is an offset list (or nullptr). Explore the list of
159 // child nodes (given by their offsets) to find one whose label is a result
160 // code.
161 //
162 // This search uses a temporary copy of |pos_|, since mutating |pos_| could
163 // skip over a node that would be important to a subsequent Advance() call.
164 const unsigned char* temp_pos = pos_;
165
166 // Read offsets from |temp_pos| until either |temp_pos| is nullptr or until
167 // the byte at |offset| contains a result code (encoded as an ASCII
168 // character below 0x20).
169 const unsigned char* offset = pos_;
170 while (GetNextOffset(&temp_pos, &offset)) {
171 DCHECK(offset < end_);
172 DCHECK((temp_pos == nullptr) || temp_pos < end_);
173 if (GetReturnValue(offset, &value))
174 break;
175 }
176 }
177 return value;
178 }
179
180 int LookupStringInFixedSet(const unsigned char* graph,
181 size_t length,
182 const char* key,
183 size_t key_length) {
184 // Do an incremental lookup until either the end of the graph is reached, or
185 // until every character in |key| is consumed.
186 FixedSetIncrementalLookup lookup(graph, length);
187 const char* key_end = key + key_length;
188 while (key != key_end) {
189 if (!lookup.Advance(*key))
190 return kDafsaNotFound;
191 key++;
192 }
193 // The entire input was consumed without reaching the end of the graph. Return
194 // the result code (if present) for the current position, or kDafsaNotFound.
195 return lookup.GetResultForCurrentSequence();
196 }
197
198 } // namespace net
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698