OLD | NEW |
---|---|
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "base/logging.h" | 5 #include "base/logging.h" |
6 #include "url/url_canon.h" | 6 #include "url/url_canon.h" |
7 #include "url/url_canon_internal.h" | 7 #include "url/url_canon_internal.h" |
8 #include "url/url_parse_internal.h" | 8 #include "url/url_parse_internal.h" |
9 | 9 |
10 namespace url { | 10 namespace url { |
(...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
155 | 155 |
156 // Now back up (skipping the trailing slash) until we find another slash. | 156 // Now back up (skipping the trailing slash) until we find another slash. |
157 i--; | 157 i--; |
158 while (output->at(i) != '/' && i > path_begin_in_output) | 158 while (output->at(i) != '/' && i > path_begin_in_output) |
159 i--; | 159 i--; |
160 | 160 |
161 // Now shrink the output to just include that last slash we found. | 161 // Now shrink the output to just include that last slash we found. |
162 output->set_length(i + 1); | 162 output->set_length(i + 1); |
163 } | 163 } |
164 | 164 |
165 // Looks for problematic nested escape sequences and escapes the output as | |
166 // needed to ensure they can't be misinterpreted. | |
167 // | |
168 // Our concern is that in input escape sequence that's invalid because it | |
169 // contains nested escape sequences might look valid once those are unescaped. | |
170 // For example, "%%300" is not a valid escape sequence, but after unescaping the | |
171 // inner "%30" this becomes "%00" which is valid. Leaving this in the output | |
172 // string can result in callers re-canonicalizing the string and unescaping this | |
173 // sequence, thus resulting in something fundamentally different than the | |
174 // original input here. This can cause a variety of problems. | |
175 // | |
176 // This function is called after we've just unescaped a sequence that's within | |
177 // two output characters of a previous '%' that we know didn't begin a valid | |
178 // escape sequence in the input string. We look for whether the output is going | |
179 // to turn into a valid escape sequence, and if so, convert the initial '%' into | |
180 // an escaped "%25" so the output can't be misinterpreted. | |
181 // | |
182 // |spec| is the input string we're canonicalizing. | |
183 // |next_input_index| is the index of the next unprocessed character in |spec|. | |
184 // |input_len| is the length of |spec|. | |
185 // |last_invalid_percent_index| is the index in |output| of a previously-seen | |
186 // '%' character. The caller knows this '%' character isn't followed by a valid | |
187 // escape sequence in the input string. | |
188 // |output| is the canonicalized output thus far. The caller guarantees this | |
189 // ends with a '%' followed by one or two characters, and the '%' is the one | |
190 // pointed to by |last_invalid_percent_index|. The last character in the string | |
191 // was just unescaped. | |
192 template<typename CHAR> | |
193 void CheckForNestedEscapes(const CHAR* spec, | |
194 int next_input_index, | |
195 int input_len, | |
196 int last_invalid_percent_index, | |
197 CanonOutput* output) { | |
198 const int length = output->length(); | |
199 const char last_unescaped_char = output->at(length - 1); | |
brettw
2015/09/21 23:25:19
I agree my suggestion is also confusing. I don't a
| |
200 | |
201 // If |output| currently looks like "%c", we need to try appending the next | |
202 // input character to see if this will result in a problematic escape | |
203 // sequence. Note that this won't trigger on the first nested escape of a | |
204 // two-escape sequence like "%%30%30" -- we'll allow the conversion to | |
205 // "%0%30" -- but the second nested escape will be caught by this function | |
206 // when it's called again in that case. | |
207 const bool append_next_char = last_invalid_percent_index == length - 2; | |
208 if (append_next_char) { | |
209 // If the input doesn't contain a 7-bit character next, this case won't be a | |
210 // problem. | |
211 if ((next_input_index == input_len) || (spec[next_input_index] >= 0x80)) | |
brettw
2015/09/21 23:25:19
If this compiles, it's fine, but I was concerned y
| |
212 return; | |
213 output->push_back(static_cast<char>(spec[next_input_index])); | |
214 } | |
215 | |
216 // Now output ends like "%cc". Try to unescape this. | |
217 int begin = last_invalid_percent_index; | |
218 unsigned char temp; | |
219 if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) { | |
220 // New escape sequence found. Overwrite the characters following the '%' | |
221 // with "25", and push_back() the one or two characters that were following | |
222 // the '%' when we were called. | |
223 if (!append_next_char) | |
224 output->push_back(output->at(last_invalid_percent_index + 1)); | |
225 output->set(last_invalid_percent_index + 1, '2'); | |
226 output->set(last_invalid_percent_index + 2, '5'); | |
227 output->push_back(last_unescaped_char); | |
228 } else if (append_next_char) { | |
229 // Not a valid escape sequence, but we still need to undo appending the next | |
230 // source character so the caller can process it normally. | |
231 output->set_length(length); | |
232 } | |
233 } | |
234 | |
165 // Appends the given path to the output. It assumes that if the input path | 235 // Appends the given path to the output. It assumes that if the input path |
166 // starts with a slash, it should be copied to the output. If no path has | 236 // starts with a slash, it should be copied to the output. If no path has |
167 // already been appended to the output (the case when not resolving | 237 // already been appended to the output (the case when not resolving |
168 // relative URLs), the path should begin with a slash. | 238 // relative URLs), the path should begin with a slash. |
169 // | 239 // |
170 // If there are already path components (this mode is used when appending | 240 // If there are already path components (this mode is used when appending |
171 // relative paths for resolving), it assumes that the output already has | 241 // relative paths for resolving), it assumes that the output already has |
172 // a trailing slash and that if the input begins with a slash, it should be | 242 // a trailing slash and that if the input begins with a slash, it should be |
173 // copied to the output. | 243 // copied to the output. |
174 // | 244 // |
175 // We do not collapse multiple slashes in a row to a single slash. It seems | 245 // We do not collapse multiple slashes in a row to a single slash. It seems |
176 // no web browsers do this, and we don't want incompatibilities, even though | 246 // no web browsers do this, and we don't want incompatibilities, even though |
177 // it would be correct for most systems. | 247 // it would be correct for most systems. |
178 template<typename CHAR, typename UCHAR> | 248 template<typename CHAR, typename UCHAR> |
179 bool DoPartialPath(const CHAR* spec, | 249 bool DoPartialPath(const CHAR* spec, |
180 const Component& path, | 250 const Component& path, |
181 int path_begin_in_output, | 251 int path_begin_in_output, |
182 CanonOutput* output) { | 252 CanonOutput* output) { |
183 int end = path.end(); | 253 int end = path.end(); |
184 | 254 |
255 // We use this variable to minimize the amount of work done when unescaping -- | |
256 // we'll only call CheckForNestedEscapes() when this points at one of the last | |
257 // couple of characters in |output|. | |
258 int last_invalid_percent_index = INT_MIN; | |
259 | |
185 bool success = true; | 260 bool success = true; |
186 for (int i = path.begin; i < end; i++) { | 261 for (int i = path.begin; i < end; i++) { |
187 UCHAR uch = static_cast<UCHAR>(spec[i]); | 262 UCHAR uch = static_cast<UCHAR>(spec[i]); |
188 if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) { | 263 if (sizeof(CHAR) > 1 && uch >= 0x80) { |
189 // We only need to test wide input for having non-ASCII characters. For | 264 // We only need to test wide input for having non-ASCII characters. For |
190 // narrow input, we'll always just use the lookup table. We don't try to | 265 // narrow input, we'll always just use the lookup table. We don't try to |
191 // do anything tricky with decoding/validating UTF-8. This function will | 266 // do anything tricky with decoding/validating UTF-8. This function will |
192 // read one or two UTF-16 characters and append the output as UTF-8. This | 267 // read one or two UTF-16 characters and append the output as UTF-8. This |
193 // call will be removed in 8-bit mode. | 268 // call will be removed in 8-bit mode. |
194 success &= AppendUTF8EscapedChar(spec, &i, end, output); | 269 success &= AppendUTF8EscapedChar(spec, &i, end, output); |
195 } else { | 270 } else { |
196 // Normal ASCII character or 8-bit input, use the lookup table. | 271 // Normal ASCII character or 8-bit input, use the lookup table. |
197 unsigned char out_ch = static_cast<unsigned char>(uch); | 272 unsigned char out_ch = static_cast<unsigned char>(uch); |
198 unsigned char flags = kPathCharLookup[out_ch]; | 273 unsigned char flags = kPathCharLookup[out_ch]; |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
238 | 313 |
239 } else if (out_ch == '\\') { | 314 } else if (out_ch == '\\') { |
240 // Convert backslashes to forward slashes | 315 // Convert backslashes to forward slashes |
241 output->push_back('/'); | 316 output->push_back('/'); |
242 | 317 |
243 } else if (out_ch == '%') { | 318 } else if (out_ch == '%') { |
244 // Handle escape sequences. | 319 // Handle escape sequences. |
245 unsigned char unescaped_value; | 320 unsigned char unescaped_value; |
246 if (DecodeEscaped(spec, &i, end, &unescaped_value)) { | 321 if (DecodeEscaped(spec, &i, end, &unescaped_value)) { |
247 // Valid escape sequence, see if we keep, reject, or unescape it. | 322 // Valid escape sequence, see if we keep, reject, or unescape it. |
323 // Note that at this point DecodeEscape() will have advanced |i| to | |
324 // the last character of the escape sequence. | |
248 char unescaped_flags = kPathCharLookup[unescaped_value]; | 325 char unescaped_flags = kPathCharLookup[unescaped_value]; |
249 | 326 |
250 if (unescaped_flags & UNESCAPE) { | 327 if (unescaped_flags & UNESCAPE) { |
251 // This escaped value shouldn't be escaped, copy it. | 328 // This escaped value shouldn't be escaped. Try to copy it. |
252 output->push_back(unescaped_value); | 329 output->push_back(unescaped_value); |
253 } else if (unescaped_flags & INVALID_BIT) { | 330 // If we just unescaped a value within 2 output characters of the |
254 // Invalid escaped character, copy it and remember the error. | 331 // '%' from a previously-detected invalid escape sequence, we |
332 // might have an input string with problematic nested escape | |
333 // sequences; detect and fix them. | |
334 if (last_invalid_percent_index >= (output->length() - 3)) { | |
335 CheckForNestedEscapes(spec, i + 1, end, | |
336 last_invalid_percent_index, output); | |
337 } | |
338 } else { | |
339 // Either this is an invalid escaped character, or it's a valid | |
340 // escaped character we should keep escaped. In the first case we | |
341 // should just copy it exactly and remember the error. In the | |
342 // second we also copy exactly in case the server is sensitive to | |
343 // changing the case of any hex letters. | |
255 output->push_back('%'); | 344 output->push_back('%'); |
256 output->push_back(static_cast<char>(spec[i - 1])); | 345 output->push_back(static_cast<char>(spec[i - 1])); |
257 output->push_back(static_cast<char>(spec[i])); | 346 output->push_back(static_cast<char>(spec[i])); |
258 success = false; | 347 if (unescaped_flags & INVALID_BIT) |
259 } else { | 348 success = false; |
260 // Valid escaped character but we should keep it escaped. We | |
261 // don't want to change the case of any hex letters in case | |
262 // the server is sensitive to that, so we just copy the two | |
263 // characters without checking (DecodeEscape will have advanced | |
264 // to the last character of the pair). | |
265 output->push_back('%'); | |
266 output->push_back(static_cast<char>(spec[i - 1])); | |
267 output->push_back(static_cast<char>(spec[i])); | |
268 } | 349 } |
269 } else { | 350 } else { |
270 // Invalid escape sequence. IE7 rejects any URLs with such | 351 // Invalid escape sequence. IE7+ rejects any URLs with such |
271 // sequences, while Firefox, IE6, and Safari all pass it through | 352 // sequences, while other browsers pass them through unchanged. We |
272 // unchanged. We are more permissive unlike IE7. I don't think this | 353 // use the permissive behavior. |
273 // can cause significant problems, if it does, we should change | 354 // TODO(brettw): Consider testing IE's strict behavior, which would |
274 // to be more like IE7. | 355 // allow removing the code to handle nested escapes above. |
356 last_invalid_percent_index = output->length(); | |
275 output->push_back('%'); | 357 output->push_back('%'); |
276 } | 358 } |
277 | 359 |
278 } else if (flags & INVALID_BIT) { | 360 } else if (flags & INVALID_BIT) { |
279 // For NULLs, etc. fail. | 361 // For NULLs, etc. fail. |
280 AppendEscapedChar(out_ch, output); | 362 AppendEscapedChar(out_ch, output); |
281 success = false; | 363 success = false; |
282 | 364 |
283 } else if (flags & ESCAPE_BIT) { | 365 } else if (flags & ESCAPE_BIT) { |
284 // This character should be escaped. | 366 // This character should be escaped. |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
344 bool CanonicalizePartialPath(const base::char16* spec, | 426 bool CanonicalizePartialPath(const base::char16* spec, |
345 const Component& path, | 427 const Component& path, |
346 int path_begin_in_output, | 428 int path_begin_in_output, |
347 CanonOutput* output) { | 429 CanonOutput* output) { |
348 return DoPartialPath<base::char16, base::char16>(spec, path, | 430 return DoPartialPath<base::char16, base::char16>(spec, path, |
349 path_begin_in_output, | 431 path_begin_in_output, |
350 output); | 432 output); |
351 } | 433 } |
352 | 434 |
353 } // namespace url | 435 } // namespace url |
OLD | NEW |