 Chromium Code Reviews
 Chromium Code Reviews Issue 1358433004:
  Correctly handle problematic nested escapes in URL paths.  (Closed) 
  Base URL: https://chromium.googlesource.com/chromium/src.git@master
    
  
    Issue 1358433004:
  Correctly handle problematic nested escapes in URL paths.  (Closed) 
  Base URL: https://chromium.googlesource.com/chromium/src.git@master| OLD | NEW | 
|---|---|
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be | 
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. | 
| 4 | 4 | 
| 5 #include "base/logging.h" | 5 #include "base/logging.h" | 
| 6 #include "url/url_canon.h" | 6 #include "url/url_canon.h" | 
| 7 #include "url/url_canon_internal.h" | 7 #include "url/url_canon_internal.h" | 
| 8 #include "url/url_parse_internal.h" | 8 #include "url/url_parse_internal.h" | 
| 9 | 9 | 
| 10 namespace url { | 10 namespace url { | 
| (...skipping 144 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 155 | 155 | 
| 156 // Now back up (skipping the trailing slash) until we find another slash. | 156 // Now back up (skipping the trailing slash) until we find another slash. | 
| 157 i--; | 157 i--; | 
| 158 while (output->at(i) != '/' && i > path_begin_in_output) | 158 while (output->at(i) != '/' && i > path_begin_in_output) | 
| 159 i--; | 159 i--; | 
| 160 | 160 | 
| 161 // Now shrink the output to just include that last slash we found. | 161 // Now shrink the output to just include that last slash we found. | 
| 162 output->set_length(i + 1); | 162 output->set_length(i + 1); | 
| 163 } | 163 } | 
| 164 | 164 | 
| 165 // Looks for problematic nested escape sequences and escapes the output as | |
| 166 // needed to ensure they can't be misinterpreted. | |
| 167 // | |
| 168 // Our concern is that in input escape sequence that's invalid because it | |
| 169 // contains nested escape sequences might look valid once those are unescaped. | |
| 170 // For example, "%%300" is not a valid escape sequence, but after unescaping the | |
| 171 // inner "%30" this becomes "%00" which is valid. Leaving this in the output | |
| 172 // string can result in callers re-canonicalizing the string and unescaping this | |
| 173 // sequence, thus resulting in something fundamentally different than the | |
| 174 // original input here. This can cause a variety of problems. | |
| 175 // | |
| 176 // This function is called after we've just unescaped a sequence that's within | |
| 177 // two output characters of a previous '%' that we know didn't begin a valid | |
| 178 // escape sequence in the input string. We look for whether the output is going | |
| 179 // to turn into a valid escape sequence, and if so, convert the initial '%' into | |
| 180 // an escaped "%25" so the output can't be misinterpreted. | |
| 181 // | |
| 182 // |spec| is the input string we're canonicalizing. | |
| 183 // |next_input_index| is the index of the next unprocessed character in |spec|. | |
| 184 // |input_len| is the length of |spec|. | |
| 185 // |last_invalid_percent_index| is the index in |output| of a previously-seen | |
| 186 // '%' character. The caller knows this '%' character isn't followed by a valid | |
| 187 // escape sequence in the input string. | |
| 188 // |output| is the canonicalized output thus far. The caller guarantees this | |
| 189 // ends with a '%' followed by one or two characters, and the '%' is the one | |
| 190 // pointed to by |last_invalid_percent_index|. The last character in the string | |
| 191 // was just unescaped. | |
| 192 template<typename CHAR> | |
| 193 void CheckForNestedEscapes(const CHAR* spec, | |
| 194 int next_input_index, | |
| 195 int input_len, | |
| 196 int last_invalid_percent_index, | |
| 197 CanonOutput* output) { | |
| 198 const int length = output->length(); | |
| 199 const char last_unescaped_char = output->at(length - 1); | |
| 
brettw
2015/09/21 22:37:06
Can you rename this to last_appended_char or somet
 
Peter Kasting
2015/09/21 23:15:30
This character is guaranteed to have just been une
 | |
| 200 | |
| 201 // If |output| currently looks like "%c", we need to try appending the next | |
| 202 // input character to see if this will result in a problematic escape | |
| 203 // sequence. Note that this won't trigger on the first nested escape of a | |
| 204 // two-escape sequence like "%%30%30" -- we'll allow the conversion to | |
| 205 // "%0%30" -- but the second nested escape will be caught by this function | |
| 206 // when it's called again in that case. | |
| 207 const bool append_next_char = last_invalid_percent_index == length - 2; | |
| 208 if (append_next_char) { | |
| 209 // If the input doesn't contain an 8-bit character next, this case won't be | |
| 210 // a problem. | |
| 211 if ((next_input_index == input_len) || !Is8BitChar(spec[next_input_index])) | |
| 
brettw
2015/09/21 22:28:34
I think the Is8BitChar can actually be a 7-bit tes
 
Peter Kasting
2015/09/21 23:15:30
We can actually do better than that: just check fo
 | |
| 212 return; | |
| 213 output->push_back(static_cast<char>(spec[next_input_index])); | |
| 214 } | |
| 215 | |
| 216 // Now output ends like "%cc". Try to unescape this. | |
| 217 int begin = last_invalid_percent_index; | |
| 218 unsigned char temp; | |
| 219 if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) { | |
| 220 // New escape sequence found. Overwrite the characters following the '%' | |
| 221 // with "25", and push_back() the one or two characters that were following | |
| 222 // the '%' when we were called. | |
| 223 if (!append_next_char) | |
| 224 output->push_back(output->at(last_invalid_percent_index + 1)); | |
| 225 output->set(last_invalid_percent_index + 1, '2'); | |
| 226 output->set(last_invalid_percent_index + 2, '5'); | |
| 227 output->push_back(last_unescaped_char); | |
| 228 } else if (append_next_char) { | |
| 229 // Not a valid escape sequence, but we still need to undo appending the next | |
| 230 // source character so the caller can process it normally. | |
| 231 output->set_length(length); | |
| 232 } | |
| 233 } | |
| 234 | |
| 165 // Appends the given path to the output. It assumes that if the input path | 235 // Appends the given path to the output. It assumes that if the input path | 
| 166 // starts with a slash, it should be copied to the output. If no path has | 236 // starts with a slash, it should be copied to the output. If no path has | 
| 167 // already been appended to the output (the case when not resolving | 237 // already been appended to the output (the case when not resolving | 
| 168 // relative URLs), the path should begin with a slash. | 238 // relative URLs), the path should begin with a slash. | 
| 169 // | 239 // | 
| 170 // If there are already path components (this mode is used when appending | 240 // If there are already path components (this mode is used when appending | 
| 171 // relative paths for resolving), it assumes that the output already has | 241 // relative paths for resolving), it assumes that the output already has | 
| 172 // a trailing slash and that if the input begins with a slash, it should be | 242 // a trailing slash and that if the input begins with a slash, it should be | 
| 173 // copied to the output. | 243 // copied to the output. | 
| 174 // | 244 // | 
| 175 // We do not collapse multiple slashes in a row to a single slash. It seems | 245 // We do not collapse multiple slashes in a row to a single slash. It seems | 
| 176 // no web browsers do this, and we don't want incompatibilities, even though | 246 // no web browsers do this, and we don't want incompatibilities, even though | 
| 177 // it would be correct for most systems. | 247 // it would be correct for most systems. | 
| 178 template<typename CHAR, typename UCHAR> | 248 template<typename CHAR, typename UCHAR> | 
| 179 bool DoPartialPath(const CHAR* spec, | 249 bool DoPartialPath(const CHAR* spec, | 
| 180 const Component& path, | 250 const Component& path, | 
| 181 int path_begin_in_output, | 251 int path_begin_in_output, | 
| 182 CanonOutput* output) { | 252 CanonOutput* output) { | 
| 183 int end = path.end(); | 253 int end = path.end(); | 
| 184 | 254 | 
| 255 // We use this variable to minimize the amount of work done when unescaping -- | |
| 256 // we'll only call CheckForNestedEscapes() when this points at one of the last | |
| 257 // couple of characters in |output|. | |
| 258 int last_invalid_percent_index = INT_MIN; | |
| 259 | |
| 185 bool success = true; | 260 bool success = true; | 
| 186 for (int i = path.begin; i < end; i++) { | 261 for (int i = path.begin; i < end; i++) { | 
| 187 UCHAR uch = static_cast<UCHAR>(spec[i]); | 262 UCHAR uch = static_cast<UCHAR>(spec[i]); | 
| 188 if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) { | 263 if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) { | 
| 189 // We only need to test wide input for having non-ASCII characters. For | 264 // We only need to test wide input for having non-ASCII characters. For | 
| 190 // narrow input, we'll always just use the lookup table. We don't try to | 265 // narrow input, we'll always just use the lookup table. We don't try to | 
| 191 // do anything tricky with decoding/validating UTF-8. This function will | 266 // do anything tricky with decoding/validating UTF-8. This function will | 
| 192 // read one or two UTF-16 characters and append the output as UTF-8. This | 267 // read one or two UTF-16 characters and append the output as UTF-8. This | 
| 193 // call will be removed in 8-bit mode. | 268 // call will be removed in 8-bit mode. | 
| 194 success &= AppendUTF8EscapedChar(spec, &i, end, output); | 269 success &= AppendUTF8EscapedChar(spec, &i, end, output); | 
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 238 | 313 | 
| 239 } else if (out_ch == '\\') { | 314 } else if (out_ch == '\\') { | 
| 240 // Convert backslashes to forward slashes | 315 // Convert backslashes to forward slashes | 
| 241 output->push_back('/'); | 316 output->push_back('/'); | 
| 242 | 317 | 
| 243 } else if (out_ch == '%') { | 318 } else if (out_ch == '%') { | 
| 244 // Handle escape sequences. | 319 // Handle escape sequences. | 
| 245 unsigned char unescaped_value; | 320 unsigned char unescaped_value; | 
| 246 if (DecodeEscaped(spec, &i, end, &unescaped_value)) { | 321 if (DecodeEscaped(spec, &i, end, &unescaped_value)) { | 
| 247 // Valid escape sequence, see if we keep, reject, or unescape it. | 322 // Valid escape sequence, see if we keep, reject, or unescape it. | 
| 323 // Note that at this point DecodeEscape() will have advanced |i| to | |
| 324 // the last character of the escape sequence. | |
| 248 char unescaped_flags = kPathCharLookup[unescaped_value]; | 325 char unescaped_flags = kPathCharLookup[unescaped_value]; | 
| 249 | 326 | 
| 250 if (unescaped_flags & UNESCAPE) { | 327 if (unescaped_flags & UNESCAPE) { | 
| 251 // This escaped value shouldn't be escaped, copy it. | 328 // This escaped value shouldn't be escaped. Try to copy it. | 
| 252 output->push_back(unescaped_value); | 329 output->push_back(unescaped_value); | 
| 253 } else if (unescaped_flags & INVALID_BIT) { | 330 // If we just unescaped a value within 2 output characters of the | 
| 254 // Invalid escaped character, copy it and remember the error. | 331 // '%' from a previously-detected invalid escape sequence, we | 
| 332 // might have an input string with problematic nested escape | |
| 333 // sequences; detect and fix them. | |
| 334 if (last_invalid_percent_index >= (output->length() - 3)) { | |
| 335 CheckForNestedEscapes(spec, i + 1, end, | |
| 336 last_invalid_percent_index, output); | |
| 337 } | |
| 338 } else { | |
| 339 // Either this is an invalid escaped character, or it's a valid | |
| 340 // escaped character we should keep escaped. In the first case we | |
| 341 // should just copy it exactly and remember the error. In the | |
| 342 // second we also copy exactly in case the server is sensitive to | |
| 343 // changing the case of any hex letters. | |
| 255 output->push_back('%'); | 344 output->push_back('%'); | 
| 256 output->push_back(static_cast<char>(spec[i - 1])); | 345 output->push_back(static_cast<char>(spec[i - 1])); | 
| 257 output->push_back(static_cast<char>(spec[i])); | 346 output->push_back(static_cast<char>(spec[i])); | 
| 258 success = false; | 347 if (unescaped_flags & INVALID_BIT) | 
| 259 } else { | 348 success = false; | 
| 260 // Valid escaped character but we should keep it escaped. We | |
| 261 // don't want to change the case of any hex letters in case | |
| 262 // the server is sensitive to that, so we just copy the two | |
| 263 // characters without checking (DecodeEscape will have advanced | |
| 264 // to the last character of the pair). | |
| 265 output->push_back('%'); | |
| 266 output->push_back(static_cast<char>(spec[i - 1])); | |
| 267 output->push_back(static_cast<char>(spec[i])); | |
| 268 } | 349 } | 
| 269 } else { | 350 } else { | 
| 270 // Invalid escape sequence. IE7 rejects any URLs with such | 351 // Invalid escape sequence. IE7+ rejects any URLs with such | 
| 271 // sequences, while Firefox, IE6, and Safari all pass it through | 352 // sequences, while other browsers pass them through unchanged. We | 
| 272 // unchanged. We are more permissive unlike IE7. I don't think this | 353 // use the permissive behavior. | 
| 273 // can cause significant problems, if it does, we should change | 354 // TODO(brettw): Consider testing IE's strict behavior, which would | 
| 274 // to be more like IE7. | 355 // allow removing the code to handle nested escapes above. | 
| 356 last_invalid_percent_index = output->length(); | |
| 275 output->push_back('%'); | 357 output->push_back('%'); | 
| 276 } | 358 } | 
| 277 | 359 | 
| 278 } else if (flags & INVALID_BIT) { | 360 } else if (flags & INVALID_BIT) { | 
| 279 // For NULLs, etc. fail. | 361 // For NULLs, etc. fail. | 
| 280 AppendEscapedChar(out_ch, output); | 362 AppendEscapedChar(out_ch, output); | 
| 281 success = false; | 363 success = false; | 
| 282 | 364 | 
| 283 } else if (flags & ESCAPE_BIT) { | 365 } else if (flags & ESCAPE_BIT) { | 
| 284 // This character should be escaped. | 366 // This character should be escaped. | 
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 344 bool CanonicalizePartialPath(const base::char16* spec, | 426 bool CanonicalizePartialPath(const base::char16* spec, | 
| 345 const Component& path, | 427 const Component& path, | 
| 346 int path_begin_in_output, | 428 int path_begin_in_output, | 
| 347 CanonOutput* output) { | 429 CanonOutput* output) { | 
| 348 return DoPartialPath<base::char16, base::char16>(spec, path, | 430 return DoPartialPath<base::char16, base::char16>(spec, path, | 
| 349 path_begin_in_output, | 431 path_begin_in_output, | 
| 350 output); | 432 output); | 
| 351 } | 433 } | 
| 352 | 434 | 
| 353 } // namespace url | 435 } // namespace url | 
| OLD | NEW |