Chromium Code Reviews| Index: url/url_canon_path.cc |
| diff --git a/url/url_canon_path.cc b/url/url_canon_path.cc |
| index ee1cd9626c5d39b8b466c4f32e9a719f05dee7f6..d97214e2ac271c4725b4b2901f7fce0b7a51d5b4 100644 |
| --- a/url/url_canon_path.cc |
| +++ b/url/url_canon_path.cc |
| @@ -162,6 +162,76 @@ void BackUpToPreviousSlash(int path_begin_in_output, |
| output->set_length(i + 1); |
| } |
| +// Looks for problematic nested escape sequences and escapes the output as |
| +// needed to ensure they can't be misinterpreted. |
| +// |
| +// Our concern is that in input escape sequence that's invalid because it |
| +// contains nested escape sequences might look valid once those are unescaped. |
| +// For example, "%%300" is not a valid escape sequence, but after unescaping the |
| +// inner "%30" this becomes "%00" which is valid. Leaving this in the output |
| +// string can result in callers re-canonicalizing the string and unescaping this |
| +// sequence, thus resulting in something fundamentally different than the |
| +// original input here. This can cause a variety of problems. |
| +// |
| +// This function is called after we've just unescaped a sequence that's within |
| +// two output characters of a previous '%' that we know didn't begin a valid |
| +// escape sequence in the input string. We look for whether the output is going |
| +// to turn into a valid escape sequence, and if so, convert the initial '%' into |
| +// an escaped "%25" so the output can't be misinterpreted. |
| +// |
| +// |spec| is the input string we're canonicalizing. |
| +// |next_input_index| is the index of the next unprocessed character in |spec|. |
| +// |input_len| is the length of |spec|. |
| +// |last_invalid_percent_index| is the index in |output| of a previously-seen |
| +// '%' character. The caller knows this '%' character isn't followed by a valid |
| +// escape sequence in the input string. |
| +// |output| is the canonicalized output thus far. The caller guarantees this |
| +// ends with a '%' followed by one or two characters, and the '%' is the one |
| +// pointed to by |last_invalid_percent_index|. The last character in the string |
| +// was just unescaped. |
| +template<typename CHAR> |
| +void CheckForNestedEscapes(const CHAR* spec, |
| + int next_input_index, |
| + int input_len, |
| + int last_invalid_percent_index, |
| + CanonOutput* output) { |
| + const int length = output->length(); |
| + const char last_unescaped_char = output->at(length - 1); |
|
brettw
2015/09/21 22:37:06
Can you rename this to last_appended_char or somet
Peter Kasting
2015/09/21 23:15:30
This character is guaranteed to have just been une
|
| + |
| + // If |output| currently looks like "%c", we need to try appending the next |
| + // input character to see if this will result in a problematic escape |
| + // sequence. Note that this won't trigger on the first nested escape of a |
| + // two-escape sequence like "%%30%30" -- we'll allow the conversion to |
| + // "%0%30" -- but the second nested escape will be caught by this function |
| + // when it's called again in that case. |
| + const bool append_next_char = last_invalid_percent_index == length - 2; |
| + if (append_next_char) { |
| + // If the input doesn't contain an 8-bit character next, this case won't be |
| + // a problem. |
| + if ((next_input_index == input_len) || !Is8BitChar(spec[next_input_index])) |
|
brettw
2015/09/21 22:28:34
I think the Is8BitChar can actually be a 7-bit tes
Peter Kasting
2015/09/21 23:15:30
We can actually do better than that: just check fo
|
| + return; |
| + output->push_back(static_cast<char>(spec[next_input_index])); |
| + } |
| + |
| + // Now output ends like "%cc". Try to unescape this. |
| + int begin = last_invalid_percent_index; |
| + unsigned char temp; |
| + if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) { |
| + // New escape sequence found. Overwrite the characters following the '%' |
| + // with "25", and push_back() the one or two characters that were following |
| + // the '%' when we were called. |
| + if (!append_next_char) |
| + output->push_back(output->at(last_invalid_percent_index + 1)); |
| + output->set(last_invalid_percent_index + 1, '2'); |
| + output->set(last_invalid_percent_index + 2, '5'); |
| + output->push_back(last_unescaped_char); |
| + } else if (append_next_char) { |
| + // Not a valid escape sequence, but we still need to undo appending the next |
| + // source character so the caller can process it normally. |
| + output->set_length(length); |
| + } |
| +} |
| + |
| // Appends the given path to the output. It assumes that if the input path |
| // starts with a slash, it should be copied to the output. If no path has |
| // already been appended to the output (the case when not resolving |
| @@ -182,6 +252,11 @@ bool DoPartialPath(const CHAR* spec, |
| CanonOutput* output) { |
| int end = path.end(); |
| + // We use this variable to minimize the amount of work done when unescaping -- |
| + // we'll only call CheckForNestedEscapes() when this points at one of the last |
| + // couple of characters in |output|. |
| + int last_invalid_percent_index = INT_MIN; |
| + |
| bool success = true; |
| for (int i = path.begin; i < end; i++) { |
| UCHAR uch = static_cast<UCHAR>(spec[i]); |
| @@ -245,33 +320,40 @@ bool DoPartialPath(const CHAR* spec, |
| unsigned char unescaped_value; |
| if (DecodeEscaped(spec, &i, end, &unescaped_value)) { |
| // Valid escape sequence, see if we keep, reject, or unescape it. |
| + // Note that at this point DecodeEscape() will have advanced |i| to |
| + // the last character of the escape sequence. |
| char unescaped_flags = kPathCharLookup[unescaped_value]; |
| if (unescaped_flags & UNESCAPE) { |
| - // This escaped value shouldn't be escaped, copy it. |
| + // This escaped value shouldn't be escaped. Try to copy it. |
| output->push_back(unescaped_value); |
| - } else if (unescaped_flags & INVALID_BIT) { |
| - // Invalid escaped character, copy it and remember the error. |
| - output->push_back('%'); |
| - output->push_back(static_cast<char>(spec[i - 1])); |
| - output->push_back(static_cast<char>(spec[i])); |
| - success = false; |
| + // If we just unescaped a value within 2 output characters of the |
| + // '%' from a previously-detected invalid escape sequence, we |
| + // might have an input string with problematic nested escape |
| + // sequences; detect and fix them. |
| + if (last_invalid_percent_index >= (output->length() - 3)) { |
| + CheckForNestedEscapes(spec, i + 1, end, |
| + last_invalid_percent_index, output); |
| + } |
| } else { |
| - // Valid escaped character but we should keep it escaped. We |
| - // don't want to change the case of any hex letters in case |
| - // the server is sensitive to that, so we just copy the two |
| - // characters without checking (DecodeEscape will have advanced |
| - // to the last character of the pair). |
| + // Either this is an invalid escaped character, or it's a valid |
| + // escaped character we should keep escaped. In the first case we |
| + // should just copy it exactly and remember the error. In the |
| + // second we also copy exactly in case the server is sensitive to |
| + // changing the case of any hex letters. |
| output->push_back('%'); |
| output->push_back(static_cast<char>(spec[i - 1])); |
| output->push_back(static_cast<char>(spec[i])); |
| + if (unescaped_flags & INVALID_BIT) |
| + success = false; |
| } |
| } else { |
| - // Invalid escape sequence. IE7 rejects any URLs with such |
| - // sequences, while Firefox, IE6, and Safari all pass it through |
| - // unchanged. We are more permissive unlike IE7. I don't think this |
| - // can cause significant problems, if it does, we should change |
| - // to be more like IE7. |
| + // Invalid escape sequence. IE7+ rejects any URLs with such |
| + // sequences, while other browsers pass them through unchanged. We |
| + // use the permissive behavior. |
| + // TODO(brettw): Consider testing IE's strict behavior, which would |
| + // allow removing the code to handle nested escapes above. |
| + last_invalid_percent_index = output->length(); |
| output->push_back('%'); |
| } |