Index: url/url_canon_path.cc |
diff --git a/url/url_canon_path.cc b/url/url_canon_path.cc |
index ee1cd9626c5d39b8b466c4f32e9a719f05dee7f6..6e5bfb1a245fe43a510c330d03497fe0cd7001ed 100644 |
--- a/url/url_canon_path.cc |
+++ b/url/url_canon_path.cc |
@@ -162,6 +162,76 @@ void BackUpToPreviousSlash(int path_begin_in_output, |
output->set_length(i + 1); |
} |
+// Looks for problematic nested escape sequences and escapes the output as |
+// needed to ensure they can't be misinterpreted. |
+// |
+// Our concern is that in input escape sequence that's invalid because it |
+// contains nested escape sequences might look valid once those are unescaped. |
+// For example, "%%300" is not a valid escape sequence, but after unescaping the |
+// inner "%30" this becomes "%00" which is valid. Leaving this in the output |
+// string can result in callers re-canonicalizing the string and unescaping this |
+// sequence, thus resulting in something fundamentally different than the |
+// original input here. This can cause a variety of problems. |
+// |
+// This function is called after we've just unescaped a sequence that's within |
+// two output characters of a previous '%' that we know didn't begin a valid |
+// escape sequence in the input string. We look for whether the output is going |
+// to turn into a valid escape sequence, and if so, convert the initial '%' into |
+// an escaped "%25" so the output can't be misinterpreted. |
+// |
+// |spec| is the input string we're canonicalizing. |
+// |next_input_index| is the index of the next unprocessed character in |spec|. |
+// |input_len| is the length of |spec|. |
+// |last_invalid_percent_index| is the index in |output| of a previously-seen |
+// '%' character. The caller knows this '%' character isn't followed by a valid |
+// escape sequence in the input string. |
+// |output| is the canonicalized output thus far. The caller guarantees this |
+// ends with a '%' followed by one or two characters, and the '%' is the one |
+// pointed to by |last_invalid_percent_index|. The last character in the string |
+// was just unescaped. |
+template<typename CHAR> |
+void CheckForNestedEscapes(const CHAR* spec, |
+ int next_input_index, |
+ int input_len, |
+ int last_invalid_percent_index, |
+ CanonOutput* output) { |
+ const int length = output->length(); |
+ const char last_unescaped_char = output->at(length - 1); |
brettw
2015/09/21 23:25:19
I agree my suggestion is also confusing. I don't a
|
+ |
+ // If |output| currently looks like "%c", we need to try appending the next |
+ // input character to see if this will result in a problematic escape |
+ // sequence. Note that this won't trigger on the first nested escape of a |
+ // two-escape sequence like "%%30%30" -- we'll allow the conversion to |
+ // "%0%30" -- but the second nested escape will be caught by this function |
+ // when it's called again in that case. |
+ const bool append_next_char = last_invalid_percent_index == length - 2; |
+ if (append_next_char) { |
+ // If the input doesn't contain a 7-bit character next, this case won't be a |
+ // problem. |
+ if ((next_input_index == input_len) || (spec[next_input_index] >= 0x80)) |
brettw
2015/09/21 23:25:19
If this compiles, it's fine, but I was concerned y
|
+ return; |
+ output->push_back(static_cast<char>(spec[next_input_index])); |
+ } |
+ |
+ // Now output ends like "%cc". Try to unescape this. |
+ int begin = last_invalid_percent_index; |
+ unsigned char temp; |
+ if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) { |
+ // New escape sequence found. Overwrite the characters following the '%' |
+ // with "25", and push_back() the one or two characters that were following |
+ // the '%' when we were called. |
+ if (!append_next_char) |
+ output->push_back(output->at(last_invalid_percent_index + 1)); |
+ output->set(last_invalid_percent_index + 1, '2'); |
+ output->set(last_invalid_percent_index + 2, '5'); |
+ output->push_back(last_unescaped_char); |
+ } else if (append_next_char) { |
+ // Not a valid escape sequence, but we still need to undo appending the next |
+ // source character so the caller can process it normally. |
+ output->set_length(length); |
+ } |
+} |
+ |
// Appends the given path to the output. It assumes that if the input path |
// starts with a slash, it should be copied to the output. If no path has |
// already been appended to the output (the case when not resolving |
@@ -182,10 +252,15 @@ bool DoPartialPath(const CHAR* spec, |
CanonOutput* output) { |
int end = path.end(); |
+ // We use this variable to minimize the amount of work done when unescaping -- |
+ // we'll only call CheckForNestedEscapes() when this points at one of the last |
+ // couple of characters in |output|. |
+ int last_invalid_percent_index = INT_MIN; |
+ |
bool success = true; |
for (int i = path.begin; i < end; i++) { |
UCHAR uch = static_cast<UCHAR>(spec[i]); |
- if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) { |
+ if (sizeof(CHAR) > 1 && uch >= 0x80) { |
// We only need to test wide input for having non-ASCII characters. For |
// narrow input, we'll always just use the lookup table. We don't try to |
// do anything tricky with decoding/validating UTF-8. This function will |
@@ -245,33 +320,40 @@ bool DoPartialPath(const CHAR* spec, |
unsigned char unescaped_value; |
if (DecodeEscaped(spec, &i, end, &unescaped_value)) { |
// Valid escape sequence, see if we keep, reject, or unescape it. |
+ // Note that at this point DecodeEscape() will have advanced |i| to |
+ // the last character of the escape sequence. |
char unescaped_flags = kPathCharLookup[unescaped_value]; |
if (unescaped_flags & UNESCAPE) { |
- // This escaped value shouldn't be escaped, copy it. |
+ // This escaped value shouldn't be escaped. Try to copy it. |
output->push_back(unescaped_value); |
- } else if (unescaped_flags & INVALID_BIT) { |
- // Invalid escaped character, copy it and remember the error. |
- output->push_back('%'); |
- output->push_back(static_cast<char>(spec[i - 1])); |
- output->push_back(static_cast<char>(spec[i])); |
- success = false; |
+ // If we just unescaped a value within 2 output characters of the |
+ // '%' from a previously-detected invalid escape sequence, we |
+ // might have an input string with problematic nested escape |
+ // sequences; detect and fix them. |
+ if (last_invalid_percent_index >= (output->length() - 3)) { |
+ CheckForNestedEscapes(spec, i + 1, end, |
+ last_invalid_percent_index, output); |
+ } |
} else { |
- // Valid escaped character but we should keep it escaped. We |
- // don't want to change the case of any hex letters in case |
- // the server is sensitive to that, so we just copy the two |
- // characters without checking (DecodeEscape will have advanced |
- // to the last character of the pair). |
+ // Either this is an invalid escaped character, or it's a valid |
+ // escaped character we should keep escaped. In the first case we |
+ // should just copy it exactly and remember the error. In the |
+ // second we also copy exactly in case the server is sensitive to |
+ // changing the case of any hex letters. |
output->push_back('%'); |
output->push_back(static_cast<char>(spec[i - 1])); |
output->push_back(static_cast<char>(spec[i])); |
+ if (unescaped_flags & INVALID_BIT) |
+ success = false; |
} |
} else { |
- // Invalid escape sequence. IE7 rejects any URLs with such |
- // sequences, while Firefox, IE6, and Safari all pass it through |
- // unchanged. We are more permissive unlike IE7. I don't think this |
- // can cause significant problems, if it does, we should change |
- // to be more like IE7. |
+ // Invalid escape sequence. IE7+ rejects any URLs with such |
+ // sequences, while other browsers pass them through unchanged. We |
+ // use the permissive behavior. |
+ // TODO(brettw): Consider testing IE's strict behavior, which would |
+ // allow removing the code to handle nested escapes above. |
+ last_invalid_percent_index = output->length(); |
output->push_back('%'); |
} |