Chromium Code Reviews| Index: net/base/escape.cc |
| diff --git a/net/base/escape.cc b/net/base/escape.cc |
| index 134a98652013177eaf7f52d2fa8c504f53654607..21cfe344496b484fe6fc81268d65726f2c15a5c1 100644 |
| --- a/net/base/escape.cc |
| +++ b/net/base/escape.cc |
| @@ -97,6 +97,30 @@ const char kUrlUnescape[128] = { |
| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0 |
| }; |
| +// Attempts to unescape the sequence at |index| within |escaped_text|. If |
| +// successful, sets |value| to the unescaped value. Returns whether |
| +// unescaping succeeded. |
| +template<typename STR> |
| +bool UnescapeUnsignedCharAtIndex(const STR& escaped_text, |
| + size_t index, |
| + unsigned char* value) { |
| + if (index + 2 < escaped_text.size()) { |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Use an early-return here, as well as on the I
Anuj
2014/02/27 21:55:35
Done.
|
| + char current_char = static_cast<char>(escaped_text[index]); |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Inline into the next statement.
Anuj
2014/02/27 21:55:35
Done.
|
| + if (current_char != '%') |
| + return false; |
| + const typename STR::value_type most_sig_digit( |
| + static_cast<typename STR::value_type>(escaped_text[index + 1])); |
| + const typename STR::value_type least_sig_digit( |
| + static_cast<typename STR::value_type>(escaped_text[index + 2])); |
| + if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { |
| + *value = HexDigitToInt(most_sig_digit) * 16 + |
| + HexDigitToInt(least_sig_digit); |
| + return true; |
| + } |
| + } |
| + return false; |
| +} |
| + |
| template<typename STR> |
| STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, |
| UnescapeRule::Type rules, |
| @@ -125,37 +149,81 @@ STR UnescapeURLWithOffsetsImpl(const STR& escaped_text, |
| continue; |
| } |
| - char current_char = static_cast<char>(escaped_text[i]); |
| - if (current_char == '%' && i + 2 < max) { |
| - const typename STR::value_type most_sig_digit( |
| - static_cast<typename STR::value_type>(escaped_text[i + 1])); |
| - const typename STR::value_type least_sig_digit( |
| - static_cast<typename STR::value_type>(escaped_text[i + 2])); |
| - if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) { |
| - unsigned char value = HexDigitToInt(most_sig_digit) * 16 + |
| - HexDigitToInt(least_sig_digit); |
| - if (value >= 0x80 || // Unescape all high-bit characters. |
| - // For 7-bit characters, the lookup table tells us all valid chars. |
| - (kUrlUnescape[value] || |
| - // ...and we allow some additional unescaping when flags are set. |
| - (value == ' ' && (rules & UnescapeRule::SPACES)) || |
| - // Allow any of the prohibited but non-control characters when |
| - // we're doing "special" chars. |
| - (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || |
| - // Additionally allow control characters if requested. |
| - (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { |
| - // Use the unescaped version of the character. |
| - adjustments.push_back(i); |
| - result.push_back(value); |
| - i += 2; |
| - } else { |
| - // Keep escaped. Append a percent and we'll get the following two |
| - // digits on the next loops through. |
| - result.push_back('%'); |
| + unsigned char value; |
| + if (UnescapeUnsignedCharAtIndex(escaped_text, i, &value)) { |
| + // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi |
| + // control characters are not allowed to appear unescaped in URLs: |
| + // |
| + // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E) |
| + // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F) |
| + // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA) |
| + // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB) |
| + // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC) |
| + // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD) |
| + // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE) |
| + // |
| + // Additionally, the Unicode Technical Report (TR9) as referenced by RFC |
| + // 3987 above has since added some new BiDi control characters. |
| + // http://www.unicode.org/reports/tr9 |
| + // |
| + // U+061C ARABIC LETTER MARK (%D8%9C) |
| + // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6) |
| + // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7) |
| + // U+2068 FIRST STRONG ISOLATE (%E2%81%A8) |
| + // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9) |
| + |
| + unsigned char tmp_value1; |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Just call this "temp" and use it for both tmp
Anuj
2014/02/27 21:55:35
Done.
|
| + if (value == 0xD8) { |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Combine conditionals here and hoist the comme
Anuj
2014/02/27 21:55:35
Done.
|
| + // Possible Arabic Letter Mark. |
| + if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1) && |
| + (tmp_value1 == 0x9c)) { |
| + result.append(escaped_text, i, 6); |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Indented too far
Anuj
2014/02/27 21:55:35
Done.
|
| + i += 5; |
| + continue; |
| } |
| + } |
| + |
| + if (value == 0xE2) { |
|
Peter Kasting
2014/02/27 21:37:09
Nit: Similarly:
// Check for other BiDi con
Anuj
2014/02/27 21:55:35
I will pass on this change. I think compiler shoul
Peter Kasting
2014/02/27 22:08:38
It's not a question of optimized code, it's a ques
Anuj
2014/02/27 22:24:51
I made the exact same change just as you sent this
|
| + // Possible BiDi control character. |
| + if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1)) { |
| + if ((tmp_value1 == 0x80) || (tmp_value1 == 0x81)) { |
| + unsigned char tmp_value2; |
| + if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &tmp_value2)) { |
| + // Embeddings, Overrides and Marks have second byte 0x80. |
| + bool is_bidi_control_char = ((tmp_value1 == 0x80) && |
| + ((tmp_value2 == 0x8E) || (tmp_value2 == 0x8F) || |
| + ((tmp_value2 >= 0xAA) && (tmp_value2 <= 0xAE)))); |
| + |
| + // Isolates have second byte 0x81. |
| + is_bidi_control_char |= ((tmp_value1 == 0x81) && |
| + ((tmp_value2 >= 0xA6) && (tmp_value2 <= 0xA9))); |
| + if (is_bidi_control_char) { |
| + result.append(escaped_text, i, 9); |
| + i += 8; |
| + continue; |
| + } |
| + } |
| + } |
| + } |
| + } |
| + |
| + if (value >= 0x80 || // Unescape all high-bit characters. |
| + // For 7-bit characters, the lookup table tells us all valid chars. |
| + (kUrlUnescape[value] || |
| + // ...and we allow some additional unescaping when flags are set. |
| + (value == ' ' && (rules & UnescapeRule::SPACES)) || |
| + // Allow any of the prohibited but non-control characters when |
| + // we're doing "special" chars. |
| + (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) || |
| + // Additionally allow control characters if requested. |
| + (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) { |
| + // Use the unescaped version of the character. |
| + adjustments.push_back(i); |
| + result.push_back(value); |
| + i += 2; |
| } else { |
| - // Invalid escape sequence, just pass the percent through and continue |
| - // right after it. |
| + // Keep escaped. Append a percent and we'll get the following two |
| + // digits on the next loops through. |
| result.push_back('%'); |
| } |
| } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) && |