net/base/escape.cc - Issue 181483008: Don't unescape BiDi control characters in URL components

Unified Diff: net/base/escape.cc

Issue 181483008: Don't unescape BiDi control characters in URL components (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addressed comments Created 6 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: net/base/escape.cc

diff --git a/net/base/escape.cc b/net/base/escape.cc

index 134a98652013177eaf7f52d2fa8c504f53654607..21cfe344496b484fe6fc81268d65726f2c15a5c1 100644

--- a/net/base/escape.cc

+++ b/net/base/escape.cc

@@ -97,6 +97,30 @@ const char kUrlUnescape[128] = {

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0

};

+// Attempts to unescape the sequence at |index| within |escaped_text|. If

+// successful, sets |value| to the unescaped value. Returns whether

+// unescaping succeeded.

+template<typename STR>

+bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,

+ size_t index,

+ unsigned char* value) {

+ if (index + 2 < escaped_text.size()) {

Peter Kasting 2014/02/27 21:37:09 Nit: Use an early-return here, as well as on the I

Anuj 2014/02/27 21:55:35 Done.

+ char current_char = static_cast<char>(escaped_text[index]);

Peter Kasting 2014/02/27 21:37:09 Nit: Inline into the next statement.

Anuj 2014/02/27 21:55:35 Done.

+ if (current_char != '%')

+ return false;

+ const typename STR::value_type most_sig_digit(

+ static_cast<typename STR::value_type>(escaped_text[index + 1]));

+ const typename STR::value_type least_sig_digit(

+ static_cast<typename STR::value_type>(escaped_text[index + 2]));

+ if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {

+ *value = HexDigitToInt(most_sig_digit) * 16 +

+ HexDigitToInt(least_sig_digit);

+ return true;

+ }

+ return false;

template<typename STR>

STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,

UnescapeRule::Type rules,

@@ -125,37 +149,81 @@ STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,

continue;

}

- char current_char = static_cast<char>(escaped_text[i]);

- if (current_char == '%' && i + 2 < max) {

- const typename STR::value_type most_sig_digit(

- static_cast<typename STR::value_type>(escaped_text[i + 1]));

- const typename STR::value_type least_sig_digit(

- static_cast<typename STR::value_type>(escaped_text[i + 2]));

- if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {

- unsigned char value = HexDigitToInt(most_sig_digit) * 16 +

- HexDigitToInt(least_sig_digit);

- if (value >= 0x80 || // Unescape all high-bit characters.

- // For 7-bit characters, the lookup table tells us all valid chars.

- (kUrlUnescape[value] ||

- // ...and we allow some additional unescaping when flags are set.

- (value == ' ' && (rules & UnescapeRule::SPACES)) ||

- // Allow any of the prohibited but non-control characters when

- // we're doing "special" chars.

- (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||

- // Additionally allow control characters if requested.

- (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {

- // Use the unescaped version of the character.

- adjustments.push_back(i);

- result.push_back(value);

- i += 2;

- } else {

- // Keep escaped. Append a percent and we'll get the following two

- // digits on the next loops through.

- result.push_back('%');

+ unsigned char value;

+ if (UnescapeUnsignedCharAtIndex(escaped_text, i, &value)) {

+ // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi

+ // control characters are not allowed to appear unescaped in URLs:

+ //

+ // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E)

+ // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F)

+ // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)

+ // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)

+ // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)

+ // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)

+ // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)

+ //

+ // Additionally, the Unicode Technical Report (TR9) as referenced by RFC

+ // 3987 above has since added some new BiDi control characters.

+ // http://www.unicode.org/reports/tr9

+ //

+ // U+061C ARABIC LETTER MARK (%D8%9C)

+ // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)

+ // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)

+ // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)

+ // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)

+ unsigned char tmp_value1;

Peter Kasting 2014/02/27 21:37:09 Nit: Just call this "temp" and use it for both tmp

Anuj 2014/02/27 21:55:35 Done.

+ if (value == 0xD8) {

Peter Kasting 2014/02/27 21:37:09 Nit: Combine conditionals here and hoist the comme

Anuj 2014/02/27 21:55:35 Done.

+ // Possible Arabic Letter Mark.

+ if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1) &&

+ (tmp_value1 == 0x9c)) {

+ result.append(escaped_text, i, 6);

Peter Kasting 2014/02/27 21:37:09 Nit: Indented too far

Anuj 2014/02/27 21:55:35 Done.

+ i += 5;

+ continue;

}

+ }

+ if (value == 0xE2) {

Peter Kasting 2014/02/27 21:37:09 Nit: Similarly: // Check for other BiDi con

Anuj 2014/02/27 21:55:35 I will pass on this change. I think compiler shoul

Peter Kasting 2014/02/27 22:08:38 It's not a question of optimized code, it's a ques

Anuj 2014/02/27 22:24:51 I made the exact same change just as you sent this

+ // Possible BiDi control character.

+ if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1)) {

+ if ((tmp_value1 == 0x80) || (tmp_value1 == 0x81)) {

+ unsigned char tmp_value2;

+ if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &tmp_value2)) {

+ // Embeddings, Overrides and Marks have second byte 0x80.

+ bool is_bidi_control_char = ((tmp_value1 == 0x80) &&

+ ((tmp_value2 == 0x8E) || (tmp_value2 == 0x8F) ||

+ ((tmp_value2 >= 0xAA) && (tmp_value2 <= 0xAE))));

+ // Isolates have second byte 0x81.

+ is_bidi_control_char |= ((tmp_value1 == 0x81) &&

+ ((tmp_value2 >= 0xA6) && (tmp_value2 <= 0xA9)));

+ if (is_bidi_control_char) {

+ result.append(escaped_text, i, 9);

+ i += 8;

+ continue;

+ }

+ if (value >= 0x80 || // Unescape all high-bit characters.

+ // For 7-bit characters, the lookup table tells us all valid chars.

+ (kUrlUnescape[value] ||

+ // ...and we allow some additional unescaping when flags are set.

+ (value == ' ' && (rules & UnescapeRule::SPACES)) ||

+ // Allow any of the prohibited but non-control characters when

+ // we're doing "special" chars.

+ (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||

+ // Additionally allow control characters if requested.

+ (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {

+ // Use the unescaped version of the character.

+ adjustments.push_back(i);

+ result.push_back(value);

+ i += 2;

} else {

- // Invalid escape sequence, just pass the percent through and continue

- // right after it.

+ // Keep escaped. Append a percent and we'll get the following two

+ // digits on the next loops through.

result.push_back('%');

}

} else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&

« base/i18n/rtl.h ('K') | « base/i18n/rtl.h ('k') | net/base/escape_unittest.cc » ('j') | no next file with comments »