net/base/escape.cc - Issue 181483008: Don't unescape BiDi control characters in URL components

Side by Side Diff: net/base/escape.cc

Issue 181483008: Don't unescape BiDi control characters in URL components (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addressed comments Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/base/escape.h"	5 #include "net/base/escape.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8	8

9 #include "base/logging.h"	9 #include "base/logging.h"

10 #include "base/memory/scoped_ptr.h"	10 #include "base/memory/scoped_ptr.h"

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
90 // @ A B C D E F G H I J K L M N O	90 // @ A B C D E F G H I J K L M N O

91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

92 // P Q R S T U V W X Y Z [ \ ] ^ _	92 // P Q R S T U V W X Y Z [ \ ] ^ _

93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,	93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,

94 // ` a b c d e f g h i j k l m n o	94 // ` a b c d e f g h i j k l m n o

95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

96 // p q r s t u v w x y z { \| } ~ <NBSP>	96 // p q r s t u v w x y z { \| } ~ <NBSP>

97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0	97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0

98 };	98 };

99	99

	100 // Attempts to unescape the sequence at \|index\| within \|escaped_text\|. If

	101 // successful, sets \|value\| to the unescaped value. Returns whether

	102 // unescaping succeeded.

	103 template<typename STR>

	104 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,

	105 size_t index,

	106 unsigned char* value) {

	107 if (index + 2 < escaped_text.size()) {
	Peter Kasting 2014/02/27 21:37:09 Nit: Use an early-return here, as well as on the I Nit: Use an early-return here, as well as on the IsHexDigit() checks below, to minimize indenting and braces. Parens around binary subexpr. Anuj 2014/02/27 21:55:35 Done. Show quoted text On 2014/02/27 21:37:09, Peter Kasting wrote: > Nit: Use an early-return here, as well as on the IsHexDigit() checks below, to > minimize indenting and braces. > > Parens around binary subexpr. Done.
	108 char current_char = static_cast<char>(escaped_text[index]);
	Peter Kasting 2014/02/27 21:37:09 Nit: Inline into the next statement. Nit: Inline into the next statement. Anuj 2014/02/27 21:55:35 Done. Show quoted text On 2014/02/27 21:37:09, Peter Kasting wrote: > Nit: Inline into the next statement. Done.
	109 if (current_char != '%')

	110 return false;

	111 const typename STR::value_type most_sig_digit(

	112 static_cast<typename STR::value_type>(escaped_text[index + 1]));

	113 const typename STR::value_type least_sig_digit(

	114 static_cast<typename STR::value_type>(escaped_text[index + 2]));

	115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {

	116 value = HexDigitToInt(most_sig_digit) 16 +

	117 HexDigitToInt(least_sig_digit);

	118 return true;

	119 }

	120 }

	121 return false;

	122 }

	123

100 template<typename STR>	124 template<typename STR>

101 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,	125 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,

102 UnescapeRule::Type rules,	126 UnescapeRule::Type rules,

103 std::vector<size_t>* offsets_for_adjustment) {	127 std::vector<size_t>* offsets_for_adjustment) {

104 if (offsets_for_adjustment) {	128 if (offsets_for_adjustment) {

105 std::for_each(offsets_for_adjustment->begin(),	129 std::for_each(offsets_for_adjustment->begin(),

106 offsets_for_adjustment->end(),	130 offsets_for_adjustment->end(),

107 base::LimitOffset<STR>(escaped_text.length()));	131 base::LimitOffset<STR>(escaped_text.length()));

108 }	132 }

109 // Do not unescape anything, return the \|escaped_text\| text.	133 // Do not unescape anything, return the \|escaped_text\| text.

110 if (rules == UnescapeRule::NONE)	134 if (rules == UnescapeRule::NONE)

111 return escaped_text;	135 return escaped_text;

112	136

113 // The output of the unescaping is always smaller than the input, so we can	137 // The output of the unescaping is always smaller than the input, so we can

114 // reserve the input size to make sure we have enough buffer and don't have	138 // reserve the input size to make sure we have enough buffer and don't have

115 // to allocate in the loop below.	139 // to allocate in the loop below.

116 STR result;	140 STR result;

117 result.reserve(escaped_text.length());	141 result.reserve(escaped_text.length());

118	142

119 // Locations of adjusted text.	143 // Locations of adjusted text.

120 net::internal::AdjustEncodingOffset::Adjustments adjustments;	144 net::internal::AdjustEncodingOffset::Adjustments adjustments;

121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {	145 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {

122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {	146 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {

123 // Non ASCII character, append as is.	147 // Non ASCII character, append as is.

124 result.push_back(escaped_text[i]);	148 result.push_back(escaped_text[i]);

125 continue;	149 continue;

126 }	150 }

127	151

128 char current_char = static_cast<char>(escaped_text[i]);	152 unsigned char value;

129 if (current_char == '%' && i + 2 < max) {	153 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &value)) {

130 const typename STR::value_type most_sig_digit(	154 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi

131 static_cast<typename STR::value_type>(escaped_text[i + 1]));	155 // control characters are not allowed to appear unescaped in URLs:

132 const typename STR::value_type least_sig_digit(	156 //

133 static_cast<typename STR::value_type>(escaped_text[i + 2]));	157 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E)

134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {	158 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F)

135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 +	159 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)

136 HexDigitToInt(least_sig_digit);	160 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)

137 if (value >= 0x80 \|\| // Unescape all high-bit characters.	161 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)

138 // For 7-bit characters, the lookup table tells us all valid chars.	162 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)

139 (kUrlUnescape[value] \|\|	163 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)

140 // ...and we allow some additional unescaping when flags are set.	164 //

141 (value == ' ' && (rules & UnescapeRule::SPACES)) \|\|	165 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC

142 // Allow any of the prohibited but non-control characters when	166 // 3987 above has since added some new BiDi control characters.

143 // we're doing "special" chars.	167 // http://www.unicode.org/reports/tr9

144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) \|\|	168 //

145 // Additionally allow control characters if requested.	169 // U+061C ARABIC LETTER MARK (%D8%9C)

146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {	170 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)

147 // Use the unescaped version of the character.	171 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)

148 adjustments.push_back(i);	172 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)

149 result.push_back(value);	173 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)

150 i += 2;	174

151 } else {	175 unsigned char tmp_value1;
	Peter Kasting 2014/02/27 21:37:09 Nit: Just call this "temp" and use it for both tmp Nit: Just call this "temp" and use it for both tmp_value1 and tmp_value2; see below for how I'd do this. Anuj 2014/02/27 21:55:35 Done. Show quoted text On 2014/02/27 21:37:09, Peter Kasting wrote: > Nit: Just call this "temp" and use it for both tmp_value1 and tmp_value2; see > below for how I'd do this. Done.
152 // Keep escaped. Append a percent and we'll get the following two	176 if (value == 0xD8) {
	Peter Kasting 2014/02/27 21:37:09 Nit: Combine conditionals here and hoist the comme Nit: Combine conditionals here and hoist the comment: // Check for ALM. unsigned char temp; if ((value == 0xD8) && UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &temp) && (temp == 0x9C)) { ... Anuj 2014/02/27 21:55:35 Done. Show quoted text On 2014/02/27 21:37:09, Peter Kasting wrote: > Nit: Combine conditionals here and hoist the comment: > > // Check for ALM. > unsigned char temp; > if ((value == 0xD8) && > UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &temp) && > (temp == 0x9C)) { > ... Done.
153 // digits on the next loops through.	177 // Possible Arabic Letter Mark.

154 result.push_back('%');	178 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1) &&

	179 (tmp_value1 == 0x9c)) {

	180 result.append(escaped_text, i, 6);
	Peter Kasting 2014/02/27 21:37:09 Nit: Indented too far Nit: Indented too far Anuj 2014/02/27 21:55:35 Done. Show quoted text On 2014/02/27 21:37:09, Peter Kasting wrote: > Nit: Indented too far Done.
	181 i += 5;

	182 continue;

155 }	183 }

	184 }

	185

	186 if (value == 0xE2) {
	Peter Kasting 2014/02/27 21:37:09 Nit: Similarly: // Check for other BiDi con Nit: Similarly: // Check for other BiDi control characters. if ((value == 0xE2) && UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &temp) && ((temp == 0x80) && UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &temp) && ((temp == 0x8E) \|\| (temp == 0x8F) \|\| ((temp >= 0xAA) && (temp <= 0xAE)))) \|\| ((temp == 0x81) && UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &temp) && (temp >= 0xA6) && (temp <= 0xA9))) { ... Anuj 2014/02/27 21:55:35 I will pass on this change. I think compiler shoul Show quoted text On 2014/02/27 21:37:09, Peter Kasting wrote: > Nit: Similarly: > > // Check for other BiDi control characters. > if ((value == 0xE2) && > UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &temp) && > ((temp == 0x80) && > UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &temp) && > ((temp == 0x8E) \|\| (temp == 0x8F) \|\| > ((temp >= 0xAA) && (temp <= 0xAE)))) \|\| > ((temp == 0x81) && > UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &temp) && > (temp >= 0xA6) && (temp <= 0xA9))) { > ... I will pass on this change. I think compiler should be able to handle this much optimization. Peter Kasting 2014/02/27 22:08:38 It's not a question of optimized code, it's a ques Show quoted text On 2014/02/27 21:55:35, Anuj wrote: > On 2014/02/27 21:37:09, Peter Kasting wrote: > > Nit: Similarly: > > > > // Check for other BiDi control characters. > > if ((value == 0xE2) && > > UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &temp) && > > ((temp == 0x80) && > > UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &temp) && > > ((temp == 0x8E) \|\| (temp == 0x8F) \|\| > > ((temp >= 0xAA) && (temp <= 0xAE)))) \|\| > > ((temp == 0x81) && > > UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &temp) && > > (temp >= 0xA6) && (temp <= 0xA9))) { > > ... > > > I will pass on this change. I think compiler should be able to handle this much > optimization. It's not a question of optimized code, it's a question of trying to read what's here. The use of additional temps makes this confusing and the variable names don't help. An alternate implementation that is similar to yours in structure but preserves the "shorter code" aspect would be to rename \|value\| to \|first_byte\|, \|temp\| to \|second_byte\|, and then do this: // Check for other BiDi control characters. if ((first_byte == 0xE2) && UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && ((second_byte == 0x80) \|\| (second_byte == 0x81)) { unsigned char third_byte; if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) && ((second_byte == 0x80) ? ((third_byte == 0x8E) \|\| (third_byte == 0x8F) \|\| ((third_byte >= 0xAA) && (third_byte <= 0xAE))) : ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) { ... Anuj 2014/02/27 22:24:51 I made the exact same change just as you sent this Show quoted text On 2014/02/27 22:08:38, Peter Kasting wrote: > On 2014/02/27 21:55:35, Anuj wrote: > > On 2014/02/27 21:37:09, Peter Kasting wrote: > > > Nit: Similarly: > > > > > > // Check for other BiDi control characters. > > > if ((value == 0xE2) && > > > UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &temp) && > > > ((temp == 0x80) && > > > UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &temp) && > > > ((temp == 0x8E) \|\| (temp == 0x8F) \|\| > > > ((temp >= 0xAA) && (temp <= 0xAE)))) \|\| > > > ((temp == 0x81) && > > > UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &temp) && > > > (temp >= 0xA6) && (temp <= 0xA9))) { > > > ... > > > > > > I will pass on this change. I think compiler should be able to handle this > much > > optimization. > > It's not a question of optimized code, it's a question of trying to read what's > here. The use of additional temps makes this confusing and the variable names > don't help. > > An alternate implementation that is similar to yours in structure but preserves > the "shorter code" aspect would be to rename \|value\| to \|first_byte\|, \|temp\| to > \|second_byte\|, and then do this: > > // Check for other BiDi control characters. > if ((first_byte == 0xE2) && > UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) && > ((second_byte == 0x80) \|\| (second_byte == 0x81)) { > unsigned char third_byte; > if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) && > ((second_byte == 0x80) ? > ((third_byte == 0x8E) \|\| (third_byte == 0x8F) \|\| > ((third_byte >= 0xAA) && (third_byte <= 0xAE))) : > ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) { > ... I made the exact same change just as you sent this comment :)
	187 // Possible BiDi control character.

	188 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &tmp_value1)) {

	189 if ((tmp_value1 == 0x80) \|\| (tmp_value1 == 0x81)) {

	190 unsigned char tmp_value2;

	191 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &tmp_value2)) {

	192 // Embeddings, Overrides and Marks have second byte 0x80.

	193 bool is_bidi_control_char = ((tmp_value1 == 0x80) &&

	194 ((tmp_value2 == 0x8E) \|\| (tmp_value2 == 0x8F) \|\|

	195 ((tmp_value2 >= 0xAA) && (tmp_value2 <= 0xAE))));

	196

	197 // Isolates have second byte 0x81.

	198 is_bidi_control_char \|= ((tmp_value1 == 0x81) &&

	199 ((tmp_value2 >= 0xA6) && (tmp_value2 <= 0xA9)));

	200 if (is_bidi_control_char) {

	201 result.append(escaped_text, i, 9);

	202 i += 8;

	203 continue;

	204 }

	205 }

	206 }

	207 }

	208 }

	209

	210 if (value >= 0x80 \|\| // Unescape all high-bit characters.

	211 // For 7-bit characters, the lookup table tells us all valid chars.

	212 (kUrlUnescape[value] \|\|

	213 // ...and we allow some additional unescaping when flags are set.

	214 (value == ' ' && (rules & UnescapeRule::SPACES)) \|\|

	215 // Allow any of the prohibited but non-control characters when

	216 // we're doing "special" chars.

	217 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) \|\|

	218 // Additionally allow control characters if requested.

	219 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {

	220 // Use the unescaped version of the character.

	221 adjustments.push_back(i);

	222 result.push_back(value);

	223 i += 2;

156 } else {	224 } else {

157 // Invalid escape sequence, just pass the percent through and continue	225 // Keep escaped. Append a percent and we'll get the following two

158 // right after it.	226 // digits on the next loops through.

159 result.push_back('%');	227 result.push_back('%');

160 }	228 }

161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&	229 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&

162 escaped_text[i] == '+') {	230 escaped_text[i] == '+') {

163 result.push_back(' ');	231 result.push_back(' ');

164 } else {	232 } else {

165 // Normal case for unescaped characters.	233 // Normal case for unescaped characters.

166 result.push_back(escaped_text[i]);	234 result.push_back(escaped_text[i]);

167 }	235 }

168 }	236 }

(...skipping 216 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
385 return;	453 return;

386 }	454 }

387 adjusted_offset -= 2;	455 adjusted_offset -= 2;

388 }	456 }

389 offset = adjusted_offset;	457 offset = adjusted_offset;

390 }	458 }

391	459

392 } // namespace internal	460 } // namespace internal

393	461

394 } // namespace net	462 } // namespace net

OLD	NEW

« base/i18n/rtl.h ('K') | « base/i18n/rtl.h ('k') | net/base/escape_unittest.cc » ('j') | no next file with comments »