net/base/escape.cc - Issue 181483008: Don't unescape BiDi control characters in URL components

Side by Side Diff: net/base/escape.cc

Issue 181483008: Don't unescape BiDi control characters in URL components (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addressed comments - 3 Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/base/escape.h"	5 #include "net/base/escape.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8	8

9 #include "base/logging.h"	9 #include "base/logging.h"

10 #include "base/memory/scoped_ptr.h"	10 #include "base/memory/scoped_ptr.h"

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
90 // @ A B C D E F G H I J K L M N O	90 // @ A B C D E F G H I J K L M N O

91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

92 // P Q R S T U V W X Y Z [ \ ] ^ _	92 // P Q R S T U V W X Y Z [ \ ] ^ _

93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,	93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,

94 // ` a b c d e f g h i j k l m n o	94 // ` a b c d e f g h i j k l m n o

95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,	95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

96 // p q r s t u v w x y z { \| } ~ <NBSP>	96 // p q r s t u v w x y z { \| } ~ <NBSP>

97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0	97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0

98 };	98 };

99	99

	100 // Attempts to unescape the sequence at \|index\| within \|escaped_text\|. If

	101 // successful, sets \|value\| to the unescaped value. Returns whether

	102 // unescaping succeeded.

	103 template<typename STR>

	104 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,

	105 size_t index,

	106 unsigned char* value) {

	107 if ((index + 2) >= escaped_text.size())

	108 return false;

	109 if (escaped_text[index] != '%')

	110 return false;

	111 const typename STR::value_type most_sig_digit(

	112 static_cast<typename STR::value_type>(escaped_text[index + 1]));

	113 const typename STR::value_type least_sig_digit(

	114 static_cast<typename STR::value_type>(escaped_text[index + 2]));

	115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {

	116 value = HexDigitToInt(most_sig_digit) 16 +

	117 HexDigitToInt(least_sig_digit);

	118 return true;

	119 }

	120 return false;

	121 }

	122

100 template<typename STR>	123 template<typename STR>

101 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,	124 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,

102 UnescapeRule::Type rules,	125 UnescapeRule::Type rules,

103 std::vector<size_t>* offsets_for_adjustment) {	126 std::vector<size_t>* offsets_for_adjustment) {

104 if (offsets_for_adjustment) {	127 if (offsets_for_adjustment) {

105 std::for_each(offsets_for_adjustment->begin(),	128 std::for_each(offsets_for_adjustment->begin(),

106 offsets_for_adjustment->end(),	129 offsets_for_adjustment->end(),

107 base::LimitOffset<STR>(escaped_text.length()));	130 base::LimitOffset<STR>(escaped_text.length()));

108 }	131 }

109 // Do not unescape anything, return the \|escaped_text\| text.	132 // Do not unescape anything, return the \|escaped_text\| text.

110 if (rules == UnescapeRule::NONE)	133 if (rules == UnescapeRule::NONE)

111 return escaped_text;	134 return escaped_text;

112	135

113 // The output of the unescaping is always smaller than the input, so we can	136 // The output of the unescaping is always smaller than the input, so we can

114 // reserve the input size to make sure we have enough buffer and don't have	137 // reserve the input size to make sure we have enough buffer and don't have

115 // to allocate in the loop below.	138 // to allocate in the loop below.

116 STR result;	139 STR result;

117 result.reserve(escaped_text.length());	140 result.reserve(escaped_text.length());

118	141

119 // Locations of adjusted text.	142 // Locations of adjusted text.

120 net::internal::AdjustEncodingOffset::Adjustments adjustments;	143 net::internal::AdjustEncodingOffset::Adjustments adjustments;

121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {	144 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {

122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {	145 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {

123 // Non ASCII character, append as is.	146 // Non ASCII character, append as is.

124 result.push_back(escaped_text[i]);	147 result.push_back(escaped_text[i]);

125 continue;	148 continue;

126 }	149 }

127	150

128 char current_char = static_cast<char>(escaped_text[i]);	151 unsigned char first_byte;

129 if (current_char == '%' && i + 2 < max) {	152 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) {

130 const typename STR::value_type most_sig_digit(	153 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi

131 static_cast<typename STR::value_type>(escaped_text[i + 1]));	154 // control characters are not allowed to appear unescaped in URLs:

132 const typename STR::value_type least_sig_digit(	155 //

133 static_cast<typename STR::value_type>(escaped_text[i + 2]));	156 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E)

134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {	157 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F)

135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 +	158 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)

136 HexDigitToInt(least_sig_digit);	159 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)

137 if (value >= 0x80 \|\| // Unescape all high-bit characters.	160 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)

138 // For 7-bit characters, the lookup table tells us all valid chars.	161 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)

139 (kUrlUnescape[value] \|\|	162 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)

140 // ...and we allow some additional unescaping when flags are set.	163 //

141 (value == ' ' && (rules & UnescapeRule::SPACES)) \|\|	164 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC

142 // Allow any of the prohibited but non-control characters when	165 // 3987 above has since added some new BiDi control characters.

143 // we're doing "special" chars.	166 // http://www.unicode.org/reports/tr9

144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) \|\|	167 //

145 // Additionally allow control characters if requested.	168 // U+061C ARABIC LETTER MARK (%D8%9C)

146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {	169 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)

147 // Use the unescaped version of the character.	170 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)

148 adjustments.push_back(i);	171 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)

149 result.push_back(value);	172 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)

150 i += 2;	173

151 } else {	174 unsigned char second_byte;

152 // Keep escaped. Append a percent and we'll get the following two	175 // Check for ALM.

153 // digits on the next loops through.	176 if ((first_byte == 0xD8) &&

154 result.push_back('%');	177 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&

	178 (second_byte == 0x9c)) {

	179 result.append(escaped_text, i, 6);

	180 i += 5;

	181 continue;

	182 }

	183

	184 // Check for other BiDi control characters.

	185 if ((first_byte == 0xE2) &&

	186 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&

	187 ((second_byte == 0x80) \|\| (second_byte == 0x81))) {

	188 unsigned char third_byte;

	189 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) &&

	190 ((second_byte == 0x80) ?

	191 ((third_byte == 0x8E) \|\| (third_byte == 0x8F) \|\|

	192 ((third_byte >= 0xAA) && (third_byte <= 0xAE))) :

	193 ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) {

	194 result.append(escaped_text, i, 9);

	195 i += 8;

	196 continue;

155 }	197 }

	198 }

	199

	200 if (first_byte >= 0x80 \|\| // Unescape all high-bit characters.

	201 // For 7-bit characters, the lookup table tells us all valid chars.

	202 (kUrlUnescape[first_byte] \|\|

	203 // ...and we allow some additional unescaping when flags are set.

	204 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) \|\|

	205 // Allow any of the prohibited but non-control characters when

	206 // we're doing "special" chars.

	207 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) \|\|

	208 // Additionally allow control characters if requested.

	209 (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {

	210 // Use the unescaped version of the character.

	211 adjustments.push_back(i);

	212 result.push_back(first_byte);

	213 i += 2;

156 } else {	214 } else {

157 // Invalid escape sequence, just pass the percent through and continue	215 // Keep escaped. Append a percent and we'll get the following two

158 // right after it.	216 // digits on the next loops through.

159 result.push_back('%');	217 result.push_back('%');

160 }	218 }

161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&	219 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&

162 escaped_text[i] == '+') {	220 escaped_text[i] == '+') {

163 result.push_back(' ');	221 result.push_back(' ');

164 } else {	222 } else {

165 // Normal case for unescaped characters.	223 // Normal case for unescaped characters.

166 result.push_back(escaped_text[i]);	224 result.push_back(escaped_text[i]);

167 }	225 }

168 }	226 }

(...skipping 216 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
385 return;	443 return;

386 }	444 }

387 adjusted_offset -= 2;	445 adjusted_offset -= 2;

388 }	446 }

389 offset = adjusted_offset;	447 offset = adjusted_offset;

390 }	448 }

391	449

392 } // namespace internal	450 } // namespace internal

393	451

394 } // namespace net	452 } // namespace net

OLD	NEW

« no previous file with comments | « no previous file | net/base/escape_unittest.cc » ('j') | no next file with comments »