net/base/escape.cc - Issue 1180393003: Added characters that look like padlocks to URL unescaping blacklist.

Side by Side Diff: net/base/escape.cc

Issue 1180393003: Added characters that look like padlocks to URL unescaping blacklist. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/base/escape.h"	5 #include "net/base/escape.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8	8

9 #include "base/logging.h"	9 #include "base/logging.h"

10 #include "base/memory/scoped_ptr.h"	10 #include "base/memory/scoped_ptr.h"

(...skipping 146 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
157 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))	157 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))

158 return false;	158 return false;

159 if (second_byte == 0x80) {	159 if (second_byte == 0x80) {

160 return third_byte == 0x8E \|\|	160 return third_byte == 0x8E \|\|

161 third_byte == 0x8F \|\|	161 third_byte == 0x8F \|\|

162 (third_byte >= 0xAA && third_byte <= 0xAE);	162 (third_byte >= 0xAA && third_byte <= 0xAE);

163 }	163 }

164 return third_byte >= 0xA6 && third_byte <= 0xA9;	164 return third_byte >= 0xA6 && third_byte <= 0xA9;

165 }	165 }

166	166

	167 // Returns true if there is a four-byte banned char at \|index\|. \|first_byte\| is

	168 // the byte at \|index\|.

	169 template <typename STR>

	170 bool HasFourByteBannedCharAtIndex(const STR& escaped_text,

	171 unsigned char first_byte,

	172 size_t index) {

	173 // The following characters are blacklisted for spoofability concerns.

	174 // U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F)

	175 // U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90)

	176 // U+1F512 LOCK (%F0%9F%94%92)

	177 // U+1F513 OPEN LOCK (%F0%9F%94%93)

	178 if (first_byte != 0xF0)

	179 return false;

	180 unsigned char second_byte;

	181 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
	Peter Kasting 2015/06/16 06:36:15 Nit: Combine these next two conditionals. Nit: Combine these next two conditionals. Matt Giuca 2015/06/16 07:55:34 Do you mean if (!...(..., &second_byte) \|\| second Do you mean if (!...(..., &second_byte) \|\| second_byte != 0x9f) return false; ? Or do you actually mean combining these calls to UnescapeUnsignedCharAtIndex into a single giant if statement? I'm hesitant to do any of these changes as I feel they make the code less readable (than simply checking each failing condition separately and returning false at that time). Also there is the precedent of the other two functions which I copied this one from. I'll let Matt give an opinion here. Peter Kasting 2015/06/16 08:05:08 Yes, I meant this. This reads more clearly to me Show quoted text On 2015/06/16 07:55:34, Matt Giuca wrote: > Do you mean > > if (!...(..., &second_byte) \|\| second_byte != 0x9f) > return false; Yes, I meant this. This reads more clearly to me than two conditionals. Show quoted text > Or do you actually mean combining these calls to UnescapeUnsignedCharAtIndex > into a single giant if statement? That's the very last "optional" comment below; I'm not convinced this would be a good change. mmenke 2015/06/17 19:50:16 I think 4 ifs (1 per character is) a little easie Show quoted text On 2015/06/16 08:05:08, Peter Kasting wrote: > On 2015/06/16 07:55:34, Matt Giuca wrote: > > Do you mean > > > > if (!...(..., &second_byte) \|\| second_byte != 0x9f) > > return false; > > Yes, I meant this. This reads more clearly to me than two conditionals. > > > Or do you actually mean combining these calls to UnescapeUnsignedCharAtIndex > > into a single giant if statement? > > That's the very last "optional" comment below; I'm not convinced this would be a > good change. I think 4 ifs (1 per character is) a little easier to read than one massive if. Could keep the last two separate. I don't feel strongly about any of the options here, though. Matt Giuca 2015/06/22 07:27:03 Done. (Not a fan of this change, but don't want to Done. (Not a fan of this change, but don't want to bikeshed.)
	182 return false;

	183 if (second_byte != 0x9F)

	184 return false;

	185 unsigned char third_byte;

	186 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))
	Peter Kasting 2015/06/16 06:36:15 Nit: And these. Nit: And these. Matt Giuca 2015/06/22 07:27:03 Done. Show quoted text On 2015/06/16 06:36:15, Peter Kasting wrote: > Nit: And these. Done.
	187 return false;

	188 if (third_byte != 0x94)

	189 return false;

	190 unsigned char fourth_byte;

	191 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 9, &fourth_byte))
	Peter Kasting 2015/06/16 06:36:15 Nit: And maybe these (more optional). Also option Nit: And maybe these (more optional). Also optional: Combine all the conditionals. Matt Giuca 2015/06/22 07:27:03 Done. Show quoted text On 2015/06/16 06:36:15, Peter Kasting wrote: > Nit: And maybe these (more optional). > > Also optional: Combine all the conditionals. Done.
	192 return false;

	193 return fourth_byte == 0x8F \|\| fourth_byte == 0x90 \|\| fourth_byte == 0x92 \|\|

	194 fourth_byte == 0x93;

	195 }

	196

167 // Unescapes \|escaped_text\| according to \|rules\|, returning the resulting	197 // Unescapes \|escaped_text\| according to \|rules\|, returning the resulting

168 // string. Fills in an \|adjustments\| parameter, if non-NULL, so it reflects	198 // string. Fills in an \|adjustments\| parameter, if non-NULL, so it reflects

169 // the alterations done to the string that are not one-character-to-one-	199 // the alterations done to the string that are not one-character-to-one-

170 // character. The resulting \|adjustments\| will always be sorted by increasing	200 // character. The resulting \|adjustments\| will always be sorted by increasing

171 // offset.	201 // offset.

172 template<typename STR>	202 template<typename STR>

173 STR UnescapeURLWithAdjustmentsImpl(	203 STR UnescapeURLWithAdjustmentsImpl(

174 const STR& escaped_text,	204 const STR& escaped_text,

175 UnescapeRule::Type rules,	205 UnescapeRule::Type rules,

176 base::OffsetAdjuster::Adjustments* adjustments) {	206 base::OffsetAdjuster::Adjustments* adjustments) {

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
210 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC	240 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC

211 // 3987 above has since added some new BiDi control characters.	241 // 3987 above has since added some new BiDi control characters.

212 // http://www.unicode.org/reports/tr9	242 // http://www.unicode.org/reports/tr9

213 //	243 //

214 // U+061C ARABIC LETTER MARK (%D8%9C)	244 // U+061C ARABIC LETTER MARK (%D8%9C)

215 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)	245 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)

216 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)	246 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)

217 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)	247 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)

218 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)	248 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)

219 //	249 //

	250 // We also ban certain spoofable characters that could be used to imitate
	mmenke 2015/06/17 19:50:16 nit: Should avoid "we" in comments. Just use pas nit: Should avoid "we" in comments. Just use passive voice. Matt Giuca 2015/06/22 07:27:03 Done. Show quoted text On 2015/06/17 19:50:16, mmenke wrote: > nit: Should avoid "we" in comments. Just use passive voice. Done.
	251 // parts of the browser UI.
	mmenke 2015/06/17 19:50:16 nit: Mentioning the browser seems like a bit of a nit: Mentioning the browser seems like a bit of a layering violation, though not exactly a serious one. Security UI? Agent UI? Matt Giuca 2015/06/22 07:27:03 I think we want to be fairly specific here. Yes, i I think we want to be fairly specific here. Yes, it is a bit of a layering violation, but these characters are banned for a fairly specific reason, which is that web browsers commonly use similar iconography to indicate security. (Note: I didn't say which web browser, so it's not Chrome-specific.)
	252 //

	253 // U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F)

	254 // U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90)

	255 // U+1F512 LOCK (%F0%9F%94%92)

	256 // U+1F513 OPEN LOCK (%F0%9F%94%93)

	257 //

220 // However, some schemes such as data: and file: need to parse the exact	258 // However, some schemes such as data: and file: need to parse the exact

221 // binary data when loading the URL. For that reason, CONTROL_CHARS allows	259 // binary data when loading the URL. For that reason, CONTROL_CHARS allows

222 // unescaping BiDi control characters.	260 // unescaping BiDi control characters.

223 // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed	261 // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed

224 // in the UI.	262 // in the UI.

225 if (!(rules & UnescapeRule::CONTROL_CHARS)) {	263 if (!(rules & UnescapeRule::CONTROL_CHARS)) {

226 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {	264 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {

227 // Keep Arabic Language Mark escaped.	265 // Keep Arabic Language Mark escaped.

228 result.append(escaped_text, i, 6);	266 result.append(escaped_text, i, 6);

229 i += 5;	267 i += 5;

230 continue;	268 continue;

231 }	269 }

232 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {	270 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {

233 // Keep BiDi control char escaped.	271 // Keep BiDi control char escaped.

234 result.append(escaped_text, i, 9);	272 result.append(escaped_text, i, 9);

235 i += 8;	273 i += 8;

236 continue;	274 continue;

237 }	275 }

	276 if (HasFourByteBannedCharAtIndex(escaped_text, first_byte, i)) {

	277 // Keep banned char escaped.

	278 result.append(escaped_text, i, 12);

	279 i += 11;

	280 continue;

	281 }

238 }	282 }

239	283

240 if (first_byte >= 0x80 \|\| // Unescape all high-bit characters.	284 if (first_byte >= 0x80 \|\| // Unescape all high-bit characters.

241 // For 7-bit characters, the lookup table tells us all valid chars.	285 // For 7-bit characters, the lookup table tells us all valid chars.

242 (kUrlUnescape[first_byte] \|\|	286 (kUrlUnescape[first_byte] \|\|

243 // ...and we allow some additional unescaping when flags are set.	287 // ...and we allow some additional unescaping when flags are set.

244 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) \|\|	288 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) \|\|

245 // Allow any of the prohibited but non-control characters when	289 // Allow any of the prohibited but non-control characters when

246 // we're doing "special" chars.	290 // we're doing "special" chars.

247 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) \|\|	291 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) \|\|

(...skipping 206 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
454 1, kEscapeToChars[i].replacement);	498 1, kEscapeToChars[i].replacement);

455 break;	499 break;

456 }	500 }

457 }	501 }

458 }	502 }

459 }	503 }

460 return text;	504 return text;

461 }	505 }

462	506

463 } // namespace net	507 } // namespace net

OLD	NEW

« no previous file with comments | « net/base/escape.h ('k') | net/base/escape_unittest.cc » ('j') | no next file with comments »