net/base/escape.cc - Issue 643963004: Unescape BiDi control chars while parsing data urls

Side by Side Diff: net/base/escape.cc

Issue 643963004: Unescape BiDi control chars while parsing data urls (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Inline IsLastTwoBytesofThreeByteBidiControlChar Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/base/escape.h"	5 #include "net/base/escape.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8	8

9 #include "base/logging.h"	9 #include "base/logging.h"

10 #include "base/memory/scoped_ptr.h"	10 #include "base/memory/scoped_ptr.h"

(...skipping 102 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
113 const typename STR::value_type least_sig_digit(	113 const typename STR::value_type least_sig_digit(

114 static_cast<typename STR::value_type>(escaped_text[index + 2]));	114 static_cast<typename STR::value_type>(escaped_text[index + 2]));

115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {	115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {

116 value = HexDigitToInt(most_sig_digit) 16 +	116 value = HexDigitToInt(most_sig_digit) 16 +

117 HexDigitToInt(least_sig_digit);	117 HexDigitToInt(least_sig_digit);

118 return true;	118 return true;

119 }	119 }

120 return false;	120 return false;

121 }	121 }

122	122

	123 // Returns true if there is an Arabic Language Mark at \|index\|. \|first_byte\|

	124 // is the byte at \|index\|.

	125 template<typename STR>

	126 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text,

	127 unsigned char first_byte,

	128 size_t index) {

	129 if (first_byte != 0xD8)

	130 return false;

	131 unsigned char second_byte;

	132 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))

	133 return false;

	134 return second_byte == 0x9c;

	135 }

	136

	137 // Returns true if there is a BiDi control char at \|index\|. \|first_byte\| is the

	138 // byte at \|index\|.

	139 template<typename STR>

	140 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text,

	141 unsigned char first_byte,

	142 size_t index) {

	143 if (first_byte != 0xE2)

	144 return false;

	145 unsigned char second_byte;

	146 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))

	147 return false;

	148 if (second_byte != 0x80 && second_byte != 0x81)

	149 return false;

	150 unsigned char third_byte;

	151 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))

	152 return false;

	153 if (second_byte == 0x80) {

	154 return third_byte == 0x8E \|\|

	155 third_byte == 0x8F \|\|

	156 (third_byte >= 0xAA && third_byte <= 0xAE);

	157 }

	158 return third_byte >= 0xA6 && third_byte <= 0xA9;

	159 }

	160

123 // Unescapes \|escaped_text\| according to \|rules\|, returning the resulting	161 // Unescapes \|escaped_text\| according to \|rules\|, returning the resulting

124 // string. Fills in an \|adjustments\| parameter, if non-NULL, so it reflects	162 // string. Fills in an \|adjustments\| parameter, if non-NULL, so it reflects

125 // the alterations done to the string that are not one-character-to-one-	163 // the alterations done to the string that are not one-character-to-one-

126 // character. The resulting \|adjustments\| will always be sorted by increasing	164 // character. The resulting \|adjustments\| will always be sorted by increasing

127 // offset.	165 // offset.

128 template<typename STR>	166 template<typename STR>

129 STR UnescapeURLWithAdjustmentsImpl(	167 STR UnescapeURLWithAdjustmentsImpl(

130 const STR& escaped_text,	168 const STR& escaped_text,

131 UnescapeRule::Type rules,	169 UnescapeRule::Type rules,

132 base::OffsetAdjuster::Adjustments* adjustments) {	170 base::OffsetAdjuster::Adjustments* adjustments) {

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
165 //	203 //

166 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC	204 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC

167 // 3987 above has since added some new BiDi control characters.	205 // 3987 above has since added some new BiDi control characters.

168 // http://www.unicode.org/reports/tr9	206 // http://www.unicode.org/reports/tr9

169 //	207 //

170 // U+061C ARABIC LETTER MARK (%D8%9C)	208 // U+061C ARABIC LETTER MARK (%D8%9C)

171 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)	209 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)

172 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)	210 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)

173 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)	211 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)

174 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)	212 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)

175	213 //

176 unsigned char second_byte;	214 // However, escaping these characters in data: urls result in
	brettw 2014/10/20 18:04:42 This sentence doesn't make sense to me. Here, we'r This sentence doesn't make sense to me. Here, we're unescaping characters but this comment talks about escaping them (since they're not ASCII, they will always be escaped when converted to a data URL). The old code was avoiding unescaping them for security reasons. The new code unescapes them sometimes when converting to the actual data, but not for display purposes. The comment should make this clear. meacer 2014/10/20 22:54:22 Done. Show quoted text On 2014/10/20 18:04:42, brettw wrote: > This sentence doesn't make sense to me. Here, we're unescaping characters but > this comment talks about escaping them (since they're not ASCII, they will > always be escaped when converted to a data URL). The old code was avoiding > unescaping them for security reasons. The new code unescapes them sometimes when > converting to the actual data, but not for display purposes. The comment should > make this clear. Done.
177 // Check for ALM.	215 // escaped BiDi control characters being displayed in the rendered html,
	brettw 2014/10/20 18:04:42 The use of the word "render" in this patch (both h The use of the word "render" in this patch (both here and in the description) is confusing to me. I think of rendering a URL as printing it to the user, as in the omnibox. Maybe we can use the phrase "load the data URL" or something. To blink, data URLs are loaded like any other URL. meacer 2014/10/20 22:54:22 Edited CL description to make it clear. Show quoted text On 2014/10/20 18:04:42, brettw wrote: > The use of the word "render" in this patch (both here and in the description) is > confusing to me. I think of rendering a URL as printing it to the user, as in > the omnibox. Maybe we can use the phrase "load the data URL" or something. To > blink, data URLs are loaded like any other URL. Edited CL description to make it clear.
178 if ((first_byte == 0xD8) &&	216 // so the parsing for data: urls is allowed to force unescaping of these

179 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&	217 // characters. DO NOT use BIDI_CONTROL_CHARS flag without talking to a

180 (second_byte == 0x9c)) {	218 // security person.

181 result.append(escaped_text, i, 6);	219 if (!(rules & UnescapeRule::BIDI_CONTROL_CHARS)) {

182 i += 5;	220 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {

183 continue;	221 result.append(escaped_text, i, 6);

184 }	222 i += 5;

185	223 continue;

186 // Check for other BiDi control characters.	224 }

187 if ((first_byte == 0xE2) &&	225 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {

188 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&

189 ((second_byte == 0x80) \|\| (second_byte == 0x81))) {

190 unsigned char third_byte;

191 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) &&

192 ((second_byte == 0x80) ?

193 ((third_byte == 0x8E) \|\| (third_byte == 0x8F) \|\|

194 ((third_byte >= 0xAA) && (third_byte <= 0xAE))) :

195 ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) {

196 result.append(escaped_text, i, 9);	226 result.append(escaped_text, i, 9);

197 i += 8;	227 i += 8;

198 continue;	228 continue;

199 }	229 }

200 }	230 }

201	231

202 if (first_byte >= 0x80 \|\| // Unescape all high-bit characters.	232 if (first_byte >= 0x80 \|\| // Unescape all high-bit characters.

203 // For 7-bit characters, the lookup table tells us all valid chars.	233 // For 7-bit characters, the lookup table tells us all valid chars.

204 (kUrlUnescape[first_byte] \|\|	234 (kUrlUnescape[first_byte] \|\|

205 // ...and we allow some additional unescaping when flags are set.	235 // ...and we allow some additional unescaping when flags are set.

(...skipping 196 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
402 1, kEscapeToChars[i].replacement);	432 1, kEscapeToChars[i].replacement);

403 break;	433 break;

404 }	434 }

405 }	435 }

406 }	436 }

407 }	437 }

408 return text;	438 return text;

409 }	439 }

410	440

411 } // namespace net	441 } // namespace net

OLD	NEW

« net/base/escape.h ('K') | « net/base/escape.h ('k') | net/base/escape_unittest.cc » ('j') | no next file with comments »