net/base/escape.cc - Issue 643963004: Unescape BiDi control chars while parsing data urls

Side by Side Diff: net/base/escape.cc

Issue 643963004: Unescape BiDi control chars while parsing data urls (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Fix test comment Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/base/escape.h"	5 #include "net/base/escape.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8	8

9 #include "base/logging.h"	9 #include "base/logging.h"

10 #include "base/memory/scoped_ptr.h"	10 #include "base/memory/scoped_ptr.h"

(...skipping 102 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
113 const typename STR::value_type least_sig_digit(	113 const typename STR::value_type least_sig_digit(

114 static_cast<typename STR::value_type>(escaped_text[index + 2]));	114 static_cast<typename STR::value_type>(escaped_text[index + 2]));

115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {	115 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {

116 value = HexDigitToInt(most_sig_digit) 16 +	116 value = HexDigitToInt(most_sig_digit) 16 +

117 HexDigitToInt(least_sig_digit);	117 HexDigitToInt(least_sig_digit);

118 return true;	118 return true;

119 }	119 }

120 return false;	120 return false;

121 }	121 }

122	122

	123 // Returns true if there is an Arabic Language Mark at \|index\|. \|first_byte\|

	124 // is the byte at \|index\|.

	125 template<typename STR>

	126 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text,

	127 unsigned char first_byte,

	128 size_t index) {

	129 if (first_byte != 0xD8)

	130 return false;

	131 unsigned char second_byte;

	132 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))

	133 return false;

	134 return second_byte == 0x9c;

	135 }

	136

	137 // Returns true if second and third bytes are of a three byte BiDi control

	138 // character sequence.

	139 bool IsLastTwoBytesofThreeByteBidiControlChar(unsigned char second_byte,

	140 unsigned char third_byte) {
	mmenke 2014/10/20 15:46:40 This function name is a lie. Its actually IsLastT This function name is a lie. Its actually IsLastTwoBytesofThreeByteBidiControlCharIfSecondByteIs0x80Or0x81. Given that, I think it's better to just inline it. meacer 2014/10/20 17:23:26 Sounds reasonable, inlined. Show quoted text On 2014/10/20 15:46:40, mmenke wrote: > This function name is a lie. Its actually > IsLastTwoBytesofThreeByteBidiControlCharIfSecondByteIs0x80Or0x81. > > Given that, I think it's better to just inline it. Sounds reasonable, inlined.
	141 if (second_byte == 0x80) {

	142 return third_byte == 0x8E \|\|

	143 third_byte == 0x8F \|\|

	144 (third_byte >= 0xAA && third_byte <= 0xAE);

	145 }
	mmenke 2014/10/20 15:46:40 DCHECK_EQ(0x81, second_byte)? If this is a separa DCHECK_EQ(0x81, second_byte)? If this is a separate function, as you have it, it's a sanity check, if you inline the function as I suggest above, it's more of a reminder that second_byte != 0x80 implies second_byte == 0x81 meacer 2014/10/20 17:23:26 Done. Show quoted text On 2014/10/20 15:46:40, mmenke wrote: > DCHECK_EQ(0x81, second_byte)? If this is a separate function, as you have it, > it's a sanity check, if you inline the function as I suggest above, it's more of > a reminder that second_byte != 0x80 implies second_byte == 0x81 Done.
	146 return third_byte >= 0xA6 && third_byte <= 0xA9;

	147 }

	148

	149 // Returns true if there is a BiDi control char at \|index\|. \|first_byte\| is the

	150 // byte at \|index\|.

	151 template<typename STR>

	152 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text,

	153 unsigned char first_byte,

	154 size_t index) {

	155 if (first_byte != 0xE2)

	156 return false;

	157 unsigned char second_byte;

	158 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))

	159 return false;

	160 if (second_byte != 0x80 && second_byte != 0x81)

	161 return false;

	162 unsigned char third_byte;

	163 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))

	164 return false;

	165 return IsLastTwoBytesofThreeByteBidiControlChar(second_byte, third_byte);

	166 }

	167

123 // Unescapes \|escaped_text\| according to \|rules\|, returning the resulting	168 // Unescapes \|escaped_text\| according to \|rules\|, returning the resulting

124 // string. Fills in an \|adjustments\| parameter, if non-NULL, so it reflects	169 // string. Fills in an \|adjustments\| parameter, if non-NULL, so it reflects

125 // the alterations done to the string that are not one-character-to-one-	170 // the alterations done to the string that are not one-character-to-one-

126 // character. The resulting \|adjustments\| will always be sorted by increasing	171 // character. The resulting \|adjustments\| will always be sorted by increasing

127 // offset.	172 // offset.

128 template<typename STR>	173 template<typename STR>

129 STR UnescapeURLWithAdjustmentsImpl(	174 STR UnescapeURLWithAdjustmentsImpl(

130 const STR& escaped_text,	175 const STR& escaped_text,

131 UnescapeRule::Type rules,	176 UnescapeRule::Type rules,

132 base::OffsetAdjuster::Adjustments* adjustments) {	177 base::OffsetAdjuster::Adjustments* adjustments) {

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
165 //	210 //

166 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC	211 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC

167 // 3987 above has since added some new BiDi control characters.	212 // 3987 above has since added some new BiDi control characters.

168 // http://www.unicode.org/reports/tr9	213 // http://www.unicode.org/reports/tr9

169 //	214 //

170 // U+061C ARABIC LETTER MARK (%D8%9C)	215 // U+061C ARABIC LETTER MARK (%D8%9C)

171 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)	216 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)

172 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)	217 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)

173 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)	218 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)

174 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)	219 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)

175	220 //

176 unsigned char second_byte;	221 // However, escaping these characters in data: urls result in

177 // Check for ALM.	222 // escaped BiDi control characters being displayed in the rendered html,

178 if ((first_byte == 0xD8) &&	223 // so the parsing for data: urls is allowed to force unescaping of these

179 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&	224 // characters. DO NOT use BIDI_CONTROL_CHARS flag without talking to a

180 (second_byte == 0x9c)) {	225 // security person.

181 result.append(escaped_text, i, 6);	226 if (!(rules & UnescapeRule::BIDI_CONTROL_CHARS)) {

182 i += 5;	227 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {

183 continue;	228 result.append(escaped_text, i, 6);

184 }	229 i += 5;

185	230 continue;

186 // Check for other BiDi control characters.	231 }

187 if ((first_byte == 0xE2) &&	232 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {

188 UnescapeUnsignedCharAtIndex(escaped_text, i + 3, &second_byte) &&

189 ((second_byte == 0x80) \|\| (second_byte == 0x81))) {

190 unsigned char third_byte;

191 if (UnescapeUnsignedCharAtIndex(escaped_text, i + 6, &third_byte) &&

192 ((second_byte == 0x80) ?

193 ((third_byte == 0x8E) \|\| (third_byte == 0x8F) \|\|

194 ((third_byte >= 0xAA) && (third_byte <= 0xAE))) :

195 ((third_byte >= 0xA6) && (third_byte <= 0xA9)))) {

196 result.append(escaped_text, i, 9);	233 result.append(escaped_text, i, 9);

197 i += 8;	234 i += 8;

198 continue;	235 continue;

199 }	236 }

200 }	237 }

201	238

202 if (first_byte >= 0x80 \|\| // Unescape all high-bit characters.	239 if (first_byte >= 0x80 \|\| // Unescape all high-bit characters.

203 // For 7-bit characters, the lookup table tells us all valid chars.	240 // For 7-bit characters, the lookup table tells us all valid chars.

204 (kUrlUnescape[first_byte] \|\|	241 (kUrlUnescape[first_byte] \|\|

205 // ...and we allow some additional unescaping when flags are set.	242 // ...and we allow some additional unescaping when flags are set.

(...skipping 196 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
402 1, kEscapeToChars[i].replacement);	439 1, kEscapeToChars[i].replacement);

403 break;	440 break;

404 }	441 }

405 }	442 }

406 }	443 }

407 }	444 }

408 return text;	445 return text;

409 }	446 }

410	447

411 } // namespace net	448 } // namespace net

OLD	NEW

« no previous file with comments | « net/base/escape.h ('k') | net/base/escape_unittest.cc » ('j') | no next file with comments »