net/http/http_content_disposition.cc - Issue 11478034: Add UMA for measuring Content-Dispostion header use and abuse.

Side by Side Diff: net/http/http_content_disposition.cc

Issue 11478034: Add UMA for measuring Content-Dispostion header use and abuse. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/http/http_content_disposition.h"	5 #include "net/http/http_content_disposition.h"

6	6

7 #include "base/base64.h"	7 #include "base/base64.h"

8 #include "base/i18n/icu_string_conversions.h"	8 #include "base/i18n/icu_string_conversions.h"

9 #include "base/logging.h"	9 #include "base/logging.h"

10 #include "base/string_util.h"	10 #include "base/string_util.h"

(...skipping 79 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
90 ucnv_close(converter);	90 ucnv_close(converter);

91 if (U_FAILURE(err))	91 if (U_FAILURE(err))

92 return false;	92 return false;

93 output->resize(output_length);	93 output->resize(output_length);

94 return true;	94 return true;

95 }	95 }

96	96

97 bool DecodeWord(const std::string& encoded_word,	97 bool DecodeWord(const std::string& encoded_word,

98 const std::string& referrer_charset,	98 const std::string& referrer_charset,

99 bool* is_rfc2047,	99 bool* is_rfc2047,

100 std::string* output) {	100 std::string* output,

	101 HttpContentDisposition::ParseResult* parse_result) {

101 *is_rfc2047 = false;	102 *is_rfc2047 = false;

102 output->clear();	103 output->clear();

103 if (encoded_word.empty())	104 if (encoded_word.empty())

104 return true;	105 return true;

105	106

106 if (!IsStringASCII(encoded_word)) {	107 if (!IsStringASCII(encoded_word)) {

107 // Try UTF-8, referrer_charset and the native OS default charset in turn.	108 // Try UTF-8, referrer_charset and the native OS default charset in turn.

108 if (IsStringUTF8(encoded_word)) {	109 if (IsStringUTF8(encoded_word)) {

109 *output = encoded_word;	110 *output = encoded_word;

110 } else {	111 } else {

111 string16 utf16_output;	112 string16 utf16_output;

112 if (!referrer_charset.empty() &&	113 if (!referrer_charset.empty() &&

113 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),	114 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),

114 base::OnStringConversionError::FAIL,	115 base::OnStringConversionError::FAIL,

115 &utf16_output)) {	116 &utf16_output)) {

116 *output = UTF16ToUTF8(utf16_output);	117 *output = UTF16ToUTF8(utf16_output);

117 } else {	118 } else {

118 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));	119 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));

119 }	120 }

120 }	121 }

121	122

	123 parse_result->has_non_ascii_strings = true;

122 return true;	124 return true;

123 }	125 }

124	126

125 // RFC 2047 : one of encoding methods supported by Firefox and relatively	127 // RFC 2047 : one of encoding methods supported by Firefox and relatively

126 // widely used by web servers.	128 // widely used by web servers.

127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.	129 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.

128 // We don't care about the length restriction (72 bytes) because	130 // We don't care about the length restriction (72 bytes) because

129 // many web servers generate encoded words longer than the limit.	131 // many web servers generate encoded words longer than the limit.

130 std::string tmp;	132 std::string decoded_word;

131 *is_rfc2047 = true;	133 *is_rfc2047 = true;

132 int part_index = 0;	134 int part_index = 0;

133 std::string charset;	135 std::string charset;

134 StringTokenizer t(encoded_word, "?");	136 StringTokenizer t(encoded_word, "?");

135 RFC2047EncodingType enc_type = Q_ENCODING;	137 RFC2047EncodingType enc_type = Q_ENCODING;

136 while (*is_rfc2047 && t.GetNext()) {	138 while (*is_rfc2047 && t.GetNext()) {

137 std::string part = t.token();	139 std::string part = t.token();

138 switch (part_index) {	140 switch (part_index) {

139 case 0:	141 case 0:

140 if (part != "=") {	142 if (part != "=") {

(...skipping 12 matching lines...) Expand all Loading...
153 part.find_first_of("bBqQ") == std::string::npos) {	155 part.find_first_of("bBqQ") == std::string::npos) {

154 *is_rfc2047 = false;	156 *is_rfc2047 = false;

155 break;	157 break;

156 }	158 }

157 if (part[0] == 'b' \|\| part[0] == 'B') {	159 if (part[0] == 'b' \|\| part[0] == 'B') {

158 enc_type = B_ENCODING;	160 enc_type = B_ENCODING;

159 }	161 }

160 ++part_index;	162 ++part_index;

161 break;	163 break;

162 case 3:	164 case 3:

163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);	165 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);

164 if (!*is_rfc2047) {	166 if (!*is_rfc2047) {

165 // Last minute failure. Invalid B/Q encoding. Rather than	167 // Last minute failure. Invalid B/Q encoding. Rather than

166 // passing it through, return now.	168 // passing it through, return now.

167 return false;	169 return false;

168 }	170 }

169 ++part_index;	171 ++part_index;

170 break;	172 break;

171 case 4:	173 case 4:

172 if (part != "=") {	174 if (part != "=") {

173 // Another last minute failure !	175 // Another last minute failure !

174 // Likely to be a case of two encoded-words in a row or	176 // Likely to be a case of two encoded-words in a row or

175 // an encoded word followed by a non-encoded word. We can be	177 // an encoded word followed by a non-encoded word. We can be

176 // generous, but it does not help much in terms of compatibility,	178 // generous, but it does not help much in terms of compatibility,

177 // I believe. Return immediately.	179 // I believe. Return immediately.

178 *is_rfc2047 = false;	180 *is_rfc2047 = false;

179 return false;	181 return false;

180 }	182 }

181 ++part_index;	183 ++part_index;

182 break;	184 break;

183 default:	185 default:

184 *is_rfc2047 = false;	186 *is_rfc2047 = false;

185 return false;	187 return false;

186 }	188 }

187 }	189 }

188	190

189 if (*is_rfc2047) {	191 if (*is_rfc2047) {

190 if (*(encoded_word.end() - 1) == '=') {	192 if (*(encoded_word.end() - 1) == '=') {

191 output->swap(tmp);	193 output->swap(decoded_word);

	194 parse_result->has_rfc2047_encoded_strings = true;

192 return true;	195 return true;

193 }	196 }

194 // encoded_word ending prematurelly with '?' or extra '?'	197 // encoded_word ending prematurelly with '?' or extra '?'

195 *is_rfc2047 = false;	198 *is_rfc2047 = false;

196 return false;	199 return false;

197 }	200 }

198	201

199 // We're not handling 'especial' characters quoted with '\', but	202 // We're not handling 'especial' characters quoted with '\', but

200 // it should be Ok because we're not an email client but a	203 // it should be Ok because we're not an email client but a

201 // web browser.	204 // web browser.

202	205

203 // What IE6/7 does: %-escaped UTF-8.	206 // What IE6/7 does: %-escaped UTF-8.

204 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);	207 decoded_word = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);

205 if (IsStringUTF8(tmp)) {	208 if (decoded_word != encoded_word)

206 output->swap(tmp);	209 parse_result->has_percent_encoded_strings = true;

	210 if (IsStringUTF8(decoded_word)) {

	211 output->swap(decoded_word);

207 return true;	212 return true;

208 // We can try either the OS default charset or 'origin charset' here,	213 // We can try either the OS default charset or 'origin charset' here,

209 // As far as I can tell, IE does not support it. However, I've seen	214 // As far as I can tell, IE does not support it. However, I've seen

210 // web servers emit %-escaped string in a legacy encoding (usually	215 // web servers emit %-escaped string in a legacy encoding (usually

211 // origin charset).	216 // origin charset).

212 // TODO(jungshik) : Test IE further and consider adding a fallback here.	217 // TODO(jungshik) : Test IE further and consider adding a fallback here.

213 }	218 }

214 return false;	219 return false;

215 }	220 }

216	221

217 // Decodes the value of a 'filename' or 'name' parameter given as \|input\|. The	222 // Decodes the value of a 'filename' or 'name' parameter given as \|input\|. The

218 // value is supposed to be of the form:	223 // value is supposed to be of the form:

219 //	224 //

220 // value = token \| quoted-string	225 // value = token \| quoted-string

221 //	226 //

222 // However we currently also allow RFC 2047 encoding and non-ASCII	227 // However we currently also allow RFC 2047 encoding and non-ASCII

223 // strings. Non-ASCII strings are interpreted based on \|referrer_charset\|.	228 // strings. Non-ASCII strings are interpreted based on \|referrer_charset\|.

224 bool DecodeFilenameValue(const std::string& input,	229 bool DecodeFilenameValue(const std::string& input,

225 const std::string& referrer_charset,	230 const std::string& referrer_charset,

226 std::string* output) {	231 std::string* output,

227 std::string tmp;	232 HttpContentDisposition::ParseResult* parse_result) {

	233 HttpContentDisposition::ParseResult current_parse_result;

	234 std::string decoded_value;

	235 bool is_previous_token_rfc2047 = true;

	236

228 // Tokenize with whitespace characters.	237 // Tokenize with whitespace characters.

229 StringTokenizer t(input, " \t\n\r");	238 StringTokenizer t(input, " \t\n\r");

230 t.set_options(StringTokenizer::RETURN_DELIMS);	239 t.set_options(StringTokenizer::RETURN_DELIMS);

231 bool is_previous_token_rfc2047 = true;

232 while (t.GetNext()) {	240 while (t.GetNext()) {

233 if (t.token_is_delim()) {	241 if (t.token_is_delim()) {

234 // If the previous non-delimeter token is not RFC2047-encoded,	242 // If the previous non-delimeter token is not RFC2047-encoded,

235 // put in a space in its place. Otheriwse, skip over it.	243 // put in a space in its place. Otheriwse, skip over it.

236 if (!is_previous_token_rfc2047) {	244 if (!is_previous_token_rfc2047)

237 tmp.push_back(' ');	245 decoded_value.push_back(' ');

238 }

239 continue;	246 continue;

240 }	247 }

241 // We don't support a single multibyte character split into	248 // We don't support a single multibyte character split into

242 // adjacent encoded words. Some broken mail clients emit headers	249 // adjacent encoded words. Some broken mail clients emit headers

243 // with that problem, but most web servers usually encode a filename	250 // with that problem, but most web servers usually encode a filename

244 // in a single encoded-word. Firefox/Thunderbird do not support	251 // in a single encoded-word. Firefox/Thunderbird do not support

245 // it, either.	252 // it, either.

246 std::string decoded;	253 std::string decoded;

247 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,	254 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,

248 &decoded))	255 &decoded, &current_parse_result))

249 return false;	256 return false;

250 tmp.append(decoded);	257 decoded_value.append(decoded);

251 }	258 }

252 output->swap(tmp);	259 output->swap(decoded_value);

	260 if (parse_result) {

	261 parse_result->has_non_ascii_strings =

	262 current_parse_result.has_non_ascii_strings;

	263 parse_result->has_percent_encoded_strings =

	264 current_parse_result.has_percent_encoded_strings;

	265 parse_result->has_rfc2047_encoded_strings =

	266 current_parse_result.has_rfc2047_encoded_strings;

	267 }

253 return true;	268 return true;

254 }	269 }

255	270

256 // Parses the charset and value-chars out of an ext-value string.	271 // Parses the charset and value-chars out of an ext-value string.

257 //	272 //

258 // ext-value = charset "'" [ language ] "'" value-chars	273 // ext-value = charset "'" [ language ] "'" value-chars

259 bool ParseExtValueComponents(const std::string& input,	274 bool ParseExtValueComponents(const std::string& input,

260 std::string* charset,	275 std::string* charset,

261 std::string* value_chars) {	276 std::string* value_chars) {

262 StringTokenizer t(input, "'");	277 StringTokenizer t(input, "'");

(...skipping 67 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
330 }	345 }

331	346

332 std::string unescaped = UnescapeURLComponent(value,	347 std::string unescaped = UnescapeURLComponent(value,

333 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS);	348 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS);

334	349

335 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);	350 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);

336 }	351 }

337	352

338 } // namespace	353 } // namespace

339	354

	355 HttpContentDisposition::ParseResult::ParseResult()

	356 : has_disposition_type(false),

	357 has_unknown_disposition_type(false),

	358 has_name(false),

	359 has_filename(false),

	360 has_ext_filename(false),

	361 has_non_ascii_strings(false),

	362 has_percent_encoded_strings(false),

	363 has_rfc2047_encoded_strings(false) {

	364 }

	365

340 HttpContentDisposition::HttpContentDisposition(	366 HttpContentDisposition::HttpContentDisposition(

341 const std::string& header, const std::string& referrer_charset)	367 const std::string& header, const std::string& referrer_charset)

342 : type_(INLINE) {	368 : type_(INLINE) {

343 Parse(header, referrer_charset);	369 Parse(header, referrer_charset);

344 }	370 }

345	371

346 HttpContentDisposition::~HttpContentDisposition() {	372 HttpContentDisposition::~HttpContentDisposition() {

347 }	373 }

348	374

349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(	375 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(

350 std::string::const_iterator begin, std::string::const_iterator end) {	376 std::string::const_iterator begin, std::string::const_iterator end) {

351 DCHECK(type_ == INLINE);	377 DCHECK(type_ == INLINE);

352 std::string::const_iterator delimiter = std::find(begin, end, ';');	378 std::string::const_iterator delimiter = std::find(begin, end, ';');

353	379

354 std::string::const_iterator type_begin = begin;	380 std::string::const_iterator type_begin = begin;

355 std::string::const_iterator type_end = delimiter;	381 std::string::const_iterator type_end = delimiter;

356 HttpUtil::TrimLWS(&type_begin, &type_end);	382 HttpUtil::TrimLWS(&type_begin, &type_end);

357	383

358 // If the disposition-type isn't a valid token the then the	384 // If the disposition-type isn't a valid token the then the

359 // Content-Disposition header is malformed, and we treat the first bytes as	385 // Content-Disposition header is malformed, and we treat the first bytes as

360 // a parameter rather than a disposition-type.	386 // a parameter rather than a disposition-type.

361 if (!HttpUtil::IsToken(type_begin, type_end))	387 if (!HttpUtil::IsToken(type_begin, type_end))

362 return begin;	388 return begin;

363	389

	390 parse_result_.has_disposition_type = true;

	391

364 DCHECK(std::find(type_begin, type_end, '=') == type_end);	392 DCHECK(std::find(type_begin, type_end, '=') == type_end);

365	393

366 if (!LowerCaseEqualsASCII(type_begin, type_end, "inline"))	394 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {

	395 type_ = INLINE;

	396 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {

367 type_ = ATTACHMENT;	397 type_ = ATTACHMENT;

	398 } else {

	399 parse_result_.has_unknown_disposition_type = true;

	400 type_ = ATTACHMENT;

	401 }

368 return delimiter;	402 return delimiter;

369 }	403 }

370	404

371 // http://tools.ietf.org/html/rfc6266	405 // http://tools.ietf.org/html/rfc6266

372 //	406 //

373 // content-disposition = "Content-Disposition" ":"	407 // content-disposition = "Content-Disposition" ":"

374 // disposition-type *( ";" disposition-parm )	408 // disposition-type *( ";" disposition-parm )

375 //	409 //

376 // disposition-type = "inline" \| "attachment" \| disp-ext-type	410 // disposition-type = "inline" \| "attachment" \| disp-ext-type

377 // ; case-insensitive	411 // ; case-insensitive

(...skipping 19 matching lines...) Expand all Loading...
397	431

398 std::string name;	432 std::string name;

399 std::string filename;	433 std::string filename;

400 std::string ext_filename;	434 std::string ext_filename;

401	435

402 HttpUtil::NameValuePairsIterator iter(pos, end, ';');	436 HttpUtil::NameValuePairsIterator iter(pos, end, ';');

403 while (iter.GetNext()) {	437 while (iter.GetNext()) {

404 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),	438 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),

405 iter.name_end(),	439 iter.name_end(),

406 "filename")) {	440 "filename")) {

407 DecodeFilenameValue(iter.value(), referrer_charset, &filename);	441 parse_result_.has_filename =

	442 DecodeFilenameValue(iter.value(), referrer_charset, &filename,

	443 &parse_result_);

408 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),	444 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),

409 iter.name_end(),	445 iter.name_end(),

410 "name")) {	446 "name")) {

411 DecodeFilenameValue(iter.value(), referrer_charset, &name);	447 parse_result_.has_name =

	448 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);

412 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),	449 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),

413 iter.name_end(),	450 iter.name_end(),

414 "filename*")) {	451 "filename*")) {

415 DecodeExtValue(iter.raw_value(), &ext_filename);	452 parse_result_.has_ext_filename =

	453 DecodeExtValue(iter.raw_value(), &ext_filename);

416 }	454 }

417 }	455 }

418	456

419 if (!ext_filename.empty())	457 if (!ext_filename.empty())

420 filename_ = ext_filename;	458 filename_ = ext_filename;

421 else if (!filename.empty())	459 else if (!filename.empty())

422 filename_ = filename;	460 filename_ = filename;

423 else	461 else

424 filename_ = name;	462 filename_ = name;

425 }	463 }

426	464

427 } // namespace net	465 } // namespace net

OLD	NEW

« content/browser/download/download_stats.cc ('K') | « net/http/http_content_disposition.h ('k') | net/http/http_content_disposition_unittest.cc » ('j') | no next file with comments »