net/http/http_content_disposition.cc - Issue 11478034: Add UMA for measuring Content-Dispostion header use and abuse.

Side by Side Diff: net/http/http_content_disposition.cc

Issue 11478034: Add UMA for measuring Content-Dispostion header use and abuse. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Only measure valid C-D headers Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/http/http_content_disposition.h"	5 #include "net/http/http_content_disposition.h"

6	6

7 #include "base/base64.h"	7 #include "base/base64.h"

8 #include "base/i18n/icu_string_conversions.h"	8 #include "base/i18n/icu_string_conversions.h"

9 #include "base/logging.h"	9 #include "base/logging.h"

10 #include "base/string_util.h"	10 #include "base/string_util.h"

(...skipping 77 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
88 ucnv_close(converter);	88 ucnv_close(converter);

89 if (U_FAILURE(err))	89 if (U_FAILURE(err))

90 return false;	90 return false;

91 output->resize(output_length);	91 output->resize(output_length);

92 return true;	92 return true;

93 }	93 }

94	94

95 bool DecodeWord(const std::string& encoded_word,	95 bool DecodeWord(const std::string& encoded_word,

96 const std::string& referrer_charset,	96 const std::string& referrer_charset,

97 bool* is_rfc2047,	97 bool* is_rfc2047,

98 std::string* output) {	98 std::string* output,

	99 net::HttpContentDisposition::ParseResult* parse_result) {

99 *is_rfc2047 = false;	100 *is_rfc2047 = false;

100 output->clear();	101 output->clear();

101 if (encoded_word.empty())	102 if (encoded_word.empty())

102 return true;	103 return true;

103	104

104 if (!IsStringASCII(encoded_word)) {	105 if (!IsStringASCII(encoded_word)) {

105 // Try UTF-8, referrer_charset and the native OS default charset in turn.	106 // Try UTF-8, referrer_charset and the native OS default charset in turn.

106 if (IsStringUTF8(encoded_word)) {	107 if (IsStringUTF8(encoded_word)) {

107 *output = encoded_word;	108 *output = encoded_word;

108 } else {	109 } else {

109 string16 utf16_output;	110 string16 utf16_output;

110 if (!referrer_charset.empty() &&	111 if (!referrer_charset.empty() &&

111 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),	112 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),

112 base::OnStringConversionError::FAIL,	113 base::OnStringConversionError::FAIL,

113 &utf16_output)) {	114 &utf16_output)) {

114 *output = UTF16ToUTF8(utf16_output);	115 *output = UTF16ToUTF8(utf16_output);

115 } else {	116 } else {

116 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));	117 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));

117 }	118 }

118 }	119 }

119	120

	121 parse_result->has_non_ascii_strings = true;

120 return true;	122 return true;

121 }	123 }

122	124

123 // RFC 2047 : one of encoding methods supported by Firefox and relatively	125 // RFC 2047 : one of encoding methods supported by Firefox and relatively

124 // widely used by web servers.	126 // widely used by web servers.

125 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.	127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.

126 // We don't care about the length restriction (72 bytes) because	128 // We don't care about the length restriction (72 bytes) because

127 // many web servers generate encoded words longer than the limit.	129 // many web servers generate encoded words longer than the limit.

128 std::string tmp;	130 std::string decoded_word;

129 *is_rfc2047 = true;	131 *is_rfc2047 = true;

130 int part_index = 0;	132 int part_index = 0;

131 std::string charset;	133 std::string charset;

132 StringTokenizer t(encoded_word, "?");	134 StringTokenizer t(encoded_word, "?");

133 RFC2047EncodingType enc_type = Q_ENCODING;	135 RFC2047EncodingType enc_type = Q_ENCODING;

134 while (*is_rfc2047 && t.GetNext()) {	136 while (*is_rfc2047 && t.GetNext()) {

135 std::string part = t.token();	137 std::string part = t.token();

136 switch (part_index) {	138 switch (part_index) {

137 case 0:	139 case 0:

138 if (part != "=") {	140 if (part != "=") {

(...skipping 12 matching lines...) Expand all Loading...
151 part.find_first_of("bBqQ") == std::string::npos) {	153 part.find_first_of("bBqQ") == std::string::npos) {

152 *is_rfc2047 = false;	154 *is_rfc2047 = false;

153 break;	155 break;

154 }	156 }

155 if (part[0] == 'b' \|\| part[0] == 'B') {	157 if (part[0] == 'b' \|\| part[0] == 'B') {

156 enc_type = B_ENCODING;	158 enc_type = B_ENCODING;

157 }	159 }

158 ++part_index;	160 ++part_index;

159 break;	161 break;

160 case 3:	162 case 3:

161 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);	163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);

162 if (!*is_rfc2047) {	164 if (!*is_rfc2047) {

163 // Last minute failure. Invalid B/Q encoding. Rather than	165 // Last minute failure. Invalid B/Q encoding. Rather than

164 // passing it through, return now.	166 // passing it through, return now.

165 return false;	167 return false;

166 }	168 }

167 ++part_index;	169 ++part_index;

168 break;	170 break;

169 case 4:	171 case 4:

170 if (part != "=") {	172 if (part != "=") {

171 // Another last minute failure !	173 // Another last minute failure !

172 // Likely to be a case of two encoded-words in a row or	174 // Likely to be a case of two encoded-words in a row or

173 // an encoded word followed by a non-encoded word. We can be	175 // an encoded word followed by a non-encoded word. We can be

174 // generous, but it does not help much in terms of compatibility,	176 // generous, but it does not help much in terms of compatibility,

175 // I believe. Return immediately.	177 // I believe. Return immediately.

176 *is_rfc2047 = false;	178 *is_rfc2047 = false;

177 return false;	179 return false;

178 }	180 }

179 ++part_index;	181 ++part_index;

180 break;	182 break;

181 default:	183 default:

182 *is_rfc2047 = false;	184 *is_rfc2047 = false;

183 return false;	185 return false;

184 }	186 }

185 }	187 }

186	188

187 if (*is_rfc2047) {	189 if (*is_rfc2047) {

188 if (*(encoded_word.end() - 1) == '=') {	190 if (*(encoded_word.end() - 1) == '=') {

189 output->swap(tmp);	191 output->swap(decoded_word);

	192 parse_result->has_rfc2047_encoded_strings = true;

190 return true;	193 return true;

191 }	194 }

192 // encoded_word ending prematurelly with '?' or extra '?'	195 // encoded_word ending prematurelly with '?' or extra '?'

193 *is_rfc2047 = false;	196 *is_rfc2047 = false;

194 return false;	197 return false;

195 }	198 }

196	199

197 // We're not handling 'especial' characters quoted with '\', but	200 // We're not handling 'especial' characters quoted with '\', but

198 // it should be Ok because we're not an email client but a	201 // it should be Ok because we're not an email client but a

199 // web browser.	202 // web browser.

200	203

201 // What IE6/7 does: %-escaped UTF-8.	204 // What IE6/7 does: %-escaped UTF-8.

202 tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES);	205 decoded_word = net::UnescapeURLComponent(encoded_word,

203 if (IsStringUTF8(tmp)) {	206 net::UnescapeRule::SPACES);

204 output->swap(tmp);	207 if (decoded_word != encoded_word)

	208 parse_result->has_percent_encoded_strings = true;

	209 if (IsStringUTF8(decoded_word)) {

	210 output->swap(decoded_word);

205 return true;	211 return true;

206 // We can try either the OS default charset or 'origin charset' here,	212 // We can try either the OS default charset or 'origin charset' here,

207 // As far as I can tell, IE does not support it. However, I've seen	213 // As far as I can tell, IE does not support it. However, I've seen

208 // web servers emit %-escaped string in a legacy encoding (usually	214 // web servers emit %-escaped string in a legacy encoding (usually

209 // origin charset).	215 // origin charset).

210 // TODO(jungshik) : Test IE further and consider adding a fallback here.	216 // TODO(jungshik) : Test IE further and consider adding a fallback here.

211 }	217 }

212 return false;	218 return false;

213 }	219 }

214	220

215 // Decodes the value of a 'filename' or 'name' parameter given as \|input\|. The	221 // Decodes the value of a 'filename' or 'name' parameter given as \|input\|. The

216 // value is supposed to be of the form:	222 // value is supposed to be of the form:

217 //	223 //

218 // value = token \| quoted-string	224 // value = token \| quoted-string

219 //	225 //

220 // However we currently also allow RFC 2047 encoding and non-ASCII	226 // However we currently also allow RFC 2047 encoding and non-ASCII

221 // strings. Non-ASCII strings are interpreted based on \|referrer_charset\|.	227 // strings. Non-ASCII strings are interpreted based on \|referrer_charset\|.

222 bool DecodeFilenameValue(const std::string& input,	228 bool DecodeFilenameValue(

223 const std::string& referrer_charset,	229 const std::string& input,

224 std::string* output) {	230 const std::string& referrer_charset,

225 std::string tmp;	231 std::string* output,

	232 net::HttpContentDisposition::ParseResult* parse_result) {

	233 net::HttpContentDisposition::ParseResult current_parse_result;

	234 std::string decoded_value;

	235 bool is_previous_token_rfc2047 = true;

	236

226 // Tokenize with whitespace characters.	237 // Tokenize with whitespace characters.

227 StringTokenizer t(input, " \t\n\r");	238 StringTokenizer t(input, " \t\n\r");

228 t.set_options(StringTokenizer::RETURN_DELIMS);	239 t.set_options(StringTokenizer::RETURN_DELIMS);

229 bool is_previous_token_rfc2047 = true;

230 while (t.GetNext()) {	240 while (t.GetNext()) {

231 if (t.token_is_delim()) {	241 if (t.token_is_delim()) {

232 // If the previous non-delimeter token is not RFC2047-encoded,	242 // If the previous non-delimeter token is not RFC2047-encoded,

233 // put in a space in its place. Otheriwse, skip over it.	243 // put in a space in its place. Otheriwse, skip over it.

234 if (!is_previous_token_rfc2047) {	244 if (!is_previous_token_rfc2047)

235 tmp.push_back(' ');	245 decoded_value.push_back(' ');

236 }

237 continue;	246 continue;

238 }	247 }

239 // We don't support a single multibyte character split into	248 // We don't support a single multibyte character split into

240 // adjacent encoded words. Some broken mail clients emit headers	249 // adjacent encoded words. Some broken mail clients emit headers

241 // with that problem, but most web servers usually encode a filename	250 // with that problem, but most web servers usually encode a filename

242 // in a single encoded-word. Firefox/Thunderbird do not support	251 // in a single encoded-word. Firefox/Thunderbird do not support

243 // it, either.	252 // it, either.

244 std::string decoded;	253 std::string decoded;

245 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,	254 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,

246 &decoded))	255 &decoded, &current_parse_result))

247 return false;	256 return false;

248 tmp.append(decoded);	257 decoded_value.append(decoded);

249 }	258 }

250 output->swap(tmp);	259 output->swap(decoded_value);

	260 if (parse_result && !output->empty()) {

	261 parse_result->has_non_ascii_strings =

	262 current_parse_result.has_non_ascii_strings;

	263 parse_result->has_percent_encoded_strings =

	264 current_parse_result.has_percent_encoded_strings;

	265 parse_result->has_rfc2047_encoded_strings =

	266 current_parse_result.has_rfc2047_encoded_strings;

	267 }

251 return true;	268 return true;

252 }	269 }

253	270

254 // Parses the charset and value-chars out of an ext-value string.	271 // Parses the charset and value-chars out of an ext-value string.

255 //	272 //

256 // ext-value = charset "'" [ language ] "'" value-chars	273 // ext-value = charset "'" [ language ] "'" value-chars

257 bool ParseExtValueComponents(const std::string& input,	274 bool ParseExtValueComponents(const std::string& input,

258 std::string* charset,	275 std::string* charset,

259 std::string* value_chars) {	276 std::string* value_chars) {

260 StringTokenizer t(input, "'");	277 StringTokenizer t(input, "'");

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
330 std::string unescaped = net::UnescapeURLComponent(	347 std::string unescaped = net::UnescapeURLComponent(

331 value, net::UnescapeRule::SPACES \| net::UnescapeRule::URL_SPECIAL_CHARS);	348 value, net::UnescapeRule::SPACES \| net::UnescapeRule::URL_SPECIAL_CHARS);

332	349

333 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);	350 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);

334 }	351 }

335	352

336 } // namespace	353 } // namespace

337	354

338 namespace net {	355 namespace net {

339	356

	357 HttpContentDisposition::ParseResult::ParseResult()

	358 : has_disposition_type(false),

	359 has_unknown_disposition_type(false),

	360 has_name(false),

	361 has_filename(false),

	362 has_ext_filename(false),

	363 has_non_ascii_strings(false),

	364 has_percent_encoded_strings(false),

	365 has_rfc2047_encoded_strings(false) {

	366 }

	367

340 HttpContentDisposition::HttpContentDisposition(	368 HttpContentDisposition::HttpContentDisposition(

341 const std::string& header, const std::string& referrer_charset)	369 const std::string& header, const std::string& referrer_charset)

342 : type_(INLINE) {	370 : type_(INLINE) {

343 Parse(header, referrer_charset);	371 Parse(header, referrer_charset);

344 }	372 }

345	373

346 HttpContentDisposition::~HttpContentDisposition() {	374 HttpContentDisposition::~HttpContentDisposition() {

347 }	375 }

348	376

349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(	377 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(

350 std::string::const_iterator begin, std::string::const_iterator end) {	378 std::string::const_iterator begin, std::string::const_iterator end) {

351 DCHECK(type_ == INLINE);	379 DCHECK(type_ == INLINE);

352 std::string::const_iterator delimiter = std::find(begin, end, ';');	380 std::string::const_iterator delimiter = std::find(begin, end, ';');

353	381

354 std::string::const_iterator type_begin = begin;	382 std::string::const_iterator type_begin = begin;

355 std::string::const_iterator type_end = delimiter;	383 std::string::const_iterator type_end = delimiter;

356 HttpUtil::TrimLWS(&type_begin, &type_end);	384 HttpUtil::TrimLWS(&type_begin, &type_end);

357	385

358 // If the disposition-type isn't a valid token the then the	386 // If the disposition-type isn't a valid token the then the

359 // Content-Disposition header is malformed, and we treat the first bytes as	387 // Content-Disposition header is malformed, and we treat the first bytes as

360 // a parameter rather than a disposition-type.	388 // a parameter rather than a disposition-type.

361 if (!HttpUtil::IsToken(type_begin, type_end))	389 if (!HttpUtil::IsToken(type_begin, type_end))

362 return begin;	390 return begin;

363	391

	392 parse_result_.has_disposition_type = true;

	393

364 DCHECK(std::find(type_begin, type_end, '=') == type_end);	394 DCHECK(std::find(type_begin, type_end, '=') == type_end);

365	395

366 if (!LowerCaseEqualsASCII(type_begin, type_end, "inline"))	396 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {

	397 type_ = INLINE;

	398 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {

367 type_ = ATTACHMENT;	399 type_ = ATTACHMENT;

	400 } else {

	401 parse_result_.has_unknown_disposition_type = true;

	402 type_ = ATTACHMENT;

	403 }

368 return delimiter;	404 return delimiter;

369 }	405 }

370	406

371 // http://tools.ietf.org/html/rfc6266	407 // http://tools.ietf.org/html/rfc6266

372 //	408 //

373 // content-disposition = "Content-Disposition" ":"	409 // content-disposition = "Content-Disposition" ":"

374 // disposition-type *( ";" disposition-parm )	410 // disposition-type *( ";" disposition-parm )

375 //	411 //

376 // disposition-type = "inline" \| "attachment" \| disp-ext-type	412 // disposition-type = "inline" \| "attachment" \| disp-ext-type

377 // ; case-insensitive	413 // ; case-insensitive

(...skipping 19 matching lines...) Expand all Loading...
397	433

398 std::string name;	434 std::string name;

399 std::string filename;	435 std::string filename;

400 std::string ext_filename;	436 std::string ext_filename;

401	437

402 HttpUtil::NameValuePairsIterator iter(pos, end, ';');	438 HttpUtil::NameValuePairsIterator iter(pos, end, ';');

403 while (iter.GetNext()) {	439 while (iter.GetNext()) {

404 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),	440 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),

405 iter.name_end(),	441 iter.name_end(),

406 "filename")) {	442 "filename")) {

407 DecodeFilenameValue(iter.value(), referrer_charset, &filename);	443 DecodeFilenameValue(iter.value(), referrer_charset, &filename,

	444 &parse_result_);

	445 parse_result_.has_filename = !filename.empty();

408 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),	446 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),

409 iter.name_end(),	447 iter.name_end(),

410 "name")) {	448 "name")) {

411 DecodeFilenameValue(iter.value(), referrer_charset, &name);	449 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);

	450 parse_result_.has_name = !name.empty();

412 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),	451 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),

413 iter.name_end(),	452 iter.name_end(),

414 "filename*")) {	453 "filename*")) {

415 DecodeExtValue(iter.raw_value(), &ext_filename);	454 DecodeExtValue(iter.raw_value(), &ext_filename);

	455 parse_result_.has_ext_filename = !ext_filename.empty();

416 }	456 }

417 }	457 }

418	458

419 if (!ext_filename.empty())	459 if (!ext_filename.empty())

420 filename_ = ext_filename;	460 filename_ = ext_filename;

421 else if (!filename.empty())	461 else if (!filename.empty())

422 filename_ = filename;	462 filename_ = filename;

423 else	463 else

424 filename_ = name;	464 filename_ = name;

425 }	465 }

426	466

427 } // namespace net	467 } // namespace net

OLD	NEW