Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(949)

Side by Side Diff: net/http/http_content_disposition.cc

Issue 11478034: Add UMA for measuring Content-Dispostion header use and abuse. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Only measure valid C-D headers Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "net/http/http_content_disposition.h" 5 #include "net/http/http_content_disposition.h"
6 6
7 #include "base/base64.h" 7 #include "base/base64.h"
8 #include "base/i18n/icu_string_conversions.h" 8 #include "base/i18n/icu_string_conversions.h"
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/string_util.h" 10 #include "base/string_util.h"
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
88 ucnv_close(converter); 88 ucnv_close(converter);
89 if (U_FAILURE(err)) 89 if (U_FAILURE(err))
90 return false; 90 return false;
91 output->resize(output_length); 91 output->resize(output_length);
92 return true; 92 return true;
93 } 93 }
94 94
95 bool DecodeWord(const std::string& encoded_word, 95 bool DecodeWord(const std::string& encoded_word,
96 const std::string& referrer_charset, 96 const std::string& referrer_charset,
97 bool* is_rfc2047, 97 bool* is_rfc2047,
98 std::string* output) { 98 std::string* output,
99 net::HttpContentDisposition::ParseResult* parse_result) {
99 *is_rfc2047 = false; 100 *is_rfc2047 = false;
100 output->clear(); 101 output->clear();
101 if (encoded_word.empty()) 102 if (encoded_word.empty())
102 return true; 103 return true;
103 104
104 if (!IsStringASCII(encoded_word)) { 105 if (!IsStringASCII(encoded_word)) {
105 // Try UTF-8, referrer_charset and the native OS default charset in turn. 106 // Try UTF-8, referrer_charset and the native OS default charset in turn.
106 if (IsStringUTF8(encoded_word)) { 107 if (IsStringUTF8(encoded_word)) {
107 *output = encoded_word; 108 *output = encoded_word;
108 } else { 109 } else {
109 string16 utf16_output; 110 string16 utf16_output;
110 if (!referrer_charset.empty() && 111 if (!referrer_charset.empty() &&
111 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), 112 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
112 base::OnStringConversionError::FAIL, 113 base::OnStringConversionError::FAIL,
113 &utf16_output)) { 114 &utf16_output)) {
114 *output = UTF16ToUTF8(utf16_output); 115 *output = UTF16ToUTF8(utf16_output);
115 } else { 116 } else {
116 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); 117 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
117 } 118 }
118 } 119 }
119 120
121 parse_result->has_non_ascii_strings = true;
120 return true; 122 return true;
121 } 123 }
122 124
123 // RFC 2047 : one of encoding methods supported by Firefox and relatively 125 // RFC 2047 : one of encoding methods supported by Firefox and relatively
124 // widely used by web servers. 126 // widely used by web servers.
125 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. 127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
126 // We don't care about the length restriction (72 bytes) because 128 // We don't care about the length restriction (72 bytes) because
127 // many web servers generate encoded words longer than the limit. 129 // many web servers generate encoded words longer than the limit.
128 std::string tmp; 130 std::string decoded_word;
129 *is_rfc2047 = true; 131 *is_rfc2047 = true;
130 int part_index = 0; 132 int part_index = 0;
131 std::string charset; 133 std::string charset;
132 StringTokenizer t(encoded_word, "?"); 134 StringTokenizer t(encoded_word, "?");
133 RFC2047EncodingType enc_type = Q_ENCODING; 135 RFC2047EncodingType enc_type = Q_ENCODING;
134 while (*is_rfc2047 && t.GetNext()) { 136 while (*is_rfc2047 && t.GetNext()) {
135 std::string part = t.token(); 137 std::string part = t.token();
136 switch (part_index) { 138 switch (part_index) {
137 case 0: 139 case 0:
138 if (part != "=") { 140 if (part != "=") {
(...skipping 12 matching lines...) Expand all
151 part.find_first_of("bBqQ") == std::string::npos) { 153 part.find_first_of("bBqQ") == std::string::npos) {
152 *is_rfc2047 = false; 154 *is_rfc2047 = false;
153 break; 155 break;
154 } 156 }
155 if (part[0] == 'b' || part[0] == 'B') { 157 if (part[0] == 'b' || part[0] == 'B') {
156 enc_type = B_ENCODING; 158 enc_type = B_ENCODING;
157 } 159 }
158 ++part_index; 160 ++part_index;
159 break; 161 break;
160 case 3: 162 case 3:
161 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); 163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
162 if (!*is_rfc2047) { 164 if (!*is_rfc2047) {
163 // Last minute failure. Invalid B/Q encoding. Rather than 165 // Last minute failure. Invalid B/Q encoding. Rather than
164 // passing it through, return now. 166 // passing it through, return now.
165 return false; 167 return false;
166 } 168 }
167 ++part_index; 169 ++part_index;
168 break; 170 break;
169 case 4: 171 case 4:
170 if (part != "=") { 172 if (part != "=") {
171 // Another last minute failure ! 173 // Another last minute failure !
172 // Likely to be a case of two encoded-words in a row or 174 // Likely to be a case of two encoded-words in a row or
173 // an encoded word followed by a non-encoded word. We can be 175 // an encoded word followed by a non-encoded word. We can be
174 // generous, but it does not help much in terms of compatibility, 176 // generous, but it does not help much in terms of compatibility,
175 // I believe. Return immediately. 177 // I believe. Return immediately.
176 *is_rfc2047 = false; 178 *is_rfc2047 = false;
177 return false; 179 return false;
178 } 180 }
179 ++part_index; 181 ++part_index;
180 break; 182 break;
181 default: 183 default:
182 *is_rfc2047 = false; 184 *is_rfc2047 = false;
183 return false; 185 return false;
184 } 186 }
185 } 187 }
186 188
187 if (*is_rfc2047) { 189 if (*is_rfc2047) {
188 if (*(encoded_word.end() - 1) == '=') { 190 if (*(encoded_word.end() - 1) == '=') {
189 output->swap(tmp); 191 output->swap(decoded_word);
192 parse_result->has_rfc2047_encoded_strings = true;
190 return true; 193 return true;
191 } 194 }
192 // encoded_word ending prematurelly with '?' or extra '?' 195 // encoded_word ending prematurelly with '?' or extra '?'
193 *is_rfc2047 = false; 196 *is_rfc2047 = false;
194 return false; 197 return false;
195 } 198 }
196 199
197 // We're not handling 'especial' characters quoted with '\', but 200 // We're not handling 'especial' characters quoted with '\', but
198 // it should be Ok because we're not an email client but a 201 // it should be Ok because we're not an email client but a
199 // web browser. 202 // web browser.
200 203
201 // What IE6/7 does: %-escaped UTF-8. 204 // What IE6/7 does: %-escaped UTF-8.
202 tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES); 205 decoded_word = net::UnescapeURLComponent(encoded_word,
203 if (IsStringUTF8(tmp)) { 206 net::UnescapeRule::SPACES);
204 output->swap(tmp); 207 if (decoded_word != encoded_word)
208 parse_result->has_percent_encoded_strings = true;
209 if (IsStringUTF8(decoded_word)) {
210 output->swap(decoded_word);
205 return true; 211 return true;
206 // We can try either the OS default charset or 'origin charset' here, 212 // We can try either the OS default charset or 'origin charset' here,
207 // As far as I can tell, IE does not support it. However, I've seen 213 // As far as I can tell, IE does not support it. However, I've seen
208 // web servers emit %-escaped string in a legacy encoding (usually 214 // web servers emit %-escaped string in a legacy encoding (usually
209 // origin charset). 215 // origin charset).
210 // TODO(jungshik) : Test IE further and consider adding a fallback here. 216 // TODO(jungshik) : Test IE further and consider adding a fallback here.
211 } 217 }
212 return false; 218 return false;
213 } 219 }
214 220
215 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The 221 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
216 // value is supposed to be of the form: 222 // value is supposed to be of the form:
217 // 223 //
218 // value = token | quoted-string 224 // value = token | quoted-string
219 // 225 //
220 // However we currently also allow RFC 2047 encoding and non-ASCII 226 // However we currently also allow RFC 2047 encoding and non-ASCII
221 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. 227 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
222 bool DecodeFilenameValue(const std::string& input, 228 bool DecodeFilenameValue(
223 const std::string& referrer_charset, 229 const std::string& input,
224 std::string* output) { 230 const std::string& referrer_charset,
225 std::string tmp; 231 std::string* output,
232 net::HttpContentDisposition::ParseResult* parse_result) {
233 net::HttpContentDisposition::ParseResult current_parse_result;
234 std::string decoded_value;
235 bool is_previous_token_rfc2047 = true;
236
226 // Tokenize with whitespace characters. 237 // Tokenize with whitespace characters.
227 StringTokenizer t(input, " \t\n\r"); 238 StringTokenizer t(input, " \t\n\r");
228 t.set_options(StringTokenizer::RETURN_DELIMS); 239 t.set_options(StringTokenizer::RETURN_DELIMS);
229 bool is_previous_token_rfc2047 = true;
230 while (t.GetNext()) { 240 while (t.GetNext()) {
231 if (t.token_is_delim()) { 241 if (t.token_is_delim()) {
232 // If the previous non-delimeter token is not RFC2047-encoded, 242 // If the previous non-delimeter token is not RFC2047-encoded,
233 // put in a space in its place. Otheriwse, skip over it. 243 // put in a space in its place. Otheriwse, skip over it.
234 if (!is_previous_token_rfc2047) { 244 if (!is_previous_token_rfc2047)
235 tmp.push_back(' '); 245 decoded_value.push_back(' ');
236 }
237 continue; 246 continue;
238 } 247 }
239 // We don't support a single multibyte character split into 248 // We don't support a single multibyte character split into
240 // adjacent encoded words. Some broken mail clients emit headers 249 // adjacent encoded words. Some broken mail clients emit headers
241 // with that problem, but most web servers usually encode a filename 250 // with that problem, but most web servers usually encode a filename
242 // in a single encoded-word. Firefox/Thunderbird do not support 251 // in a single encoded-word. Firefox/Thunderbird do not support
243 // it, either. 252 // it, either.
244 std::string decoded; 253 std::string decoded;
245 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, 254 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
246 &decoded)) 255 &decoded, &current_parse_result))
247 return false; 256 return false;
248 tmp.append(decoded); 257 decoded_value.append(decoded);
249 } 258 }
250 output->swap(tmp); 259 output->swap(decoded_value);
260 if (parse_result && !output->empty()) {
261 parse_result->has_non_ascii_strings =
262 current_parse_result.has_non_ascii_strings;
263 parse_result->has_percent_encoded_strings =
264 current_parse_result.has_percent_encoded_strings;
265 parse_result->has_rfc2047_encoded_strings =
266 current_parse_result.has_rfc2047_encoded_strings;
267 }
251 return true; 268 return true;
252 } 269 }
253 270
254 // Parses the charset and value-chars out of an ext-value string. 271 // Parses the charset and value-chars out of an ext-value string.
255 // 272 //
256 // ext-value = charset "'" [ language ] "'" value-chars 273 // ext-value = charset "'" [ language ] "'" value-chars
257 bool ParseExtValueComponents(const std::string& input, 274 bool ParseExtValueComponents(const std::string& input,
258 std::string* charset, 275 std::string* charset,
259 std::string* value_chars) { 276 std::string* value_chars) {
260 StringTokenizer t(input, "'"); 277 StringTokenizer t(input, "'");
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
330 std::string unescaped = net::UnescapeURLComponent( 347 std::string unescaped = net::UnescapeURLComponent(
331 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); 348 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS);
332 349
333 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); 350 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
334 } 351 }
335 352
336 } // namespace 353 } // namespace
337 354
338 namespace net { 355 namespace net {
339 356
357 HttpContentDisposition::ParseResult::ParseResult()
358 : has_disposition_type(false),
359 has_unknown_disposition_type(false),
360 has_name(false),
361 has_filename(false),
362 has_ext_filename(false),
363 has_non_ascii_strings(false),
364 has_percent_encoded_strings(false),
365 has_rfc2047_encoded_strings(false) {
366 }
367
340 HttpContentDisposition::HttpContentDisposition( 368 HttpContentDisposition::HttpContentDisposition(
341 const std::string& header, const std::string& referrer_charset) 369 const std::string& header, const std::string& referrer_charset)
342 : type_(INLINE) { 370 : type_(INLINE) {
343 Parse(header, referrer_charset); 371 Parse(header, referrer_charset);
344 } 372 }
345 373
346 HttpContentDisposition::~HttpContentDisposition() { 374 HttpContentDisposition::~HttpContentDisposition() {
347 } 375 }
348 376
349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( 377 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
350 std::string::const_iterator begin, std::string::const_iterator end) { 378 std::string::const_iterator begin, std::string::const_iterator end) {
351 DCHECK(type_ == INLINE); 379 DCHECK(type_ == INLINE);
352 std::string::const_iterator delimiter = std::find(begin, end, ';'); 380 std::string::const_iterator delimiter = std::find(begin, end, ';');
353 381
354 std::string::const_iterator type_begin = begin; 382 std::string::const_iterator type_begin = begin;
355 std::string::const_iterator type_end = delimiter; 383 std::string::const_iterator type_end = delimiter;
356 HttpUtil::TrimLWS(&type_begin, &type_end); 384 HttpUtil::TrimLWS(&type_begin, &type_end);
357 385
358 // If the disposition-type isn't a valid token the then the 386 // If the disposition-type isn't a valid token the then the
359 // Content-Disposition header is malformed, and we treat the first bytes as 387 // Content-Disposition header is malformed, and we treat the first bytes as
360 // a parameter rather than a disposition-type. 388 // a parameter rather than a disposition-type.
361 if (!HttpUtil::IsToken(type_begin, type_end)) 389 if (!HttpUtil::IsToken(type_begin, type_end))
362 return begin; 390 return begin;
363 391
392 parse_result_.has_disposition_type = true;
393
364 DCHECK(std::find(type_begin, type_end, '=') == type_end); 394 DCHECK(std::find(type_begin, type_end, '=') == type_end);
365 395
366 if (!LowerCaseEqualsASCII(type_begin, type_end, "inline")) 396 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
397 type_ = INLINE;
398 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
367 type_ = ATTACHMENT; 399 type_ = ATTACHMENT;
400 } else {
401 parse_result_.has_unknown_disposition_type = true;
402 type_ = ATTACHMENT;
403 }
368 return delimiter; 404 return delimiter;
369 } 405 }
370 406
371 // http://tools.ietf.org/html/rfc6266 407 // http://tools.ietf.org/html/rfc6266
372 // 408 //
373 // content-disposition = "Content-Disposition" ":" 409 // content-disposition = "Content-Disposition" ":"
374 // disposition-type *( ";" disposition-parm ) 410 // disposition-type *( ";" disposition-parm )
375 // 411 //
376 // disposition-type = "inline" | "attachment" | disp-ext-type 412 // disposition-type = "inline" | "attachment" | disp-ext-type
377 // ; case-insensitive 413 // ; case-insensitive
(...skipping 19 matching lines...) Expand all
397 433
398 std::string name; 434 std::string name;
399 std::string filename; 435 std::string filename;
400 std::string ext_filename; 436 std::string ext_filename;
401 437
402 HttpUtil::NameValuePairsIterator iter(pos, end, ';'); 438 HttpUtil::NameValuePairsIterator iter(pos, end, ';');
403 while (iter.GetNext()) { 439 while (iter.GetNext()) {
404 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), 440 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
405 iter.name_end(), 441 iter.name_end(),
406 "filename")) { 442 "filename")) {
407 DecodeFilenameValue(iter.value(), referrer_charset, &filename); 443 DecodeFilenameValue(iter.value(), referrer_charset, &filename,
444 &parse_result_);
445 parse_result_.has_filename = !filename.empty();
408 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), 446 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
409 iter.name_end(), 447 iter.name_end(),
410 "name")) { 448 "name")) {
411 DecodeFilenameValue(iter.value(), referrer_charset, &name); 449 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
450 parse_result_.has_name = !name.empty();
412 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), 451 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
413 iter.name_end(), 452 iter.name_end(),
414 "filename*")) { 453 "filename*")) {
415 DecodeExtValue(iter.raw_value(), &ext_filename); 454 DecodeExtValue(iter.raw_value(), &ext_filename);
455 parse_result_.has_ext_filename = !ext_filename.empty();
416 } 456 }
417 } 457 }
418 458
419 if (!ext_filename.empty()) 459 if (!ext_filename.empty())
420 filename_ = ext_filename; 460 filename_ = ext_filename;
421 else if (!filename.empty()) 461 else if (!filename.empty())
422 filename_ = filename; 462 filename_ = filename;
423 else 463 else
424 filename_ = name; 464 filename_ = name;
425 } 465 }
426 466
427 } // namespace net 467 } // namespace net
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698