Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(151)

Side by Side Diff: net/http/http_content_disposition.cc

Issue 11478034: Add UMA for measuring Content-Dispostion header use and abuse. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "net/http/http_content_disposition.h" 5 #include "net/http/http_content_disposition.h"
6 6
7 #include "base/base64.h" 7 #include "base/base64.h"
8 #include "base/i18n/icu_string_conversions.h" 8 #include "base/i18n/icu_string_conversions.h"
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/string_util.h" 10 #include "base/string_util.h"
(...skipping 79 matching lines...) Expand 10 before | Expand all | Expand 10 after
90 ucnv_close(converter); 90 ucnv_close(converter);
91 if (U_FAILURE(err)) 91 if (U_FAILURE(err))
92 return false; 92 return false;
93 output->resize(output_length); 93 output->resize(output_length);
94 return true; 94 return true;
95 } 95 }
96 96
97 bool DecodeWord(const std::string& encoded_word, 97 bool DecodeWord(const std::string& encoded_word,
98 const std::string& referrer_charset, 98 const std::string& referrer_charset,
99 bool* is_rfc2047, 99 bool* is_rfc2047,
100 std::string* output) { 100 std::string* output,
101 HttpContentDisposition::ParseResult* parse_result) {
101 *is_rfc2047 = false; 102 *is_rfc2047 = false;
102 output->clear(); 103 output->clear();
103 if (encoded_word.empty()) 104 if (encoded_word.empty())
104 return true; 105 return true;
105 106
106 if (!IsStringASCII(encoded_word)) { 107 if (!IsStringASCII(encoded_word)) {
107 // Try UTF-8, referrer_charset and the native OS default charset in turn. 108 // Try UTF-8, referrer_charset and the native OS default charset in turn.
108 if (IsStringUTF8(encoded_word)) { 109 if (IsStringUTF8(encoded_word)) {
109 *output = encoded_word; 110 *output = encoded_word;
110 } else { 111 } else {
111 string16 utf16_output; 112 string16 utf16_output;
112 if (!referrer_charset.empty() && 113 if (!referrer_charset.empty() &&
113 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), 114 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
114 base::OnStringConversionError::FAIL, 115 base::OnStringConversionError::FAIL,
115 &utf16_output)) { 116 &utf16_output)) {
116 *output = UTF16ToUTF8(utf16_output); 117 *output = UTF16ToUTF8(utf16_output);
117 } else { 118 } else {
118 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); 119 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
119 } 120 }
120 } 121 }
121 122
123 parse_result->has_non_ascii_strings = true;
122 return true; 124 return true;
123 } 125 }
124 126
125 // RFC 2047 : one of encoding methods supported by Firefox and relatively 127 // RFC 2047 : one of encoding methods supported by Firefox and relatively
126 // widely used by web servers. 128 // widely used by web servers.
127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. 129 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
128 // We don't care about the length restriction (72 bytes) because 130 // We don't care about the length restriction (72 bytes) because
129 // many web servers generate encoded words longer than the limit. 131 // many web servers generate encoded words longer than the limit.
130 std::string tmp; 132 std::string decoded_word;
131 *is_rfc2047 = true; 133 *is_rfc2047 = true;
132 int part_index = 0; 134 int part_index = 0;
133 std::string charset; 135 std::string charset;
134 StringTokenizer t(encoded_word, "?"); 136 StringTokenizer t(encoded_word, "?");
135 RFC2047EncodingType enc_type = Q_ENCODING; 137 RFC2047EncodingType enc_type = Q_ENCODING;
136 while (*is_rfc2047 && t.GetNext()) { 138 while (*is_rfc2047 && t.GetNext()) {
137 std::string part = t.token(); 139 std::string part = t.token();
138 switch (part_index) { 140 switch (part_index) {
139 case 0: 141 case 0:
140 if (part != "=") { 142 if (part != "=") {
(...skipping 12 matching lines...) Expand all
153 part.find_first_of("bBqQ") == std::string::npos) { 155 part.find_first_of("bBqQ") == std::string::npos) {
154 *is_rfc2047 = false; 156 *is_rfc2047 = false;
155 break; 157 break;
156 } 158 }
157 if (part[0] == 'b' || part[0] == 'B') { 159 if (part[0] == 'b' || part[0] == 'B') {
158 enc_type = B_ENCODING; 160 enc_type = B_ENCODING;
159 } 161 }
160 ++part_index; 162 ++part_index;
161 break; 163 break;
162 case 3: 164 case 3:
163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); 165 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
164 if (!*is_rfc2047) { 166 if (!*is_rfc2047) {
165 // Last minute failure. Invalid B/Q encoding. Rather than 167 // Last minute failure. Invalid B/Q encoding. Rather than
166 // passing it through, return now. 168 // passing it through, return now.
167 return false; 169 return false;
168 } 170 }
169 ++part_index; 171 ++part_index;
170 break; 172 break;
171 case 4: 173 case 4:
172 if (part != "=") { 174 if (part != "=") {
173 // Another last minute failure ! 175 // Another last minute failure !
174 // Likely to be a case of two encoded-words in a row or 176 // Likely to be a case of two encoded-words in a row or
175 // an encoded word followed by a non-encoded word. We can be 177 // an encoded word followed by a non-encoded word. We can be
176 // generous, but it does not help much in terms of compatibility, 178 // generous, but it does not help much in terms of compatibility,
177 // I believe. Return immediately. 179 // I believe. Return immediately.
178 *is_rfc2047 = false; 180 *is_rfc2047 = false;
179 return false; 181 return false;
180 } 182 }
181 ++part_index; 183 ++part_index;
182 break; 184 break;
183 default: 185 default:
184 *is_rfc2047 = false; 186 *is_rfc2047 = false;
185 return false; 187 return false;
186 } 188 }
187 } 189 }
188 190
189 if (*is_rfc2047) { 191 if (*is_rfc2047) {
190 if (*(encoded_word.end() - 1) == '=') { 192 if (*(encoded_word.end() - 1) == '=') {
191 output->swap(tmp); 193 output->swap(decoded_word);
194 parse_result->has_rfc2047_encoded_strings = true;
192 return true; 195 return true;
193 } 196 }
194 // encoded_word ending prematurelly with '?' or extra '?' 197 // encoded_word ending prematurelly with '?' or extra '?'
195 *is_rfc2047 = false; 198 *is_rfc2047 = false;
196 return false; 199 return false;
197 } 200 }
198 201
199 // We're not handling 'especial' characters quoted with '\', but 202 // We're not handling 'especial' characters quoted with '\', but
200 // it should be Ok because we're not an email client but a 203 // it should be Ok because we're not an email client but a
201 // web browser. 204 // web browser.
202 205
203 // What IE6/7 does: %-escaped UTF-8. 206 // What IE6/7 does: %-escaped UTF-8.
204 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); 207 decoded_word = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
205 if (IsStringUTF8(tmp)) { 208 if (decoded_word != encoded_word)
206 output->swap(tmp); 209 parse_result->has_percent_encoded_strings = true;
210 if (IsStringUTF8(decoded_word)) {
211 output->swap(decoded_word);
207 return true; 212 return true;
208 // We can try either the OS default charset or 'origin charset' here, 213 // We can try either the OS default charset or 'origin charset' here,
209 // As far as I can tell, IE does not support it. However, I've seen 214 // As far as I can tell, IE does not support it. However, I've seen
210 // web servers emit %-escaped string in a legacy encoding (usually 215 // web servers emit %-escaped string in a legacy encoding (usually
211 // origin charset). 216 // origin charset).
212 // TODO(jungshik) : Test IE further and consider adding a fallback here. 217 // TODO(jungshik) : Test IE further and consider adding a fallback here.
213 } 218 }
214 return false; 219 return false;
215 } 220 }
216 221
217 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The 222 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
218 // value is supposed to be of the form: 223 // value is supposed to be of the form:
219 // 224 //
220 // value = token | quoted-string 225 // value = token | quoted-string
221 // 226 //
222 // However we currently also allow RFC 2047 encoding and non-ASCII 227 // However we currently also allow RFC 2047 encoding and non-ASCII
223 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. 228 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
224 bool DecodeFilenameValue(const std::string& input, 229 bool DecodeFilenameValue(const std::string& input,
225 const std::string& referrer_charset, 230 const std::string& referrer_charset,
226 std::string* output) { 231 std::string* output,
227 std::string tmp; 232 HttpContentDisposition::ParseResult* parse_result) {
233 HttpContentDisposition::ParseResult current_parse_result;
234 std::string decoded_value;
235 bool is_previous_token_rfc2047 = true;
236
228 // Tokenize with whitespace characters. 237 // Tokenize with whitespace characters.
229 StringTokenizer t(input, " \t\n\r"); 238 StringTokenizer t(input, " \t\n\r");
230 t.set_options(StringTokenizer::RETURN_DELIMS); 239 t.set_options(StringTokenizer::RETURN_DELIMS);
231 bool is_previous_token_rfc2047 = true;
232 while (t.GetNext()) { 240 while (t.GetNext()) {
233 if (t.token_is_delim()) { 241 if (t.token_is_delim()) {
234 // If the previous non-delimeter token is not RFC2047-encoded, 242 // If the previous non-delimeter token is not RFC2047-encoded,
235 // put in a space in its place. Otheriwse, skip over it. 243 // put in a space in its place. Otheriwse, skip over it.
236 if (!is_previous_token_rfc2047) { 244 if (!is_previous_token_rfc2047)
237 tmp.push_back(' '); 245 decoded_value.push_back(' ');
238 }
239 continue; 246 continue;
240 } 247 }
241 // We don't support a single multibyte character split into 248 // We don't support a single multibyte character split into
242 // adjacent encoded words. Some broken mail clients emit headers 249 // adjacent encoded words. Some broken mail clients emit headers
243 // with that problem, but most web servers usually encode a filename 250 // with that problem, but most web servers usually encode a filename
244 // in a single encoded-word. Firefox/Thunderbird do not support 251 // in a single encoded-word. Firefox/Thunderbird do not support
245 // it, either. 252 // it, either.
246 std::string decoded; 253 std::string decoded;
247 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, 254 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
248 &decoded)) 255 &decoded, &current_parse_result))
249 return false; 256 return false;
250 tmp.append(decoded); 257 decoded_value.append(decoded);
251 } 258 }
252 output->swap(tmp); 259 output->swap(decoded_value);
260 if (parse_result) {
261 parse_result->has_non_ascii_strings =
262 current_parse_result.has_non_ascii_strings;
263 parse_result->has_percent_encoded_strings =
264 current_parse_result.has_percent_encoded_strings;
265 parse_result->has_rfc2047_encoded_strings =
266 current_parse_result.has_rfc2047_encoded_strings;
267 }
253 return true; 268 return true;
254 } 269 }
255 270
256 // Parses the charset and value-chars out of an ext-value string. 271 // Parses the charset and value-chars out of an ext-value string.
257 // 272 //
258 // ext-value = charset "'" [ language ] "'" value-chars 273 // ext-value = charset "'" [ language ] "'" value-chars
259 bool ParseExtValueComponents(const std::string& input, 274 bool ParseExtValueComponents(const std::string& input,
260 std::string* charset, 275 std::string* charset,
261 std::string* value_chars) { 276 std::string* value_chars) {
262 StringTokenizer t(input, "'"); 277 StringTokenizer t(input, "'");
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after
330 } 345 }
331 346
332 std::string unescaped = UnescapeURLComponent(value, 347 std::string unescaped = UnescapeURLComponent(value,
333 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); 348 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
334 349
335 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); 350 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
336 } 351 }
337 352
338 } // namespace 353 } // namespace
339 354
355 HttpContentDisposition::ParseResult::ParseResult()
356 : has_disposition_type(false),
357 has_unknown_disposition_type(false),
358 has_name(false),
359 has_filename(false),
360 has_ext_filename(false),
361 has_non_ascii_strings(false),
362 has_percent_encoded_strings(false),
363 has_rfc2047_encoded_strings(false) {
364 }
365
340 HttpContentDisposition::HttpContentDisposition( 366 HttpContentDisposition::HttpContentDisposition(
341 const std::string& header, const std::string& referrer_charset) 367 const std::string& header, const std::string& referrer_charset)
342 : type_(INLINE) { 368 : type_(INLINE) {
343 Parse(header, referrer_charset); 369 Parse(header, referrer_charset);
344 } 370 }
345 371
346 HttpContentDisposition::~HttpContentDisposition() { 372 HttpContentDisposition::~HttpContentDisposition() {
347 } 373 }
348 374
349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( 375 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
350 std::string::const_iterator begin, std::string::const_iterator end) { 376 std::string::const_iterator begin, std::string::const_iterator end) {
351 DCHECK(type_ == INLINE); 377 DCHECK(type_ == INLINE);
352 std::string::const_iterator delimiter = std::find(begin, end, ';'); 378 std::string::const_iterator delimiter = std::find(begin, end, ';');
353 379
354 std::string::const_iterator type_begin = begin; 380 std::string::const_iterator type_begin = begin;
355 std::string::const_iterator type_end = delimiter; 381 std::string::const_iterator type_end = delimiter;
356 HttpUtil::TrimLWS(&type_begin, &type_end); 382 HttpUtil::TrimLWS(&type_begin, &type_end);
357 383
358 // If the disposition-type isn't a valid token the then the 384 // If the disposition-type isn't a valid token the then the
359 // Content-Disposition header is malformed, and we treat the first bytes as 385 // Content-Disposition header is malformed, and we treat the first bytes as
360 // a parameter rather than a disposition-type. 386 // a parameter rather than a disposition-type.
361 if (!HttpUtil::IsToken(type_begin, type_end)) 387 if (!HttpUtil::IsToken(type_begin, type_end))
362 return begin; 388 return begin;
363 389
390 parse_result_.has_disposition_type = true;
391
364 DCHECK(std::find(type_begin, type_end, '=') == type_end); 392 DCHECK(std::find(type_begin, type_end, '=') == type_end);
365 393
366 if (!LowerCaseEqualsASCII(type_begin, type_end, "inline")) 394 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) {
395 type_ = INLINE;
396 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) {
367 type_ = ATTACHMENT; 397 type_ = ATTACHMENT;
398 } else {
399 parse_result_.has_unknown_disposition_type = true;
400 type_ = ATTACHMENT;
401 }
368 return delimiter; 402 return delimiter;
369 } 403 }
370 404
371 // http://tools.ietf.org/html/rfc6266 405 // http://tools.ietf.org/html/rfc6266
372 // 406 //
373 // content-disposition = "Content-Disposition" ":" 407 // content-disposition = "Content-Disposition" ":"
374 // disposition-type *( ";" disposition-parm ) 408 // disposition-type *( ";" disposition-parm )
375 // 409 //
376 // disposition-type = "inline" | "attachment" | disp-ext-type 410 // disposition-type = "inline" | "attachment" | disp-ext-type
377 // ; case-insensitive 411 // ; case-insensitive
(...skipping 19 matching lines...) Expand all
397 431
398 std::string name; 432 std::string name;
399 std::string filename; 433 std::string filename;
400 std::string ext_filename; 434 std::string ext_filename;
401 435
402 HttpUtil::NameValuePairsIterator iter(pos, end, ';'); 436 HttpUtil::NameValuePairsIterator iter(pos, end, ';');
403 while (iter.GetNext()) { 437 while (iter.GetNext()) {
404 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), 438 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
405 iter.name_end(), 439 iter.name_end(),
406 "filename")) { 440 "filename")) {
407 DecodeFilenameValue(iter.value(), referrer_charset, &filename); 441 parse_result_.has_filename =
442 DecodeFilenameValue(iter.value(), referrer_charset, &filename,
443 &parse_result_);
408 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), 444 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(),
409 iter.name_end(), 445 iter.name_end(),
410 "name")) { 446 "name")) {
411 DecodeFilenameValue(iter.value(), referrer_charset, &name); 447 parse_result_.has_name =
448 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL);
412 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), 449 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(),
413 iter.name_end(), 450 iter.name_end(),
414 "filename*")) { 451 "filename*")) {
415 DecodeExtValue(iter.raw_value(), &ext_filename); 452 parse_result_.has_ext_filename =
453 DecodeExtValue(iter.raw_value(), &ext_filename);
416 } 454 }
417 } 455 }
418 456
419 if (!ext_filename.empty()) 457 if (!ext_filename.empty())
420 filename_ = ext_filename; 458 filename_ = ext_filename;
421 else if (!filename.empty()) 459 else if (!filename.empty())
422 filename_ = filename; 460 filename_ = filename;
423 else 461 else
424 filename_ = name; 462 filename_ = name;
425 } 463 }
426 464
427 } // namespace net 465 } // namespace net
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698