OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "net/http/http_content_disposition.h" | 5 #include "net/http/http_content_disposition.h" |
6 | 6 |
7 #include "base/base64.h" | 7 #include "base/base64.h" |
8 #include "base/i18n/icu_string_conversions.h" | 8 #include "base/i18n/icu_string_conversions.h" |
9 #include "base/logging.h" | 9 #include "base/logging.h" |
10 #include "base/string_util.h" | 10 #include "base/string_util.h" |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
88 ucnv_close(converter); | 88 ucnv_close(converter); |
89 if (U_FAILURE(err)) | 89 if (U_FAILURE(err)) |
90 return false; | 90 return false; |
91 output->resize(output_length); | 91 output->resize(output_length); |
92 return true; | 92 return true; |
93 } | 93 } |
94 | 94 |
95 bool DecodeWord(const std::string& encoded_word, | 95 bool DecodeWord(const std::string& encoded_word, |
96 const std::string& referrer_charset, | 96 const std::string& referrer_charset, |
97 bool* is_rfc2047, | 97 bool* is_rfc2047, |
98 std::string* output) { | 98 std::string* output, |
| 99 net::HttpContentDisposition::ParseResult* parse_result) { |
99 *is_rfc2047 = false; | 100 *is_rfc2047 = false; |
100 output->clear(); | 101 output->clear(); |
101 if (encoded_word.empty()) | 102 if (encoded_word.empty()) |
102 return true; | 103 return true; |
103 | 104 |
104 if (!IsStringASCII(encoded_word)) { | 105 if (!IsStringASCII(encoded_word)) { |
105 // Try UTF-8, referrer_charset and the native OS default charset in turn. | 106 // Try UTF-8, referrer_charset and the native OS default charset in turn. |
106 if (IsStringUTF8(encoded_word)) { | 107 if (IsStringUTF8(encoded_word)) { |
107 *output = encoded_word; | 108 *output = encoded_word; |
108 } else { | 109 } else { |
109 string16 utf16_output; | 110 string16 utf16_output; |
110 if (!referrer_charset.empty() && | 111 if (!referrer_charset.empty() && |
111 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), | 112 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), |
112 base::OnStringConversionError::FAIL, | 113 base::OnStringConversionError::FAIL, |
113 &utf16_output)) { | 114 &utf16_output)) { |
114 *output = UTF16ToUTF8(utf16_output); | 115 *output = UTF16ToUTF8(utf16_output); |
115 } else { | 116 } else { |
116 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); | 117 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); |
117 } | 118 } |
118 } | 119 } |
119 | 120 |
| 121 parse_result->has_non_ascii_strings = true; |
120 return true; | 122 return true; |
121 } | 123 } |
122 | 124 |
123 // RFC 2047 : one of encoding methods supported by Firefox and relatively | 125 // RFC 2047 : one of encoding methods supported by Firefox and relatively |
124 // widely used by web servers. | 126 // widely used by web servers. |
125 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. | 127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. |
126 // We don't care about the length restriction (72 bytes) because | 128 // We don't care about the length restriction (72 bytes) because |
127 // many web servers generate encoded words longer than the limit. | 129 // many web servers generate encoded words longer than the limit. |
128 std::string tmp; | 130 std::string decoded_word; |
129 *is_rfc2047 = true; | 131 *is_rfc2047 = true; |
130 int part_index = 0; | 132 int part_index = 0; |
131 std::string charset; | 133 std::string charset; |
132 StringTokenizer t(encoded_word, "?"); | 134 StringTokenizer t(encoded_word, "?"); |
133 RFC2047EncodingType enc_type = Q_ENCODING; | 135 RFC2047EncodingType enc_type = Q_ENCODING; |
134 while (*is_rfc2047 && t.GetNext()) { | 136 while (*is_rfc2047 && t.GetNext()) { |
135 std::string part = t.token(); | 137 std::string part = t.token(); |
136 switch (part_index) { | 138 switch (part_index) { |
137 case 0: | 139 case 0: |
138 if (part != "=") { | 140 if (part != "=") { |
(...skipping 12 matching lines...) Expand all Loading... |
151 part.find_first_of("bBqQ") == std::string::npos) { | 153 part.find_first_of("bBqQ") == std::string::npos) { |
152 *is_rfc2047 = false; | 154 *is_rfc2047 = false; |
153 break; | 155 break; |
154 } | 156 } |
155 if (part[0] == 'b' || part[0] == 'B') { | 157 if (part[0] == 'b' || part[0] == 'B') { |
156 enc_type = B_ENCODING; | 158 enc_type = B_ENCODING; |
157 } | 159 } |
158 ++part_index; | 160 ++part_index; |
159 break; | 161 break; |
160 case 3: | 162 case 3: |
161 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); | 163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word); |
162 if (!*is_rfc2047) { | 164 if (!*is_rfc2047) { |
163 // Last minute failure. Invalid B/Q encoding. Rather than | 165 // Last minute failure. Invalid B/Q encoding. Rather than |
164 // passing it through, return now. | 166 // passing it through, return now. |
165 return false; | 167 return false; |
166 } | 168 } |
167 ++part_index; | 169 ++part_index; |
168 break; | 170 break; |
169 case 4: | 171 case 4: |
170 if (part != "=") { | 172 if (part != "=") { |
171 // Another last minute failure ! | 173 // Another last minute failure ! |
172 // Likely to be a case of two encoded-words in a row or | 174 // Likely to be a case of two encoded-words in a row or |
173 // an encoded word followed by a non-encoded word. We can be | 175 // an encoded word followed by a non-encoded word. We can be |
174 // generous, but it does not help much in terms of compatibility, | 176 // generous, but it does not help much in terms of compatibility, |
175 // I believe. Return immediately. | 177 // I believe. Return immediately. |
176 *is_rfc2047 = false; | 178 *is_rfc2047 = false; |
177 return false; | 179 return false; |
178 } | 180 } |
179 ++part_index; | 181 ++part_index; |
180 break; | 182 break; |
181 default: | 183 default: |
182 *is_rfc2047 = false; | 184 *is_rfc2047 = false; |
183 return false; | 185 return false; |
184 } | 186 } |
185 } | 187 } |
186 | 188 |
187 if (*is_rfc2047) { | 189 if (*is_rfc2047) { |
188 if (*(encoded_word.end() - 1) == '=') { | 190 if (*(encoded_word.end() - 1) == '=') { |
189 output->swap(tmp); | 191 output->swap(decoded_word); |
| 192 parse_result->has_rfc2047_encoded_strings = true; |
190 return true; | 193 return true; |
191 } | 194 } |
192 // encoded_word ending prematurelly with '?' or extra '?' | 195 // encoded_word ending prematurelly with '?' or extra '?' |
193 *is_rfc2047 = false; | 196 *is_rfc2047 = false; |
194 return false; | 197 return false; |
195 } | 198 } |
196 | 199 |
197 // We're not handling 'especial' characters quoted with '\', but | 200 // We're not handling 'especial' characters quoted with '\', but |
198 // it should be Ok because we're not an email client but a | 201 // it should be Ok because we're not an email client but a |
199 // web browser. | 202 // web browser. |
200 | 203 |
201 // What IE6/7 does: %-escaped UTF-8. | 204 // What IE6/7 does: %-escaped UTF-8. |
202 tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES); | 205 decoded_word = net::UnescapeURLComponent(encoded_word, |
203 if (IsStringUTF8(tmp)) { | 206 net::UnescapeRule::SPACES); |
204 output->swap(tmp); | 207 if (decoded_word != encoded_word) |
| 208 parse_result->has_percent_encoded_strings = true; |
| 209 if (IsStringUTF8(decoded_word)) { |
| 210 output->swap(decoded_word); |
205 return true; | 211 return true; |
206 // We can try either the OS default charset or 'origin charset' here, | 212 // We can try either the OS default charset or 'origin charset' here, |
207 // As far as I can tell, IE does not support it. However, I've seen | 213 // As far as I can tell, IE does not support it. However, I've seen |
208 // web servers emit %-escaped string in a legacy encoding (usually | 214 // web servers emit %-escaped string in a legacy encoding (usually |
209 // origin charset). | 215 // origin charset). |
210 // TODO(jungshik) : Test IE further and consider adding a fallback here. | 216 // TODO(jungshik) : Test IE further and consider adding a fallback here. |
211 } | 217 } |
212 return false; | 218 return false; |
213 } | 219 } |
214 | 220 |
215 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The | 221 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The |
216 // value is supposed to be of the form: | 222 // value is supposed to be of the form: |
217 // | 223 // |
218 // value = token | quoted-string | 224 // value = token | quoted-string |
219 // | 225 // |
220 // However we currently also allow RFC 2047 encoding and non-ASCII | 226 // However we currently also allow RFC 2047 encoding and non-ASCII |
221 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. | 227 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. |
222 bool DecodeFilenameValue(const std::string& input, | 228 bool DecodeFilenameValue( |
223 const std::string& referrer_charset, | 229 const std::string& input, |
224 std::string* output) { | 230 const std::string& referrer_charset, |
225 std::string tmp; | 231 std::string* output, |
| 232 net::HttpContentDisposition::ParseResult* parse_result) { |
| 233 net::HttpContentDisposition::ParseResult current_parse_result; |
| 234 std::string decoded_value; |
| 235 bool is_previous_token_rfc2047 = true; |
| 236 |
226 // Tokenize with whitespace characters. | 237 // Tokenize with whitespace characters. |
227 StringTokenizer t(input, " \t\n\r"); | 238 StringTokenizer t(input, " \t\n\r"); |
228 t.set_options(StringTokenizer::RETURN_DELIMS); | 239 t.set_options(StringTokenizer::RETURN_DELIMS); |
229 bool is_previous_token_rfc2047 = true; | |
230 while (t.GetNext()) { | 240 while (t.GetNext()) { |
231 if (t.token_is_delim()) { | 241 if (t.token_is_delim()) { |
232 // If the previous non-delimeter token is not RFC2047-encoded, | 242 // If the previous non-delimeter token is not RFC2047-encoded, |
233 // put in a space in its place. Otheriwse, skip over it. | 243 // put in a space in its place. Otheriwse, skip over it. |
234 if (!is_previous_token_rfc2047) { | 244 if (!is_previous_token_rfc2047) |
235 tmp.push_back(' '); | 245 decoded_value.push_back(' '); |
236 } | |
237 continue; | 246 continue; |
238 } | 247 } |
239 // We don't support a single multibyte character split into | 248 // We don't support a single multibyte character split into |
240 // adjacent encoded words. Some broken mail clients emit headers | 249 // adjacent encoded words. Some broken mail clients emit headers |
241 // with that problem, but most web servers usually encode a filename | 250 // with that problem, but most web servers usually encode a filename |
242 // in a single encoded-word. Firefox/Thunderbird do not support | 251 // in a single encoded-word. Firefox/Thunderbird do not support |
243 // it, either. | 252 // it, either. |
244 std::string decoded; | 253 std::string decoded; |
245 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, | 254 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, |
246 &decoded)) | 255 &decoded, ¤t_parse_result)) |
247 return false; | 256 return false; |
248 tmp.append(decoded); | 257 decoded_value.append(decoded); |
249 } | 258 } |
250 output->swap(tmp); | 259 output->swap(decoded_value); |
| 260 if (parse_result && !output->empty()) { |
| 261 parse_result->has_non_ascii_strings = |
| 262 current_parse_result.has_non_ascii_strings; |
| 263 parse_result->has_percent_encoded_strings = |
| 264 current_parse_result.has_percent_encoded_strings; |
| 265 parse_result->has_rfc2047_encoded_strings = |
| 266 current_parse_result.has_rfc2047_encoded_strings; |
| 267 } |
251 return true; | 268 return true; |
252 } | 269 } |
253 | 270 |
254 // Parses the charset and value-chars out of an ext-value string. | 271 // Parses the charset and value-chars out of an ext-value string. |
255 // | 272 // |
256 // ext-value = charset "'" [ language ] "'" value-chars | 273 // ext-value = charset "'" [ language ] "'" value-chars |
257 bool ParseExtValueComponents(const std::string& input, | 274 bool ParseExtValueComponents(const std::string& input, |
258 std::string* charset, | 275 std::string* charset, |
259 std::string* value_chars) { | 276 std::string* value_chars) { |
260 StringTokenizer t(input, "'"); | 277 StringTokenizer t(input, "'"); |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
330 std::string unescaped = net::UnescapeURLComponent( | 347 std::string unescaped = net::UnescapeURLComponent( |
331 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | 348 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); |
332 | 349 |
333 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); | 350 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); |
334 } | 351 } |
335 | 352 |
336 } // namespace | 353 } // namespace |
337 | 354 |
338 namespace net { | 355 namespace net { |
339 | 356 |
| 357 HttpContentDisposition::ParseResult::ParseResult() |
| 358 : has_disposition_type(false), |
| 359 has_unknown_disposition_type(false), |
| 360 has_name(false), |
| 361 has_filename(false), |
| 362 has_ext_filename(false), |
| 363 has_non_ascii_strings(false), |
| 364 has_percent_encoded_strings(false), |
| 365 has_rfc2047_encoded_strings(false) { |
| 366 } |
| 367 |
340 HttpContentDisposition::HttpContentDisposition( | 368 HttpContentDisposition::HttpContentDisposition( |
341 const std::string& header, const std::string& referrer_charset) | 369 const std::string& header, const std::string& referrer_charset) |
342 : type_(INLINE) { | 370 : type_(INLINE) { |
343 Parse(header, referrer_charset); | 371 Parse(header, referrer_charset); |
344 } | 372 } |
345 | 373 |
346 HttpContentDisposition::~HttpContentDisposition() { | 374 HttpContentDisposition::~HttpContentDisposition() { |
347 } | 375 } |
348 | 376 |
349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( | 377 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( |
350 std::string::const_iterator begin, std::string::const_iterator end) { | 378 std::string::const_iterator begin, std::string::const_iterator end) { |
351 DCHECK(type_ == INLINE); | 379 DCHECK(type_ == INLINE); |
352 std::string::const_iterator delimiter = std::find(begin, end, ';'); | 380 std::string::const_iterator delimiter = std::find(begin, end, ';'); |
353 | 381 |
354 std::string::const_iterator type_begin = begin; | 382 std::string::const_iterator type_begin = begin; |
355 std::string::const_iterator type_end = delimiter; | 383 std::string::const_iterator type_end = delimiter; |
356 HttpUtil::TrimLWS(&type_begin, &type_end); | 384 HttpUtil::TrimLWS(&type_begin, &type_end); |
357 | 385 |
358 // If the disposition-type isn't a valid token the then the | 386 // If the disposition-type isn't a valid token the then the |
359 // Content-Disposition header is malformed, and we treat the first bytes as | 387 // Content-Disposition header is malformed, and we treat the first bytes as |
360 // a parameter rather than a disposition-type. | 388 // a parameter rather than a disposition-type. |
361 if (!HttpUtil::IsToken(type_begin, type_end)) | 389 if (!HttpUtil::IsToken(type_begin, type_end)) |
362 return begin; | 390 return begin; |
363 | 391 |
| 392 parse_result_.has_disposition_type = true; |
| 393 |
364 DCHECK(std::find(type_begin, type_end, '=') == type_end); | 394 DCHECK(std::find(type_begin, type_end, '=') == type_end); |
365 | 395 |
366 if (!LowerCaseEqualsASCII(type_begin, type_end, "inline")) | 396 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) { |
| 397 type_ = INLINE; |
| 398 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) { |
367 type_ = ATTACHMENT; | 399 type_ = ATTACHMENT; |
| 400 } else { |
| 401 parse_result_.has_unknown_disposition_type = true; |
| 402 type_ = ATTACHMENT; |
| 403 } |
368 return delimiter; | 404 return delimiter; |
369 } | 405 } |
370 | 406 |
371 // http://tools.ietf.org/html/rfc6266 | 407 // http://tools.ietf.org/html/rfc6266 |
372 // | 408 // |
373 // content-disposition = "Content-Disposition" ":" | 409 // content-disposition = "Content-Disposition" ":" |
374 // disposition-type *( ";" disposition-parm ) | 410 // disposition-type *( ";" disposition-parm ) |
375 // | 411 // |
376 // disposition-type = "inline" | "attachment" | disp-ext-type | 412 // disposition-type = "inline" | "attachment" | disp-ext-type |
377 // ; case-insensitive | 413 // ; case-insensitive |
(...skipping 19 matching lines...) Expand all Loading... |
397 | 433 |
398 std::string name; | 434 std::string name; |
399 std::string filename; | 435 std::string filename; |
400 std::string ext_filename; | 436 std::string ext_filename; |
401 | 437 |
402 HttpUtil::NameValuePairsIterator iter(pos, end, ';'); | 438 HttpUtil::NameValuePairsIterator iter(pos, end, ';'); |
403 while (iter.GetNext()) { | 439 while (iter.GetNext()) { |
404 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), | 440 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), |
405 iter.name_end(), | 441 iter.name_end(), |
406 "filename")) { | 442 "filename")) { |
407 DecodeFilenameValue(iter.value(), referrer_charset, &filename); | 443 DecodeFilenameValue(iter.value(), referrer_charset, &filename, |
| 444 &parse_result_); |
| 445 parse_result_.has_filename = !filename.empty(); |
408 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), | 446 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), |
409 iter.name_end(), | 447 iter.name_end(), |
410 "name")) { | 448 "name")) { |
411 DecodeFilenameValue(iter.value(), referrer_charset, &name); | 449 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL); |
| 450 parse_result_.has_name = !name.empty(); |
412 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), | 451 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), |
413 iter.name_end(), | 452 iter.name_end(), |
414 "filename*")) { | 453 "filename*")) { |
415 DecodeExtValue(iter.raw_value(), &ext_filename); | 454 DecodeExtValue(iter.raw_value(), &ext_filename); |
| 455 parse_result_.has_ext_filename = !ext_filename.empty(); |
416 } | 456 } |
417 } | 457 } |
418 | 458 |
419 if (!ext_filename.empty()) | 459 if (!ext_filename.empty()) |
420 filename_ = ext_filename; | 460 filename_ = ext_filename; |
421 else if (!filename.empty()) | 461 else if (!filename.empty()) |
422 filename_ = filename; | 462 filename_ = filename; |
423 else | 463 else |
424 filename_ = name; | 464 filename_ = name; |
425 } | 465 } |
426 | 466 |
427 } // namespace net | 467 } // namespace net |
OLD | NEW |