| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "net/http/http_content_disposition.h" | |
| 6 | |
| 7 #include "base/base64.h" | |
| 8 #include "base/logging.h" | |
| 9 #include "base/strings/string_tokenizer.h" | |
| 10 #include "base/strings/string_util.h" | |
| 11 #include "base/strings/sys_string_conversions.h" | |
| 12 #include "base/strings/utf_string_conversions.h" | |
| 13 #include "net/base/net_string_util.h" | |
| 14 #include "net/base/net_util.h" | |
| 15 #include "net/http/http_util.h" | |
| 16 | |
| 17 namespace { | |
| 18 | |
| 19 enum RFC2047EncodingType { | |
| 20 Q_ENCODING, | |
| 21 B_ENCODING | |
| 22 }; | |
| 23 | |
| 24 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to | |
| 25 // decoding a quoted-printable string. Returns true if the input was valid. | |
| 26 bool DecodeQEncoding(const std::string& input, std::string* output) { | |
| 27 std::string temp; | |
| 28 temp.reserve(input.size()); | |
| 29 for (std::string::const_iterator it = input.begin(); it != input.end(); | |
| 30 ++it) { | |
| 31 if (*it == '_') { | |
| 32 temp.push_back(' '); | |
| 33 } else if (*it == '=') { | |
| 34 if ((input.end() - it < 3) || | |
| 35 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || | |
| 36 !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) | |
| 37 return false; | |
| 38 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + | |
| 39 HexDigitToInt(*(it + 2)); | |
| 40 temp.push_back(static_cast<char>(ch)); | |
| 41 ++it; | |
| 42 ++it; | |
| 43 } else if (0x20 < *it && *it < 0x7F && *it != '?') { | |
| 44 // In a Q-encoded word, only printable ASCII characters | |
| 45 // represent themselves. Besides, space, '=', '_' and '?' are | |
| 46 // not allowed, but they're already filtered out. | |
| 47 DCHECK_NE('=', *it); | |
| 48 DCHECK_NE('?', *it); | |
| 49 DCHECK_NE('_', *it); | |
| 50 temp.push_back(*it); | |
| 51 } else { | |
| 52 return false; | |
| 53 } | |
| 54 } | |
| 55 output->swap(temp); | |
| 56 return true; | |
| 57 } | |
| 58 | |
| 59 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding | |
| 60 // type is specified in |enc_type|. | |
| 61 bool DecodeBQEncoding(const std::string& part, | |
| 62 RFC2047EncodingType enc_type, | |
| 63 const std::string& charset, | |
| 64 std::string* output) { | |
| 65 std::string decoded; | |
| 66 if (!((enc_type == B_ENCODING) ? | |
| 67 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) { | |
| 68 return false; | |
| 69 } | |
| 70 | |
| 71 if (decoded.empty()) { | |
| 72 output->clear(); | |
| 73 return true; | |
| 74 } | |
| 75 | |
| 76 return net::ConvertToUtf8(decoded, charset.c_str(), output); | |
| 77 } | |
| 78 | |
| 79 bool DecodeWord(const std::string& encoded_word, | |
| 80 const std::string& referrer_charset, | |
| 81 bool* is_rfc2047, | |
| 82 std::string* output, | |
| 83 int* parse_result_flags) { | |
| 84 *is_rfc2047 = false; | |
| 85 output->clear(); | |
| 86 if (encoded_word.empty()) | |
| 87 return true; | |
| 88 | |
| 89 if (!base::IsStringASCII(encoded_word)) { | |
| 90 // Try UTF-8, referrer_charset and the native OS default charset in turn. | |
| 91 if (base::IsStringUTF8(encoded_word)) { | |
| 92 *output = encoded_word; | |
| 93 } else { | |
| 94 base::string16 utf16_output; | |
| 95 if (!referrer_charset.empty() && | |
| 96 net::ConvertToUTF16(encoded_word, referrer_charset.c_str(), | |
| 97 &utf16_output)) { | |
| 98 *output = base::UTF16ToUTF8(utf16_output); | |
| 99 } else { | |
| 100 *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word)); | |
| 101 } | |
| 102 } | |
| 103 | |
| 104 *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS; | |
| 105 return true; | |
| 106 } | |
| 107 | |
| 108 // RFC 2047 : one of encoding methods supported by Firefox and relatively | |
| 109 // widely used by web servers. | |
| 110 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. | |
| 111 // We don't care about the length restriction (72 bytes) because | |
| 112 // many web servers generate encoded words longer than the limit. | |
| 113 std::string decoded_word; | |
| 114 *is_rfc2047 = true; | |
| 115 int part_index = 0; | |
| 116 std::string charset; | |
| 117 base::StringTokenizer t(encoded_word, "?"); | |
| 118 RFC2047EncodingType enc_type = Q_ENCODING; | |
| 119 while (*is_rfc2047 && t.GetNext()) { | |
| 120 std::string part = t.token(); | |
| 121 switch (part_index) { | |
| 122 case 0: | |
| 123 if (part != "=") { | |
| 124 *is_rfc2047 = false; | |
| 125 break; | |
| 126 } | |
| 127 ++part_index; | |
| 128 break; | |
| 129 case 1: | |
| 130 // Do we need charset validity check here? | |
| 131 charset = part; | |
| 132 ++part_index; | |
| 133 break; | |
| 134 case 2: | |
| 135 if (part.size() > 1 || | |
| 136 part.find_first_of("bBqQ") == std::string::npos) { | |
| 137 *is_rfc2047 = false; | |
| 138 break; | |
| 139 } | |
| 140 if (part[0] == 'b' || part[0] == 'B') { | |
| 141 enc_type = B_ENCODING; | |
| 142 } | |
| 143 ++part_index; | |
| 144 break; | |
| 145 case 3: | |
| 146 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word); | |
| 147 if (!*is_rfc2047) { | |
| 148 // Last minute failure. Invalid B/Q encoding. Rather than | |
| 149 // passing it through, return now. | |
| 150 return false; | |
| 151 } | |
| 152 ++part_index; | |
| 153 break; | |
| 154 case 4: | |
| 155 if (part != "=") { | |
| 156 // Another last minute failure ! | |
| 157 // Likely to be a case of two encoded-words in a row or | |
| 158 // an encoded word followed by a non-encoded word. We can be | |
| 159 // generous, but it does not help much in terms of compatibility, | |
| 160 // I believe. Return immediately. | |
| 161 *is_rfc2047 = false; | |
| 162 return false; | |
| 163 } | |
| 164 ++part_index; | |
| 165 break; | |
| 166 default: | |
| 167 *is_rfc2047 = false; | |
| 168 return false; | |
| 169 } | |
| 170 } | |
| 171 | |
| 172 if (*is_rfc2047) { | |
| 173 if (*(encoded_word.end() - 1) == '=') { | |
| 174 output->swap(decoded_word); | |
| 175 *parse_result_flags |= | |
| 176 net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS; | |
| 177 return true; | |
| 178 } | |
| 179 // encoded_word ending prematurelly with '?' or extra '?' | |
| 180 *is_rfc2047 = false; | |
| 181 return false; | |
| 182 } | |
| 183 | |
| 184 // We're not handling 'especial' characters quoted with '\', but | |
| 185 // it should be Ok because we're not an email client but a | |
| 186 // web browser. | |
| 187 | |
| 188 // What IE6/7 does: %-escaped UTF-8. | |
| 189 decoded_word = net::UnescapeURLComponent(encoded_word, | |
| 190 net::UnescapeRule::SPACES); | |
| 191 if (decoded_word != encoded_word) | |
| 192 *parse_result_flags |= | |
| 193 net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS; | |
| 194 if (base::IsStringUTF8(decoded_word)) { | |
| 195 output->swap(decoded_word); | |
| 196 return true; | |
| 197 // We can try either the OS default charset or 'origin charset' here, | |
| 198 // As far as I can tell, IE does not support it. However, I've seen | |
| 199 // web servers emit %-escaped string in a legacy encoding (usually | |
| 200 // origin charset). | |
| 201 // TODO(jungshik) : Test IE further and consider adding a fallback here. | |
| 202 } | |
| 203 return false; | |
| 204 } | |
| 205 | |
| 206 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The | |
| 207 // value is supposed to be of the form: | |
| 208 // | |
| 209 // value = token | quoted-string | |
| 210 // | |
| 211 // However we currently also allow RFC 2047 encoding and non-ASCII | |
| 212 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. | |
| 213 bool DecodeFilenameValue(const std::string& input, | |
| 214 const std::string& referrer_charset, | |
| 215 std::string* output, | |
| 216 int* parse_result_flags) { | |
| 217 int current_parse_result_flags = 0; | |
| 218 std::string decoded_value; | |
| 219 bool is_previous_token_rfc2047 = true; | |
| 220 | |
| 221 // Tokenize with whitespace characters. | |
| 222 base::StringTokenizer t(input, " \t\n\r"); | |
| 223 t.set_options(base::StringTokenizer::RETURN_DELIMS); | |
| 224 while (t.GetNext()) { | |
| 225 if (t.token_is_delim()) { | |
| 226 // If the previous non-delimeter token is not RFC2047-encoded, | |
| 227 // put in a space in its place. Otheriwse, skip over it. | |
| 228 if (!is_previous_token_rfc2047) | |
| 229 decoded_value.push_back(' '); | |
| 230 continue; | |
| 231 } | |
| 232 // We don't support a single multibyte character split into | |
| 233 // adjacent encoded words. Some broken mail clients emit headers | |
| 234 // with that problem, but most web servers usually encode a filename | |
| 235 // in a single encoded-word. Firefox/Thunderbird do not support | |
| 236 // it, either. | |
| 237 std::string decoded; | |
| 238 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, | |
| 239 &decoded, ¤t_parse_result_flags)) | |
| 240 return false; | |
| 241 decoded_value.append(decoded); | |
| 242 } | |
| 243 output->swap(decoded_value); | |
| 244 if (parse_result_flags && !output->empty()) | |
| 245 *parse_result_flags |= current_parse_result_flags; | |
| 246 return true; | |
| 247 } | |
| 248 | |
| 249 // Parses the charset and value-chars out of an ext-value string. | |
| 250 // | |
| 251 // ext-value = charset "'" [ language ] "'" value-chars | |
| 252 bool ParseExtValueComponents(const std::string& input, | |
| 253 std::string* charset, | |
| 254 std::string* value_chars) { | |
| 255 base::StringTokenizer t(input, "'"); | |
| 256 t.set_options(base::StringTokenizer::RETURN_DELIMS); | |
| 257 std::string temp_charset; | |
| 258 std::string temp_value; | |
| 259 int numDelimsSeen = 0; | |
| 260 while (t.GetNext()) { | |
| 261 if (t.token_is_delim()) { | |
| 262 ++numDelimsSeen; | |
| 263 continue; | |
| 264 } else { | |
| 265 switch (numDelimsSeen) { | |
| 266 case 0: | |
| 267 temp_charset = t.token(); | |
| 268 break; | |
| 269 case 1: | |
| 270 // Language is ignored. | |
| 271 break; | |
| 272 case 2: | |
| 273 temp_value = t.token(); | |
| 274 break; | |
| 275 default: | |
| 276 return false; | |
| 277 } | |
| 278 } | |
| 279 } | |
| 280 if (numDelimsSeen != 2) | |
| 281 return false; | |
| 282 if (temp_charset.empty() || temp_value.empty()) | |
| 283 return false; | |
| 284 charset->swap(temp_charset); | |
| 285 value_chars->swap(temp_value); | |
| 286 return true; | |
| 287 } | |
| 288 | |
| 289 // http://tools.ietf.org/html/rfc5987#section-3.2 | |
| 290 // | |
| 291 // ext-value = charset "'" [ language ] "'" value-chars | |
| 292 // | |
| 293 // charset = "UTF-8" / "ISO-8859-1" / mime-charset | |
| 294 // | |
| 295 // mime-charset = 1*mime-charsetc | |
| 296 // mime-charsetc = ALPHA / DIGIT | |
| 297 // / "!" / "#" / "$" / "%" / "&" | |
| 298 // / "+" / "-" / "^" / "_" / "`" | |
| 299 // / "{" / "}" / "~" | |
| 300 // | |
| 301 // language = <Language-Tag, defined in [RFC5646], Section 2.1> | |
| 302 // | |
| 303 // value-chars = *( pct-encoded / attr-char ) | |
| 304 // | |
| 305 // pct-encoded = "%" HEXDIG HEXDIG | |
| 306 // | |
| 307 // attr-char = ALPHA / DIGIT | |
| 308 // / "!" / "#" / "$" / "&" / "+" / "-" / "." | |
| 309 // / "^" / "_" / "`" / "|" / "~" | |
| 310 bool DecodeExtValue(const std::string& param_value, std::string* decoded) { | |
| 311 if (param_value.find('"') != std::string::npos) | |
| 312 return false; | |
| 313 | |
| 314 std::string charset; | |
| 315 std::string value; | |
| 316 if (!ParseExtValueComponents(param_value, &charset, &value)) | |
| 317 return false; | |
| 318 | |
| 319 // RFC 5987 value should be ASCII-only. | |
| 320 if (!base::IsStringASCII(value)) { | |
| 321 decoded->clear(); | |
| 322 return true; | |
| 323 } | |
| 324 | |
| 325 std::string unescaped = net::UnescapeURLComponent( | |
| 326 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
| 327 | |
| 328 return net::ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded); | |
| 329 } | |
| 330 | |
| 331 } // namespace | |
| 332 | |
| 333 namespace net { | |
| 334 | |
| 335 HttpContentDisposition::HttpContentDisposition( | |
| 336 const std::string& header, const std::string& referrer_charset) | |
| 337 : type_(INLINE), | |
| 338 parse_result_flags_(INVALID) { | |
| 339 Parse(header, referrer_charset); | |
| 340 } | |
| 341 | |
| 342 HttpContentDisposition::~HttpContentDisposition() { | |
| 343 } | |
| 344 | |
| 345 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( | |
| 346 std::string::const_iterator begin, std::string::const_iterator end) { | |
| 347 DCHECK(type_ == INLINE); | |
| 348 std::string::const_iterator delimiter = std::find(begin, end, ';'); | |
| 349 | |
| 350 std::string::const_iterator type_begin = begin; | |
| 351 std::string::const_iterator type_end = delimiter; | |
| 352 HttpUtil::TrimLWS(&type_begin, &type_end); | |
| 353 | |
| 354 // If the disposition-type isn't a valid token the then the | |
| 355 // Content-Disposition header is malformed, and we treat the first bytes as | |
| 356 // a parameter rather than a disposition-type. | |
| 357 if (!HttpUtil::IsToken(type_begin, type_end)) | |
| 358 return begin; | |
| 359 | |
| 360 parse_result_flags_ |= HAS_DISPOSITION_TYPE; | |
| 361 | |
| 362 DCHECK(std::find(type_begin, type_end, '=') == type_end); | |
| 363 | |
| 364 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) { | |
| 365 type_ = INLINE; | |
| 366 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) { | |
| 367 type_ = ATTACHMENT; | |
| 368 } else { | |
| 369 parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE; | |
| 370 type_ = ATTACHMENT; | |
| 371 } | |
| 372 return delimiter; | |
| 373 } | |
| 374 | |
| 375 // http://tools.ietf.org/html/rfc6266 | |
| 376 // | |
| 377 // content-disposition = "Content-Disposition" ":" | |
| 378 // disposition-type *( ";" disposition-parm ) | |
| 379 // | |
| 380 // disposition-type = "inline" | "attachment" | disp-ext-type | |
| 381 // ; case-insensitive | |
| 382 // disp-ext-type = token | |
| 383 // | |
| 384 // disposition-parm = filename-parm | disp-ext-parm | |
| 385 // | |
| 386 // filename-parm = "filename" "=" value | |
| 387 // | "filename*" "=" ext-value | |
| 388 // | |
| 389 // disp-ext-parm = token "=" value | |
| 390 // | ext-token "=" ext-value | |
| 391 // ext-token = <the characters in token, followed by "*"> | |
| 392 // | |
| 393 void HttpContentDisposition::Parse(const std::string& header, | |
| 394 const std::string& referrer_charset) { | |
| 395 DCHECK(type_ == INLINE); | |
| 396 DCHECK(filename_.empty()); | |
| 397 | |
| 398 std::string::const_iterator pos = header.begin(); | |
| 399 std::string::const_iterator end = header.end(); | |
| 400 pos = ConsumeDispositionType(pos, end); | |
| 401 | |
| 402 std::string name; | |
| 403 std::string filename; | |
| 404 std::string ext_filename; | |
| 405 | |
| 406 HttpUtil::NameValuePairsIterator iter(pos, end, ';'); | |
| 407 while (iter.GetNext()) { | |
| 408 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), | |
| 409 iter.name_end(), | |
| 410 "filename")) { | |
| 411 DecodeFilenameValue(iter.value(), referrer_charset, &filename, | |
| 412 &parse_result_flags_); | |
| 413 if (!filename.empty()) | |
| 414 parse_result_flags_ |= HAS_FILENAME; | |
| 415 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), | |
| 416 iter.name_end(), | |
| 417 "name")) { | |
| 418 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL); | |
| 419 if (!name.empty()) | |
| 420 parse_result_flags_ |= HAS_NAME; | |
| 421 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), | |
| 422 iter.name_end(), | |
| 423 "filename*")) { | |
| 424 DecodeExtValue(iter.raw_value(), &ext_filename); | |
| 425 if (!ext_filename.empty()) | |
| 426 parse_result_flags_ |= HAS_EXT_FILENAME; | |
| 427 } | |
| 428 } | |
| 429 | |
| 430 if (!ext_filename.empty()) | |
| 431 filename_ = ext_filename; | |
| 432 else if (!filename.empty()) | |
| 433 filename_ = filename; | |
| 434 else | |
| 435 filename_ = name; | |
| 436 } | |
| 437 | |
| 438 } // namespace net | |
| OLD | NEW |