OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "net/http/http_content_disposition.h" | 5 #include "net/http/http_content_disposition.h" |
6 | 6 |
| 7 #include "base/base64.h" |
| 8 #include "base/i18n/icu_string_conversions.h" |
7 #include "base/logging.h" | 9 #include "base/logging.h" |
8 #include "base/string_util.h" | 10 #include "base/string_util.h" |
| 11 #include "base/sys_string_conversions.h" |
| 12 #include "base/utf_string_conversions.h" |
9 #include "net/base/net_util.h" | 13 #include "net/base/net_util.h" |
10 #include "net/http/http_util.h" | 14 #include "net/http/http_util.h" |
| 15 #include "unicode/ucnv.h" |
| 16 |
| 17 namespace { |
| 18 |
| 19 enum RFC2047EncodingType { |
| 20 Q_ENCODING, |
| 21 B_ENCODING |
| 22 }; |
| 23 |
| 24 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to |
| 25 // decoding a quoted-printable string. Returns true if the input was valid. |
| 26 bool DecodeQEncoding(const std::string& input, std::string* output) { |
| 27 std::string temp; |
| 28 temp.reserve(input.size()); |
| 29 for (std::string::const_iterator it = input.begin(); it != input.end(); |
| 30 ++it) { |
| 31 if (*it == '_') { |
| 32 temp.push_back(' '); |
| 33 } else if (*it == '=') { |
| 34 if ((input.end() - it < 3) || |
| 35 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || |
| 36 !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) |
| 37 return false; |
| 38 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + |
| 39 HexDigitToInt(*(it + 2)); |
| 40 temp.push_back(static_cast<char>(ch)); |
| 41 ++it; |
| 42 ++it; |
| 43 } else if (0x20 < *it && *it < 0x7F && *it != '?') { |
| 44 // In a Q-encoded word, only printable ASCII characters |
| 45 // represent themselves. Besides, space, '=', '_' and '?' are |
| 46 // not allowed, but they're already filtered out. |
| 47 DCHECK_NE('=', *it); |
| 48 DCHECK_NE('?', *it); |
| 49 DCHECK_NE('_', *it); |
| 50 temp.push_back(*it); |
| 51 } else { |
| 52 return false; |
| 53 } |
| 54 } |
| 55 output->swap(temp); |
| 56 return true; |
| 57 } |
| 58 |
| 59 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding |
| 60 // type is specified in |enc_type|. |
| 61 bool DecodeBQEncoding(const std::string& part, |
| 62 RFC2047EncodingType enc_type, |
| 63 const std::string& charset, |
| 64 std::string* output) { |
| 65 std::string decoded; |
| 66 if (!((enc_type == B_ENCODING) ? |
| 67 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) |
| 68 return false; |
| 69 |
| 70 if (decoded.empty()) { |
| 71 output->clear(); |
| 72 return true; |
| 73 } |
| 74 |
| 75 UErrorCode err = U_ZERO_ERROR; |
| 76 UConverter* converter(ucnv_open(charset.c_str(), &err)); |
| 77 if (U_FAILURE(err)) |
| 78 return false; |
| 79 |
| 80 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. |
| 81 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes |
| 82 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a |
| 83 // trailing '\0'. |
| 84 size_t output_length = decoded.length() * 3 + 1; |
| 85 char* buf = WriteInto(output, output_length); |
| 86 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, |
| 87 decoded.data(), decoded.length(), &err); |
| 88 ucnv_close(converter); |
| 89 if (U_FAILURE(err)) |
| 90 return false; |
| 91 output->resize(output_length); |
| 92 return true; |
| 93 } |
| 94 |
| 95 bool DecodeWord(const std::string& encoded_word, |
| 96 const std::string& referrer_charset, |
| 97 bool* is_rfc2047, |
| 98 std::string* output) { |
| 99 *is_rfc2047 = false; |
| 100 output->clear(); |
| 101 if (encoded_word.empty()) |
| 102 return true; |
| 103 |
| 104 if (!IsStringASCII(encoded_word)) { |
| 105 // Try UTF-8, referrer_charset and the native OS default charset in turn. |
| 106 if (IsStringUTF8(encoded_word)) { |
| 107 *output = encoded_word; |
| 108 } else { |
| 109 string16 utf16_output; |
| 110 if (!referrer_charset.empty() && |
| 111 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), |
| 112 base::OnStringConversionError::FAIL, |
| 113 &utf16_output)) { |
| 114 *output = UTF16ToUTF8(utf16_output); |
| 115 } else { |
| 116 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); |
| 117 } |
| 118 } |
| 119 |
| 120 return true; |
| 121 } |
| 122 |
| 123 // RFC 2047 : one of encoding methods supported by Firefox and relatively |
| 124 // widely used by web servers. |
| 125 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. |
| 126 // We don't care about the length restriction (72 bytes) because |
| 127 // many web servers generate encoded words longer than the limit. |
| 128 std::string tmp; |
| 129 *is_rfc2047 = true; |
| 130 int part_index = 0; |
| 131 std::string charset; |
| 132 StringTokenizer t(encoded_word, "?"); |
| 133 RFC2047EncodingType enc_type = Q_ENCODING; |
| 134 while (*is_rfc2047 && t.GetNext()) { |
| 135 std::string part = t.token(); |
| 136 switch (part_index) { |
| 137 case 0: |
| 138 if (part != "=") { |
| 139 *is_rfc2047 = false; |
| 140 break; |
| 141 } |
| 142 ++part_index; |
| 143 break; |
| 144 case 1: |
| 145 // Do we need charset validity check here? |
| 146 charset = part; |
| 147 ++part_index; |
| 148 break; |
| 149 case 2: |
| 150 if (part.size() > 1 || |
| 151 part.find_first_of("bBqQ") == std::string::npos) { |
| 152 *is_rfc2047 = false; |
| 153 break; |
| 154 } |
| 155 if (part[0] == 'b' || part[0] == 'B') { |
| 156 enc_type = B_ENCODING; |
| 157 } |
| 158 ++part_index; |
| 159 break; |
| 160 case 3: |
| 161 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); |
| 162 if (!*is_rfc2047) { |
| 163 // Last minute failure. Invalid B/Q encoding. Rather than |
| 164 // passing it through, return now. |
| 165 return false; |
| 166 } |
| 167 ++part_index; |
| 168 break; |
| 169 case 4: |
| 170 if (part != "=") { |
| 171 // Another last minute failure ! |
| 172 // Likely to be a case of two encoded-words in a row or |
| 173 // an encoded word followed by a non-encoded word. We can be |
| 174 // generous, but it does not help much in terms of compatibility, |
| 175 // I believe. Return immediately. |
| 176 *is_rfc2047 = false; |
| 177 return false; |
| 178 } |
| 179 ++part_index; |
| 180 break; |
| 181 default: |
| 182 *is_rfc2047 = false; |
| 183 return false; |
| 184 } |
| 185 } |
| 186 |
| 187 if (*is_rfc2047) { |
| 188 if (*(encoded_word.end() - 1) == '=') { |
| 189 output->swap(tmp); |
| 190 return true; |
| 191 } |
| 192 // encoded_word ending prematurelly with '?' or extra '?' |
| 193 *is_rfc2047 = false; |
| 194 return false; |
| 195 } |
| 196 |
| 197 // We're not handling 'especial' characters quoted with '\', but |
| 198 // it should be Ok because we're not an email client but a |
| 199 // web browser. |
| 200 |
| 201 // What IE6/7 does: %-escaped UTF-8. |
| 202 tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES); |
| 203 if (IsStringUTF8(tmp)) { |
| 204 output->swap(tmp); |
| 205 return true; |
| 206 // We can try either the OS default charset or 'origin charset' here, |
| 207 // As far as I can tell, IE does not support it. However, I've seen |
| 208 // web servers emit %-escaped string in a legacy encoding (usually |
| 209 // origin charset). |
| 210 // TODO(jungshik) : Test IE further and consider adding a fallback here. |
| 211 } |
| 212 return false; |
| 213 } |
| 214 |
| 215 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The |
| 216 // value is supposed to be of the form: |
| 217 // |
| 218 // value = token | quoted-string |
| 219 // |
| 220 // However we currently also allow RFC 2047 encoding and non-ASCII |
| 221 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. |
| 222 bool DecodeFilenameValue(const std::string& input, |
| 223 const std::string& referrer_charset, |
| 224 std::string* output) { |
| 225 std::string tmp; |
| 226 // Tokenize with whitespace characters. |
| 227 StringTokenizer t(input, " \t\n\r"); |
| 228 t.set_options(StringTokenizer::RETURN_DELIMS); |
| 229 bool is_previous_token_rfc2047 = true; |
| 230 while (t.GetNext()) { |
| 231 if (t.token_is_delim()) { |
| 232 // If the previous non-delimeter token is not RFC2047-encoded, |
| 233 // put in a space in its place. Otheriwse, skip over it. |
| 234 if (!is_previous_token_rfc2047) { |
| 235 tmp.push_back(' '); |
| 236 } |
| 237 continue; |
| 238 } |
| 239 // We don't support a single multibyte character split into |
| 240 // adjacent encoded words. Some broken mail clients emit headers |
| 241 // with that problem, but most web servers usually encode a filename |
| 242 // in a single encoded-word. Firefox/Thunderbird do not support |
| 243 // it, either. |
| 244 std::string decoded; |
| 245 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, |
| 246 &decoded)) |
| 247 return false; |
| 248 tmp.append(decoded); |
| 249 } |
| 250 output->swap(tmp); |
| 251 return true; |
| 252 } |
| 253 |
| 254 // Parses the charset and value-chars out of an ext-value string. |
| 255 // |
| 256 // ext-value = charset "'" [ language ] "'" value-chars |
| 257 bool ParseExtValueComponents(const std::string& input, |
| 258 std::string* charset, |
| 259 std::string* value_chars) { |
| 260 StringTokenizer t(input, "'"); |
| 261 t.set_options(StringTokenizer::RETURN_DELIMS); |
| 262 std::string temp_charset; |
| 263 std::string temp_value; |
| 264 int numDelimsSeen = 0; |
| 265 while (t.GetNext()) { |
| 266 if (t.token_is_delim()) { |
| 267 ++numDelimsSeen; |
| 268 continue; |
| 269 } else { |
| 270 switch (numDelimsSeen) { |
| 271 case 0: |
| 272 temp_charset = t.token(); |
| 273 break; |
| 274 case 1: |
| 275 // Language is ignored. |
| 276 break; |
| 277 case 2: |
| 278 temp_value = t.token(); |
| 279 break; |
| 280 default: |
| 281 return false; |
| 282 } |
| 283 } |
| 284 } |
| 285 if (numDelimsSeen != 2) |
| 286 return false; |
| 287 if (temp_charset.empty() || temp_value.empty()) |
| 288 return false; |
| 289 charset->swap(temp_charset); |
| 290 value_chars->swap(temp_value); |
| 291 return true; |
| 292 } |
| 293 |
| 294 // http://tools.ietf.org/html/rfc5987#section-3.2 |
| 295 // |
| 296 // ext-value = charset "'" [ language ] "'" value-chars |
| 297 // |
| 298 // charset = "UTF-8" / "ISO-8859-1" / mime-charset |
| 299 // |
| 300 // mime-charset = 1*mime-charsetc |
| 301 // mime-charsetc = ALPHA / DIGIT |
| 302 // / "!" / "#" / "$" / "%" / "&" |
| 303 // / "+" / "-" / "^" / "_" / "`" |
| 304 // / "{" / "}" / "~" |
| 305 // |
| 306 // language = <Language-Tag, defined in [RFC5646], Section 2.1> |
| 307 // |
| 308 // value-chars = *( pct-encoded / attr-char ) |
| 309 // |
| 310 // pct-encoded = "%" HEXDIG HEXDIG |
| 311 // |
| 312 // attr-char = ALPHA / DIGIT |
| 313 // / "!" / "#" / "$" / "&" / "+" / "-" / "." |
| 314 // / "^" / "_" / "`" / "|" / "~" |
| 315 bool DecodeExtValue(const std::string& param_value, std::string* decoded) { |
| 316 if (param_value.find('"') != std::string::npos) |
| 317 return false; |
| 318 |
| 319 std::string charset; |
| 320 std::string value; |
| 321 if (!ParseExtValueComponents(param_value, &charset, &value)) |
| 322 return false; |
| 323 |
| 324 // RFC 5987 value should be ASCII-only. |
| 325 if (!IsStringASCII(value)) { |
| 326 decoded->clear(); |
| 327 return true; |
| 328 } |
| 329 |
| 330 std::string unescaped = net::UnescapeURLComponent( |
| 331 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); |
| 332 |
| 333 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); |
| 334 } |
| 335 |
| 336 } // namespace |
11 | 337 |
12 namespace net { | 338 namespace net { |
13 | 339 |
14 HttpContentDisposition::HttpContentDisposition( | 340 HttpContentDisposition::HttpContentDisposition( |
15 const std::string& header, const std::string& referrer_charset) | 341 const std::string& header, const std::string& referrer_charset) |
16 : type_(INLINE) { | 342 : type_(INLINE) { |
17 Parse(header, referrer_charset); | 343 Parse(header, referrer_charset); |
18 } | 344 } |
19 | 345 |
20 HttpContentDisposition::~HttpContentDisposition() { | 346 HttpContentDisposition::~HttpContentDisposition() { |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
92 | 418 |
93 if (!ext_filename.empty()) | 419 if (!ext_filename.empty()) |
94 filename_ = ext_filename; | 420 filename_ = ext_filename; |
95 else if (!filename.empty()) | 421 else if (!filename.empty()) |
96 filename_ = filename; | 422 filename_ = filename; |
97 else | 423 else |
98 filename_ = name; | 424 filename_ = name; |
99 } | 425 } |
100 | 426 |
101 } // namespace net | 427 } // namespace net |
OLD | NEW |