Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "net/http/http_content_disposition.h" | 5 #include "net/http/http_content_disposition.h" |
| 6 | 6 |
| 7 #include "base/base64.h" | |
| 8 #include "base/i18n/icu_string_conversions.h" | |
| 7 #include "base/logging.h" | 9 #include "base/logging.h" |
| 8 #include "base/string_util.h" | 10 #include "base/string_util.h" |
| 11 #include "base/sys_string_conversions.h" | |
| 12 #include "base/utf_string_conversions.h" | |
| 9 #include "net/base/net_util.h" | 13 #include "net/base/net_util.h" |
| 10 #include "net/http/http_util.h" | 14 #include "net/http/http_util.h" |
| 15 #include "unicode/ucnv.h" | |
| 11 | 16 |
| 12 namespace net { | 17 namespace net { |
| 13 | 18 |
| 19 namespace { | |
|
rvargas (doing something else)
2012/12/13 22:44:43
nit: It looks like all this code is fairly indepen
asanka
2012/12/13 23:28:47
Done in patch set 3.
| |
| 20 | |
| 21 enum RFC2047EncodingType { | |
| 22 Q_ENCODING, | |
| 23 B_ENCODING | |
| 24 }; | |
| 25 | |
| 26 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to | |
| 27 // decoding a quoted-printable string. Returns true if the input was valid. | |
| 28 bool DecodeQEncoding(const std::string& input, std::string* output) { | |
| 29 std::string temp; | |
| 30 temp.reserve(input.size()); | |
| 31 for (std::string::const_iterator it = input.begin(); it != input.end(); | |
| 32 ++it) { | |
| 33 if (*it == '_') { | |
| 34 temp.push_back(' '); | |
| 35 } else if (*it == '=') { | |
| 36 if ((input.end() - it < 3) || | |
| 37 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || | |
| 38 !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) | |
| 39 return false; | |
| 40 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + | |
| 41 HexDigitToInt(*(it + 2)); | |
| 42 temp.push_back(static_cast<char>(ch)); | |
| 43 ++it; | |
| 44 ++it; | |
| 45 } else if (0x20 < *it && *it < 0x7F && *it != '?') { | |
| 46 // In a Q-encoded word, only printable ASCII characters | |
| 47 // represent themselves. Besides, space, '=', '_' and '?' are | |
| 48 // not allowed, but they're already filtered out. | |
| 49 DCHECK_NE('=', *it); | |
| 50 DCHECK_NE('?', *it); | |
| 51 DCHECK_NE('_', *it); | |
| 52 temp.push_back(*it); | |
| 53 } else { | |
| 54 return false; | |
| 55 } | |
| 56 } | |
| 57 output->swap(temp); | |
| 58 return true; | |
| 59 } | |
| 60 | |
| 61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding | |
| 62 // type is specified in |enc_type|. | |
| 63 bool DecodeBQEncoding(const std::string& part, | |
| 64 RFC2047EncodingType enc_type, | |
| 65 const std::string& charset, | |
| 66 std::string* output) { | |
| 67 std::string decoded; | |
| 68 if (!((enc_type == B_ENCODING) ? | |
| 69 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) | |
| 70 return false; | |
| 71 | |
| 72 if (decoded.empty()) { | |
| 73 output->clear(); | |
| 74 return true; | |
| 75 } | |
| 76 | |
| 77 UErrorCode err = U_ZERO_ERROR; | |
| 78 UConverter* converter(ucnv_open(charset.c_str(), &err)); | |
| 79 if (U_FAILURE(err)) | |
| 80 return false; | |
| 81 | |
| 82 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. | |
| 83 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes | |
| 84 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a | |
| 85 // trailing '\0'. | |
| 86 size_t output_length = decoded.length() * 3 + 1; | |
| 87 char* buf = WriteInto(output, output_length); | |
| 88 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, | |
| 89 decoded.data(), decoded.length(), &err); | |
| 90 ucnv_close(converter); | |
| 91 if (U_FAILURE(err)) | |
| 92 return false; | |
| 93 output->resize(output_length); | |
| 94 return true; | |
| 95 } | |
| 96 | |
| 97 bool DecodeWord(const std::string& encoded_word, | |
| 98 const std::string& referrer_charset, | |
| 99 bool* is_rfc2047, | |
| 100 std::string* output) { | |
| 101 *is_rfc2047 = false; | |
| 102 output->clear(); | |
| 103 if (encoded_word.empty()) | |
| 104 return true; | |
| 105 | |
| 106 if (!IsStringASCII(encoded_word)) { | |
| 107 // Try UTF-8, referrer_charset and the native OS default charset in turn. | |
| 108 if (IsStringUTF8(encoded_word)) { | |
| 109 *output = encoded_word; | |
| 110 } else { | |
| 111 string16 utf16_output; | |
| 112 if (!referrer_charset.empty() && | |
| 113 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), | |
| 114 base::OnStringConversionError::FAIL, | |
| 115 &utf16_output)) { | |
| 116 *output = UTF16ToUTF8(utf16_output); | |
| 117 } else { | |
| 118 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); | |
| 119 } | |
| 120 } | |
| 121 | |
| 122 return true; | |
| 123 } | |
| 124 | |
| 125 // RFC 2047 : one of encoding methods supported by Firefox and relatively | |
| 126 // widely used by web servers. | |
| 127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. | |
| 128 // We don't care about the length restriction (72 bytes) because | |
| 129 // many web servers generate encoded words longer than the limit. | |
| 130 std::string tmp; | |
| 131 *is_rfc2047 = true; | |
| 132 int part_index = 0; | |
| 133 std::string charset; | |
| 134 StringTokenizer t(encoded_word, "?"); | |
| 135 RFC2047EncodingType enc_type = Q_ENCODING; | |
| 136 while (*is_rfc2047 && t.GetNext()) { | |
| 137 std::string part = t.token(); | |
| 138 switch (part_index) { | |
| 139 case 0: | |
| 140 if (part != "=") { | |
| 141 *is_rfc2047 = false; | |
| 142 break; | |
| 143 } | |
| 144 ++part_index; | |
| 145 break; | |
| 146 case 1: | |
| 147 // Do we need charset validity check here? | |
| 148 charset = part; | |
| 149 ++part_index; | |
| 150 break; | |
| 151 case 2: | |
| 152 if (part.size() > 1 || | |
| 153 part.find_first_of("bBqQ") == std::string::npos) { | |
| 154 *is_rfc2047 = false; | |
| 155 break; | |
| 156 } | |
| 157 if (part[0] == 'b' || part[0] == 'B') { | |
| 158 enc_type = B_ENCODING; | |
| 159 } | |
| 160 ++part_index; | |
| 161 break; | |
| 162 case 3: | |
| 163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); | |
| 164 if (!*is_rfc2047) { | |
| 165 // Last minute failure. Invalid B/Q encoding. Rather than | |
| 166 // passing it through, return now. | |
| 167 return false; | |
| 168 } | |
| 169 ++part_index; | |
| 170 break; | |
| 171 case 4: | |
| 172 if (part != "=") { | |
| 173 // Another last minute failure ! | |
| 174 // Likely to be a case of two encoded-words in a row or | |
| 175 // an encoded word followed by a non-encoded word. We can be | |
| 176 // generous, but it does not help much in terms of compatibility, | |
| 177 // I believe. Return immediately. | |
| 178 *is_rfc2047 = false; | |
| 179 return false; | |
| 180 } | |
| 181 ++part_index; | |
| 182 break; | |
| 183 default: | |
| 184 *is_rfc2047 = false; | |
| 185 return false; | |
| 186 } | |
| 187 } | |
| 188 | |
| 189 if (*is_rfc2047) { | |
| 190 if (*(encoded_word.end() - 1) == '=') { | |
| 191 output->swap(tmp); | |
| 192 return true; | |
| 193 } | |
| 194 // encoded_word ending prematurelly with '?' or extra '?' | |
| 195 *is_rfc2047 = false; | |
| 196 return false; | |
| 197 } | |
| 198 | |
| 199 // We're not handling 'especial' characters quoted with '\', but | |
| 200 // it should be Ok because we're not an email client but a | |
| 201 // web browser. | |
| 202 | |
| 203 // What IE6/7 does: %-escaped UTF-8. | |
| 204 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); | |
| 205 if (IsStringUTF8(tmp)) { | |
| 206 output->swap(tmp); | |
| 207 return true; | |
| 208 // We can try either the OS default charset or 'origin charset' here, | |
| 209 // As far as I can tell, IE does not support it. However, I've seen | |
| 210 // web servers emit %-escaped string in a legacy encoding (usually | |
| 211 // origin charset). | |
| 212 // TODO(jungshik) : Test IE further and consider adding a fallback here. | |
| 213 } | |
| 214 return false; | |
| 215 } | |
| 216 | |
| 217 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The | |
| 218 // value is supposed to be of the form: | |
| 219 // | |
| 220 // value = token | quoted-string | |
| 221 // | |
| 222 // However we currently also allow RFC 2047 encoding and non-ASCII | |
| 223 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. | |
| 224 bool DecodeFilenameValue(const std::string& input, | |
| 225 const std::string& referrer_charset, | |
| 226 std::string* output) { | |
| 227 std::string tmp; | |
| 228 // Tokenize with whitespace characters. | |
| 229 StringTokenizer t(input, " \t\n\r"); | |
| 230 t.set_options(StringTokenizer::RETURN_DELIMS); | |
| 231 bool is_previous_token_rfc2047 = true; | |
| 232 while (t.GetNext()) { | |
| 233 if (t.token_is_delim()) { | |
| 234 // If the previous non-delimeter token is not RFC2047-encoded, | |
| 235 // put in a space in its place. Otheriwse, skip over it. | |
| 236 if (!is_previous_token_rfc2047) { | |
| 237 tmp.push_back(' '); | |
| 238 } | |
| 239 continue; | |
| 240 } | |
| 241 // We don't support a single multibyte character split into | |
| 242 // adjacent encoded words. Some broken mail clients emit headers | |
| 243 // with that problem, but most web servers usually encode a filename | |
| 244 // in a single encoded-word. Firefox/Thunderbird do not support | |
| 245 // it, either. | |
| 246 std::string decoded; | |
| 247 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, | |
| 248 &decoded)) | |
| 249 return false; | |
| 250 tmp.append(decoded); | |
| 251 } | |
| 252 output->swap(tmp); | |
| 253 return true; | |
| 254 } | |
| 255 | |
| 256 // Parses the charset and value-chars out of an ext-value string. | |
| 257 // | |
| 258 // ext-value = charset "'" [ language ] "'" value-chars | |
| 259 bool ParseExtValueComponents(const std::string& input, | |
| 260 std::string* charset, | |
| 261 std::string* value_chars) { | |
| 262 StringTokenizer t(input, "'"); | |
| 263 t.set_options(StringTokenizer::RETURN_DELIMS); | |
| 264 std::string temp_charset; | |
| 265 std::string temp_value; | |
| 266 int numDelimsSeen = 0; | |
| 267 while (t.GetNext()) { | |
| 268 if (t.token_is_delim()) { | |
| 269 ++numDelimsSeen; | |
| 270 continue; | |
| 271 } else { | |
| 272 switch (numDelimsSeen) { | |
| 273 case 0: | |
| 274 temp_charset = t.token(); | |
| 275 break; | |
| 276 case 1: | |
| 277 // Language is ignored. | |
| 278 break; | |
| 279 case 2: | |
| 280 temp_value = t.token(); | |
| 281 break; | |
| 282 default: | |
| 283 return false; | |
| 284 } | |
| 285 } | |
| 286 } | |
| 287 if (numDelimsSeen != 2) | |
| 288 return false; | |
| 289 if (temp_charset.empty() || temp_value.empty()) | |
| 290 return false; | |
| 291 charset->swap(temp_charset); | |
| 292 value_chars->swap(temp_value); | |
| 293 return true; | |
| 294 } | |
| 295 | |
| 296 // http://tools.ietf.org/html/rfc5987#section-3.2 | |
| 297 // | |
| 298 // ext-value = charset "'" [ language ] "'" value-chars | |
| 299 // | |
| 300 // charset = "UTF-8" / "ISO-8859-1" / mime-charset | |
| 301 // | |
| 302 // mime-charset = 1*mime-charsetc | |
| 303 // mime-charsetc = ALPHA / DIGIT | |
| 304 // / "!" / "#" / "$" / "%" / "&" | |
| 305 // / "+" / "-" / "^" / "_" / "`" | |
| 306 // / "{" / "}" / "~" | |
| 307 // | |
| 308 // language = <Language-Tag, defined in [RFC5646], Section 2.1> | |
| 309 // | |
| 310 // value-chars = *( pct-encoded / attr-char ) | |
| 311 // | |
| 312 // pct-encoded = "%" HEXDIG HEXDIG | |
| 313 // | |
| 314 // attr-char = ALPHA / DIGIT | |
| 315 // / "!" / "#" / "$" / "&" / "+" / "-" / "." | |
| 316 // / "^" / "_" / "`" / "|" / "~" | |
| 317 bool DecodeExtValue(const std::string& param_value, std::string* decoded) { | |
| 318 if (param_value.find('"') != std::string::npos) | |
| 319 return false; | |
| 320 | |
| 321 std::string charset; | |
| 322 std::string value; | |
| 323 if (!ParseExtValueComponents(param_value, &charset, &value)) | |
| 324 return false; | |
| 325 | |
| 326 // RFC 5987 value should be ASCII-only. | |
| 327 if (!IsStringASCII(value)) { | |
| 328 decoded->clear(); | |
| 329 return true; | |
| 330 } | |
| 331 | |
| 332 std::string unescaped = UnescapeURLComponent(value, | |
| 333 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); | |
| 334 | |
| 335 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); | |
| 336 } | |
| 337 | |
| 338 } // namespace | |
| 339 | |
| 14 HttpContentDisposition::HttpContentDisposition( | 340 HttpContentDisposition::HttpContentDisposition( |
| 15 const std::string& header, const std::string& referrer_charset) | 341 const std::string& header, const std::string& referrer_charset) |
| 16 : type_(INLINE) { | 342 : type_(INLINE) { |
| 17 Parse(header, referrer_charset); | 343 Parse(header, referrer_charset); |
| 18 } | 344 } |
| 19 | 345 |
| 20 HttpContentDisposition::~HttpContentDisposition() { | 346 HttpContentDisposition::~HttpContentDisposition() { |
| 21 } | 347 } |
| 22 | 348 |
| 23 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( | 349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( |
| (...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 92 | 418 |
| 93 if (!ext_filename.empty()) | 419 if (!ext_filename.empty()) |
| 94 filename_ = ext_filename; | 420 filename_ = ext_filename; |
| 95 else if (!filename.empty()) | 421 else if (!filename.empty()) |
| 96 filename_ = filename; | 422 filename_ = filename; |
| 97 else | 423 else |
| 98 filename_ = name; | 424 filename_ = name; |
| 99 } | 425 } |
| 100 | 426 |
| 101 } // namespace net | 427 } // namespace net |
| OLD | NEW |