Chromium Code Reviews| Index: net/http/http_content_disposition.cc |
| diff --git a/net/http/http_content_disposition.cc b/net/http/http_content_disposition.cc |
| index 52d9f4fdf2435e80b4394910d19a8d1ad33c4839..cd691ad56cbee54404f86899f607430a76eae72b 100644 |
| --- a/net/http/http_content_disposition.cc |
| +++ b/net/http/http_content_disposition.cc |
| @@ -4,13 +4,339 @@ |
| #include "net/http/http_content_disposition.h" |
| +#include "base/base64.h" |
| +#include "base/i18n/icu_string_conversions.h" |
| #include "base/logging.h" |
| #include "base/string_util.h" |
| +#include "base/sys_string_conversions.h" |
| +#include "base/utf_string_conversions.h" |
| #include "net/base/net_util.h" |
| #include "net/http/http_util.h" |
| +#include "unicode/ucnv.h" |
| namespace net { |
| +namespace { |
|
rvargas (doing something else)
2012/12/13 22:44:43
nit: It looks like all this code is fairly indepen
asanka
2012/12/13 23:28:47
Done in patch set 3.
|
| + |
| +enum RFC2047EncodingType { |
| + Q_ENCODING, |
| + B_ENCODING |
| +}; |
| + |
| +// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to |
| +// decoding a quoted-printable string. Returns true if the input was valid. |
| +bool DecodeQEncoding(const std::string& input, std::string* output) { |
| + std::string temp; |
| + temp.reserve(input.size()); |
| + for (std::string::const_iterator it = input.begin(); it != input.end(); |
| + ++it) { |
| + if (*it == '_') { |
| + temp.push_back(' '); |
| + } else if (*it == '=') { |
| + if ((input.end() - it < 3) || |
| + !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || |
| + !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) |
| + return false; |
| + unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + |
| + HexDigitToInt(*(it + 2)); |
| + temp.push_back(static_cast<char>(ch)); |
| + ++it; |
| + ++it; |
| + } else if (0x20 < *it && *it < 0x7F && *it != '?') { |
| + // In a Q-encoded word, only printable ASCII characters |
| + // represent themselves. Besides, space, '=', '_' and '?' are |
| + // not allowed, but they're already filtered out. |
| + DCHECK_NE('=', *it); |
| + DCHECK_NE('?', *it); |
| + DCHECK_NE('_', *it); |
| + temp.push_back(*it); |
| + } else { |
| + return false; |
| + } |
| + } |
| + output->swap(temp); |
| + return true; |
| +} |
| + |
| +// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding |
| +// type is specified in |enc_type|. |
| +bool DecodeBQEncoding(const std::string& part, |
| + RFC2047EncodingType enc_type, |
| + const std::string& charset, |
| + std::string* output) { |
| + std::string decoded; |
| + if (!((enc_type == B_ENCODING) ? |
| + base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) |
| + return false; |
| + |
| + if (decoded.empty()) { |
| + output->clear(); |
| + return true; |
| + } |
| + |
| + UErrorCode err = U_ZERO_ERROR; |
| + UConverter* converter(ucnv_open(charset.c_str(), &err)); |
| + if (U_FAILURE(err)) |
| + return false; |
| + |
| + // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. |
| + // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes |
| + // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a |
| + // trailing '\0'. |
| + size_t output_length = decoded.length() * 3 + 1; |
| + char* buf = WriteInto(output, output_length); |
| + output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, |
| + decoded.data(), decoded.length(), &err); |
| + ucnv_close(converter); |
| + if (U_FAILURE(err)) |
| + return false; |
| + output->resize(output_length); |
| + return true; |
| +} |
| + |
| +bool DecodeWord(const std::string& encoded_word, |
| + const std::string& referrer_charset, |
| + bool* is_rfc2047, |
| + std::string* output) { |
| + *is_rfc2047 = false; |
| + output->clear(); |
| + if (encoded_word.empty()) |
| + return true; |
| + |
| + if (!IsStringASCII(encoded_word)) { |
| + // Try UTF-8, referrer_charset and the native OS default charset in turn. |
| + if (IsStringUTF8(encoded_word)) { |
| + *output = encoded_word; |
| + } else { |
| + string16 utf16_output; |
| + if (!referrer_charset.empty() && |
| + base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), |
| + base::OnStringConversionError::FAIL, |
| + &utf16_output)) { |
| + *output = UTF16ToUTF8(utf16_output); |
| + } else { |
| + *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); |
| + } |
| + } |
| + |
| + return true; |
| + } |
| + |
| + // RFC 2047 : one of encoding methods supported by Firefox and relatively |
| + // widely used by web servers. |
| + // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. |
| + // We don't care about the length restriction (72 bytes) because |
| + // many web servers generate encoded words longer than the limit. |
| + std::string tmp; |
| + *is_rfc2047 = true; |
| + int part_index = 0; |
| + std::string charset; |
| + StringTokenizer t(encoded_word, "?"); |
| + RFC2047EncodingType enc_type = Q_ENCODING; |
| + while (*is_rfc2047 && t.GetNext()) { |
| + std::string part = t.token(); |
| + switch (part_index) { |
| + case 0: |
| + if (part != "=") { |
| + *is_rfc2047 = false; |
| + break; |
| + } |
| + ++part_index; |
| + break; |
| + case 1: |
| + // Do we need charset validity check here? |
| + charset = part; |
| + ++part_index; |
| + break; |
| + case 2: |
| + if (part.size() > 1 || |
| + part.find_first_of("bBqQ") == std::string::npos) { |
| + *is_rfc2047 = false; |
| + break; |
| + } |
| + if (part[0] == 'b' || part[0] == 'B') { |
| + enc_type = B_ENCODING; |
| + } |
| + ++part_index; |
| + break; |
| + case 3: |
| + *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); |
| + if (!*is_rfc2047) { |
| + // Last minute failure. Invalid B/Q encoding. Rather than |
| + // passing it through, return now. |
| + return false; |
| + } |
| + ++part_index; |
| + break; |
| + case 4: |
| + if (part != "=") { |
| + // Another last minute failure ! |
| + // Likely to be a case of two encoded-words in a row or |
| + // an encoded word followed by a non-encoded word. We can be |
| + // generous, but it does not help much in terms of compatibility, |
| + // I believe. Return immediately. |
| + *is_rfc2047 = false; |
| + return false; |
| + } |
| + ++part_index; |
| + break; |
| + default: |
| + *is_rfc2047 = false; |
| + return false; |
| + } |
| + } |
| + |
| + if (*is_rfc2047) { |
| + if (*(encoded_word.end() - 1) == '=') { |
| + output->swap(tmp); |
| + return true; |
| + } |
| + // encoded_word ending prematurelly with '?' or extra '?' |
| + *is_rfc2047 = false; |
| + return false; |
| + } |
| + |
| + // We're not handling 'especial' characters quoted with '\', but |
| + // it should be Ok because we're not an email client but a |
| + // web browser. |
| + |
| + // What IE6/7 does: %-escaped UTF-8. |
| + tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); |
| + if (IsStringUTF8(tmp)) { |
| + output->swap(tmp); |
| + return true; |
| + // We can try either the OS default charset or 'origin charset' here, |
| + // As far as I can tell, IE does not support it. However, I've seen |
| + // web servers emit %-escaped string in a legacy encoding (usually |
| + // origin charset). |
| + // TODO(jungshik) : Test IE further and consider adding a fallback here. |
| + } |
| + return false; |
| +} |
| + |
| +// Decodes the value of a 'filename' or 'name' parameter given as |input|. The |
| +// value is supposed to be of the form: |
| +// |
| +// value = token | quoted-string |
| +// |
| +// However we currently also allow RFC 2047 encoding and non-ASCII |
| +// strings. Non-ASCII strings are interpreted based on |referrer_charset|. |
| +bool DecodeFilenameValue(const std::string& input, |
| + const std::string& referrer_charset, |
| + std::string* output) { |
| + std::string tmp; |
| + // Tokenize with whitespace characters. |
| + StringTokenizer t(input, " \t\n\r"); |
| + t.set_options(StringTokenizer::RETURN_DELIMS); |
| + bool is_previous_token_rfc2047 = true; |
| + while (t.GetNext()) { |
| + if (t.token_is_delim()) { |
| + // If the previous non-delimeter token is not RFC2047-encoded, |
| + // put in a space in its place. Otheriwse, skip over it. |
| + if (!is_previous_token_rfc2047) { |
| + tmp.push_back(' '); |
| + } |
| + continue; |
| + } |
| + // We don't support a single multibyte character split into |
| + // adjacent encoded words. Some broken mail clients emit headers |
| + // with that problem, but most web servers usually encode a filename |
| + // in a single encoded-word. Firefox/Thunderbird do not support |
| + // it, either. |
| + std::string decoded; |
| + if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, |
| + &decoded)) |
| + return false; |
| + tmp.append(decoded); |
| + } |
| + output->swap(tmp); |
| + return true; |
| +} |
| + |
| +// Parses the charset and value-chars out of an ext-value string. |
| +// |
| +// ext-value = charset "'" [ language ] "'" value-chars |
| +bool ParseExtValueComponents(const std::string& input, |
| + std::string* charset, |
| + std::string* value_chars) { |
| + StringTokenizer t(input, "'"); |
| + t.set_options(StringTokenizer::RETURN_DELIMS); |
| + std::string temp_charset; |
| + std::string temp_value; |
| + int numDelimsSeen = 0; |
| + while (t.GetNext()) { |
| + if (t.token_is_delim()) { |
| + ++numDelimsSeen; |
| + continue; |
| + } else { |
| + switch (numDelimsSeen) { |
| + case 0: |
| + temp_charset = t.token(); |
| + break; |
| + case 1: |
| + // Language is ignored. |
| + break; |
| + case 2: |
| + temp_value = t.token(); |
| + break; |
| + default: |
| + return false; |
| + } |
| + } |
| + } |
| + if (numDelimsSeen != 2) |
| + return false; |
| + if (temp_charset.empty() || temp_value.empty()) |
| + return false; |
| + charset->swap(temp_charset); |
| + value_chars->swap(temp_value); |
| + return true; |
| +} |
| + |
| +// http://tools.ietf.org/html/rfc5987#section-3.2 |
| +// |
| +// ext-value = charset "'" [ language ] "'" value-chars |
| +// |
| +// charset = "UTF-8" / "ISO-8859-1" / mime-charset |
| +// |
| +// mime-charset = 1*mime-charsetc |
| +// mime-charsetc = ALPHA / DIGIT |
| +// / "!" / "#" / "$" / "%" / "&" |
| +// / "+" / "-" / "^" / "_" / "`" |
| +// / "{" / "}" / "~" |
| +// |
| +// language = <Language-Tag, defined in [RFC5646], Section 2.1> |
| +// |
| +// value-chars = *( pct-encoded / attr-char ) |
| +// |
| +// pct-encoded = "%" HEXDIG HEXDIG |
| +// |
| +// attr-char = ALPHA / DIGIT |
| +// / "!" / "#" / "$" / "&" / "+" / "-" / "." |
| +// / "^" / "_" / "`" / "|" / "~" |
| +bool DecodeExtValue(const std::string& param_value, std::string* decoded) { |
| + if (param_value.find('"') != std::string::npos) |
| + return false; |
| + |
| + std::string charset; |
| + std::string value; |
| + if (!ParseExtValueComponents(param_value, &charset, &value)) |
| + return false; |
| + |
| + // RFC 5987 value should be ASCII-only. |
| + if (!IsStringASCII(value)) { |
| + decoded->clear(); |
| + return true; |
| + } |
| + |
| + std::string unescaped = UnescapeURLComponent(value, |
| + UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); |
| + |
| + return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); |
| +} |
| + |
| +} // namespace |
| + |
| HttpContentDisposition::HttpContentDisposition( |
| const std::string& header, const std::string& referrer_charset) |
| : type_(INLINE) { |