Index: net/http/http_content_disposition.cc |
diff --git a/net/http/http_content_disposition.cc b/net/http/http_content_disposition.cc |
index 52d9f4fdf2435e80b4394910d19a8d1ad33c4839..0726e93ee477b1a1ab3bf6a50b97fe49c773a664 100644 |
--- a/net/http/http_content_disposition.cc |
+++ b/net/http/http_content_disposition.cc |
@@ -4,10 +4,336 @@ |
#include "net/http/http_content_disposition.h" |
+#include "base/base64.h" |
+#include "base/i18n/icu_string_conversions.h" |
#include "base/logging.h" |
#include "base/string_util.h" |
+#include "base/sys_string_conversions.h" |
+#include "base/utf_string_conversions.h" |
#include "net/base/net_util.h" |
#include "net/http/http_util.h" |
+#include "unicode/ucnv.h" |
+ |
+namespace { |
+ |
+enum RFC2047EncodingType { |
+ Q_ENCODING, |
+ B_ENCODING |
+}; |
+ |
+// Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to |
+// decoding a quoted-printable string. Returns true if the input was valid. |
+bool DecodeQEncoding(const std::string& input, std::string* output) { |
+ std::string temp; |
+ temp.reserve(input.size()); |
+ for (std::string::const_iterator it = input.begin(); it != input.end(); |
+ ++it) { |
+ if (*it == '_') { |
+ temp.push_back(' '); |
+ } else if (*it == '=') { |
+ if ((input.end() - it < 3) || |
+ !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || |
+ !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) |
+ return false; |
+ unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + |
+ HexDigitToInt(*(it + 2)); |
+ temp.push_back(static_cast<char>(ch)); |
+ ++it; |
+ ++it; |
+ } else if (0x20 < *it && *it < 0x7F && *it != '?') { |
+ // In a Q-encoded word, only printable ASCII characters |
+ // represent themselves. Besides, space, '=', '_' and '?' are |
+ // not allowed, but they're already filtered out. |
+ DCHECK_NE('=', *it); |
+ DCHECK_NE('?', *it); |
+ DCHECK_NE('_', *it); |
+ temp.push_back(*it); |
+ } else { |
+ return false; |
+ } |
+ } |
+ output->swap(temp); |
+ return true; |
+} |
+ |
+// Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding |
+// type is specified in |enc_type|. |
+bool DecodeBQEncoding(const std::string& part, |
+ RFC2047EncodingType enc_type, |
+ const std::string& charset, |
+ std::string* output) { |
+ std::string decoded; |
+ if (!((enc_type == B_ENCODING) ? |
+ base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) |
+ return false; |
+ |
+ if (decoded.empty()) { |
+ output->clear(); |
+ return true; |
+ } |
+ |
+ UErrorCode err = U_ZERO_ERROR; |
+ UConverter* converter(ucnv_open(charset.c_str(), &err)); |
+ if (U_FAILURE(err)) |
+ return false; |
+ |
+ // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. |
+ // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes |
+ // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a |
+ // trailing '\0'. |
+ size_t output_length = decoded.length() * 3 + 1; |
+ char* buf = WriteInto(output, output_length); |
+ output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, |
+ decoded.data(), decoded.length(), &err); |
+ ucnv_close(converter); |
+ if (U_FAILURE(err)) |
+ return false; |
+ output->resize(output_length); |
+ return true; |
+} |
+ |
+bool DecodeWord(const std::string& encoded_word, |
+ const std::string& referrer_charset, |
+ bool* is_rfc2047, |
+ std::string* output) { |
+ *is_rfc2047 = false; |
+ output->clear(); |
+ if (encoded_word.empty()) |
+ return true; |
+ |
+ if (!IsStringASCII(encoded_word)) { |
+ // Try UTF-8, referrer_charset and the native OS default charset in turn. |
+ if (IsStringUTF8(encoded_word)) { |
+ *output = encoded_word; |
+ } else { |
+ string16 utf16_output; |
+ if (!referrer_charset.empty() && |
+ base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), |
+ base::OnStringConversionError::FAIL, |
+ &utf16_output)) { |
+ *output = UTF16ToUTF8(utf16_output); |
+ } else { |
+ *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); |
+ } |
+ } |
+ |
+ return true; |
+ } |
+ |
+ // RFC 2047 : one of encoding methods supported by Firefox and relatively |
+ // widely used by web servers. |
+ // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. |
+ // We don't care about the length restriction (72 bytes) because |
+ // many web servers generate encoded words longer than the limit. |
+ std::string tmp; |
+ *is_rfc2047 = true; |
+ int part_index = 0; |
+ std::string charset; |
+ StringTokenizer t(encoded_word, "?"); |
+ RFC2047EncodingType enc_type = Q_ENCODING; |
+ while (*is_rfc2047 && t.GetNext()) { |
+ std::string part = t.token(); |
+ switch (part_index) { |
+ case 0: |
+ if (part != "=") { |
+ *is_rfc2047 = false; |
+ break; |
+ } |
+ ++part_index; |
+ break; |
+ case 1: |
+ // Do we need charset validity check here? |
+ charset = part; |
+ ++part_index; |
+ break; |
+ case 2: |
+ if (part.size() > 1 || |
+ part.find_first_of("bBqQ") == std::string::npos) { |
+ *is_rfc2047 = false; |
+ break; |
+ } |
+ if (part[0] == 'b' || part[0] == 'B') { |
+ enc_type = B_ENCODING; |
+ } |
+ ++part_index; |
+ break; |
+ case 3: |
+ *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); |
+ if (!*is_rfc2047) { |
+ // Last minute failure. Invalid B/Q encoding. Rather than |
+ // passing it through, return now. |
+ return false; |
+ } |
+ ++part_index; |
+ break; |
+ case 4: |
+ if (part != "=") { |
+ // Another last minute failure ! |
+ // Likely to be a case of two encoded-words in a row or |
+ // an encoded word followed by a non-encoded word. We can be |
+ // generous, but it does not help much in terms of compatibility, |
+ // I believe. Return immediately. |
+ *is_rfc2047 = false; |
+ return false; |
+ } |
+ ++part_index; |
+ break; |
+ default: |
+ *is_rfc2047 = false; |
+ return false; |
+ } |
+ } |
+ |
+ if (*is_rfc2047) { |
+ if (*(encoded_word.end() - 1) == '=') { |
+ output->swap(tmp); |
+ return true; |
+ } |
+ // encoded_word ending prematurelly with '?' or extra '?' |
+ *is_rfc2047 = false; |
+ return false; |
+ } |
+ |
+ // We're not handling 'especial' characters quoted with '\', but |
+ // it should be Ok because we're not an email client but a |
+ // web browser. |
+ |
+ // What IE6/7 does: %-escaped UTF-8. |
+ tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES); |
+ if (IsStringUTF8(tmp)) { |
+ output->swap(tmp); |
+ return true; |
+ // We can try either the OS default charset or 'origin charset' here, |
+ // As far as I can tell, IE does not support it. However, I've seen |
+ // web servers emit %-escaped string in a legacy encoding (usually |
+ // origin charset). |
+ // TODO(jungshik) : Test IE further and consider adding a fallback here. |
+ } |
+ return false; |
+} |
+ |
+// Decodes the value of a 'filename' or 'name' parameter given as |input|. The |
+// value is supposed to be of the form: |
+// |
+// value = token | quoted-string |
+// |
+// However we currently also allow RFC 2047 encoding and non-ASCII |
+// strings. Non-ASCII strings are interpreted based on |referrer_charset|. |
+bool DecodeFilenameValue(const std::string& input, |
+ const std::string& referrer_charset, |
+ std::string* output) { |
+ std::string tmp; |
+ // Tokenize with whitespace characters. |
+ StringTokenizer t(input, " \t\n\r"); |
+ t.set_options(StringTokenizer::RETURN_DELIMS); |
+ bool is_previous_token_rfc2047 = true; |
+ while (t.GetNext()) { |
+ if (t.token_is_delim()) { |
+ // If the previous non-delimeter token is not RFC2047-encoded, |
+ // put in a space in its place. Otheriwse, skip over it. |
+ if (!is_previous_token_rfc2047) { |
+ tmp.push_back(' '); |
+ } |
+ continue; |
+ } |
+ // We don't support a single multibyte character split into |
+ // adjacent encoded words. Some broken mail clients emit headers |
+ // with that problem, but most web servers usually encode a filename |
+ // in a single encoded-word. Firefox/Thunderbird do not support |
+ // it, either. |
+ std::string decoded; |
+ if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, |
+ &decoded)) |
+ return false; |
+ tmp.append(decoded); |
+ } |
+ output->swap(tmp); |
+ return true; |
+} |
+ |
+// Parses the charset and value-chars out of an ext-value string. |
+// |
+// ext-value = charset "'" [ language ] "'" value-chars |
+bool ParseExtValueComponents(const std::string& input, |
+ std::string* charset, |
+ std::string* value_chars) { |
+ StringTokenizer t(input, "'"); |
+ t.set_options(StringTokenizer::RETURN_DELIMS); |
+ std::string temp_charset; |
+ std::string temp_value; |
+ int numDelimsSeen = 0; |
+ while (t.GetNext()) { |
+ if (t.token_is_delim()) { |
+ ++numDelimsSeen; |
+ continue; |
+ } else { |
+ switch (numDelimsSeen) { |
+ case 0: |
+ temp_charset = t.token(); |
+ break; |
+ case 1: |
+ // Language is ignored. |
+ break; |
+ case 2: |
+ temp_value = t.token(); |
+ break; |
+ default: |
+ return false; |
+ } |
+ } |
+ } |
+ if (numDelimsSeen != 2) |
+ return false; |
+ if (temp_charset.empty() || temp_value.empty()) |
+ return false; |
+ charset->swap(temp_charset); |
+ value_chars->swap(temp_value); |
+ return true; |
+} |
+ |
+// http://tools.ietf.org/html/rfc5987#section-3.2 |
+// |
+// ext-value = charset "'" [ language ] "'" value-chars |
+// |
+// charset = "UTF-8" / "ISO-8859-1" / mime-charset |
+// |
+// mime-charset = 1*mime-charsetc |
+// mime-charsetc = ALPHA / DIGIT |
+// / "!" / "#" / "$" / "%" / "&" |
+// / "+" / "-" / "^" / "_" / "`" |
+// / "{" / "}" / "~" |
+// |
+// language = <Language-Tag, defined in [RFC5646], Section 2.1> |
+// |
+// value-chars = *( pct-encoded / attr-char ) |
+// |
+// pct-encoded = "%" HEXDIG HEXDIG |
+// |
+// attr-char = ALPHA / DIGIT |
+// / "!" / "#" / "$" / "&" / "+" / "-" / "." |
+// / "^" / "_" / "`" / "|" / "~" |
+bool DecodeExtValue(const std::string& param_value, std::string* decoded) { |
+ if (param_value.find('"') != std::string::npos) |
+ return false; |
+ |
+ std::string charset; |
+ std::string value; |
+ if (!ParseExtValueComponents(param_value, &charset, &value)) |
+ return false; |
+ |
+ // RFC 5987 value should be ASCII-only. |
+ if (!IsStringASCII(value)) { |
+ decoded->clear(); |
+ return true; |
+ } |
+ |
+ std::string unescaped = net::UnescapeURLComponent( |
+ value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); |
+ |
+ return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); |
+} |
+ |
+} // namespace |
namespace net { |