net/http/http_content_disposition.cc - Issue 11471041: Move DecodeFilenameValue and DecodeExt value into http_content_disposition.

Side by Side Diff: net/http/http_content_disposition.cc

Issue 11471041: Move DecodeFilenameValue and DecodeExt value into http_content_disposition. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Move non-net code out of net namespace. Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/http/http_content_disposition.h"	5 #include "net/http/http_content_disposition.h"

6	6

	7 #include "base/base64.h"

	8 #include "base/i18n/icu_string_conversions.h"

7 #include "base/logging.h"	9 #include "base/logging.h"

8 #include "base/string_util.h"	10 #include "base/string_util.h"

	11 #include "base/sys_string_conversions.h"

	12 #include "base/utf_string_conversions.h"

9 #include "net/base/net_util.h"	13 #include "net/base/net_util.h"

10 #include "net/http/http_util.h"	14 #include "net/http/http_util.h"

	15 #include "unicode/ucnv.h"

	16

	17 namespace {

	18

	19 enum RFC2047EncodingType {

	20 Q_ENCODING,

	21 B_ENCODING

	22 };

	23

	24 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to

	25 // decoding a quoted-printable string. Returns true if the input was valid.

	26 bool DecodeQEncoding(const std::string& input, std::string* output) {

	27 std::string temp;

	28 temp.reserve(input.size());

	29 for (std::string::const_iterator it = input.begin(); it != input.end();

	30 ++it) {

	31 if (*it == '_') {

	32 temp.push_back(' ');

	33 } else if (*it == '=') {

	34 if ((input.end() - it < 3) \|\|

	35 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) \|\|

	36 !IsHexDigit(static_cast<unsigned char>(*(it + 2))))

	37 return false;

	38 unsigned char ch = HexDigitToInt((it + 1)) 16 +

	39 HexDigitToInt(*(it + 2));

	40 temp.push_back(static_cast<char>(ch));

	41 ++it;

	42 ++it;

	43 } else if (0x20 < it && it < 0x7F && *it != '?') {

	44 // In a Q-encoded word, only printable ASCII characters

	45 // represent themselves. Besides, space, '=', '_' and '?' are

	46 // not allowed, but they're already filtered out.

	47 DCHECK_NE('=', *it);

	48 DCHECK_NE('?', *it);

	49 DCHECK_NE('_', *it);

	50 temp.push_back(*it);

	51 } else {

	52 return false;

	53 }

	54 }

	55 output->swap(temp);

	56 return true;

	57 }

	58

	59 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding

	60 // type is specified in \|enc_type\|.

	61 bool DecodeBQEncoding(const std::string& part,

	62 RFC2047EncodingType enc_type,

	63 const std::string& charset,

	64 std::string* output) {

	65 std::string decoded;

	66 if (!((enc_type == B_ENCODING) ?

	67 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))

	68 return false;

	69

	70 if (decoded.empty()) {

	71 output->clear();

	72 return true;

	73 }

	74

	75 UErrorCode err = U_ZERO_ERROR;

	76 UConverter* converter(ucnv_open(charset.c_str(), &err));

	77 if (U_FAILURE(err))

	78 return false;

	79

	80 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.

	81 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes

	82 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a

	83 // trailing '\0'.

	84 size_t output_length = decoded.length() * 3 + 1;

	85 char* buf = WriteInto(output, output_length);

	86 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,

	87 decoded.data(), decoded.length(), &err);

	88 ucnv_close(converter);

	89 if (U_FAILURE(err))

	90 return false;

	91 output->resize(output_length);

	92 return true;

	93 }

	94

	95 bool DecodeWord(const std::string& encoded_word,

	96 const std::string& referrer_charset,

	97 bool* is_rfc2047,

	98 std::string* output) {

	99 *is_rfc2047 = false;

	100 output->clear();

	101 if (encoded_word.empty())

	102 return true;

	103

	104 if (!IsStringASCII(encoded_word)) {

	105 // Try UTF-8, referrer_charset and the native OS default charset in turn.

	106 if (IsStringUTF8(encoded_word)) {

	107 *output = encoded_word;

	108 } else {

	109 string16 utf16_output;

	110 if (!referrer_charset.empty() &&

	111 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),

	112 base::OnStringConversionError::FAIL,

	113 &utf16_output)) {

	114 *output = UTF16ToUTF8(utf16_output);

	115 } else {

	116 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));

	117 }

	118 }

	119

	120 return true;

	121 }

	122

	123 // RFC 2047 : one of encoding methods supported by Firefox and relatively

	124 // widely used by web servers.

	125 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.

	126 // We don't care about the length restriction (72 bytes) because

	127 // many web servers generate encoded words longer than the limit.

	128 std::string tmp;

	129 *is_rfc2047 = true;

	130 int part_index = 0;

	131 std::string charset;

	132 StringTokenizer t(encoded_word, "?");

	133 RFC2047EncodingType enc_type = Q_ENCODING;

	134 while (*is_rfc2047 && t.GetNext()) {

	135 std::string part = t.token();

	136 switch (part_index) {

	137 case 0:

	138 if (part != "=") {

	139 *is_rfc2047 = false;

	140 break;

	141 }

	142 ++part_index;

	143 break;

	144 case 1:

	145 // Do we need charset validity check here?

	146 charset = part;

	147 ++part_index;

	148 break;

	149 case 2:

	150 if (part.size() > 1 \|\|

	151 part.find_first_of("bBqQ") == std::string::npos) {

	152 *is_rfc2047 = false;

	153 break;

	154 }

	155 if (part[0] == 'b' \|\| part[0] == 'B') {

	156 enc_type = B_ENCODING;

	157 }

	158 ++part_index;

	159 break;

	160 case 3:

	161 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);

	162 if (!*is_rfc2047) {

	163 // Last minute failure. Invalid B/Q encoding. Rather than

	164 // passing it through, return now.

	165 return false;

	166 }

	167 ++part_index;

	168 break;

	169 case 4:

	170 if (part != "=") {

	171 // Another last minute failure !

	172 // Likely to be a case of two encoded-words in a row or

	173 // an encoded word followed by a non-encoded word. We can be

	174 // generous, but it does not help much in terms of compatibility,

	175 // I believe. Return immediately.

	176 *is_rfc2047 = false;

	177 return false;

	178 }

	179 ++part_index;

	180 break;

	181 default:

	182 *is_rfc2047 = false;

	183 return false;

	184 }

	185 }

	186

	187 if (*is_rfc2047) {

	188 if (*(encoded_word.end() - 1) == '=') {

	189 output->swap(tmp);

	190 return true;

	191 }

	192 // encoded_word ending prematurelly with '?' or extra '?'

	193 *is_rfc2047 = false;

	194 return false;

	195 }

	196

	197 // We're not handling 'especial' characters quoted with '\', but

	198 // it should be Ok because we're not an email client but a

	199 // web browser.

	200

	201 // What IE6/7 does: %-escaped UTF-8.

	202 tmp = net::UnescapeURLComponent(encoded_word, net::UnescapeRule::SPACES);

	203 if (IsStringUTF8(tmp)) {

	204 output->swap(tmp);

	205 return true;

	206 // We can try either the OS default charset or 'origin charset' here,

	207 // As far as I can tell, IE does not support it. However, I've seen

	208 // web servers emit %-escaped string in a legacy encoding (usually

	209 // origin charset).

	210 // TODO(jungshik) : Test IE further and consider adding a fallback here.

	211 }

	212 return false;

	213 }

	214

	215 // Decodes the value of a 'filename' or 'name' parameter given as \|input\|. The

	216 // value is supposed to be of the form:

	217 //

	218 // value = token \| quoted-string

	219 //

	220 // However we currently also allow RFC 2047 encoding and non-ASCII

	221 // strings. Non-ASCII strings are interpreted based on \|referrer_charset\|.

	222 bool DecodeFilenameValue(const std::string& input,

	223 const std::string& referrer_charset,

	224 std::string* output) {

	225 std::string tmp;

	226 // Tokenize with whitespace characters.

	227 StringTokenizer t(input, " \t\n\r");

	228 t.set_options(StringTokenizer::RETURN_DELIMS);

	229 bool is_previous_token_rfc2047 = true;

	230 while (t.GetNext()) {

	231 if (t.token_is_delim()) {

	232 // If the previous non-delimeter token is not RFC2047-encoded,

	233 // put in a space in its place. Otheriwse, skip over it.

	234 if (!is_previous_token_rfc2047) {

	235 tmp.push_back(' ');

	236 }

	237 continue;

	238 }

	239 // We don't support a single multibyte character split into

	240 // adjacent encoded words. Some broken mail clients emit headers

	241 // with that problem, but most web servers usually encode a filename

	242 // in a single encoded-word. Firefox/Thunderbird do not support

	243 // it, either.

	244 std::string decoded;

	245 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,

	246 &decoded))

	247 return false;

	248 tmp.append(decoded);

	249 }

	250 output->swap(tmp);

	251 return true;

	252 }

	253

	254 // Parses the charset and value-chars out of an ext-value string.

	255 //

	256 // ext-value = charset "'" [ language ] "'" value-chars

	257 bool ParseExtValueComponents(const std::string& input,

	258 std::string* charset,

	259 std::string* value_chars) {

	260 StringTokenizer t(input, "'");

	261 t.set_options(StringTokenizer::RETURN_DELIMS);

	262 std::string temp_charset;

	263 std::string temp_value;

	264 int numDelimsSeen = 0;

	265 while (t.GetNext()) {

	266 if (t.token_is_delim()) {

	267 ++numDelimsSeen;

	268 continue;

	269 } else {

	270 switch (numDelimsSeen) {

	271 case 0:

	272 temp_charset = t.token();

	273 break;

	274 case 1:

	275 // Language is ignored.

	276 break;

	277 case 2:

	278 temp_value = t.token();

	279 break;

	280 default:

	281 return false;

	282 }

	283 }

	284 }

	285 if (numDelimsSeen != 2)

	286 return false;

	287 if (temp_charset.empty() \|\| temp_value.empty())

	288 return false;

	289 charset->swap(temp_charset);

	290 value_chars->swap(temp_value);

	291 return true;

	292 }

	293

	294 // http://tools.ietf.org/html/rfc5987#section-3.2

	295 //

	296 // ext-value = charset "'" [ language ] "'" value-chars

	297 //

	298 // charset = "UTF-8" / "ISO-8859-1" / mime-charset

	299 //

	300 // mime-charset = 1*mime-charsetc

	301 // mime-charsetc = ALPHA / DIGIT

	302 // / "!" / "#" / "$" / "%" / "&"

	303 // / "+" / "-" / "^" / "_" / "`"

	304 // / "{" / "}" / "~"

	305 //

	306 // language = <Language-Tag, defined in [RFC5646], Section 2.1>

	307 //

	308 // value-chars = *( pct-encoded / attr-char )

	309 //

	310 // pct-encoded = "%" HEXDIG HEXDIG

	311 //

	312 // attr-char = ALPHA / DIGIT

	313 // / "!" / "#" / "$" / "&" / "+" / "-" / "."

	314 // / "^" / "_" / "`" / "\|" / "~"

	315 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {

	316 if (param_value.find('"') != std::string::npos)

	317 return false;

	318

	319 std::string charset;

	320 std::string value;

	321 if (!ParseExtValueComponents(param_value, &charset, &value))

	322 return false;

	323

	324 // RFC 5987 value should be ASCII-only.

	325 if (!IsStringASCII(value)) {

	326 decoded->clear();

	327 return true;

	328 }

	329

	330 std::string unescaped = net::UnescapeURLComponent(

	331 value, net::UnescapeRule::SPACES \| net::UnescapeRule::URL_SPECIAL_CHARS);

	332

	333 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);

	334 }

	335

	336 } // namespace

11	337

12 namespace net {	338 namespace net {

13	339

14 HttpContentDisposition::HttpContentDisposition(	340 HttpContentDisposition::HttpContentDisposition(

15 const std::string& header, const std::string& referrer_charset)	341 const std::string& header, const std::string& referrer_charset)

16 : type_(INLINE) {	342 : type_(INLINE) {

17 Parse(header, referrer_charset);	343 Parse(header, referrer_charset);

18 }	344 }

19	345

20 HttpContentDisposition::~HttpContentDisposition() {	346 HttpContentDisposition::~HttpContentDisposition() {

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
92	418

93 if (!ext_filename.empty())	419 if (!ext_filename.empty())

94 filename_ = ext_filename;	420 filename_ = ext_filename;

95 else if (!filename.empty())	421 else if (!filename.empty())

96 filename_ = filename;	422 filename_ = filename;

97 else	423 else

98 filename_ = name;	424 filename_ = name;

99 }	425 }

100	426

101 } // namespace net	427 } // namespace net

OLD	NEW

« no previous file with comments | « net/base/net_util_unittest.cc ('k') | no next file » | no next file with comments »