net/http/http_content_disposition.cc - Issue 11471041: Move DecodeFilenameValue and DecodeExt value into http_content_disposition.

Side by Side Diff: net/http/http_content_disposition.cc

Issue 11471041: Move DecodeFilenameValue and DecodeExt value into http_content_disposition. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/http/http_content_disposition.h"	5 #include "net/http/http_content_disposition.h"

6	6

	7 #include "base/base64.h"

	8 #include "base/i18n/icu_string_conversions.h"

7 #include "base/logging.h"	9 #include "base/logging.h"

8 #include "base/string_util.h"	10 #include "base/string_util.h"

	11 #include "base/sys_string_conversions.h"

	12 #include "base/utf_string_conversions.h"

9 #include "net/base/net_util.h"	13 #include "net/base/net_util.h"

10 #include "net/http/http_util.h"	14 #include "net/http/http_util.h"

	15 #include "unicode/ucnv.h"

11	16

12 namespace net {	17 namespace net {

13	18

	19 namespace {
	rvargas (doing something else) 2012/12/13 22:44:43 nit: It looks like all this code is fairly indepen nit: It looks like all this code is fairly independent of net::. Do you mind keeping this namespace outside of net? asanka 2012/12/13 23:28:47 Done in patch set 3. Show quoted text On 2012/12/13 22:44:43, rvargas wrote: > nit: It looks like all this code is fairly independent of net::. Do you mind > keeping this namespace outside of net? Done in patch set 3.
	20

	21 enum RFC2047EncodingType {

	22 Q_ENCODING,

	23 B_ENCODING

	24 };

	25

	26 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to

	27 // decoding a quoted-printable string. Returns true if the input was valid.

	28 bool DecodeQEncoding(const std::string& input, std::string* output) {

	29 std::string temp;

	30 temp.reserve(input.size());

	31 for (std::string::const_iterator it = input.begin(); it != input.end();

	32 ++it) {

	33 if (*it == '_') {

	34 temp.push_back(' ');

	35 } else if (*it == '=') {

	36 if ((input.end() - it < 3) \|\|

	37 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) \|\|

	38 !IsHexDigit(static_cast<unsigned char>(*(it + 2))))

	39 return false;

	40 unsigned char ch = HexDigitToInt((it + 1)) 16 +

	41 HexDigitToInt(*(it + 2));

	42 temp.push_back(static_cast<char>(ch));

	43 ++it;

	44 ++it;

	45 } else if (0x20 < it && it < 0x7F && *it != '?') {

	46 // In a Q-encoded word, only printable ASCII characters

	47 // represent themselves. Besides, space, '=', '_' and '?' are

	48 // not allowed, but they're already filtered out.

	49 DCHECK_NE('=', *it);

	50 DCHECK_NE('?', *it);

	51 DCHECK_NE('_', *it);

	52 temp.push_back(*it);

	53 } else {

	54 return false;

	55 }

	56 }

	57 output->swap(temp);

	58 return true;

	59 }

	60

	61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding

	62 // type is specified in \|enc_type\|.

	63 bool DecodeBQEncoding(const std::string& part,

	64 RFC2047EncodingType enc_type,

	65 const std::string& charset,

	66 std::string* output) {

	67 std::string decoded;

	68 if (!((enc_type == B_ENCODING) ?

	69 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))

	70 return false;

	71

	72 if (decoded.empty()) {

	73 output->clear();

	74 return true;

	75 }

	76

	77 UErrorCode err = U_ZERO_ERROR;

	78 UConverter* converter(ucnv_open(charset.c_str(), &err));

	79 if (U_FAILURE(err))

	80 return false;

	81

	82 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.

	83 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes

	84 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a

	85 // trailing '\0'.

	86 size_t output_length = decoded.length() * 3 + 1;

	87 char* buf = WriteInto(output, output_length);

	88 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,

	89 decoded.data(), decoded.length(), &err);

	90 ucnv_close(converter);

	91 if (U_FAILURE(err))

	92 return false;

	93 output->resize(output_length);

	94 return true;

	95 }

	96

	97 bool DecodeWord(const std::string& encoded_word,

	98 const std::string& referrer_charset,

	99 bool* is_rfc2047,

	100 std::string* output) {

	101 *is_rfc2047 = false;

	102 output->clear();

	103 if (encoded_word.empty())

	104 return true;

	105

	106 if (!IsStringASCII(encoded_word)) {

	107 // Try UTF-8, referrer_charset and the native OS default charset in turn.

	108 if (IsStringUTF8(encoded_word)) {

	109 *output = encoded_word;

	110 } else {

	111 string16 utf16_output;

	112 if (!referrer_charset.empty() &&

	113 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),

	114 base::OnStringConversionError::FAIL,

	115 &utf16_output)) {

	116 *output = UTF16ToUTF8(utf16_output);

	117 } else {

	118 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));

	119 }

	120 }

	121

	122 return true;

	123 }

	124

	125 // RFC 2047 : one of encoding methods supported by Firefox and relatively

	126 // widely used by web servers.

	127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.

	128 // We don't care about the length restriction (72 bytes) because

	129 // many web servers generate encoded words longer than the limit.

	130 std::string tmp;

	131 *is_rfc2047 = true;

	132 int part_index = 0;

	133 std::string charset;

	134 StringTokenizer t(encoded_word, "?");

	135 RFC2047EncodingType enc_type = Q_ENCODING;

	136 while (*is_rfc2047 && t.GetNext()) {

	137 std::string part = t.token();

	138 switch (part_index) {

	139 case 0:

	140 if (part != "=") {

	141 *is_rfc2047 = false;

	142 break;

	143 }

	144 ++part_index;

	145 break;

	146 case 1:

	147 // Do we need charset validity check here?

	148 charset = part;

	149 ++part_index;

	150 break;

	151 case 2:

	152 if (part.size() > 1 \|\|

	153 part.find_first_of("bBqQ") == std::string::npos) {

	154 *is_rfc2047 = false;

	155 break;

	156 }

	157 if (part[0] == 'b' \|\| part[0] == 'B') {

	158 enc_type = B_ENCODING;

	159 }

	160 ++part_index;

	161 break;

	162 case 3:

	163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);

	164 if (!*is_rfc2047) {

	165 // Last minute failure. Invalid B/Q encoding. Rather than

	166 // passing it through, return now.

	167 return false;

	168 }

	169 ++part_index;

	170 break;

	171 case 4:

	172 if (part != "=") {

	173 // Another last minute failure !

	174 // Likely to be a case of two encoded-words in a row or

	175 // an encoded word followed by a non-encoded word. We can be

	176 // generous, but it does not help much in terms of compatibility,

	177 // I believe. Return immediately.

	178 *is_rfc2047 = false;

	179 return false;

	180 }

	181 ++part_index;

	182 break;

	183 default:

	184 *is_rfc2047 = false;

	185 return false;

	186 }

	187 }

	188

	189 if (*is_rfc2047) {

	190 if (*(encoded_word.end() - 1) == '=') {

	191 output->swap(tmp);

	192 return true;

	193 }

	194 // encoded_word ending prematurelly with '?' or extra '?'

	195 *is_rfc2047 = false;

	196 return false;

	197 }

	198

	199 // We're not handling 'especial' characters quoted with '\', but

	200 // it should be Ok because we're not an email client but a

	201 // web browser.

	202

	203 // What IE6/7 does: %-escaped UTF-8.

	204 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);

	205 if (IsStringUTF8(tmp)) {

	206 output->swap(tmp);

	207 return true;

	208 // We can try either the OS default charset or 'origin charset' here,

	209 // As far as I can tell, IE does not support it. However, I've seen

	210 // web servers emit %-escaped string in a legacy encoding (usually

	211 // origin charset).

	212 // TODO(jungshik) : Test IE further and consider adding a fallback here.

	213 }

	214 return false;

	215 }

	216

	217 // Decodes the value of a 'filename' or 'name' parameter given as \|input\|. The

	218 // value is supposed to be of the form:

	219 //

	220 // value = token \| quoted-string

	221 //

	222 // However we currently also allow RFC 2047 encoding and non-ASCII

	223 // strings. Non-ASCII strings are interpreted based on \|referrer_charset\|.

	224 bool DecodeFilenameValue(const std::string& input,

	225 const std::string& referrer_charset,

	226 std::string* output) {

	227 std::string tmp;

	228 // Tokenize with whitespace characters.

	229 StringTokenizer t(input, " \t\n\r");

	230 t.set_options(StringTokenizer::RETURN_DELIMS);

	231 bool is_previous_token_rfc2047 = true;

	232 while (t.GetNext()) {

	233 if (t.token_is_delim()) {

	234 // If the previous non-delimeter token is not RFC2047-encoded,

	235 // put in a space in its place. Otheriwse, skip over it.

	236 if (!is_previous_token_rfc2047) {

	237 tmp.push_back(' ');

	238 }

	239 continue;

	240 }

	241 // We don't support a single multibyte character split into

	242 // adjacent encoded words. Some broken mail clients emit headers

	243 // with that problem, but most web servers usually encode a filename

	244 // in a single encoded-word. Firefox/Thunderbird do not support

	245 // it, either.

	246 std::string decoded;

	247 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,

	248 &decoded))

	249 return false;

	250 tmp.append(decoded);

	251 }

	252 output->swap(tmp);

	253 return true;

	254 }

	255

	256 // Parses the charset and value-chars out of an ext-value string.

	257 //

	258 // ext-value = charset "'" [ language ] "'" value-chars

	259 bool ParseExtValueComponents(const std::string& input,

	260 std::string* charset,

	261 std::string* value_chars) {

	262 StringTokenizer t(input, "'");

	263 t.set_options(StringTokenizer::RETURN_DELIMS);

	264 std::string temp_charset;

	265 std::string temp_value;

	266 int numDelimsSeen = 0;

	267 while (t.GetNext()) {

	268 if (t.token_is_delim()) {

	269 ++numDelimsSeen;

	270 continue;

	271 } else {

	272 switch (numDelimsSeen) {

	273 case 0:

	274 temp_charset = t.token();

	275 break;

	276 case 1:

	277 // Language is ignored.

	278 break;

	279 case 2:

	280 temp_value = t.token();

	281 break;

	282 default:

	283 return false;

	284 }

	285 }

	286 }

	287 if (numDelimsSeen != 2)

	288 return false;

	289 if (temp_charset.empty() \|\| temp_value.empty())

	290 return false;

	291 charset->swap(temp_charset);

	292 value_chars->swap(temp_value);

	293 return true;

	294 }

	295

	296 // http://tools.ietf.org/html/rfc5987#section-3.2

	297 //

	298 // ext-value = charset "'" [ language ] "'" value-chars

	299 //

	300 // charset = "UTF-8" / "ISO-8859-1" / mime-charset

	301 //

	302 // mime-charset = 1*mime-charsetc

	303 // mime-charsetc = ALPHA / DIGIT

	304 // / "!" / "#" / "$" / "%" / "&"

	305 // / "+" / "-" / "^" / "_" / "`"

	306 // / "{" / "}" / "~"

	307 //

	308 // language = <Language-Tag, defined in [RFC5646], Section 2.1>

	309 //

	310 // value-chars = *( pct-encoded / attr-char )

	311 //

	312 // pct-encoded = "%" HEXDIG HEXDIG

	313 //

	314 // attr-char = ALPHA / DIGIT

	315 // / "!" / "#" / "$" / "&" / "+" / "-" / "."

	316 // / "^" / "_" / "`" / "\|" / "~"

	317 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {

	318 if (param_value.find('"') != std::string::npos)

	319 return false;

	320

	321 std::string charset;

	322 std::string value;

	323 if (!ParseExtValueComponents(param_value, &charset, &value))

	324 return false;

	325

	326 // RFC 5987 value should be ASCII-only.

	327 if (!IsStringASCII(value)) {

	328 decoded->clear();

	329 return true;

	330 }

	331

	332 std::string unescaped = UnescapeURLComponent(value,

	333 UnescapeRule::SPACES \| UnescapeRule::URL_SPECIAL_CHARS);

	334

	335 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);

	336 }

	337

	338 } // namespace

	339

14 HttpContentDisposition::HttpContentDisposition(	340 HttpContentDisposition::HttpContentDisposition(

15 const std::string& header, const std::string& referrer_charset)	341 const std::string& header, const std::string& referrer_charset)

16 : type_(INLINE) {	342 : type_(INLINE) {

17 Parse(header, referrer_charset);	343 Parse(header, referrer_charset);

18 }	344 }

19	345

20 HttpContentDisposition::~HttpContentDisposition() {	346 HttpContentDisposition::~HttpContentDisposition() {

21 }	347 }

22	348

23 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(	349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(

(...skipping 68 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
92	418

93 if (!ext_filename.empty())	419 if (!ext_filename.empty())

94 filename_ = ext_filename;	420 filename_ = ext_filename;

95 else if (!filename.empty())	421 else if (!filename.empty())

96 filename_ = filename;	422 filename_ = filename;

97 else	423 else

98 filename_ = name;	424 filename_ = name;

99 }	425 }

100	426

101 } // namespace net	427 } // namespace net

OLD	NEW

« no previous file with comments | « net/base/net_util_unittest.cc ('k') | no next file » | no next file with comments »