Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(60)

Side by Side Diff: net/http/http_content_disposition.cc

Issue 11471041: Move DecodeFilenameValue and DecodeExt value into http_content_disposition. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 8 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « net/base/net_util_unittest.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "net/http/http_content_disposition.h" 5 #include "net/http/http_content_disposition.h"
6 6
7 #include "base/base64.h"
8 #include "base/i18n/icu_string_conversions.h"
7 #include "base/logging.h" 9 #include "base/logging.h"
8 #include "base/string_util.h" 10 #include "base/string_util.h"
11 #include "base/sys_string_conversions.h"
12 #include "base/utf_string_conversions.h"
9 #include "net/base/net_util.h" 13 #include "net/base/net_util.h"
10 #include "net/http/http_util.h" 14 #include "net/http/http_util.h"
15 #include "unicode/ucnv.h"
11 16
12 namespace net { 17 namespace net {
13 18
19 namespace {
rvargas (doing something else) 2012/12/13 22:44:43 nit: It looks like all this code is fairly indepen
asanka 2012/12/13 23:28:47 Done in patch set 3.
20
21 enum RFC2047EncodingType {
22 Q_ENCODING,
23 B_ENCODING
24 };
25
26 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
27 // decoding a quoted-printable string. Returns true if the input was valid.
28 bool DecodeQEncoding(const std::string& input, std::string* output) {
29 std::string temp;
30 temp.reserve(input.size());
31 for (std::string::const_iterator it = input.begin(); it != input.end();
32 ++it) {
33 if (*it == '_') {
34 temp.push_back(' ');
35 } else if (*it == '=') {
36 if ((input.end() - it < 3) ||
37 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
38 !IsHexDigit(static_cast<unsigned char>(*(it + 2))))
39 return false;
40 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 +
41 HexDigitToInt(*(it + 2));
42 temp.push_back(static_cast<char>(ch));
43 ++it;
44 ++it;
45 } else if (0x20 < *it && *it < 0x7F && *it != '?') {
46 // In a Q-encoded word, only printable ASCII characters
47 // represent themselves. Besides, space, '=', '_' and '?' are
48 // not allowed, but they're already filtered out.
49 DCHECK_NE('=', *it);
50 DCHECK_NE('?', *it);
51 DCHECK_NE('_', *it);
52 temp.push_back(*it);
53 } else {
54 return false;
55 }
56 }
57 output->swap(temp);
58 return true;
59 }
60
61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
62 // type is specified in |enc_type|.
63 bool DecodeBQEncoding(const std::string& part,
64 RFC2047EncodingType enc_type,
65 const std::string& charset,
66 std::string* output) {
67 std::string decoded;
68 if (!((enc_type == B_ENCODING) ?
69 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded)))
70 return false;
71
72 if (decoded.empty()) {
73 output->clear();
74 return true;
75 }
76
77 UErrorCode err = U_ZERO_ERROR;
78 UConverter* converter(ucnv_open(charset.c_str(), &err));
79 if (U_FAILURE(err))
80 return false;
81
82 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
83 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
84 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a
85 // trailing '\0'.
86 size_t output_length = decoded.length() * 3 + 1;
87 char* buf = WriteInto(output, output_length);
88 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length,
89 decoded.data(), decoded.length(), &err);
90 ucnv_close(converter);
91 if (U_FAILURE(err))
92 return false;
93 output->resize(output_length);
94 return true;
95 }
96
97 bool DecodeWord(const std::string& encoded_word,
98 const std::string& referrer_charset,
99 bool* is_rfc2047,
100 std::string* output) {
101 *is_rfc2047 = false;
102 output->clear();
103 if (encoded_word.empty())
104 return true;
105
106 if (!IsStringASCII(encoded_word)) {
107 // Try UTF-8, referrer_charset and the native OS default charset in turn.
108 if (IsStringUTF8(encoded_word)) {
109 *output = encoded_word;
110 } else {
111 string16 utf16_output;
112 if (!referrer_charset.empty() &&
113 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(),
114 base::OnStringConversionError::FAIL,
115 &utf16_output)) {
116 *output = UTF16ToUTF8(utf16_output);
117 } else {
118 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
119 }
120 }
121
122 return true;
123 }
124
125 // RFC 2047 : one of encoding methods supported by Firefox and relatively
126 // widely used by web servers.
127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
128 // We don't care about the length restriction (72 bytes) because
129 // many web servers generate encoded words longer than the limit.
130 std::string tmp;
131 *is_rfc2047 = true;
132 int part_index = 0;
133 std::string charset;
134 StringTokenizer t(encoded_word, "?");
135 RFC2047EncodingType enc_type = Q_ENCODING;
136 while (*is_rfc2047 && t.GetNext()) {
137 std::string part = t.token();
138 switch (part_index) {
139 case 0:
140 if (part != "=") {
141 *is_rfc2047 = false;
142 break;
143 }
144 ++part_index;
145 break;
146 case 1:
147 // Do we need charset validity check here?
148 charset = part;
149 ++part_index;
150 break;
151 case 2:
152 if (part.size() > 1 ||
153 part.find_first_of("bBqQ") == std::string::npos) {
154 *is_rfc2047 = false;
155 break;
156 }
157 if (part[0] == 'b' || part[0] == 'B') {
158 enc_type = B_ENCODING;
159 }
160 ++part_index;
161 break;
162 case 3:
163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
164 if (!*is_rfc2047) {
165 // Last minute failure. Invalid B/Q encoding. Rather than
166 // passing it through, return now.
167 return false;
168 }
169 ++part_index;
170 break;
171 case 4:
172 if (part != "=") {
173 // Another last minute failure !
174 // Likely to be a case of two encoded-words in a row or
175 // an encoded word followed by a non-encoded word. We can be
176 // generous, but it does not help much in terms of compatibility,
177 // I believe. Return immediately.
178 *is_rfc2047 = false;
179 return false;
180 }
181 ++part_index;
182 break;
183 default:
184 *is_rfc2047 = false;
185 return false;
186 }
187 }
188
189 if (*is_rfc2047) {
190 if (*(encoded_word.end() - 1) == '=') {
191 output->swap(tmp);
192 return true;
193 }
194 // encoded_word ending prematurelly with '?' or extra '?'
195 *is_rfc2047 = false;
196 return false;
197 }
198
199 // We're not handling 'especial' characters quoted with '\', but
200 // it should be Ok because we're not an email client but a
201 // web browser.
202
203 // What IE6/7 does: %-escaped UTF-8.
204 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
205 if (IsStringUTF8(tmp)) {
206 output->swap(tmp);
207 return true;
208 // We can try either the OS default charset or 'origin charset' here,
209 // As far as I can tell, IE does not support it. However, I've seen
210 // web servers emit %-escaped string in a legacy encoding (usually
211 // origin charset).
212 // TODO(jungshik) : Test IE further and consider adding a fallback here.
213 }
214 return false;
215 }
216
217 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
218 // value is supposed to be of the form:
219 //
220 // value = token | quoted-string
221 //
222 // However we currently also allow RFC 2047 encoding and non-ASCII
223 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
224 bool DecodeFilenameValue(const std::string& input,
225 const std::string& referrer_charset,
226 std::string* output) {
227 std::string tmp;
228 // Tokenize with whitespace characters.
229 StringTokenizer t(input, " \t\n\r");
230 t.set_options(StringTokenizer::RETURN_DELIMS);
231 bool is_previous_token_rfc2047 = true;
232 while (t.GetNext()) {
233 if (t.token_is_delim()) {
234 // If the previous non-delimeter token is not RFC2047-encoded,
235 // put in a space in its place. Otheriwse, skip over it.
236 if (!is_previous_token_rfc2047) {
237 tmp.push_back(' ');
238 }
239 continue;
240 }
241 // We don't support a single multibyte character split into
242 // adjacent encoded words. Some broken mail clients emit headers
243 // with that problem, but most web servers usually encode a filename
244 // in a single encoded-word. Firefox/Thunderbird do not support
245 // it, either.
246 std::string decoded;
247 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
248 &decoded))
249 return false;
250 tmp.append(decoded);
251 }
252 output->swap(tmp);
253 return true;
254 }
255
256 // Parses the charset and value-chars out of an ext-value string.
257 //
258 // ext-value = charset "'" [ language ] "'" value-chars
259 bool ParseExtValueComponents(const std::string& input,
260 std::string* charset,
261 std::string* value_chars) {
262 StringTokenizer t(input, "'");
263 t.set_options(StringTokenizer::RETURN_DELIMS);
264 std::string temp_charset;
265 std::string temp_value;
266 int numDelimsSeen = 0;
267 while (t.GetNext()) {
268 if (t.token_is_delim()) {
269 ++numDelimsSeen;
270 continue;
271 } else {
272 switch (numDelimsSeen) {
273 case 0:
274 temp_charset = t.token();
275 break;
276 case 1:
277 // Language is ignored.
278 break;
279 case 2:
280 temp_value = t.token();
281 break;
282 default:
283 return false;
284 }
285 }
286 }
287 if (numDelimsSeen != 2)
288 return false;
289 if (temp_charset.empty() || temp_value.empty())
290 return false;
291 charset->swap(temp_charset);
292 value_chars->swap(temp_value);
293 return true;
294 }
295
296 // http://tools.ietf.org/html/rfc5987#section-3.2
297 //
298 // ext-value = charset "'" [ language ] "'" value-chars
299 //
300 // charset = "UTF-8" / "ISO-8859-1" / mime-charset
301 //
302 // mime-charset = 1*mime-charsetc
303 // mime-charsetc = ALPHA / DIGIT
304 // / "!" / "#" / "$" / "%" / "&"
305 // / "+" / "-" / "^" / "_" / "`"
306 // / "{" / "}" / "~"
307 //
308 // language = <Language-Tag, defined in [RFC5646], Section 2.1>
309 //
310 // value-chars = *( pct-encoded / attr-char )
311 //
312 // pct-encoded = "%" HEXDIG HEXDIG
313 //
314 // attr-char = ALPHA / DIGIT
315 // / "!" / "#" / "$" / "&" / "+" / "-" / "."
316 // / "^" / "_" / "`" / "|" / "~"
317 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
318 if (param_value.find('"') != std::string::npos)
319 return false;
320
321 std::string charset;
322 std::string value;
323 if (!ParseExtValueComponents(param_value, &charset, &value))
324 return false;
325
326 // RFC 5987 value should be ASCII-only.
327 if (!IsStringASCII(value)) {
328 decoded->clear();
329 return true;
330 }
331
332 std::string unescaped = UnescapeURLComponent(value,
333 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
334
335 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded);
336 }
337
338 } // namespace
339
14 HttpContentDisposition::HttpContentDisposition( 340 HttpContentDisposition::HttpContentDisposition(
15 const std::string& header, const std::string& referrer_charset) 341 const std::string& header, const std::string& referrer_charset)
16 : type_(INLINE) { 342 : type_(INLINE) {
17 Parse(header, referrer_charset); 343 Parse(header, referrer_charset);
18 } 344 }
19 345
20 HttpContentDisposition::~HttpContentDisposition() { 346 HttpContentDisposition::~HttpContentDisposition() {
21 } 347 }
22 348
23 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( 349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after
92 418
93 if (!ext_filename.empty()) 419 if (!ext_filename.empty())
94 filename_ = ext_filename; 420 filename_ = ext_filename;
95 else if (!filename.empty()) 421 else if (!filename.empty())
96 filename_ = filename; 422 filename_ = filename;
97 else 423 else
98 filename_ = name; 424 filename_ = name;
99 } 425 }
100 426
101 } // namespace net 427 } // namespace net
OLDNEW
« no previous file with comments | « net/base/net_util_unittest.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698