OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "net/http/http_content_disposition.h" | |
6 | |
7 #include "base/base64.h" | |
8 #include "base/logging.h" | |
9 #include "base/strings/string_tokenizer.h" | |
10 #include "base/strings/string_util.h" | |
11 #include "base/strings/sys_string_conversions.h" | |
12 #include "base/strings/utf_string_conversions.h" | |
13 #include "net/base/net_string_util.h" | |
14 #include "net/base/net_util.h" | |
15 #include "net/http/http_util.h" | |
16 | |
17 namespace { | |
18 | |
19 enum RFC2047EncodingType { | |
20 Q_ENCODING, | |
21 B_ENCODING | |
22 }; | |
23 | |
24 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to | |
25 // decoding a quoted-printable string. Returns true if the input was valid. | |
26 bool DecodeQEncoding(const std::string& input, std::string* output) { | |
27 std::string temp; | |
28 temp.reserve(input.size()); | |
29 for (std::string::const_iterator it = input.begin(); it != input.end(); | |
30 ++it) { | |
31 if (*it == '_') { | |
32 temp.push_back(' '); | |
33 } else if (*it == '=') { | |
34 if ((input.end() - it < 3) || | |
35 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || | |
36 !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) | |
37 return false; | |
38 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + | |
39 HexDigitToInt(*(it + 2)); | |
40 temp.push_back(static_cast<char>(ch)); | |
41 ++it; | |
42 ++it; | |
43 } else if (0x20 < *it && *it < 0x7F && *it != '?') { | |
44 // In a Q-encoded word, only printable ASCII characters | |
45 // represent themselves. Besides, space, '=', '_' and '?' are | |
46 // not allowed, but they're already filtered out. | |
47 DCHECK_NE('=', *it); | |
48 DCHECK_NE('?', *it); | |
49 DCHECK_NE('_', *it); | |
50 temp.push_back(*it); | |
51 } else { | |
52 return false; | |
53 } | |
54 } | |
55 output->swap(temp); | |
56 return true; | |
57 } | |
58 | |
59 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding | |
60 // type is specified in |enc_type|. | |
61 bool DecodeBQEncoding(const std::string& part, | |
62 RFC2047EncodingType enc_type, | |
63 const std::string& charset, | |
64 std::string* output) { | |
65 std::string decoded; | |
66 if (!((enc_type == B_ENCODING) ? | |
67 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) { | |
68 return false; | |
69 } | |
70 | |
71 if (decoded.empty()) { | |
72 output->clear(); | |
73 return true; | |
74 } | |
75 | |
76 return net::ConvertToUtf8(decoded, charset.c_str(), output); | |
77 } | |
78 | |
79 bool DecodeWord(const std::string& encoded_word, | |
80 const std::string& referrer_charset, | |
81 bool* is_rfc2047, | |
82 std::string* output, | |
83 int* parse_result_flags) { | |
84 *is_rfc2047 = false; | |
85 output->clear(); | |
86 if (encoded_word.empty()) | |
87 return true; | |
88 | |
89 if (!base::IsStringASCII(encoded_word)) { | |
90 // Try UTF-8, referrer_charset and the native OS default charset in turn. | |
91 if (base::IsStringUTF8(encoded_word)) { | |
92 *output = encoded_word; | |
93 } else { | |
94 base::string16 utf16_output; | |
95 if (!referrer_charset.empty() && | |
96 net::ConvertToUTF16(encoded_word, referrer_charset.c_str(), | |
97 &utf16_output)) { | |
98 *output = base::UTF16ToUTF8(utf16_output); | |
99 } else { | |
100 *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word)); | |
101 } | |
102 } | |
103 | |
104 *parse_result_flags |= net::HttpContentDisposition::HAS_NON_ASCII_STRINGS; | |
105 return true; | |
106 } | |
107 | |
108 // RFC 2047 : one of encoding methods supported by Firefox and relatively | |
109 // widely used by web servers. | |
110 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. | |
111 // We don't care about the length restriction (72 bytes) because | |
112 // many web servers generate encoded words longer than the limit. | |
113 std::string decoded_word; | |
114 *is_rfc2047 = true; | |
115 int part_index = 0; | |
116 std::string charset; | |
117 base::StringTokenizer t(encoded_word, "?"); | |
118 RFC2047EncodingType enc_type = Q_ENCODING; | |
119 while (*is_rfc2047 && t.GetNext()) { | |
120 std::string part = t.token(); | |
121 switch (part_index) { | |
122 case 0: | |
123 if (part != "=") { | |
124 *is_rfc2047 = false; | |
125 break; | |
126 } | |
127 ++part_index; | |
128 break; | |
129 case 1: | |
130 // Do we need charset validity check here? | |
131 charset = part; | |
132 ++part_index; | |
133 break; | |
134 case 2: | |
135 if (part.size() > 1 || | |
136 part.find_first_of("bBqQ") == std::string::npos) { | |
137 *is_rfc2047 = false; | |
138 break; | |
139 } | |
140 if (part[0] == 'b' || part[0] == 'B') { | |
141 enc_type = B_ENCODING; | |
142 } | |
143 ++part_index; | |
144 break; | |
145 case 3: | |
146 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word); | |
147 if (!*is_rfc2047) { | |
148 // Last minute failure. Invalid B/Q encoding. Rather than | |
149 // passing it through, return now. | |
150 return false; | |
151 } | |
152 ++part_index; | |
153 break; | |
154 case 4: | |
155 if (part != "=") { | |
156 // Another last minute failure ! | |
157 // Likely to be a case of two encoded-words in a row or | |
158 // an encoded word followed by a non-encoded word. We can be | |
159 // generous, but it does not help much in terms of compatibility, | |
160 // I believe. Return immediately. | |
161 *is_rfc2047 = false; | |
162 return false; | |
163 } | |
164 ++part_index; | |
165 break; | |
166 default: | |
167 *is_rfc2047 = false; | |
168 return false; | |
169 } | |
170 } | |
171 | |
172 if (*is_rfc2047) { | |
173 if (*(encoded_word.end() - 1) == '=') { | |
174 output->swap(decoded_word); | |
175 *parse_result_flags |= | |
176 net::HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS; | |
177 return true; | |
178 } | |
179 // encoded_word ending prematurelly with '?' or extra '?' | |
180 *is_rfc2047 = false; | |
181 return false; | |
182 } | |
183 | |
184 // We're not handling 'especial' characters quoted with '\', but | |
185 // it should be Ok because we're not an email client but a | |
186 // web browser. | |
187 | |
188 // What IE6/7 does: %-escaped UTF-8. | |
189 decoded_word = net::UnescapeURLComponent(encoded_word, | |
190 net::UnescapeRule::SPACES); | |
191 if (decoded_word != encoded_word) | |
192 *parse_result_flags |= | |
193 net::HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS; | |
194 if (base::IsStringUTF8(decoded_word)) { | |
195 output->swap(decoded_word); | |
196 return true; | |
197 // We can try either the OS default charset or 'origin charset' here, | |
198 // As far as I can tell, IE does not support it. However, I've seen | |
199 // web servers emit %-escaped string in a legacy encoding (usually | |
200 // origin charset). | |
201 // TODO(jungshik) : Test IE further and consider adding a fallback here. | |
202 } | |
203 return false; | |
204 } | |
205 | |
206 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The | |
207 // value is supposed to be of the form: | |
208 // | |
209 // value = token | quoted-string | |
210 // | |
211 // However we currently also allow RFC 2047 encoding and non-ASCII | |
212 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. | |
213 bool DecodeFilenameValue(const std::string& input, | |
214 const std::string& referrer_charset, | |
215 std::string* output, | |
216 int* parse_result_flags) { | |
217 int current_parse_result_flags = 0; | |
218 std::string decoded_value; | |
219 bool is_previous_token_rfc2047 = true; | |
220 | |
221 // Tokenize with whitespace characters. | |
222 base::StringTokenizer t(input, " \t\n\r"); | |
223 t.set_options(base::StringTokenizer::RETURN_DELIMS); | |
224 while (t.GetNext()) { | |
225 if (t.token_is_delim()) { | |
226 // If the previous non-delimeter token is not RFC2047-encoded, | |
227 // put in a space in its place. Otheriwse, skip over it. | |
228 if (!is_previous_token_rfc2047) | |
229 decoded_value.push_back(' '); | |
230 continue; | |
231 } | |
232 // We don't support a single multibyte character split into | |
233 // adjacent encoded words. Some broken mail clients emit headers | |
234 // with that problem, but most web servers usually encode a filename | |
235 // in a single encoded-word. Firefox/Thunderbird do not support | |
236 // it, either. | |
237 std::string decoded; | |
238 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, | |
239 &decoded, ¤t_parse_result_flags)) | |
240 return false; | |
241 decoded_value.append(decoded); | |
242 } | |
243 output->swap(decoded_value); | |
244 if (parse_result_flags && !output->empty()) | |
245 *parse_result_flags |= current_parse_result_flags; | |
246 return true; | |
247 } | |
248 | |
249 // Parses the charset and value-chars out of an ext-value string. | |
250 // | |
251 // ext-value = charset "'" [ language ] "'" value-chars | |
252 bool ParseExtValueComponents(const std::string& input, | |
253 std::string* charset, | |
254 std::string* value_chars) { | |
255 base::StringTokenizer t(input, "'"); | |
256 t.set_options(base::StringTokenizer::RETURN_DELIMS); | |
257 std::string temp_charset; | |
258 std::string temp_value; | |
259 int numDelimsSeen = 0; | |
260 while (t.GetNext()) { | |
261 if (t.token_is_delim()) { | |
262 ++numDelimsSeen; | |
263 continue; | |
264 } else { | |
265 switch (numDelimsSeen) { | |
266 case 0: | |
267 temp_charset = t.token(); | |
268 break; | |
269 case 1: | |
270 // Language is ignored. | |
271 break; | |
272 case 2: | |
273 temp_value = t.token(); | |
274 break; | |
275 default: | |
276 return false; | |
277 } | |
278 } | |
279 } | |
280 if (numDelimsSeen != 2) | |
281 return false; | |
282 if (temp_charset.empty() || temp_value.empty()) | |
283 return false; | |
284 charset->swap(temp_charset); | |
285 value_chars->swap(temp_value); | |
286 return true; | |
287 } | |
288 | |
289 // http://tools.ietf.org/html/rfc5987#section-3.2 | |
290 // | |
291 // ext-value = charset "'" [ language ] "'" value-chars | |
292 // | |
293 // charset = "UTF-8" / "ISO-8859-1" / mime-charset | |
294 // | |
295 // mime-charset = 1*mime-charsetc | |
296 // mime-charsetc = ALPHA / DIGIT | |
297 // / "!" / "#" / "$" / "%" / "&" | |
298 // / "+" / "-" / "^" / "_" / "`" | |
299 // / "{" / "}" / "~" | |
300 // | |
301 // language = <Language-Tag, defined in [RFC5646], Section 2.1> | |
302 // | |
303 // value-chars = *( pct-encoded / attr-char ) | |
304 // | |
305 // pct-encoded = "%" HEXDIG HEXDIG | |
306 // | |
307 // attr-char = ALPHA / DIGIT | |
308 // / "!" / "#" / "$" / "&" / "+" / "-" / "." | |
309 // / "^" / "_" / "`" / "|" / "~" | |
310 bool DecodeExtValue(const std::string& param_value, std::string* decoded) { | |
311 if (param_value.find('"') != std::string::npos) | |
312 return false; | |
313 | |
314 std::string charset; | |
315 std::string value; | |
316 if (!ParseExtValueComponents(param_value, &charset, &value)) | |
317 return false; | |
318 | |
319 // RFC 5987 value should be ASCII-only. | |
320 if (!base::IsStringASCII(value)) { | |
321 decoded->clear(); | |
322 return true; | |
323 } | |
324 | |
325 std::string unescaped = net::UnescapeURLComponent( | |
326 value, net::UnescapeRule::SPACES | net::UnescapeRule::URL_SPECIAL_CHARS); | |
327 | |
328 return net::ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded); | |
329 } | |
330 | |
331 } // namespace | |
332 | |
333 namespace net { | |
334 | |
335 HttpContentDisposition::HttpContentDisposition( | |
336 const std::string& header, const std::string& referrer_charset) | |
337 : type_(INLINE), | |
338 parse_result_flags_(INVALID) { | |
339 Parse(header, referrer_charset); | |
340 } | |
341 | |
342 HttpContentDisposition::~HttpContentDisposition() { | |
343 } | |
344 | |
345 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( | |
346 std::string::const_iterator begin, std::string::const_iterator end) { | |
347 DCHECK(type_ == INLINE); | |
348 std::string::const_iterator delimiter = std::find(begin, end, ';'); | |
349 | |
350 std::string::const_iterator type_begin = begin; | |
351 std::string::const_iterator type_end = delimiter; | |
352 HttpUtil::TrimLWS(&type_begin, &type_end); | |
353 | |
354 // If the disposition-type isn't a valid token the then the | |
355 // Content-Disposition header is malformed, and we treat the first bytes as | |
356 // a parameter rather than a disposition-type. | |
357 if (!HttpUtil::IsToken(type_begin, type_end)) | |
358 return begin; | |
359 | |
360 parse_result_flags_ |= HAS_DISPOSITION_TYPE; | |
361 | |
362 DCHECK(std::find(type_begin, type_end, '=') == type_end); | |
363 | |
364 if (LowerCaseEqualsASCII(type_begin, type_end, "inline")) { | |
365 type_ = INLINE; | |
366 } else if (LowerCaseEqualsASCII(type_begin, type_end, "attachment")) { | |
367 type_ = ATTACHMENT; | |
368 } else { | |
369 parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE; | |
370 type_ = ATTACHMENT; | |
371 } | |
372 return delimiter; | |
373 } | |
374 | |
375 // http://tools.ietf.org/html/rfc6266 | |
376 // | |
377 // content-disposition = "Content-Disposition" ":" | |
378 // disposition-type *( ";" disposition-parm ) | |
379 // | |
380 // disposition-type = "inline" | "attachment" | disp-ext-type | |
381 // ; case-insensitive | |
382 // disp-ext-type = token | |
383 // | |
384 // disposition-parm = filename-parm | disp-ext-parm | |
385 // | |
386 // filename-parm = "filename" "=" value | |
387 // | "filename*" "=" ext-value | |
388 // | |
389 // disp-ext-parm = token "=" value | |
390 // | ext-token "=" ext-value | |
391 // ext-token = <the characters in token, followed by "*"> | |
392 // | |
393 void HttpContentDisposition::Parse(const std::string& header, | |
394 const std::string& referrer_charset) { | |
395 DCHECK(type_ == INLINE); | |
396 DCHECK(filename_.empty()); | |
397 | |
398 std::string::const_iterator pos = header.begin(); | |
399 std::string::const_iterator end = header.end(); | |
400 pos = ConsumeDispositionType(pos, end); | |
401 | |
402 std::string name; | |
403 std::string filename; | |
404 std::string ext_filename; | |
405 | |
406 HttpUtil::NameValuePairsIterator iter(pos, end, ';'); | |
407 while (iter.GetNext()) { | |
408 if (filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), | |
409 iter.name_end(), | |
410 "filename")) { | |
411 DecodeFilenameValue(iter.value(), referrer_charset, &filename, | |
412 &parse_result_flags_); | |
413 if (!filename.empty()) | |
414 parse_result_flags_ |= HAS_FILENAME; | |
415 } else if (name.empty() && LowerCaseEqualsASCII(iter.name_begin(), | |
416 iter.name_end(), | |
417 "name")) { | |
418 DecodeFilenameValue(iter.value(), referrer_charset, &name, NULL); | |
419 if (!name.empty()) | |
420 parse_result_flags_ |= HAS_NAME; | |
421 } else if (ext_filename.empty() && LowerCaseEqualsASCII(iter.name_begin(), | |
422 iter.name_end(), | |
423 "filename*")) { | |
424 DecodeExtValue(iter.raw_value(), &ext_filename); | |
425 if (!ext_filename.empty()) | |
426 parse_result_flags_ |= HAS_EXT_FILENAME; | |
427 } | |
428 } | |
429 | |
430 if (!ext_filename.empty()) | |
431 filename_ = ext_filename; | |
432 else if (!filename.empty()) | |
433 filename_ = filename; | |
434 else | |
435 filename_ = name; | |
436 } | |
437 | |
438 } // namespace net | |
OLD | NEW |