OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "net/http/http_content_disposition.h" | 5 #include "net/http/http_content_disposition.h" |
6 | 6 |
7 #include "base/base64.h" | |
8 #include "base/i18n/icu_string_conversions.h" | |
7 #include "base/logging.h" | 9 #include "base/logging.h" |
8 #include "base/string_util.h" | 10 #include "base/string_util.h" |
11 #include "base/sys_string_conversions.h" | |
12 #include "base/utf_string_conversions.h" | |
9 #include "net/base/net_util.h" | 13 #include "net/base/net_util.h" |
10 #include "net/http/http_util.h" | 14 #include "net/http/http_util.h" |
15 #include "unicode/ucnv.h" | |
11 | 16 |
12 namespace net { | 17 namespace net { |
13 | 18 |
19 namespace { | |
rvargas (doing something else)
2012/12/13 22:44:43
nit: It looks like all this code is fairly indepen
asanka
2012/12/13 23:28:47
Done in patch set 3.
| |
20 | |
21 enum RFC2047EncodingType { | |
22 Q_ENCODING, | |
23 B_ENCODING | |
24 }; | |
25 | |
26 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to | |
27 // decoding a quoted-printable string. Returns true if the input was valid. | |
28 bool DecodeQEncoding(const std::string& input, std::string* output) { | |
29 std::string temp; | |
30 temp.reserve(input.size()); | |
31 for (std::string::const_iterator it = input.begin(); it != input.end(); | |
32 ++it) { | |
33 if (*it == '_') { | |
34 temp.push_back(' '); | |
35 } else if (*it == '=') { | |
36 if ((input.end() - it < 3) || | |
37 !IsHexDigit(static_cast<unsigned char>(*(it + 1))) || | |
38 !IsHexDigit(static_cast<unsigned char>(*(it + 2)))) | |
39 return false; | |
40 unsigned char ch = HexDigitToInt(*(it + 1)) * 16 + | |
41 HexDigitToInt(*(it + 2)); | |
42 temp.push_back(static_cast<char>(ch)); | |
43 ++it; | |
44 ++it; | |
45 } else if (0x20 < *it && *it < 0x7F && *it != '?') { | |
46 // In a Q-encoded word, only printable ASCII characters | |
47 // represent themselves. Besides, space, '=', '_' and '?' are | |
48 // not allowed, but they're already filtered out. | |
49 DCHECK_NE('=', *it); | |
50 DCHECK_NE('?', *it); | |
51 DCHECK_NE('_', *it); | |
52 temp.push_back(*it); | |
53 } else { | |
54 return false; | |
55 } | |
56 } | |
57 output->swap(temp); | |
58 return true; | |
59 } | |
60 | |
61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding | |
62 // type is specified in |enc_type|. | |
63 bool DecodeBQEncoding(const std::string& part, | |
64 RFC2047EncodingType enc_type, | |
65 const std::string& charset, | |
66 std::string* output) { | |
67 std::string decoded; | |
68 if (!((enc_type == B_ENCODING) ? | |
69 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) | |
70 return false; | |
71 | |
72 if (decoded.empty()) { | |
73 output->clear(); | |
74 return true; | |
75 } | |
76 | |
77 UErrorCode err = U_ZERO_ERROR; | |
78 UConverter* converter(ucnv_open(charset.c_str(), &err)); | |
79 if (U_FAILURE(err)) | |
80 return false; | |
81 | |
82 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. | |
83 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes | |
84 // in UTF-8. Therefore, the expansion ratio is 3 at most. Add one for a | |
85 // trailing '\0'. | |
86 size_t output_length = decoded.length() * 3 + 1; | |
87 char* buf = WriteInto(output, output_length); | |
88 output_length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, output_length, | |
89 decoded.data(), decoded.length(), &err); | |
90 ucnv_close(converter); | |
91 if (U_FAILURE(err)) | |
92 return false; | |
93 output->resize(output_length); | |
94 return true; | |
95 } | |
96 | |
97 bool DecodeWord(const std::string& encoded_word, | |
98 const std::string& referrer_charset, | |
99 bool* is_rfc2047, | |
100 std::string* output) { | |
101 *is_rfc2047 = false; | |
102 output->clear(); | |
103 if (encoded_word.empty()) | |
104 return true; | |
105 | |
106 if (!IsStringASCII(encoded_word)) { | |
107 // Try UTF-8, referrer_charset and the native OS default charset in turn. | |
108 if (IsStringUTF8(encoded_word)) { | |
109 *output = encoded_word; | |
110 } else { | |
111 string16 utf16_output; | |
112 if (!referrer_charset.empty() && | |
113 base::CodepageToUTF16(encoded_word, referrer_charset.c_str(), | |
114 base::OnStringConversionError::FAIL, | |
115 &utf16_output)) { | |
116 *output = UTF16ToUTF8(utf16_output); | |
117 } else { | |
118 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word)); | |
119 } | |
120 } | |
121 | |
122 return true; | |
123 } | |
124 | |
125 // RFC 2047 : one of encoding methods supported by Firefox and relatively | |
126 // widely used by web servers. | |
127 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. | |
128 // We don't care about the length restriction (72 bytes) because | |
129 // many web servers generate encoded words longer than the limit. | |
130 std::string tmp; | |
131 *is_rfc2047 = true; | |
132 int part_index = 0; | |
133 std::string charset; | |
134 StringTokenizer t(encoded_word, "?"); | |
135 RFC2047EncodingType enc_type = Q_ENCODING; | |
136 while (*is_rfc2047 && t.GetNext()) { | |
137 std::string part = t.token(); | |
138 switch (part_index) { | |
139 case 0: | |
140 if (part != "=") { | |
141 *is_rfc2047 = false; | |
142 break; | |
143 } | |
144 ++part_index; | |
145 break; | |
146 case 1: | |
147 // Do we need charset validity check here? | |
148 charset = part; | |
149 ++part_index; | |
150 break; | |
151 case 2: | |
152 if (part.size() > 1 || | |
153 part.find_first_of("bBqQ") == std::string::npos) { | |
154 *is_rfc2047 = false; | |
155 break; | |
156 } | |
157 if (part[0] == 'b' || part[0] == 'B') { | |
158 enc_type = B_ENCODING; | |
159 } | |
160 ++part_index; | |
161 break; | |
162 case 3: | |
163 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); | |
164 if (!*is_rfc2047) { | |
165 // Last minute failure. Invalid B/Q encoding. Rather than | |
166 // passing it through, return now. | |
167 return false; | |
168 } | |
169 ++part_index; | |
170 break; | |
171 case 4: | |
172 if (part != "=") { | |
173 // Another last minute failure ! | |
174 // Likely to be a case of two encoded-words in a row or | |
175 // an encoded word followed by a non-encoded word. We can be | |
176 // generous, but it does not help much in terms of compatibility, | |
177 // I believe. Return immediately. | |
178 *is_rfc2047 = false; | |
179 return false; | |
180 } | |
181 ++part_index; | |
182 break; | |
183 default: | |
184 *is_rfc2047 = false; | |
185 return false; | |
186 } | |
187 } | |
188 | |
189 if (*is_rfc2047) { | |
190 if (*(encoded_word.end() - 1) == '=') { | |
191 output->swap(tmp); | |
192 return true; | |
193 } | |
194 // encoded_word ending prematurelly with '?' or extra '?' | |
195 *is_rfc2047 = false; | |
196 return false; | |
197 } | |
198 | |
199 // We're not handling 'especial' characters quoted with '\', but | |
200 // it should be Ok because we're not an email client but a | |
201 // web browser. | |
202 | |
203 // What IE6/7 does: %-escaped UTF-8. | |
204 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); | |
205 if (IsStringUTF8(tmp)) { | |
206 output->swap(tmp); | |
207 return true; | |
208 // We can try either the OS default charset or 'origin charset' here, | |
209 // As far as I can tell, IE does not support it. However, I've seen | |
210 // web servers emit %-escaped string in a legacy encoding (usually | |
211 // origin charset). | |
212 // TODO(jungshik) : Test IE further and consider adding a fallback here. | |
213 } | |
214 return false; | |
215 } | |
216 | |
217 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The | |
218 // value is supposed to be of the form: | |
219 // | |
220 // value = token | quoted-string | |
221 // | |
222 // However we currently also allow RFC 2047 encoding and non-ASCII | |
223 // strings. Non-ASCII strings are interpreted based on |referrer_charset|. | |
224 bool DecodeFilenameValue(const std::string& input, | |
225 const std::string& referrer_charset, | |
226 std::string* output) { | |
227 std::string tmp; | |
228 // Tokenize with whitespace characters. | |
229 StringTokenizer t(input, " \t\n\r"); | |
230 t.set_options(StringTokenizer::RETURN_DELIMS); | |
231 bool is_previous_token_rfc2047 = true; | |
232 while (t.GetNext()) { | |
233 if (t.token_is_delim()) { | |
234 // If the previous non-delimeter token is not RFC2047-encoded, | |
235 // put in a space in its place. Otheriwse, skip over it. | |
236 if (!is_previous_token_rfc2047) { | |
237 tmp.push_back(' '); | |
238 } | |
239 continue; | |
240 } | |
241 // We don't support a single multibyte character split into | |
242 // adjacent encoded words. Some broken mail clients emit headers | |
243 // with that problem, but most web servers usually encode a filename | |
244 // in a single encoded-word. Firefox/Thunderbird do not support | |
245 // it, either. | |
246 std::string decoded; | |
247 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047, | |
248 &decoded)) | |
249 return false; | |
250 tmp.append(decoded); | |
251 } | |
252 output->swap(tmp); | |
253 return true; | |
254 } | |
255 | |
256 // Parses the charset and value-chars out of an ext-value string. | |
257 // | |
258 // ext-value = charset "'" [ language ] "'" value-chars | |
259 bool ParseExtValueComponents(const std::string& input, | |
260 std::string* charset, | |
261 std::string* value_chars) { | |
262 StringTokenizer t(input, "'"); | |
263 t.set_options(StringTokenizer::RETURN_DELIMS); | |
264 std::string temp_charset; | |
265 std::string temp_value; | |
266 int numDelimsSeen = 0; | |
267 while (t.GetNext()) { | |
268 if (t.token_is_delim()) { | |
269 ++numDelimsSeen; | |
270 continue; | |
271 } else { | |
272 switch (numDelimsSeen) { | |
273 case 0: | |
274 temp_charset = t.token(); | |
275 break; | |
276 case 1: | |
277 // Language is ignored. | |
278 break; | |
279 case 2: | |
280 temp_value = t.token(); | |
281 break; | |
282 default: | |
283 return false; | |
284 } | |
285 } | |
286 } | |
287 if (numDelimsSeen != 2) | |
288 return false; | |
289 if (temp_charset.empty() || temp_value.empty()) | |
290 return false; | |
291 charset->swap(temp_charset); | |
292 value_chars->swap(temp_value); | |
293 return true; | |
294 } | |
295 | |
296 // http://tools.ietf.org/html/rfc5987#section-3.2 | |
297 // | |
298 // ext-value = charset "'" [ language ] "'" value-chars | |
299 // | |
300 // charset = "UTF-8" / "ISO-8859-1" / mime-charset | |
301 // | |
302 // mime-charset = 1*mime-charsetc | |
303 // mime-charsetc = ALPHA / DIGIT | |
304 // / "!" / "#" / "$" / "%" / "&" | |
305 // / "+" / "-" / "^" / "_" / "`" | |
306 // / "{" / "}" / "~" | |
307 // | |
308 // language = <Language-Tag, defined in [RFC5646], Section 2.1> | |
309 // | |
310 // value-chars = *( pct-encoded / attr-char ) | |
311 // | |
312 // pct-encoded = "%" HEXDIG HEXDIG | |
313 // | |
314 // attr-char = ALPHA / DIGIT | |
315 // / "!" / "#" / "$" / "&" / "+" / "-" / "." | |
316 // / "^" / "_" / "`" / "|" / "~" | |
317 bool DecodeExtValue(const std::string& param_value, std::string* decoded) { | |
318 if (param_value.find('"') != std::string::npos) | |
319 return false; | |
320 | |
321 std::string charset; | |
322 std::string value; | |
323 if (!ParseExtValueComponents(param_value, &charset, &value)) | |
324 return false; | |
325 | |
326 // RFC 5987 value should be ASCII-only. | |
327 if (!IsStringASCII(value)) { | |
328 decoded->clear(); | |
329 return true; | |
330 } | |
331 | |
332 std::string unescaped = UnescapeURLComponent(value, | |
333 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); | |
334 | |
335 return base::ConvertToUtf8AndNormalize(unescaped, charset, decoded); | |
336 } | |
337 | |
338 } // namespace | |
339 | |
14 HttpContentDisposition::HttpContentDisposition( | 340 HttpContentDisposition::HttpContentDisposition( |
15 const std::string& header, const std::string& referrer_charset) | 341 const std::string& header, const std::string& referrer_charset) |
16 : type_(INLINE) { | 342 : type_(INLINE) { |
17 Parse(header, referrer_charset); | 343 Parse(header, referrer_charset); |
18 } | 344 } |
19 | 345 |
20 HttpContentDisposition::~HttpContentDisposition() { | 346 HttpContentDisposition::~HttpContentDisposition() { |
21 } | 347 } |
22 | 348 |
23 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( | 349 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType( |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
92 | 418 |
93 if (!ext_filename.empty()) | 419 if (!ext_filename.empty()) |
94 filename_ = ext_filename; | 420 filename_ = ext_filename; |
95 else if (!filename.empty()) | 421 else if (!filename.empty()) |
96 filename_ = filename; | 422 filename_ = filename; |
97 else | 423 else |
98 filename_ = name; | 424 filename_ = name; |
99 } | 425 } |
100 | 426 |
101 } // namespace net | 427 } // namespace net |
OLD | NEW |