OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h" | |
6 | |
7 #include <vector> | |
8 | |
9 #include "base/string_util.h" | |
10 #include "base/values.h" | |
11 #include "net/base/escape.h" | |
12 #include "net/url_request/url_request.h" | |
13 #include "third_party/re2/re2/re2.h" | |
14 | |
15 using base::DictionaryValue; | |
16 using base::ListValue; | |
17 using base::StringPiece; | |
18 using re2::RE2; | |
19 | |
20 namespace extensions { | |
21 | |
22 // Parses URLencoded forms, see | |
23 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . | |
24 class FormDataParserUrlEncoded : public FormDataParser { | |
25 public: | |
26 FormDataParserUrlEncoded(); | |
27 virtual ~FormDataParserUrlEncoded(); | |
28 | |
29 // Implementation of FormDataParser. | |
30 virtual bool AllDataReadOK() OVERRIDE; | |
31 virtual bool GetNextNameValue(Result* result) OVERRIDE; | |
32 virtual bool SetSource(const base::StringPiece& source) OVERRIDE; | |
33 | |
34 private: | |
35 // The pattern to match a single name-value pair. Ideally this should be | |
36 // static, so that it is constructed only once, independently on how many | |
37 // parser instances we have. However, then we would run into exit-time | |
38 // destructors problems. | |
39 const RE2 pattern_; | |
40 | |
41 static const size_t args_size_ = 2u; // Auxiliary constant for using RE2. | |
42 static const net::UnescapeRule::Type unescape_rules_; | |
43 | |
44 re2::StringPiece source_; | |
45 bool source_set_; | |
46 | |
47 // Auxiliary store for using RE2. | |
48 std::string name_; | |
49 std::string value_; | |
50 const RE2::Arg arg_name_; | |
51 const RE2::Arg arg_value_; | |
52 const RE2::Arg* args_[args_size_]; | |
53 | |
54 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); | |
55 }; | |
56 | |
57 // The following class, FormDataParserMultipart, parses forms encoded as | |
58 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart | |
59 // encoding) and 5322 (MIME-headers). | |
60 // | |
61 // Implementation details | |
62 // | |
63 // The original grammar from RFC 2046 is this, "multipart-body" being the root | |
64 // non-terminal: | |
65 // | |
66 // boundary := 0*69<bchars> bcharsnospace | |
67 // bchars := bcharsnospace / " " | |
68 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," | |
69 // / "-" / "." / "/" / ":" / "=" / "?" | |
70 // dash-boundary := "--" boundary | |
71 // multipart-body := [preamble CRLF] | |
72 // dash-boundary transport-padding CRLF | |
73 // body-part *encapsulation | |
74 // close-delimiter transport-padding | |
75 // [CRLF epilogue] | |
76 // transport-padding := *LWSP-char | |
77 // encapsulation := delimiter transport-padding CRLF body-part | |
78 // delimiter := CRLF dash-boundary | |
79 // close-delimiter := delimiter "--" | |
80 // preamble := discard-text | |
81 // epilogue := discard-text | |
82 // discard-text := *(*text CRLF) *text | |
83 // body-part := MIME-part-headers [CRLF *OCTET] | |
84 // OCTET := <any 0-255 octet value> | |
85 // | |
86 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF, | |
87 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the | |
88 // English alphabet, respectively. | |
89 // The non-terminal "text" is presumably just any text, excluding line breaks. | |
90 // The non-terminal "LWSP-char" is not directly defined in the original grammar | |
91 // but it means "linear whitespace", which is a space or a horizontal tab. | |
92 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use | |
93 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322: | |
94 // | |
95 // MIME-part-headers := field-name ":" unstructured CRLF | |
96 // field-name := 1*ftext | |
97 // ftext := %d33-57 / ; Printable US-ASCII | |
98 // %d59-126 ; characters not including ":". | |
99 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which | |
100 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and | |
101 // "CRLF<horizontal tab>", which serve for "folding". | |
102 // | |
103 // The FormDataParseMultipart class reads the input source and tries to parse it | |
104 // according to the grammar above, rooted at the "multipart-body" non-terminal. | |
105 // This happens in stages: | |
106 // | |
107 // 1. The optional preamble and the initial dash-boundary with transport padding | |
108 // and a CRLF are read and ignored. | |
109 // | |
110 // 2. Repeatedly each body part is read. The body parts can either serve to | |
111 // upload a file, or just a string of bytes. | |
112 // 2.a. The headers of that part are searched for the "content-disposition" | |
113 // header, which contains the name of the value represented by that body | |
114 // part. If the body-part is for file upload, that header also contains a | |
115 // filename. | |
116 // 2.b. The "*OCTET" part of the body part is then read and passed as the value | |
117 // of the name-value pair for body parts representing a string of bytes. | |
118 // For body parts for uploading a file the "*OCTET" part is just ignored | |
119 // and the filename is used for value instead. | |
120 // | |
121 // 3. The final close-delimiter and epilogue are read and ignored. | |
122 // | |
123 // IMPORTANT NOTE | |
124 // This parser supports multiple sources, i.e., SetSource can be called multiple | |
125 // times if the input is spread over several byte blocks. However, the split | |
126 // may only occur inside a body part, right after the trailing CRLF of headers. | |
127 class FormDataParserMultipart : public FormDataParser { | |
128 public: | |
129 explicit FormDataParserMultipart(const std::string& boundary_separator); | |
130 virtual ~FormDataParserMultipart(); | |
131 | |
132 // Implementation of FormDataParser. | |
133 virtual bool AllDataReadOK() OVERRIDE; | |
134 virtual bool GetNextNameValue(Result* result) OVERRIDE; | |
135 virtual bool SetSource(const base::StringPiece& source) OVERRIDE; | |
136 | |
137 private: | |
138 enum State { | |
139 STATE_INIT, // No input read yet. | |
140 STATE_READY, // Ready to call GetNextNameValue. | |
141 STATE_FINISHED, // Read the input until the end. | |
142 STATE_SUSPEND, // Waiting until a new |source_| is set. | |
143 STATE_ERROR | |
144 }; | |
145 | |
146 // Produces a regexp to match the |boundary| string. | |
147 static std::string GetDashBoundaryPattern(const std::string& boundary); | |
148 | |
149 // Tests whether |input| has a prefix matching |pattern|. | |
150 static bool LookAhead(const RE2& pattern, const re2::StringPiece& input); | |
151 | |
152 // If source_ starts with a header, consumes it. If the header is | |
153 // Content-Disposition, it also extracts |name| from "name=" and possibly | |
154 // |value| from "filename=" fields of that header. It only touches |name| or | |
155 // |value| if it finds the respective fields for them. Returns true if it | |
156 // consumed a header, false if it did not. Sets |value_assigned| to true if it | |
157 // has assigned to value, otherwise it sets it to false. | |
158 bool TryReadHeader(base::StringPiece* name, | |
159 base::StringPiece* value, | |
160 bool* value_assigned); | |
161 | |
162 // Helper to GetNextNameValue. Attempts to read the data portion of a body | |
163 // part. If |value| is not NULL but empty, it sets it to contain the data | |
164 // portion. Returns true when the reading was successful. | |
165 bool GetNextNameValueContinue(base::StringPiece* value); | |
166 | |
167 // Ideally those should be static, so that they are constructed only once, | |
168 // independently on how many parser instances we have. However, then we would | |
169 // run into exit-time destructors problems. | |
170 const RE2 transfer_padding_pattern_; | |
171 const RE2 crlf_pattern_; | |
172 const RE2 closing_pattern_; | |
173 const RE2 epilogue_pattern_; | |
174 const RE2 crlf_free_pattern_; | |
175 const RE2 preamble_pattern_; | |
176 const RE2 header_pattern_; | |
177 const RE2 content_disposition_pattern_; | |
178 const RE2 name_pattern_; | |
179 const RE2 value_pattern_; | |
180 | |
181 const RE2 dash_boundary_pattern_; | |
182 | |
183 // Because of initialisation dependency, |state_| needs to be declared after | |
184 // |dash_boundary_pattern_|. | |
185 State state_; | |
186 | |
187 // The parsed message can be split into multiple sources which we read | |
188 // sequentially. | |
189 re2::StringPiece source_; | |
190 | |
191 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); | |
192 }; | |
193 | |
194 // Implementation of FormDataParser and FormDataParser::Result . | |
195 | |
196 FormDataParser::Result::Result() {} | |
197 FormDataParser::Result::~Result() {} | |
198 | |
199 void FormDataParser::Result::Reset() { | |
200 name_.erase(); | |
201 value_.erase(); | |
202 } | |
203 | |
204 FormDataParser::~FormDataParser() {} | |
205 | |
206 // static | |
207 scoped_ptr<FormDataParser> FormDataParser::Create( | |
208 const net::URLRequest* request) { | |
209 std::string value; | |
210 const bool found = request->extra_request_headers().GetHeader( | |
211 net::HttpRequestHeaders::kContentType, &value); | |
212 return Create(found ? &value : NULL); | |
213 } | |
214 | |
215 // static | |
216 scoped_ptr<FormDataParser> FormDataParser::Create( | |
217 const std::string* content_type_header) { | |
218 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE}; | |
219 ParserChoice choice = ERROR_CHOICE; | |
220 std::string boundary; | |
221 | |
222 if (content_type_header == NULL) { | |
223 choice = URL_ENCODED; | |
224 } else { | |
225 const std::string content_type( | |
226 content_type_header->substr(0, content_type_header->find(';'))); | |
227 | |
228 if (base::strcasecmp( | |
229 content_type.c_str(), "application/x-www-form-urlencoded") == 0) { | |
230 choice = URL_ENCODED; | |
231 } else if (base::strcasecmp( | |
232 content_type.c_str(), "multipart/form-data") == 0) { | |
233 static const char kBoundaryString[] = "boundary="; | |
234 size_t offset = content_type_header->find(kBoundaryString); | |
235 if (offset == std::string::npos) { | |
236 // Malformed header. | |
237 return scoped_ptr<FormDataParser>(); | |
238 } | |
239 offset += sizeof(kBoundaryString) - 1; | |
240 boundary = content_type_header->substr( | |
241 offset, content_type_header->find(';', offset)); | |
242 if (!boundary.empty()) | |
243 choice = MULTIPART; | |
244 } | |
245 } | |
246 // Other cases are unparseable, including when |content_type| is "text/plain". | |
247 | |
248 switch (choice) { | |
249 case URL_ENCODED: | |
250 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded()); | |
251 case MULTIPART: | |
252 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary)); | |
253 default: // In other words, case ERROR_CHOICE: | |
254 return scoped_ptr<FormDataParser>(); | |
255 } | |
256 } | |
257 | |
258 FormDataParser::FormDataParser() {} | |
259 | |
260 // Implementation of FormDataParserUrlEncoded. | |
261 | |
262 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ = | |
263 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS | | |
264 net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE; | |
265 | |
266 FormDataParserUrlEncoded::FormDataParserUrlEncoded() | |
267 : pattern_("([^=]*)=([^&]*)&?"), | |
268 source_(NULL), | |
269 source_set_(false), | |
270 arg_name_(&name_), | |
271 arg_value_(&value_) { | |
272 args_[0] = &arg_name_; | |
273 args_[1] = &arg_value_; | |
274 } | |
275 | |
276 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {} | |
277 | |
278 bool FormDataParserUrlEncoded::AllDataReadOK() { | |
279 // All OK means we read the whole source. | |
280 return source_set_ && source_.size() == 0; | |
281 } | |
282 | |
283 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) { | |
284 if (!source_set_) | |
285 return false; | |
286 | |
287 bool success = RE2::ConsumeN(&source_, pattern_, args_, args_size_); | |
288 if (success) { | |
289 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_)); | |
290 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_)); | |
291 } | |
292 return success; | |
293 } | |
294 | |
295 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) { | |
296 if (source_set_) | |
297 return false; // We do not allow multiple sources for this parser. | |
298 source_.set(source.data(), source.size()); | |
299 source_set_ = true; | |
300 return true; | |
301 } | |
302 | |
303 // Implementation of FormDataParserMultipart. | |
304 | |
305 // static | |
306 std::string FormDataParserMultipart::GetDashBoundaryPattern( | |
307 const std::string& boundary) { | |
308 static const char escape_closing_quote[] = "\\\\E"; | |
309 static const RE2 unquote_pattern(escape_closing_quote); | |
vabr (Chromium)
2012/08/30 12:26:48
Note to myself -- make this a non-static data memb
| |
310 #define OPEN_QUOTE "\\Q" | |
311 static const char opening_quote[] = OPEN_QUOTE; | |
312 static const char closing_quote[] = "\\E"; | |
313 | |
314 std::string output(OPEN_QUOTE "--"); // Let us start with the "--". | |
315 #undef OPEN_QUOTE | |
316 re2::StringPiece seek_unquote(boundary); | |
317 const char* copy_start = boundary.data(); | |
318 size_t copy_length = boundary.size(); | |
319 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern)) { | |
320 copy_length = seek_unquote.data() - copy_start; | |
321 output.append(copy_start, copy_length); | |
322 output.append(escape_closing_quote); | |
323 output.append(opening_quote); | |
324 copy_start = seek_unquote.data(); | |
325 } | |
326 copy_length = (boundary.data() + boundary.size()) - copy_start; | |
327 output.append(copy_start, copy_length); | |
328 output.append(closing_quote); | |
329 return output; | |
330 } | |
331 | |
332 // static | |
333 bool FormDataParserMultipart::LookAhead(const RE2& pattern, | |
334 const re2::StringPiece& input) { | |
335 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0); | |
336 } | |
337 | |
338 #define CONTENT_DISPOSITION "content-disposition:" | |
339 FormDataParserMultipart::FormDataParserMultipart( | |
340 const std::string& boundary_separator) | |
341 : transfer_padding_pattern_("[ \\t]*\\r\\n"), | |
342 crlf_pattern_("\\r\\n"), | |
343 closing_pattern_("--[ \\t]*"), | |
344 epilogue_pattern_("|\\r\\n(?s:.)*"), | |
345 crlf_free_pattern_("(?:[^\\r]|\\r+[^\\r\\n])*"), | |
346 preamble_pattern_(".*?"), | |
347 header_pattern_("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"), | |
348 content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"), | |
349 name_pattern_("\\bname=\"([^\"]*)\""), | |
350 value_pattern_("\\bfilename=\"([^\"]*)\""), | |
351 dash_boundary_pattern_(GetDashBoundaryPattern(boundary_separator)), | |
352 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {} | |
353 | |
354 FormDataParserMultipart::~FormDataParserMultipart() {} | |
355 | |
356 bool FormDataParserMultipart::AllDataReadOK() { | |
357 return state_ == STATE_FINISHED; | |
358 } | |
359 | |
360 bool FormDataParserMultipart::GetNextNameValueContinue( | |
361 base::StringPiece* value) { | |
362 const char* data_start = source_.data(); | |
363 while (!LookAhead(dash_boundary_pattern_, source_)) { | |
364 if (!RE2::Consume(&source_, crlf_free_pattern_) || | |
365 !RE2::Consume(&source_, crlf_pattern_)) { | |
366 state_ = STATE_ERROR; | |
367 return false; | |
368 } | |
369 } | |
370 if (value != NULL) { | |
371 if (source_.data() == data_start) { | |
372 // No data in this body part. | |
373 state_ = STATE_ERROR; | |
374 return false; | |
375 } | |
376 // Subtract 2u for the trailing "\r\n". | |
377 value->set(data_start, source_.data() - data_start - 2u); | |
378 } | |
379 | |
380 // Finally, read the dash-boundary and either skip to the next body part, or | |
381 // finish reading the source. | |
382 CHECK(RE2::Consume(&source_, dash_boundary_pattern_)); | |
383 if (LookAhead(closing_pattern_, source_)) { | |
384 CHECK(RE2::Consume(&source_, closing_pattern_)); | |
385 if (RE2::Consume(&source_, epilogue_pattern_)) | |
386 state_ = STATE_FINISHED; | |
387 else | |
388 state_ = STATE_ERROR; | |
389 } else { // Next body part ahead. | |
390 if (!RE2::Consume(&source_, transfer_padding_pattern_)) | |
391 state_ = STATE_ERROR; | |
392 } | |
393 return state_ != STATE_ERROR; | |
394 } | |
395 | |
396 bool FormDataParserMultipart::GetNextNameValue(Result* result) { | |
397 if (source_.size() == 0 || state_ != STATE_READY) | |
398 return false; | |
399 | |
400 // 1. Read body-part headers. | |
401 base::StringPiece name; | |
402 base::StringPiece value; | |
403 bool value_assigned = false; | |
404 bool value_assigned_temp; | |
405 while (TryReadHeader(&name, &value, &value_assigned_temp)) | |
406 value_assigned |= value_assigned_temp; | |
407 if (name.size() == 0) { | |
408 state_ = STATE_ERROR; | |
409 return false; | |
410 } | |
411 | |
412 // 2. Read the trailing CRLF after headers. | |
413 if (!RE2::Consume(&source_, crlf_pattern_)) { | |
414 state_ = STATE_ERROR; | |
415 return false; | |
416 } | |
417 | |
418 // 3. Read the data of this body part, i.e., everything until the first | |
419 // dash-boundary. | |
420 bool return_value = true; | |
421 if (value_assigned && source_.size() == 0) // Wait for a new source? | |
422 state_ = STATE_SUSPEND; | |
423 else | |
424 return_value = GetNextNameValueContinue(value_assigned ? NULL : &value); | |
425 | |
426 result->set_name(name); | |
427 result->set_value(value); | |
428 | |
429 return return_value; | |
430 } | |
431 | |
432 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) { | |
433 if (source.data() == NULL || source_.size() != 0) | |
434 return false; | |
435 source_.set(source.data(), source.size()); | |
436 | |
437 switch (state_) { | |
438 case STATE_INIT: | |
439 // Seek behind the preamble. | |
440 while (!LookAhead(dash_boundary_pattern_, source_)) { | |
441 if (!RE2::Consume(&source_, preamble_pattern_)) { | |
442 state_ = STATE_ERROR; | |
443 break; | |
444 } | |
445 } | |
446 // Read dash-boundary, transfer padding, and CRLF. | |
447 if (state_ != STATE_ERROR) { | |
448 if (!RE2::Consume(&source_, dash_boundary_pattern_) || | |
449 !RE2::Consume(&source_, transfer_padding_pattern_)) | |
450 state_ = STATE_ERROR; | |
451 else | |
452 state_ = STATE_READY; | |
453 } | |
454 break; | |
455 case STATE_READY: // Nothing to do. | |
456 break; | |
457 case STATE_SUSPEND: | |
458 state_ = GetNextNameValueContinue(NULL) ? STATE_READY : STATE_ERROR; | |
459 break; | |
460 default: | |
461 state_ = STATE_ERROR; | |
462 } | |
463 return state_ != STATE_ERROR; | |
464 } | |
465 | |
466 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name, | |
467 base::StringPiece* value, | |
468 bool* value_assigned) { | |
469 static const size_t content_disposition_value_offset = | |
470 sizeof(CONTENT_DISPOSITION) - 1; | |
471 #undef CONTENT_DISPOSITION | |
472 | |
473 *value_assigned = false; | |
474 const char* header_start = source_.data(); | |
475 if (!RE2::Consume(&source_, header_pattern_)) | |
476 return false; | |
477 // (*) After this point we must return true, because we consumed one header. | |
478 | |
479 // Subtract 2u for the trailing "\r\n". | |
480 re2::StringPiece header(header_start, source_.data() - header_start - 2u); | |
481 | |
482 // Now we check whether |header| is a Content-Disposition header, and try | |
483 // to extract name and possibly value from it. | |
484 if (LookAhead(content_disposition_pattern_, header)) { | |
485 re2::StringPiece groups[2u]; | |
486 | |
487 if (!name_pattern_.Match(header, | |
488 content_disposition_value_offset, header.size(), | |
489 RE2::UNANCHORED, groups, 2)) { | |
490 state_ = STATE_ERROR; | |
491 return true; // See (*) for why true. | |
492 } | |
493 name->set(groups[1].data(), groups[1].size()); | |
494 | |
495 if (!value_pattern_.Match(header, | |
496 content_disposition_value_offset, header.size(), | |
497 RE2::UNANCHORED, groups, 2)) | |
498 return true; // See (*) for why true. | |
499 value->set(groups[1].data(), groups[1].size()); | |
500 *value_assigned = true; | |
501 } | |
502 return true; | |
503 } | |
504 | |
505 } // namespace extensions | |
OLD | NEW |