Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1150)

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: More comments from Dominic Created 8 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h"
6
7 #include <vector>
8
9 #include "base/lazy_instance.h"
10 #include "base/string_util.h"
11 #include "base/values.h"
12 #include "net/base/escape.h"
13 #include "net/url_request/url_request.h"
14 #include "third_party/re2/re2/re2.h"
15
16 using base::DictionaryValue;
17 using base::ListValue;
18 using base::StringPiece;
19 using re2::RE2;
20
21 namespace extensions {
22
23 namespace {
24
25 #define CONTENT_DISPOSITION "content-disposition:"
26
27 static const char g_escape_closing_quote[] = "\\\\E";
28 static const size_t g_content_disposition_length =
29 sizeof(CONTENT_DISPOSITION) - 1;
30
31 // A wrapper struct for static RE2 objects to be held as LazyInstance.
32 struct Patterns {
33 Patterns();
34 ~Patterns();
35 const RE2 transfer_padding_pattern;
36 const RE2 crlf_pattern;
37 const RE2 closing_pattern;
38 const RE2 epilogue_pattern;
39 const RE2 crlf_free_pattern;
40 const RE2 preamble_pattern;
41 const RE2 header_pattern;
42 const RE2 content_disposition_pattern;
43 const RE2 name_pattern;
44 const RE2 value_pattern;
45 const RE2 unquote_pattern;
46 const RE2 url_encoded_pattern;
47 };
48
49 Patterns::Patterns()
50 : transfer_padding_pattern("[ \\t]*\\r\\n"),
51 crlf_pattern("\\r\\n"),
52 closing_pattern("--[ \\t]*"),
53 epilogue_pattern("|\\r\\n(?s:.)*"),
54 crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"),
55 preamble_pattern(".*?"),
56 header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),
57 content_disposition_pattern("(?i:" CONTENT_DISPOSITION ")"),
58 name_pattern("\\bname=\"([^\"]*)\""),
59 value_pattern("\\bfilename=\"([^\"]*)\""),
60 unquote_pattern(g_escape_closing_quote),
61 url_encoded_pattern("([^=]*)=([^&]*)&?") {}
62
63 #undef CONTENT_DISPOSITION
64
65 Patterns::~Patterns() {}
66
67 static base::LazyInstance<Patterns>::Leaky g_patterns =
68 LAZY_INSTANCE_INITIALIZER;
69
70 } // namespace
71
72 // Parses URLencoded forms, see
73 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
74 class FormDataParserUrlEncoded : public FormDataParser {
75 public:
76 FormDataParserUrlEncoded();
77 virtual ~FormDataParserUrlEncoded();
78
79 // Implementation of FormDataParser.
80 virtual bool AllDataReadOK() OVERRIDE;
81 virtual bool GetNextNameValue(Result* result) OVERRIDE;
82 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
83
84 private:
85 // The pattern to match a single name-value pair. This could be even static,
86 // but then we would have to spend more code on initializing the cached
87 // pointer to g_patterns.Get() .
88 const RE2& pattern() const {
89 return patterns_->url_encoded_pattern;
90 }
91
92 // Auxiliary constant for using RE2. Number of arguments for parsing
93 // name-value pairs (one for name, one for value).
94 static const size_t args_size_ = 2u;
95 static const net::UnescapeRule::Type unescape_rules_;
96
97 re2::StringPiece source_;
98 bool source_set_;
99
100 // Auxiliary store for using RE2.
101 std::string name_;
102 std::string value_;
103 const RE2::Arg arg_name_;
104 const RE2::Arg arg_value_;
105 const RE2::Arg* args_[args_size_];
106
107 // Caching the pointer to g_patterns.Get() .
108 const Patterns* patterns_;
109
110 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
111 };
112
113 // The following class, FormDataParserMultipart, parses forms encoded as
114 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
115 // encoding) and 5322 (MIME-headers).
116 //
117 // Implementation details
118 //
119 // The original grammar from RFC 2046 is this, "multipart-body" being the root
120 // non-terminal:
121 //
122 // boundary := 0*69<bchars> bcharsnospace
123 // bchars := bcharsnospace / " "
124 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
125 // / "-" / "." / "/" / ":" / "=" / "?"
126 // dash-boundary := "--" boundary
127 // multipart-body := [preamble CRLF]
128 // dash-boundary transport-padding CRLF
129 // body-part *encapsulation
130 // close-delimiter transport-padding
131 // [CRLF epilogue]
132 // transport-padding := *LWSP-char
133 // encapsulation := delimiter transport-padding CRLF body-part
134 // delimiter := CRLF dash-boundary
135 // close-delimiter := delimiter "--"
136 // preamble := discard-text
137 // epilogue := discard-text
138 // discard-text := *(*text CRLF) *text
139 // body-part := MIME-part-headers [CRLF *OCTET]
140 // OCTET := <any 0-255 octet value>
141 //
142 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
143 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
144 // English alphabet, respectively.
145 // The non-terminal "text" is presumably just any text, excluding line breaks.
146 // The non-terminal "LWSP-char" is not directly defined in the original grammar
147 // but it means "linear whitespace", which is a space or a horizontal tab.
148 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
149 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
150 //
151 // MIME-part-headers := field-name ":" unstructured CRLF
152 // field-name := 1*ftext
153 // ftext := %d33-57 / ; Printable US-ASCII
154 // %d59-126 ; characters not including ":".
155 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
156 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
157 // "CRLF<horizontal tab>", which serve for "folding".
158 //
159 // The FormDataParseMultipart class reads the input source and tries to parse it
160 // according to the grammar above, rooted at the "multipart-body" non-terminal.
161 // This happens in stages:
162 //
163 // 1. The optional preamble and the initial dash-boundary with transport padding
164 // and a CRLF are read and ignored.
165 //
166 // 2. Repeatedly each body part is read. The body parts can either serve to
167 // upload a file, or just a string of bytes.
168 // 2.a. The headers of that part are searched for the "content-disposition"
169 // header, which contains the name of the value represented by that body
170 // part. If the body-part is for file upload, that header also contains a
171 // filename.
172 // 2.b. The "*OCTET" part of the body part is then read and passed as the value
173 // of the name-value pair for body parts representing a string of bytes.
174 // For body parts for uploading a file the "*OCTET" part is just ignored
175 // and the filename is used for value instead.
176 //
177 // 3. The final close-delimiter and epilogue are read and ignored.
178 //
179 // IMPORTANT NOTE
180 // This parser supports multiple sources, i.e., SetSource can be called multiple
181 // times if the input is spread over several byte blocks. However, the split
182 // may only occur inside a body part, right after the trailing CRLF of headers.
183 class FormDataParserMultipart : public FormDataParser {
184 public:
185 explicit FormDataParserMultipart(const std::string& boundary_separator);
186 virtual ~FormDataParserMultipart();
187
188 // Implementation of FormDataParser.
189 virtual bool AllDataReadOK() OVERRIDE;
190 virtual bool GetNextNameValue(Result* result) OVERRIDE;
191 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
192
193 private:
194 enum State {
195 STATE_INIT, // No input read yet.
196 STATE_READY, // Ready to call GetNextNameValue.
197 STATE_FINISHED, // Read the input until the end.
198 STATE_SUSPEND, // Waiting until a new |source_| is set.
199 STATE_ERROR
200 };
201
202 // Produces a regexp to match the string "--" + |literal|. The idea is to
203 // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed
204 // in "\\Q" and "\\E". The only catch is to watch out ofr occurences of "\\E"
205 // inside |literal|. Those must be excluded from the quote and the backslash
206 // doubly escaped. For example, for literal == "abc\\Edef" the result is
207 // "\\Q--abc\\E\\\\E\\Qdef\\E".
208 static std::string CreateBoundaryPatternFromLiteral(
209 const std::string& literal);
210
211 // Tests whether |input| has a prefix matching |pattern|.
212 static bool StartsWithPattern(const re2::StringPiece& input,
213 const RE2& pattern);
214
215 // If |source_| starts with a header, seeks |source_| beyond the header. If
216 // the header is Content-Disposition, extracts |name| from "name=" and
217 // possibly |value| from "filename=" fields of that header. Only if the
218 // "name" or "filename" fields are found, then |name| or |value| are touched.
219 // Returns true iff |source_| is seeked forward. Sets |value_assigned|
220 // to true iff |value| has been assigned to.
221 bool TryReadHeader(base::StringPiece* name,
222 base::StringPiece* value,
223 bool* value_assigned);
224
225 // Helper to GetNextNameValue. Expects that the input starts with a data
226 // portion of a body part. An attempt is made to read the input until the end
227 // of that body part. If |data| is not NULL, it is set to contain the data
228 // portion. Returns true iff the reading was successful.
229 bool FinishReadingPart(base::StringPiece* data);
230
231 // These methods could be even static, but then we would have to spend more
232 // code on initializing the cached pointer to g_patterns.Get() .
233 const RE2& transfer_padding_pattern() const {
234 return patterns_->transfer_padding_pattern;
235 }
236 const RE2& crlf_pattern() const {
237 return patterns_->crlf_pattern;
238 }
239 const RE2& closing_pattern() const {
240 return patterns_->closing_pattern;
241 }
242 const RE2& epilogue_pattern() const {
243 return patterns_->epilogue_pattern;
244 }
245 const RE2& crlf_free_pattern() const {
246 return patterns_->crlf_free_pattern;
247 }
248 const RE2& preamble_pattern() const {
249 return patterns_->preamble_pattern;
250 }
251 const RE2& header_pattern() const {
252 return patterns_->header_pattern;
253 }
254 const RE2& content_disposition_pattern() const {
255 return patterns_->content_disposition_pattern;
256 }
257 const RE2& name_pattern() const {
258 return patterns_->name_pattern;
259 }
260 const RE2& value_pattern() const {
261 return patterns_->value_pattern;
262 }
263 // However, this is used in a static method so it needs to be static.
264 static const RE2& unquote_pattern() {
265 return g_patterns.Get().unquote_pattern; // No caching g_patterns here.
266 }
267
268 const RE2 dash_boundary_pattern_;
269
270 // Because of initialisation dependency, |state_| needs to be declared after
271 // |dash_boundary_pattern_|.
272 State state_;
273
274 // The parsed message can be split into multiple sources which we read
275 // sequentially.
276 re2::StringPiece source_;
277
278 // Caching the pointer to g_patterns.Get() .
battre 2012/09/12 18:08:40 nit: - space before .
279 const Patterns* patterns_;
280
281 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
282 };
283
284 // Implementation of FormDataParser and FormDataParser::Result .
285
286 FormDataParser::Result::Result() {}
287 FormDataParser::Result::~Result() {}
288
289 void FormDataParser::Result::Reset() {
290 name_.erase();
291 value_.erase();
292 }
293
294 FormDataParser::~FormDataParser() {}
295
296 // static
297 scoped_ptr<FormDataParser> FormDataParser::Create(
298 const net::URLRequest* request) {
299 std::string value;
300 const bool found = request->extra_request_headers().GetHeader(
301 net::HttpRequestHeaders::kContentType, &value);
302 return Create(found ? &value : NULL);
303 }
304
305 // static
306 scoped_ptr<FormDataParser> FormDataParser::Create(
307 const std::string* content_type_header) {
308 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};
309 ParserChoice choice = ERROR_CHOICE;
310 std::string boundary;
311
312 if (content_type_header == NULL) {
313 choice = URL_ENCODED;
314 } else {
315 const std::string content_type(
316 content_type_header->substr(0, content_type_header->find(';')));
317
318 if (base::strcasecmp(
319 content_type.c_str(), "application/x-www-form-urlencoded") == 0) {
320 choice = URL_ENCODED;
321 } else if (base::strcasecmp(
322 content_type.c_str(), "multipart/form-data") == 0) {
323 static const char kBoundaryString[] = "boundary=";
324 size_t offset = content_type_header->find(kBoundaryString);
325 if (offset == std::string::npos) {
326 // Malformed header.
327 return scoped_ptr<FormDataParser>();
328 }
329 offset += sizeof(kBoundaryString) - 1;
330 boundary = content_type_header->substr(
331 offset, content_type_header->find(';', offset));
332 if (!boundary.empty())
333 choice = MULTIPART;
334 }
335 }
336 // Other cases are unparseable, including when |content_type| is "text/plain".
337
338 switch (choice) {
339 case URL_ENCODED:
340 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());
341 case MULTIPART:
342 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));
343 default: // In other words, case ERROR_CHOICE:
344 return scoped_ptr<FormDataParser>();
345 }
346 }
347
348 FormDataParser::FormDataParser() {}
349
350 // Implementation of FormDataParserUrlEncoded.
351
352 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =
353 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS |
354 net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;
355
356 FormDataParserUrlEncoded::FormDataParserUrlEncoded()
357 : source_(NULL),
358 source_set_(false),
359 arg_name_(&name_),
360 arg_value_(&value_),
361 patterns_(&(g_patterns.Get())) {
362 args_[0] = &arg_name_;
363 args_[1] = &arg_value_;
364 }
365
366 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}
367
368 bool FormDataParserUrlEncoded::AllDataReadOK() {
369 // All OK means we read the whole source.
370 return source_set_ && source_.size() == 0;
371 }
372
373 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {
374 if (!source_set_)
375 return false;
376
377 bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_);
378 if (success) {
379 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));
380 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));
381 }
382 return success;
383 }
384
385 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {
386 if (source_set_)
387 return false; // We do not allow multiple sources for this parser.
388 source_.set(source.data(), source.size());
389 source_set_ = true;
390 return true;
391 }
392
393 // Implementation of FormDataParserMultipart.
394
395 // static
396 std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral(
397 const std::string& literal) {
398 #define OPEN_QUOTE "\\Q"
399 static const char quote[] = OPEN_QUOTE;
400 static const char unquote[] = "\\E";
401
402 // The result always starts with opening the qoute and then "--".
403 std::string result(OPEN_QUOTE "--");
404 #undef OPEN_QUOTE
405
406 // This StringPiece is used below to record the next occurrence of "\\E" in
407 // |literal|.
408 re2::StringPiece seek_unquote(literal);
409 const char* copy_start = literal.data();
410 size_t copy_length = literal.size();
411
412 // Find all "\\E" in |literal| and exclude them from the \Q...\E quote.
413 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) {
414 copy_length = seek_unquote.data() - copy_start;
415 result.append(copy_start, copy_length);
416 result.append(g_escape_closing_quote);
417 result.append(quote);
418 copy_start = seek_unquote.data();
419 }
420
421 // Finish the last \Q...\E quote.
422 copy_length = (literal.data() + literal.size()) - copy_start;
423 result.append(copy_start, copy_length);
424 result.append(unquote);
425 return result;
426 }
427
428 // static
429 bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input,
430 const RE2& pattern) {
431 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);
432 }
433
434 FormDataParserMultipart::FormDataParserMultipart(
435 const std::string& boundary_separator)
436 : dash_boundary_pattern_(
437 CreateBoundaryPatternFromLiteral(boundary_separator)),
438 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR),
439 patterns_(&(g_patterns.Get())) {}
440
441 FormDataParserMultipart::~FormDataParserMultipart() {}
442
443 bool FormDataParserMultipart::AllDataReadOK() {
444 return state_ == STATE_FINISHED;
445 }
446
447 bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) {
448 const char* data_start = source_.data();
449 while (!StartsWithPattern(source_, dash_boundary_pattern_)) {
450 if (!RE2::Consume(&source_, crlf_free_pattern()) ||
451 !RE2::Consume(&source_, crlf_pattern())) {
452 state_ = STATE_ERROR;
453 return false;
454 }
455 }
456 if (data != NULL) {
457 if (source_.data() == data_start) {
458 // No data in this body part.
459 state_ = STATE_ERROR;
460 return false;
461 }
462 // Subtract 2u for the trailing "\r\n".
463 data->set(data_start, source_.data() - data_start - 2u);
464 }
465
466 // Finally, read the dash-boundary and either skip to the next body part, or
467 // finish reading the source.
468 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));
469 if (StartsWithPattern(source_, closing_pattern())) {
470 CHECK(RE2::Consume(&source_, closing_pattern()));
471 if (RE2::Consume(&source_, epilogue_pattern()))
472 state_ = STATE_FINISHED;
473 else
474 state_ = STATE_ERROR;
475 } else { // Next body part ahead.
476 if (!RE2::Consume(&source_, transfer_padding_pattern()))
477 state_ = STATE_ERROR;
478 }
479 return state_ != STATE_ERROR;
480 }
481
482 bool FormDataParserMultipart::GetNextNameValue(Result* result) {
483 if (source_.size() == 0 || state_ != STATE_READY)
484 return false;
485
486 // 1. Read body-part headers.
487 base::StringPiece name;
488 base::StringPiece value;
489 bool value_assigned = false;
490 bool value_assigned_temp;
491 while (TryReadHeader(&name, &value, &value_assigned_temp))
492 value_assigned |= value_assigned_temp;
493 if (name.size() == 0 || state_ == STATE_ERROR) {
494 state_ = STATE_ERROR;
495 return false;
496 }
497
498 // 2. Read the trailing CRLF after headers.
499 if (!RE2::Consume(&source_, crlf_pattern())) {
500 state_ = STATE_ERROR;
501 return false;
502 }
503
504 // 3. Read the data of this body part, i.e., everything until the first
505 // dash-boundary.
506 bool return_value;
507 if (value_assigned && source_.size() == 0) { // Wait for a new source?
508 return_value = true;
509 state_ = STATE_SUSPEND;
510 } else {
511 return_value = FinishReadingPart(value_assigned ? NULL : &value);
512 }
513
514 std::string unescaped_name = net::UnescapeURLComponent(
515 name.as_string(),
516 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS);
517 result->set_name(unescaped_name);
518 result->set_value(value);
519
520 return return_value;
521 }
522
523 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {
524 if (source.data() == NULL || source_.size() != 0)
525 return false;
526 source_.set(source.data(), source.size());
527
528 switch (state_) {
529 case STATE_INIT:
530 // Seek behind the preamble.
531 while (!StartsWithPattern(source_, dash_boundary_pattern_)) {
532 if (!RE2::Consume(&source_, preamble_pattern())) {
533 state_ = STATE_ERROR;
534 break;
535 }
536 }
537 // Read dash-boundary, transfer padding, and CRLF.
538 if (state_ != STATE_ERROR) {
539 if (!RE2::Consume(&source_, dash_boundary_pattern_) ||
540 !RE2::Consume(&source_, transfer_padding_pattern()))
541 state_ = STATE_ERROR;
542 else
543 state_ = STATE_READY;
544 }
545 break;
546 case STATE_READY: // Nothing to do.
547 break;
548 case STATE_SUSPEND:
549 state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR;
550 break;
551 default:
552 state_ = STATE_ERROR;
553 }
554 return state_ != STATE_ERROR;
555 }
556
557 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,
558 base::StringPiece* value,
559 bool* value_assigned) {
560 *value_assigned = false;
561 const char* header_start = source_.data();
562 if (!RE2::Consume(&source_, header_pattern()))
563 return false;
564 // (*) After this point we must return true, because we consumed one header.
565
566 // Subtract 2u for the trailing "\r\n".
567 re2::StringPiece header(header_start, source_.data() - header_start - 2u);
568
569 if (!StartsWithPattern(header, content_disposition_pattern()))
570 return true; // Skip headers that don't describe the content-disposition.
571
572 re2::StringPiece groups[2u];
573
574 if (!name_pattern().Match(header,
575 g_content_disposition_length, header.size(),
576 RE2::UNANCHORED, groups, 2)) {
577 state_ = STATE_ERROR;
578 return true; // See (*) for why true.
579 }
580 name->set(groups[1].data(), groups[1].size());
581
582 if (value_pattern().Match(header,
583 g_content_disposition_length, header.size(),
584 RE2::UNANCHORED, groups, 2)) {
585 value->set(groups[1].data(), groups[1].size());
586 *value_assigned = true;
587 }
588 return true;
589 }
590
591 } // namespace extensions
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698