Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Introducing LazyInstance for "static" RE2 Created 8 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h"
6
7 #include <vector>
8
9 #include "base/lazy_instance.h"
10 #include "base/string_util.h"
11 #include "base/values.h"
12 #include "net/base/escape.h"
13 #include "net/url_request/url_request.h"
14 #include "third_party/re2/re2/re2.h"
15
16 using base::DictionaryValue;
17 using base::ListValue;
18 using base::StringPiece;
19 using re2::RE2;
20
21 namespace extensions {
22
23 namespace {
24
25 #define CONTENT_DISPOSITION "content-disposition:"
26
27 static const char g_escape_closing_quote[] = "\\\\E";
28 static const size_t g_content_disposition_length =
29 sizeof(CONTENT_DISPOSITION) - 1;
30
31 // A wrapper struct for static RE2 objects to be held as LazyInstance.
32 struct Patterns {
33 Patterns();
34 ~Patterns();
35 const RE2 transfer_padding_pattern_;
36 const RE2 crlf_pattern_;
37 const RE2 closing_pattern_;
38 const RE2 epilogue_pattern_;
39 const RE2 crlf_free_pattern_;
40 const RE2 preamble_pattern_;
41 const RE2 header_pattern_;
42 const RE2 content_disposition_pattern_;
43 const RE2 name_pattern_;
44 const RE2 value_pattern_;
45 const RE2 unquote_pattern_;
46 const RE2 url_encoded_pattern_;
47 };
48
49 Patterns::Patterns()
50 : transfer_padding_pattern_("[ \\t]*\\r\\n"),
51 crlf_pattern_("\\r\\n"),
52 closing_pattern_("--[ \\t]*"),
53 epilogue_pattern_("|\\r\\n(?s:.)*"),
54 crlf_free_pattern_("(?:[^\\r]|\\r+[^\\r\\n])*"),
55 preamble_pattern_(".*?"),
56 header_pattern_("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),
57 content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"),
58 name_pattern_("\\bname=\"([^\"]*)\""),
59 value_pattern_("\\bfilename=\"([^\"]*)\""),
60 unquote_pattern_(g_escape_closing_quote),
61 url_encoded_pattern_("([^=]*)=([^&]*)&?") {}
62
63 #undef CONTENT_DISPOSITION
64
65 Patterns::~Patterns() {}
66
67 static base::LazyInstance<Patterns> g_patterns = LAZY_INSTANCE_INITIALIZER;
68
69 } // namespace
70
71 // Parses URLencoded forms, see
72 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
73 class FormDataParserUrlEncoded : public FormDataParser {
74 public:
75 FormDataParserUrlEncoded();
76 virtual ~FormDataParserUrlEncoded();
77
78 // Implementation of FormDataParser.
79 virtual bool AllDataReadOK() OVERRIDE;
80 virtual bool GetNextNameValue(Result* result) OVERRIDE;
81 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
82
83 private:
84 // The pattern to match a single name-value pair.
85 const RE2& pattern() {
86 return g_patterns.Get().url_encoded_pattern_;
87 }
88
89 static const size_t args_size_ = 2u; // Auxiliary constant for using RE2.
90 static const net::UnescapeRule::Type unescape_rules_;
91
92 re2::StringPiece source_;
93 bool source_set_;
94
95 // Auxiliary store for using RE2.
96 std::string name_;
97 std::string value_;
98 const RE2::Arg arg_name_;
99 const RE2::Arg arg_value_;
100 const RE2::Arg* args_[args_size_];
101
102 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
103 };
104
105 // The following class, FormDataParserMultipart, parses forms encoded as
106 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
107 // encoding) and 5322 (MIME-headers).
108 //
109 // Implementation details
110 //
111 // The original grammar from RFC 2046 is this, "multipart-body" being the root
112 // non-terminal:
113 //
114 // boundary := 0*69<bchars> bcharsnospace
115 // bchars := bcharsnospace / " "
116 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
117 // / "-" / "." / "/" / ":" / "=" / "?"
118 // dash-boundary := "--" boundary
119 // multipart-body := [preamble CRLF]
120 // dash-boundary transport-padding CRLF
121 // body-part *encapsulation
122 // close-delimiter transport-padding
123 // [CRLF epilogue]
124 // transport-padding := *LWSP-char
125 // encapsulation := delimiter transport-padding CRLF body-part
126 // delimiter := CRLF dash-boundary
127 // close-delimiter := delimiter "--"
128 // preamble := discard-text
129 // epilogue := discard-text
130 // discard-text := *(*text CRLF) *text
131 // body-part := MIME-part-headers [CRLF *OCTET]
132 // OCTET := <any 0-255 octet value>
133 //
134 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
135 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
136 // English alphabet, respectively.
137 // The non-terminal "text" is presumably just any text, excluding line breaks.
138 // The non-terminal "LWSP-char" is not directly defined in the original grammar
139 // but it means "linear whitespace", which is a space or a horizontal tab.
140 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
141 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
142 //
143 // MIME-part-headers := field-name ":" unstructured CRLF
144 // field-name := 1*ftext
145 // ftext := %d33-57 / ; Printable US-ASCII
146 // %d59-126 ; characters not including ":".
147 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
148 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
149 // "CRLF<horizontal tab>", which serve for "folding".
150 //
151 // The FormDataParseMultipart class reads the input source and tries to parse it
152 // according to the grammar above, rooted at the "multipart-body" non-terminal.
153 // This happens in stages:
154 //
155 // 1. The optional preamble and the initial dash-boundary with transport padding
156 // and a CRLF are read and ignored.
157 //
158 // 2. Repeatedly each body part is read. The body parts can either serve to
159 // upload a file, or just a string of bytes.
160 // 2.a. The headers of that part are searched for the "content-disposition"
161 // header, which contains the name of the value represented by that body
162 // part. If the body-part is for file upload, that header also contains a
163 // filename.
164 // 2.b. The "*OCTET" part of the body part is then read and passed as the value
165 // of the name-value pair for body parts representing a string of bytes.
166 // For body parts for uploading a file the "*OCTET" part is just ignored
167 // and the filename is used for value instead.
168 //
169 // 3. The final close-delimiter and epilogue are read and ignored.
170 //
171 // IMPORTANT NOTE
172 // This parser supports multiple sources, i.e., SetSource can be called multiple
173 // times if the input is spread over several byte blocks. However, the split
174 // may only occur inside a body part, right after the trailing CRLF of headers.
175 class FormDataParserMultipart : public FormDataParser {
176 public:
177 explicit FormDataParserMultipart(const std::string& boundary_separator);
178 virtual ~FormDataParserMultipart();
179
180 // Implementation of FormDataParser.
181 virtual bool AllDataReadOK() OVERRIDE;
182 virtual bool GetNextNameValue(Result* result) OVERRIDE;
183 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
184
185 private:
186 enum State {
187 STATE_INIT, // No input read yet.
188 STATE_READY, // Ready to call GetNextNameValue.
189 STATE_FINISHED, // Read the input until the end.
190 STATE_SUSPEND, // Waiting until a new |source_| is set.
191 STATE_ERROR
192 };
193
194 // Produces a regexp to match the string "--" + |literal|.
195 static std::string GetBoundaryPatternFromLiteral(const std::string& literal);
196
197 // Tests whether |input| has a prefix matching |pattern|.
198 static bool LookAhead(const RE2& pattern, const re2::StringPiece& input);
199
200 // If source_ starts with a header, consumes it. If the header is
201 // Content-Disposition, it also extracts |name| from "name=" and possibly
202 // |value| from "filename=" fields of that header. It only touches |name| or
203 // |value| if it finds the respective fields for them. Returns true if it
204 // consumed a header, false if it did not. Sets |value_assigned| to true if it
205 // has assigned to value, otherwise it sets it to false.
206 bool TryReadHeader(base::StringPiece* name,
207 base::StringPiece* value,
208 bool* value_assigned);
209
210 // Helper to GetNextNameValue. Expects that the input starts with a data
211 // portion of a body part. It then attempts to read the input until the end of
212 // that body part. If |data| is not NULL, it sets it to contain the data
213 // portion. Returns true when the reading was successful.
214 bool FinishReadingPart(base::StringPiece* data);
215
216 static const RE2& transfer_padding_pattern() {
217 return g_patterns.Get().transfer_padding_pattern_;
218 }
219 static const RE2& crlf_pattern() {
220 return g_patterns.Get().crlf_pattern_;
221 }
222 static const RE2& closing_pattern() {
223 return g_patterns.Get().closing_pattern_;
224 }
225 static const RE2& epilogue_pattern() {
226 return g_patterns.Get().epilogue_pattern_;
227 }
228 static const RE2& crlf_free_pattern() {
229 return g_patterns.Get().crlf_free_pattern_;
230 }
231 static const RE2& preamble_pattern() {
232 return g_patterns.Get().preamble_pattern_;
233 }
234 static const RE2& header_pattern() {
235 return g_patterns.Get().header_pattern_;
236 }
237 static const RE2& content_disposition_pattern() {
238 return g_patterns.Get().content_disposition_pattern_;
239 }
240 static const RE2& name_pattern() {
241 return g_patterns.Get().name_pattern_;
242 }
243 static const RE2& value_pattern() {
244 return g_patterns.Get().value_pattern_;
245 }
246 static const RE2& unquote_pattern() {
247 return g_patterns.Get().unquote_pattern_;
248 }
249
250 const RE2 dash_boundary_pattern_;
251
252 // Because of initialisation dependency, |state_| needs to be declared after
253 // |dash_boundary_pattern_|.
254 State state_;
255
256 // The parsed message can be split into multiple sources which we read
257 // sequentially.
258 re2::StringPiece source_;
259
260 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
261 };
262
263 // Implementation of FormDataParser and FormDataParser::Result .
264
265 FormDataParser::Result::Result() {}
266 FormDataParser::Result::~Result() {}
267
268 void FormDataParser::Result::Reset() {
269 name_.erase();
270 value_.erase();
271 }
272
273 FormDataParser::~FormDataParser() {}
274
275 // static
276 scoped_ptr<FormDataParser> FormDataParser::Create(
277 const net::URLRequest* request) {
278 std::string value;
279 const bool found = request->extra_request_headers().GetHeader(
280 net::HttpRequestHeaders::kContentType, &value);
281 return Create(found ? &value : NULL);
282 }
283
284 // static
285 scoped_ptr<FormDataParser> FormDataParser::Create(
286 const std::string* content_type_header) {
287 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};
288 ParserChoice choice = ERROR_CHOICE;
289 std::string boundary;
290
291 if (content_type_header == NULL) {
292 choice = URL_ENCODED;
293 } else {
294 const std::string content_type(
295 content_type_header->substr(0, content_type_header->find(';')));
296
297 if (base::strcasecmp(
298 content_type.c_str(), "application/x-www-form-urlencoded") == 0) {
299 choice = URL_ENCODED;
300 } else if (base::strcasecmp(
301 content_type.c_str(), "multipart/form-data") == 0) {
302 static const char kBoundaryString[] = "boundary=";
303 size_t offset = content_type_header->find(kBoundaryString);
304 if (offset == std::string::npos) {
305 // Malformed header.
306 return scoped_ptr<FormDataParser>();
307 }
308 offset += sizeof(kBoundaryString) - 1;
309 boundary = content_type_header->substr(
310 offset, content_type_header->find(';', offset));
311 if (!boundary.empty())
312 choice = MULTIPART;
313 }
314 }
315 // Other cases are unparseable, including when |content_type| is "text/plain".
316
317 switch (choice) {
318 case URL_ENCODED:
319 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());
320 case MULTIPART:
321 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));
322 default: // In other words, case ERROR_CHOICE:
323 return scoped_ptr<FormDataParser>();
324 }
325 }
326
327 FormDataParser::FormDataParser() {}
328
329 // Implementation of FormDataParserUrlEncoded.
330
331 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =
332 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS |
333 net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;
334
335 FormDataParserUrlEncoded::FormDataParserUrlEncoded()
336 : source_(NULL),
337 source_set_(false),
338 arg_name_(&name_),
339 arg_value_(&value_) {
340 args_[0] = &arg_name_;
341 args_[1] = &arg_value_;
342 }
343
344 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}
345
346 bool FormDataParserUrlEncoded::AllDataReadOK() {
347 // All OK means we read the whole source.
348 return source_set_ && source_.size() == 0;
349 }
350
351 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {
352 if (!source_set_)
353 return false;
354
355 bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_);
356 if (success) {
357 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));
358 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));
359 }
360 return success;
361 }
362
363 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {
364 if (source_set_)
365 return false; // We do not allow multiple sources for this parser.
366 source_.set(source.data(), source.size());
367 source_set_ = true;
368 return true;
369 }
370
371 // Implementation of FormDataParserMultipart.
372
373 // static
374 std::string FormDataParserMultipart::GetBoundaryPatternFromLiteral(
375 const std::string& literal) {
376 #define OPEN_QUOTE "\\Q"
377 static const char opening_quote[] = OPEN_QUOTE;
378 static const char closing_quote[] = "\\E";
379
380 std::string output(OPEN_QUOTE "--"); // Let us start with the "--".
381 #undef OPEN_QUOTE
382 re2::StringPiece seek_unquote(literal);
383 const char* copy_start = literal.data();
384 size_t copy_length = literal.size();
385 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) {
386 copy_length = seek_unquote.data() - copy_start;
387 output.append(copy_start, copy_length);
388 output.append(g_escape_closing_quote);
389 output.append(opening_quote);
390 copy_start = seek_unquote.data();
391 }
392 copy_length = (literal.data() + literal.size()) - copy_start;
393 output.append(copy_start, copy_length);
394 output.append(closing_quote);
395 return output;
396 }
397
398 // static
399 bool FormDataParserMultipart::LookAhead(const RE2& pattern,
400 const re2::StringPiece& input) {
401 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);
402 }
403
404 FormDataParserMultipart::FormDataParserMultipart(
405 const std::string& boundary_separator)
406 : dash_boundary_pattern_(GetBoundaryPatternFromLiteral(boundary_separator)),
407 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {}
408
409 FormDataParserMultipart::~FormDataParserMultipart() {}
410
411 bool FormDataParserMultipart::AllDataReadOK() {
412 return state_ == STATE_FINISHED;
413 }
414
415 bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) {
416 const char* data_start = source_.data();
417 while (!LookAhead(dash_boundary_pattern_, source_)) {
418 if (!RE2::Consume(&source_, crlf_free_pattern()) ||
419 !RE2::Consume(&source_, crlf_pattern())) {
420 state_ = STATE_ERROR;
421 return false;
422 }
423 }
424 if (data != NULL) {
425 if (source_.data() == data_start) {
426 // No data in this body part.
427 state_ = STATE_ERROR;
428 return false;
429 }
430 // Subtract 2u for the trailing "\r\n".
431 data->set(data_start, source_.data() - data_start - 2u);
432 }
433
434 // Finally, read the dash-boundary and either skip to the next body part, or
435 // finish reading the source.
436 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));
437 if (LookAhead(closing_pattern(), source_)) {
438 CHECK(RE2::Consume(&source_, closing_pattern()));
439 if (RE2::Consume(&source_, epilogue_pattern()))
440 state_ = STATE_FINISHED;
441 else
442 state_ = STATE_ERROR;
443 } else { // Next body part ahead.
444 if (!RE2::Consume(&source_, transfer_padding_pattern()))
445 state_ = STATE_ERROR;
446 }
447 return state_ != STATE_ERROR;
448 }
449
450 bool FormDataParserMultipart::GetNextNameValue(Result* result) {
451 if (source_.size() == 0 || state_ != STATE_READY)
452 return false;
453
454 // 1. Read body-part headers.
455 base::StringPiece name;
456 base::StringPiece value;
457 bool value_assigned = false;
458 bool value_assigned_temp;
459 while (TryReadHeader(&name, &value, &value_assigned_temp))
460 value_assigned |= value_assigned_temp;
461 if (name.size() == 0) {
462 state_ = STATE_ERROR;
463 return false;
464 }
465
466 // 2. Read the trailing CRLF after headers.
467 if (!RE2::Consume(&source_, crlf_pattern())) {
468 state_ = STATE_ERROR;
469 return false;
470 }
471
472 // 3. Read the data of this body part, i.e., everything until the first
473 // dash-boundary.
474 bool return_value = true;
475 if (value_assigned && source_.size() == 0) // Wait for a new source?
476 state_ = STATE_SUSPEND;
477 else
478 return_value = FinishReadingPart(value_assigned ? NULL : &value);
479
480 std::string unescaped_name = net::UnescapeURLComponent(
481 name.as_string(),
482 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS);
483 result->set_name(unescaped_name);
484 result->set_value(value);
485
486 return return_value;
487 }
488
489 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {
490 if (source.data() == NULL || source_.size() != 0)
491 return false;
492 source_.set(source.data(), source.size());
493
494 switch (state_) {
495 case STATE_INIT:
496 // Seek behind the preamble.
497 while (!LookAhead(dash_boundary_pattern_, source_)) {
498 if (!RE2::Consume(&source_, preamble_pattern())) {
499 state_ = STATE_ERROR;
500 break;
501 }
502 }
503 // Read dash-boundary, transfer padding, and CRLF.
504 if (state_ != STATE_ERROR) {
505 if (!RE2::Consume(&source_, dash_boundary_pattern_) ||
506 !RE2::Consume(&source_, transfer_padding_pattern()))
507 state_ = STATE_ERROR;
508 else
509 state_ = STATE_READY;
510 }
511 break;
512 case STATE_READY: // Nothing to do.
513 break;
514 case STATE_SUSPEND:
515 state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR;
516 break;
517 default:
518 state_ = STATE_ERROR;
519 }
520 return state_ != STATE_ERROR;
521 }
522
523 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,
524 base::StringPiece* value,
525 bool* value_assigned) {
526 *value_assigned = false;
527 const char* header_start = source_.data();
528 if (!RE2::Consume(&source_, header_pattern()))
529 return false;
530 // (*) After this point we must return true, because we consumed one header.
531
532 // Subtract 2u for the trailing "\r\n".
533 re2::StringPiece header(header_start, source_.data() - header_start - 2u);
534
535 // Now we check whether |header| is a Content-Disposition header, and try
536 // to extract name and possibly value from it.
537 if (LookAhead(content_disposition_pattern(), header)) {
538 re2::StringPiece groups[2u];
539
540 if (!name_pattern().Match(header,
541 g_content_disposition_length, header.size(),
542 RE2::UNANCHORED, groups, 2)) {
543 state_ = STATE_ERROR;
544 return true; // See (*) for why true.
545 }
546 name->set(groups[1].data(), groups[1].size());
547
548 if (!value_pattern().Match(header,
549 g_content_disposition_length, header.size(),
550 RE2::UNANCHORED, groups, 2))
551 return true; // See (*) for why true.
552 value->set(groups[1].data(), groups[1].size());
553 *value_assigned = true;
554 }
555 return true;
556 }
557
558 } // namespace extensions
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698