Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(377)

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Unescaping field name + corresponding api tests enhanced Created 8 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h"
6
7 #include <vector>
8
9 #include "base/string_util.h"
10 #include "base/values.h"
11 #include "net/base/escape.h"
12 #include "net/url_request/url_request.h"
13 #include "third_party/re2/re2/re2.h"
14
15 using base::DictionaryValue;
16 using base::ListValue;
17 using base::StringPiece;
18 using re2::RE2;
19
20 namespace extensions {
21
22 // Parses URLencoded forms, see
23 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
24 class FormDataParserUrlEncoded : public FormDataParser {
25 public:
26 FormDataParserUrlEncoded();
27 virtual ~FormDataParserUrlEncoded();
28
29 // Implementation of FormDataParser.
30 virtual bool AllDataReadOK() OVERRIDE;
31 virtual bool GetNextNameValue(Result* result) OVERRIDE;
32 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
33
34 private:
35 // The pattern to match a single name-value pair. Ideally this should be
36 // static, so that it is constructed only once, independently on how many
37 // parser instances we have. However, then we would run into exit-time
38 // destructors problems.
39 const RE2 pattern_;
40
41 static const size_t args_size_ = 2u; // Auxiliary constant for using RE2.
42 static const net::UnescapeRule::Type unescape_rules_;
43
44 re2::StringPiece source_;
45 bool source_set_;
46
47 // Auxiliary store for using RE2.
48 std::string name_;
49 std::string value_;
50 const RE2::Arg arg_name_;
51 const RE2::Arg arg_value_;
52 const RE2::Arg* args_[args_size_];
53
54 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
55 };
56
57 // The following class, FormDataParserMultipart, parses forms encoded as
58 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
59 // encoding) and 5322 (MIME-headers).
60 //
61 // Implementation details
62 //
63 // The original grammar from RFC 2046 is this, "multipart-body" being the root
64 // non-terminal:
65 //
66 // boundary := 0*69<bchars> bcharsnospace
67 // bchars := bcharsnospace / " "
68 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
69 // / "-" / "." / "/" / ":" / "=" / "?"
70 // dash-boundary := "--" boundary
71 // multipart-body := [preamble CRLF]
72 // dash-boundary transport-padding CRLF
73 // body-part *encapsulation
74 // close-delimiter transport-padding
75 // [CRLF epilogue]
76 // transport-padding := *LWSP-char
77 // encapsulation := delimiter transport-padding CRLF body-part
78 // delimiter := CRLF dash-boundary
79 // close-delimiter := delimiter "--"
80 // preamble := discard-text
81 // epilogue := discard-text
82 // discard-text := *(*text CRLF) *text
83 // body-part := MIME-part-headers [CRLF *OCTET]
84 // OCTET := <any 0-255 octet value>
85 //
86 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
87 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
88 // English alphabet, respectively.
89 // The non-terminal "text" is presumably just any text, excluding line breaks.
90 // The non-terminal "LWSP-char" is not directly defined in the original grammar
91 // but it means "linear whitespace", which is a space or a horizontal tab.
92 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
93 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
94 //
95 // MIME-part-headers := field-name ":" unstructured CRLF
96 // field-name := 1*ftext
97 // ftext := %d33-57 / ; Printable US-ASCII
98 // %d59-126 ; characters not including ":".
99 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
100 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
101 // "CRLF<horizontal tab>", which serve for "folding".
102 //
103 // The FormDataParseMultipart class reads the input source and tries to parse it
104 // according to the grammar above, rooted at the "multipart-body" non-terminal.
105 // This happens in stages:
106 //
107 // 1. The optional preamble and the initial dash-boundary with transport padding
108 // and a CRLF are read and ignored.
109 //
110 // 2. Repeatedly each body part is read. The body parts can either serve to
111 // upload a file, or just a string of bytes.
112 // 2.a. The headers of that part are searched for the "content-disposition"
113 // header, which contains the name of the value represented by that body
114 // part. If the body-part is for file upload, that header also contains a
115 // filename.
116 // 2.b. The "*OCTET" part of the body part is then read and passed as the value
117 // of the name-value pair for body parts representing a string of bytes.
118 // For body parts for uploading a file the "*OCTET" part is just ignored
119 // and the filename is used for value instead.
120 //
121 // 3. The final close-delimiter and epilogue are read and ignored.
122 //
123 // IMPORTANT NOTE
124 // This parser supports multiple sources, i.e., SetSource can be called multiple
125 // times if the input is spread over several byte blocks. However, the split
126 // may only occur inside a body part, right after the trailing CRLF of headers.
127 class FormDataParserMultipart : public FormDataParser {
128 public:
129 explicit FormDataParserMultipart(const std::string& boundary_separator);
130 virtual ~FormDataParserMultipart();
131
132 // Implementation of FormDataParser.
133 virtual bool AllDataReadOK() OVERRIDE;
134 virtual bool GetNextNameValue(Result* result) OVERRIDE;
135 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
136
137 private:
138 enum State {
139 STATE_INIT, // No input read yet.
140 STATE_READY, // Ready to call GetNextNameValue.
141 STATE_FINISHED, // Read the input until the end.
142 STATE_SUSPEND, // Waiting until a new |source_| is set.
143 STATE_ERROR
144 };
145
146 // Produces a regexp to match the |boundary| string.
147 static std::string GetDashBoundaryPattern(const std::string& boundary);
148
149 // Tests whether |input| has a prefix matching |pattern|.
150 static bool LookAhead(const RE2& pattern, const re2::StringPiece& input);
151
152 // If source_ starts with a header, consumes it. If the header is
153 // Content-Disposition, it also extracts |name| from "name=" and possibly
154 // |value| from "filename=" fields of that header. It only touches |name| or
155 // |value| if it finds the respective fields for them. Returns true if it
156 // consumed a header, false if it did not. Sets |value_assigned| to true if it
157 // has assigned to value, otherwise it sets it to false.
158 bool TryReadHeader(base::StringPiece* name,
159 base::StringPiece* value,
160 bool* value_assigned);
161
162 // Helper to GetNextNameValue. Attempts to read the data portion of a body
163 // part. If |value| is not NULL but empty, it sets it to contain the data
164 // portion. Returns true when the reading was successful.
165 bool GetNextNameValueContinue(base::StringPiece* value);
166
167 // Ideally those should be static, so that they are constructed only once,
168 // independently on how many parser instances we have. However, then we would
169 // run into exit-time destructors problems.
170 const RE2 transfer_padding_pattern_;
171 const RE2 crlf_pattern_;
172 const RE2 closing_pattern_;
173 const RE2 epilogue_pattern_;
174 const RE2 crlf_free_pattern_;
175 const RE2 preamble_pattern_;
176 const RE2 header_pattern_;
177 const RE2 content_disposition_pattern_;
178 const RE2 name_pattern_;
179 const RE2 value_pattern_;
180
181 const RE2 dash_boundary_pattern_;
182
183 // Because of initialisation dependency, |state_| needs to be declared after
184 // |dash_boundary_pattern_|.
185 State state_;
186
187 // The parsed message can be split into multiple sources which we read
188 // sequentially.
189 re2::StringPiece source_;
190
191 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
192 };
193
194 // Implementation of FormDataParser and FormDataParser::Result .
195
196 FormDataParser::Result::Result() {}
197 FormDataParser::Result::~Result() {}
198
199 void FormDataParser::Result::Reset() {
200 name_.erase();
201 value_.erase();
202 }
203
204 FormDataParser::~FormDataParser() {}
205
206 // static
207 scoped_ptr<FormDataParser> FormDataParser::Create(
208 const net::URLRequest* request) {
209 std::string value;
210 const bool found = request->extra_request_headers().GetHeader(
211 net::HttpRequestHeaders::kContentType, &value);
212 return Create(found ? &value : NULL);
213 }
214
215 // static
216 scoped_ptr<FormDataParser> FormDataParser::Create(
217 const std::string* content_type_header) {
218 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};
219 ParserChoice choice = ERROR_CHOICE;
220 std::string boundary;
221
222 if (content_type_header == NULL) {
223 choice = URL_ENCODED;
224 } else {
225 const std::string content_type(
226 content_type_header->substr(0, content_type_header->find(';')));
227
228 if (base::strcasecmp(
229 content_type.c_str(), "application/x-www-form-urlencoded") == 0) {
230 choice = URL_ENCODED;
231 } else if (base::strcasecmp(
232 content_type.c_str(), "multipart/form-data") == 0) {
233 static const char kBoundaryString[] = "boundary=";
234 size_t offset = content_type_header->find(kBoundaryString);
235 if (offset == std::string::npos) {
236 // Malformed header.
237 return scoped_ptr<FormDataParser>();
238 }
239 offset += sizeof(kBoundaryString) - 1;
240 boundary = content_type_header->substr(
241 offset, content_type_header->find(';', offset));
242 if (!boundary.empty())
243 choice = MULTIPART;
244 }
245 }
246 // Other cases are unparseable, including when |content_type| is "text/plain".
247
248 switch (choice) {
249 case URL_ENCODED:
250 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());
251 case MULTIPART:
252 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));
253 default: // In other words, case ERROR_CHOICE:
254 return scoped_ptr<FormDataParser>();
255 }
256 }
257
258 FormDataParser::FormDataParser() {}
259
260 // Implementation of FormDataParserUrlEncoded.
261
262 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =
263 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS |
264 net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;
265
266 FormDataParserUrlEncoded::FormDataParserUrlEncoded()
267 : pattern_("([^=]*)=([^&]*)&?"),
268 source_(NULL),
269 source_set_(false),
270 arg_name_(&name_),
271 arg_value_(&value_) {
272 args_[0] = &arg_name_;
273 args_[1] = &arg_value_;
274 }
275
276 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}
277
278 bool FormDataParserUrlEncoded::AllDataReadOK() {
279 // All OK means we read the whole source.
280 return source_set_ && source_.size() == 0;
281 }
282
283 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {
284 if (!source_set_)
285 return false;
286
287 bool success = RE2::ConsumeN(&source_, pattern_, args_, args_size_);
288 if (success) {
289 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));
290 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));
291 }
292 return success;
293 }
294
295 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {
296 if (source_set_)
297 return false; // We do not allow multiple sources for this parser.
298 source_.set(source.data(), source.size());
299 source_set_ = true;
300 return true;
301 }
302
303 // Implementation of FormDataParserMultipart.
304
305 // static
306 std::string FormDataParserMultipart::GetDashBoundaryPattern(
307 const std::string& boundary) {
308 static const char escape_closing_quote[] = "\\\\E";
309 static const RE2 unquote_pattern(escape_closing_quote);
310 #define OPEN_QUOTE "\\Q"
311 static const char opening_quote[] = OPEN_QUOTE;
312 static const char closing_quote[] = "\\E";
313
314 std::string output(OPEN_QUOTE "--"); // Let us start with the "--".
315 #undef OPEN_QUOTE
316 re2::StringPiece seek_unquote(boundary);
317 const char* copy_start = boundary.data();
318 size_t copy_length = boundary.size();
319 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern)) {
320 copy_length = seek_unquote.data() - copy_start;
321 output.append(copy_start, copy_length);
322 output.append(escape_closing_quote);
323 output.append(opening_quote);
324 copy_start = seek_unquote.data();
325 }
326 copy_length = (boundary.data() + boundary.size()) - copy_start;
327 output.append(copy_start, copy_length);
328 output.append(closing_quote);
329 return output;
330 }
331
332 // static
333 bool FormDataParserMultipart::LookAhead(const RE2& pattern,
334 const re2::StringPiece& input) {
335 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);
336 }
337
338 #define CONTENT_DISPOSITION "content-disposition:"
339 FormDataParserMultipart::FormDataParserMultipart(
340 const std::string& boundary_separator)
341 : transfer_padding_pattern_("[ \\t]*\\r\\n"),
342 crlf_pattern_("\\r\\n"),
343 closing_pattern_("--[ \\t]*"),
344 epilogue_pattern_("|\\r\\n(?s:.)*"),
345 crlf_free_pattern_("(?:[^\\r]|\\r+[^\\r\\n])*"),
346 preamble_pattern_(".*?"),
347 header_pattern_("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),
348 content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"),
349 name_pattern_("\\bname=\"([^\"]*)\""),
350 value_pattern_("\\bfilename=\"([^\"]*)\""),
351 dash_boundary_pattern_(GetDashBoundaryPattern(boundary_separator)),
352 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {}
353
354 FormDataParserMultipart::~FormDataParserMultipart() {}
355
356 bool FormDataParserMultipart::AllDataReadOK() {
357 return state_ == STATE_FINISHED;
358 }
359
360 bool FormDataParserMultipart::GetNextNameValueContinue(
361 base::StringPiece* value) {
362 const char* data_start = source_.data();
363 while (!LookAhead(dash_boundary_pattern_, source_)) {
364 if (!RE2::Consume(&source_, crlf_free_pattern_) ||
365 !RE2::Consume(&source_, crlf_pattern_)) {
366 state_ = STATE_ERROR;
367 return false;
368 }
369 }
370 if (value != NULL) {
371 if (source_.data() == data_start) {
372 // No data in this body part.
373 state_ = STATE_ERROR;
374 return false;
375 }
376 // Subtract 2u for the trailing "\r\n".
377 value->set(data_start, source_.data() - data_start - 2u);
378 }
379
380 // Finally, read the dash-boundary and either skip to the next body part, or
381 // finish reading the source.
382 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));
383 if (LookAhead(closing_pattern_, source_)) {
384 CHECK(RE2::Consume(&source_, closing_pattern_));
385 if (RE2::Consume(&source_, epilogue_pattern_))
386 state_ = STATE_FINISHED;
387 else
388 state_ = STATE_ERROR;
389 } else { // Next body part ahead.
390 if (!RE2::Consume(&source_, transfer_padding_pattern_))
391 state_ = STATE_ERROR;
392 }
393 return state_ != STATE_ERROR;
394 }
395
396 bool FormDataParserMultipart::GetNextNameValue(Result* result) {
397 if (source_.size() == 0 || state_ != STATE_READY)
398 return false;
399
400 // 1. Read body-part headers.
401 base::StringPiece name;
402 base::StringPiece value;
403 bool value_assigned = false;
404 bool value_assigned_temp;
405 while (TryReadHeader(&name, &value, &value_assigned_temp))
406 value_assigned |= value_assigned_temp;
407 if (name.size() == 0) {
408 state_ = STATE_ERROR;
409 return false;
410 }
411
412 // 2. Read the trailing CRLF after headers.
413 if (!RE2::Consume(&source_, crlf_pattern_)) {
414 state_ = STATE_ERROR;
415 return false;
416 }
417
418 // 3. Read the data of this body part, i.e., everything until the first
419 // dash-boundary.
420 bool return_value = true;
421 if (value_assigned && source_.size() == 0) // Wait for a new source?
422 state_ = STATE_SUSPEND;
423 else
424 return_value = GetNextNameValueContinue(value_assigned ? NULL : &value);
425
426 std::string unescaped_name = net::UnescapeURLComponent(
427 name.as_string(),
428 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS);
429 result->set_name(unescaped_name);
vabr (Chromium) 2012/09/03 09:55:30 Here is the added name-unescaping.
430 result->set_value(value);
431
432 return return_value;
433 }
434
435 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {
436 if (source.data() == NULL || source_.size() != 0)
437 return false;
438 source_.set(source.data(), source.size());
439
440 switch (state_) {
441 case STATE_INIT:
442 // Seek behind the preamble.
443 while (!LookAhead(dash_boundary_pattern_, source_)) {
444 if (!RE2::Consume(&source_, preamble_pattern_)) {
445 state_ = STATE_ERROR;
446 break;
447 }
448 }
449 // Read dash-boundary, transfer padding, and CRLF.
450 if (state_ != STATE_ERROR) {
451 if (!RE2::Consume(&source_, dash_boundary_pattern_) ||
452 !RE2::Consume(&source_, transfer_padding_pattern_))
453 state_ = STATE_ERROR;
454 else
455 state_ = STATE_READY;
456 }
457 break;
458 case STATE_READY: // Nothing to do.
459 break;
460 case STATE_SUSPEND:
461 state_ = GetNextNameValueContinue(NULL) ? STATE_READY : STATE_ERROR;
462 break;
463 default:
464 state_ = STATE_ERROR;
465 }
466 return state_ != STATE_ERROR;
467 }
468
469 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,
470 base::StringPiece* value,
471 bool* value_assigned) {
472 static const size_t content_disposition_value_offset =
473 sizeof(CONTENT_DISPOSITION) - 1;
474 #undef CONTENT_DISPOSITION
475
476 *value_assigned = false;
477 const char* header_start = source_.data();
478 if (!RE2::Consume(&source_, header_pattern_))
479 return false;
480 // (*) After this point we must return true, because we consumed one header.
481
482 // Subtract 2u for the trailing "\r\n".
483 re2::StringPiece header(header_start, source_.data() - header_start - 2u);
484
485 // Now we check whether |header| is a Content-Disposition header, and try
486 // to extract name and possibly value from it.
487 if (LookAhead(content_disposition_pattern_, header)) {
488 re2::StringPiece groups[2u];
489
490 if (!name_pattern_.Match(header,
491 content_disposition_value_offset, header.size(),
492 RE2::UNANCHORED, groups, 2)) {
493 state_ = STATE_ERROR;
494 return true; // See (*) for why true.
495 }
496 name->set(groups[1].data(), groups[1].size());
497
498 if (!value_pattern_.Match(header,
499 content_disposition_value_offset, header.size(),
500 RE2::UNANCHORED, groups, 2))
501 return true; // See (*) for why true.
502 value->set(groups[1].data(), groups[1].size());
503 *value_assigned = true;
504 }
505 return true;
506 }
507
508 } // namespace extensions
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698