chrome/browser/extensions/api/web_request/form_data_parser.cc - Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Making non-trivial data-members non-static Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/browser/extensions/api/web_request/form_data_parser.h ('k') | chrome/browser/extensions/api/web_request/form_data_parser_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h"

	6

	7 #include <vector>

	8

	9 #include "base/string_util.h"

	10 #include "base/values.h"

	11 #include "net/base/escape.h"

	12 #include "net/url_request/url_request.h"

	13 #include "third_party/re2/re2/re2.h"

	14

	15 using base::DictionaryValue;

	16 using base::ListValue;

	17 using base::StringPiece;

	18 using re2::RE2;

	19

	20 namespace extensions {

	21

	22 // Parses URLencoded forms, see

	23 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .

	24 class FormDataParserUrlEncoded : public FormDataParser {

	25 public:

	26 FormDataParserUrlEncoded();

	27 virtual ~FormDataParserUrlEncoded();

	28

	29 // Implementation of FormDataParser.

	30 virtual bool AllDataReadOK() OVERRIDE;

	31 virtual bool GetNextNameValue(Result* result) OVERRIDE;

	32 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

	33

	34 private:

	35 // The pattern to match a single name-value pair. Ideally this should be

	36 // static, so that it is constructed only once, independently on how many

	37 // parser instances we have. However, then we would run into exit-time

	38 // destructors problems.

	39 const RE2 pattern_;

	40

	41 static const size_t args_size_ = 2u; // Auxiliary constant for using RE2.

	42 static const net::UnescapeRule::Type unescape_rules_;

	43

	44 re2::StringPiece source_;

	45 bool source_set_;

	46

	47 // Auxiliary store for using RE2.

	48 std::string name_;

	49 std::string value_;

	50 const RE2::Arg arg_name_;

	51 const RE2::Arg arg_value_;

	52 const RE2::Arg* args_[args_size_];

	53

	54 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);

	55 };

	56

	57 // The following class, FormDataParserMultipart, parses forms encoded as

	58 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart

	59 // encoding) and 5322 (MIME-headers).

	60 //

	61 // Implementation details

	62 //

	63 // The original grammar from RFC 2046 is this, "multipart-body" being the root

	64 // non-terminal:

	65 //

	66 // boundary := 0*69<bchars> bcharsnospace

	67 // bchars := bcharsnospace / " "

	68 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","

	69 // / "-" / "." / "/" / ":" / "=" / "?"

	70 // dash-boundary := "--" boundary

	71 // multipart-body := [preamble CRLF]

	72 // dash-boundary transport-padding CRLF

	73 // body-part *encapsulation

	74 // close-delimiter transport-padding

	75 // [CRLF epilogue]

	76 // transport-padding := *LWSP-char

	77 // encapsulation := delimiter transport-padding CRLF body-part

	78 // delimiter := CRLF dash-boundary

	79 // close-delimiter := delimiter "--"

	80 // preamble := discard-text

	81 // epilogue := discard-text

	82 // discard-text := (text CRLF) *text

	83 // body-part := MIME-part-headers [CRLF *OCTET]

	84 // OCTET := <any 0-255 octet value>

	85 //

	86 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,

	87 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the

	88 // English alphabet, respectively.

	89 // The non-terminal "text" is presumably just any text, excluding line breaks.

	90 // The non-terminal "LWSP-char" is not directly defined in the original grammar

	91 // but it means "linear whitespace", which is a space or a horizontal tab.

	92 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use

	93 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:

	94 //

	95 // MIME-part-headers := field-name ":" unstructured CRLF

	96 // field-name := 1*ftext

	97 // ftext := %d33-57 / ; Printable US-ASCII

	98 // %d59-126 ; characters not including ":".

	99 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which

	100 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and

	101 // "CRLF<horizontal tab>", which serve for "folding".

	102 //

	103 // The FormDataParseMultipart class reads the input source and tries to parse it

	104 // according to the grammar above, rooted at the "multipart-body" non-terminal.

	105 // This happens in stages:

	106 //

	107 // 1. The optional preamble and the initial dash-boundary with transport padding

	108 // and a CRLF are read and ignored.

	109 //

	110 // 2. Repeatedly each body part is read. The body parts can either serve to

	111 // upload a file, or just a string of bytes.

	112 // 2.a. The headers of that part are searched for the "content-disposition"

	113 // header, which contains the name of the value represented by that body

	114 // part. If the body-part is for file upload, that header also contains a

	115 // filename.

	116 // 2.b. The "*OCTET" part of the body part is then read and passed as the value

	117 // of the name-value pair for body parts representing a string of bytes.

	118 // For body parts for uploading a file the "*OCTET" part is just ignored

	119 // and the filename is used for value instead.

	120 //

	121 // 3. The final close-delimiter and epilogue are read and ignored.

	122 //

	123 // IMPORTANT NOTE

	124 // This parser supports multiple sources, i.e., SetSource can be called multiple

	125 // times if the input is spread over several byte blocks. However, the split

	126 // may only occur inside a body part, right after the trailing CRLF of headers.

	127 class FormDataParserMultipart : public FormDataParser {

	128 public:

	129 explicit FormDataParserMultipart(const std::string& boundary_separator);

	130 virtual ~FormDataParserMultipart();

	131

	132 // Implementation of FormDataParser.

	133 virtual bool AllDataReadOK() OVERRIDE;

	134 virtual bool GetNextNameValue(Result* result) OVERRIDE;

	135 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

	136

	137 private:

	138 enum State {

	139 STATE_INIT, // No input read yet.

	140 STATE_READY, // Ready to call GetNextNameValue.

	141 STATE_FINISHED, // Read the input until the end.

	142 STATE_SUSPEND, // Waiting until a new \|source_\| is set.

	143 STATE_ERROR

	144 };

	145

	146 // Produces a regexp to match the \|boundary\| string.

	147 static std::string GetDashBoundaryPattern(const std::string& boundary);

	148

	149 // Tests whether \|input\| has a prefix matching \|pattern\|.

	150 static bool LookAhead(const RE2& pattern, const re2::StringPiece& input);

	151

	152 // If source_ starts with a header, consumes it. If the header is

	153 // Content-Disposition, it also extracts \|name\| from "name=" and possibly

	154 // \|value\| from "filename=" fields of that header. It only touches \|name\| or

	155 // \|value\| if it finds the respective fields for them. Returns true if it

	156 // consumed a header, false if it did not. Sets \|value_assigned\| to true if it

	157 // has assigned to value, otherwise it sets it to false.

	158 bool TryReadHeader(base::StringPiece* name,

	159 base::StringPiece* value,

	160 bool* value_assigned);

	161

	162 // Helper to GetNextNameValue. Attempts to read the data portion of a body

	163 // part. If \|value\| is not NULL but empty, it sets it to contain the data

	164 // portion. Returns true when the reading was successful.

	165 bool GetNextNameValueContinue(base::StringPiece* value);

	166

	167 // Ideally those should be static, so that they are constructed only once,

	168 // independently on how many parser instances we have. However, then we would

	169 // run into exit-time destructors problems.

	170 const RE2 transfer_padding_pattern_;

	171 const RE2 crlf_pattern_;

	172 const RE2 closing_pattern_;

	173 const RE2 epilogue_pattern_;

	174 const RE2 crlf_free_pattern_;

	175 const RE2 preamble_pattern_;

	176 const RE2 header_pattern_;

	177 const RE2 content_disposition_pattern_;

	178 const RE2 name_pattern_;

	179 const RE2 value_pattern_;

	180

	181 const RE2 dash_boundary_pattern_;

	182

	183 // Because of initialisation dependency, \|state_\| needs to be declared after

	184 // \|dash_boundary_pattern_\|.

	185 State state_;

	186

	187 // The parsed message can be split into multiple sources which we read

	188 // sequentially.

	189 re2::StringPiece source_;

	190

	191 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);

	192 };

	193

	194 // Implementation of FormDataParser and FormDataParser::Result .

	195

	196 FormDataParser::Result::Result() {}

	197 FormDataParser::Result::~Result() {}

	198

	199 void FormDataParser::Result::Reset() {

	200 name_.erase();

	201 value_.erase();

	202 }

	203

	204 FormDataParser::~FormDataParser() {}

	205

	206 // static

	207 scoped_ptr<FormDataParser> FormDataParser::Create(

	208 const net::URLRequest* request) {

	209 std::string value;

	210 const bool found = request->extra_request_headers().GetHeader(

	211 net::HttpRequestHeaders::kContentType, &value);

	212 return Create(found ? &value : NULL);

	213 }

	214

	215 // static

	216 scoped_ptr<FormDataParser> FormDataParser::Create(

	217 const std::string* content_type_header) {

	218 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};

	219 ParserChoice choice = ERROR_CHOICE;

	220 std::string boundary;

	221

	222 if (content_type_header == NULL) {

	223 choice = URL_ENCODED;

	224 } else {

	225 const std::string content_type(

	226 content_type_header->substr(0, content_type_header->find(';')));

	227

	228 if (base::strcasecmp(

	229 content_type.c_str(), "application/x-www-form-urlencoded") == 0) {

	230 choice = URL_ENCODED;

	231 } else if (base::strcasecmp(

	232 content_type.c_str(), "multipart/form-data") == 0) {

	233 static const char kBoundaryString[] = "boundary=";

	234 size_t offset = content_type_header->find(kBoundaryString);

	235 if (offset == std::string::npos) {

	236 // Malformed header.

	237 return scoped_ptr<FormDataParser>();

	238 }

	239 offset += sizeof(kBoundaryString) - 1;

	240 boundary = content_type_header->substr(

	241 offset, content_type_header->find(';', offset));

	242 if (!boundary.empty())

	243 choice = MULTIPART;

	244 }

	245 }

	246 // Other cases are unparseable, including when \|content_type\| is "text/plain".

	247

	248 switch (choice) {

	249 case URL_ENCODED:

	250 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());

	251 case MULTIPART:

	252 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));

	253 default: // In other words, case ERROR_CHOICE:

	254 return scoped_ptr<FormDataParser>();

	255 }

	256 }

	257

	258 FormDataParser::FormDataParser() {}

	259

	260 // Implementation of FormDataParserUrlEncoded.

	261

	262 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =

	263 net::UnescapeRule::URL_SPECIAL_CHARS \| net::UnescapeRule::CONTROL_CHARS \|

	264 net::UnescapeRule::SPACES \| net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

	265

	266 FormDataParserUrlEncoded::FormDataParserUrlEncoded()

	267 : pattern_("([^=])=([^&])&?"),

	268 source_(NULL),

	269 source_set_(false),

	270 arg_name_(&name_),

	271 arg_value_(&value_) {

	272 args_[0] = &arg_name_;

	273 args_[1] = &arg_value_;

	274 }

	275

	276 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}

	277

	278 bool FormDataParserUrlEncoded::AllDataReadOK() {

	279 // All OK means we read the whole source.

	280 return source_set_ && source_.size() == 0;

	281 }

	282

	283 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {

	284 if (!source_set_)

	285 return false;

	286

	287 bool success = RE2::ConsumeN(&source_, pattern_, args_, args_size_);

	288 if (success) {

	289 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));

	290 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));

	291 }

	292 return success;

	293 }

	294

	295 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {

	296 if (source_set_)

	297 return false; // We do not allow multiple sources for this parser.

	298 source_.set(source.data(), source.size());

	299 source_set_ = true;

	300 return true;

	301 }

	302

	303 // Implementation of FormDataParserMultipart.

	304

	305 // static

	306 std::string FormDataParserMultipart::GetDashBoundaryPattern(

	307 const std::string& boundary) {

	308 static const char escape_closing_quote[] = "\\\\E";

	309 static const RE2 unquote_pattern(escape_closing_quote);
	vabr (Chromium) 2012/08/30 12:26:48 Note to myself -- make this a non-static data memb Note to myself -- make this a non-static data member. (This instance slipped through during the recent removal of static data-members with non-trivial destructors.)
	310 #define OPEN_QUOTE "\\Q"

	311 static const char opening_quote[] = OPEN_QUOTE;

	312 static const char closing_quote[] = "\\E";

	313

	314 std::string output(OPEN_QUOTE "--"); // Let us start with the "--".

	315 #undef OPEN_QUOTE

	316 re2::StringPiece seek_unquote(boundary);

	317 const char* copy_start = boundary.data();

	318 size_t copy_length = boundary.size();

	319 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern)) {

	320 copy_length = seek_unquote.data() - copy_start;

	321 output.append(copy_start, copy_length);

	322 output.append(escape_closing_quote);

	323 output.append(opening_quote);

	324 copy_start = seek_unquote.data();

	325 }

	326 copy_length = (boundary.data() + boundary.size()) - copy_start;

	327 output.append(copy_start, copy_length);

	328 output.append(closing_quote);

	329 return output;

	330 }

	331

	332 // static

	333 bool FormDataParserMultipart::LookAhead(const RE2& pattern,

	334 const re2::StringPiece& input) {

	335 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);

	336 }

	337

	338 #define CONTENT_DISPOSITION "content-disposition:"

	339 FormDataParserMultipart::FormDataParserMultipart(

	340 const std::string& boundary_separator)

	341 : transfer_padding_pattern_("[ \\t]*\\r\\n"),

	342 crlf_pattern_("\\r\\n"),

	343 closing_pattern_("--[ \\t]*"),

	344 epilogue_pattern_("\|\\r\\n(?s:.)*"),

	345 crlf_free_pattern_("(?:[^\\r]\|\\r+[^\\r\\n])*"),

	346 preamble_pattern_(".*?"),

	347 header_pattern_("[!-9;-~]+:(.\|\\r\\n[\\t ])*\\r\\n"),

	348 content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"),

	349 name_pattern_("\\bname=\"([^\"]*)\""),

	350 value_pattern_("\\bfilename=\"([^\"]*)\""),

	351 dash_boundary_pattern_(GetDashBoundaryPattern(boundary_separator)),

	352 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {}

	353

	354 FormDataParserMultipart::~FormDataParserMultipart() {}

	355

	356 bool FormDataParserMultipart::AllDataReadOK() {

	357 return state_ == STATE_FINISHED;

	358 }

	359

	360 bool FormDataParserMultipart::GetNextNameValueContinue(

	361 base::StringPiece* value) {

	362 const char* data_start = source_.data();

	363 while (!LookAhead(dash_boundary_pattern_, source_)) {

	364 if (!RE2::Consume(&source_, crlf_free_pattern_) \|\|

	365 !RE2::Consume(&source_, crlf_pattern_)) {

	366 state_ = STATE_ERROR;

	367 return false;

	368 }

	369 }

	370 if (value != NULL) {

	371 if (source_.data() == data_start) {

	372 // No data in this body part.

	373 state_ = STATE_ERROR;

	374 return false;

	375 }

	376 // Subtract 2u for the trailing "\r\n".

	377 value->set(data_start, source_.data() - data_start - 2u);

	378 }

	379

	380 // Finally, read the dash-boundary and either skip to the next body part, or

	381 // finish reading the source.

	382 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));

	383 if (LookAhead(closing_pattern_, source_)) {

	384 CHECK(RE2::Consume(&source_, closing_pattern_));

	385 if (RE2::Consume(&source_, epilogue_pattern_))

	386 state_ = STATE_FINISHED;

	387 else

	388 state_ = STATE_ERROR;

	389 } else { // Next body part ahead.

	390 if (!RE2::Consume(&source_, transfer_padding_pattern_))

	391 state_ = STATE_ERROR;

	392 }

	393 return state_ != STATE_ERROR;

	394 }

	395

	396 bool FormDataParserMultipart::GetNextNameValue(Result* result) {

	397 if (source_.size() == 0 \|\| state_ != STATE_READY)

	398 return false;

	399

	400 // 1. Read body-part headers.

	401 base::StringPiece name;

	402 base::StringPiece value;

	403 bool value_assigned = false;

	404 bool value_assigned_temp;

	405 while (TryReadHeader(&name, &value, &value_assigned_temp))

	406 value_assigned \|= value_assigned_temp;

	407 if (name.size() == 0) {

	408 state_ = STATE_ERROR;

	409 return false;

	410 }

	411

	412 // 2. Read the trailing CRLF after headers.

	413 if (!RE2::Consume(&source_, crlf_pattern_)) {

	414 state_ = STATE_ERROR;

	415 return false;

	416 }

	417

	418 // 3. Read the data of this body part, i.e., everything until the first

	419 // dash-boundary.

	420 bool return_value = true;

	421 if (value_assigned && source_.size() == 0) // Wait for a new source?

	422 state_ = STATE_SUSPEND;

	423 else

	424 return_value = GetNextNameValueContinue(value_assigned ? NULL : &value);

	425

	426 result->set_name(name);

	427 result->set_value(value);

	428

	429 return return_value;

	430 }

	431

	432 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {

	433 if (source.data() == NULL \|\| source_.size() != 0)

	434 return false;

	435 source_.set(source.data(), source.size());

	436

	437 switch (state_) {

	438 case STATE_INIT:

	439 // Seek behind the preamble.

	440 while (!LookAhead(dash_boundary_pattern_, source_)) {

	441 if (!RE2::Consume(&source_, preamble_pattern_)) {

	442 state_ = STATE_ERROR;

	443 break;

	444 }

	445 }

	446 // Read dash-boundary, transfer padding, and CRLF.

	447 if (state_ != STATE_ERROR) {

	448 if (!RE2::Consume(&source_, dash_boundary_pattern_) \|\|

	449 !RE2::Consume(&source_, transfer_padding_pattern_))

	450 state_ = STATE_ERROR;

	451 else

	452 state_ = STATE_READY;

	453 }

	454 break;

	455 case STATE_READY: // Nothing to do.

	456 break;

	457 case STATE_SUSPEND:

	458 state_ = GetNextNameValueContinue(NULL) ? STATE_READY : STATE_ERROR;

	459 break;

	460 default:

	461 state_ = STATE_ERROR;

	462 }

	463 return state_ != STATE_ERROR;

	464 }

	465

	466 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,

	467 base::StringPiece* value,

	468 bool* value_assigned) {

	469 static const size_t content_disposition_value_offset =

	470 sizeof(CONTENT_DISPOSITION) - 1;

	471 #undef CONTENT_DISPOSITION

	472

	473 *value_assigned = false;

	474 const char* header_start = source_.data();

	475 if (!RE2::Consume(&source_, header_pattern_))

	476 return false;

	477 // (*) After this point we must return true, because we consumed one header.

	478

	479 // Subtract 2u for the trailing "\r\n".

	480 re2::StringPiece header(header_start, source_.data() - header_start - 2u);

	481

	482 // Now we check whether \|header\| is a Content-Disposition header, and try

	483 // to extract name and possibly value from it.

	484 if (LookAhead(content_disposition_pattern_, header)) {

	485 re2::StringPiece groups[2u];

	486

	487 if (!name_pattern_.Match(header,

	488 content_disposition_value_offset, header.size(),

	489 RE2::UNANCHORED, groups, 2)) {

	490 state_ = STATE_ERROR;

	491 return true; // See (*) for why true.

	492 }

	493 name->set(groups[1].data(), groups[1].size());

	494

	495 if (!value_pattern_.Match(header,

	496 content_disposition_value_offset, header.size(),

	497 RE2::UNANCHORED, groups, 2))

	498 return true; // See (*) for why true.

	499 value->set(groups[1].data(), groups[1].size());

	500 *value_assigned = true;

	501 }

	502 return true;

	503 }

	504

	505 } // namespace extensions

OLD	NEW