chrome/browser/extensions/api/web_request/form_data_parser.cc - Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: One more static RE2 object made non-static Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/browser/extensions/api/web_request/form_data_parser.h ('k') | chrome/browser/extensions/api/web_request/form_data_parser_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h"

	6

	7 #include <vector>

	8

	9 #include "base/string_util.h"

	10 #include "base/values.h"

	11 #include "net/base/escape.h"

	12 #include "net/url_request/url_request.h"

	13 #include "third_party/re2/re2/re2.h"

	14

	15 using base::DictionaryValue;

	16 using base::ListValue;

	17 using base::StringPiece;

	18 using re2::RE2;

	19

	20 namespace extensions {

	21

	22 // Parses URLencoded forms, see

	23 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .

	24 class FormDataParserUrlEncoded : public FormDataParser {

	25 public:

	26 FormDataParserUrlEncoded();

	27 virtual ~FormDataParserUrlEncoded();

	28

	29 // Implementation of FormDataParser.

	30 virtual bool AllDataReadOK() OVERRIDE;

	31 virtual bool GetNextNameValue(Result* result) OVERRIDE;

	32 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

	33

	34 private:

	35 // The pattern to match a single name-value pair. Ideally this should be

	36 // static, so that it is constructed only once, independently on how many

	37 // parser instances we have. However, then we would run into exit-time

	38 // destructors problems.

	39 const RE2 pattern_;

	40

	41 static const size_t args_size_ = 2u; // Auxiliary constant for using RE2.

	42 static const net::UnescapeRule::Type unescape_rules_;

	43

	44 re2::StringPiece source_;

	45 bool source_set_;

	46

	47 // Auxiliary store for using RE2.

	48 std::string name_;

	49 std::string value_;

	50 const RE2::Arg arg_name_;

	51 const RE2::Arg arg_value_;

	52 const RE2::Arg* args_[args_size_];

	53

	54 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);

	55 };

	56

	57 // The following class, FormDataParserMultipart, parses forms encoded as

	58 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart

	59 // encoding) and 5322 (MIME-headers).

	60 //

	61 // Implementation details

	62 //

	63 // The original grammar from RFC 2046 is this, "multipart-body" being the root

	64 // non-terminal:

	65 //

	66 // boundary := 0*69<bchars> bcharsnospace

	67 // bchars := bcharsnospace / " "

	68 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","

	69 // / "-" / "." / "/" / ":" / "=" / "?"

	70 // dash-boundary := "--" boundary

	71 // multipart-body := [preamble CRLF]

	72 // dash-boundary transport-padding CRLF

	73 // body-part *encapsulation

	74 // close-delimiter transport-padding

	75 // [CRLF epilogue]

	76 // transport-padding := *LWSP-char

	77 // encapsulation := delimiter transport-padding CRLF body-part

	78 // delimiter := CRLF dash-boundary

	79 // close-delimiter := delimiter "--"

	80 // preamble := discard-text

	81 // epilogue := discard-text

	82 // discard-text := (text CRLF) *text

	83 // body-part := MIME-part-headers [CRLF *OCTET]

	84 // OCTET := <any 0-255 octet value>

	85 //

	86 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,

	87 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the

	88 // English alphabet, respectively.

	89 // The non-terminal "text" is presumably just any text, excluding line breaks.

	90 // The non-terminal "LWSP-char" is not directly defined in the original grammar

	91 // but it means "linear whitespace", which is a space or a horizontal tab.

	92 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use

	93 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:

	94 //

	95 // MIME-part-headers := field-name ":" unstructured CRLF

	96 // field-name := 1*ftext

	97 // ftext := %d33-57 / ; Printable US-ASCII

	98 // %d59-126 ; characters not including ":".

	99 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which

	100 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and

	101 // "CRLF<horizontal tab>", which serve for "folding".

	102 //

	103 // The FormDataParseMultipart class reads the input source and tries to parse it

	104 // according to the grammar above, rooted at the "multipart-body" non-terminal.

	105 // This happens in stages:

	106 //

	107 // 1. The optional preamble and the initial dash-boundary with transport padding

	108 // and a CRLF are read and ignored.

	109 //

	110 // 2. Repeatedly each body part is read. The body parts can either serve to

	111 // upload a file, or just a string of bytes.

	112 // 2.a. The headers of that part are searched for the "content-disposition"

	113 // header, which contains the name of the value represented by that body

	114 // part. If the body-part is for file upload, that header also contains a

	115 // filename.

	116 // 2.b. The "*OCTET" part of the body part is then read and passed as the value

	117 // of the name-value pair for body parts representing a string of bytes.

	118 // For body parts for uploading a file the "*OCTET" part is just ignored

	119 // and the filename is used for value instead.

	120 //

	121 // 3. The final close-delimiter and epilogue are read and ignored.

	122 //

	123 // IMPORTANT NOTE

	124 // This parser supports multiple sources, i.e., SetSource can be called multiple

	125 // times if the input is spread over several byte blocks. However, the split

	126 // may only occur inside a body part, right after the trailing CRLF of headers.

	127 class FormDataParserMultipart : public FormDataParser {

	128 public:

	129 explicit FormDataParserMultipart(const std::string& boundary_separator);

	130 virtual ~FormDataParserMultipart();

	131

	132 // Implementation of FormDataParser.

	133 virtual bool AllDataReadOK() OVERRIDE;

	134 virtual bool GetNextNameValue(Result* result) OVERRIDE;

	135 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

	136

	137 private:

	138 enum State {

	139 STATE_INIT, // No input read yet.

	140 STATE_READY, // Ready to call GetNextNameValue.

	141 STATE_FINISHED, // Read the input until the end.

	142 STATE_SUSPEND, // Waiting until a new \|source_\| is set.

	143 STATE_ERROR

	144 };

	145

	146 // Produces a regexp to match the \|boundary\| string.

	147 static std::string GetDashBoundaryPattern(const std::string& boundary);

	148

	149 // Tests whether \|input\| has a prefix matching \|pattern\|.

	150 static bool LookAhead(const RE2& pattern, const re2::StringPiece& input);

	151

	152 // If source_ starts with a header, consumes it. If the header is

	153 // Content-Disposition, it also extracts \|name\| from "name=" and possibly

	154 // \|value\| from "filename=" fields of that header. It only touches \|name\| or

	155 // \|value\| if it finds the respective fields for them. Returns true if it

	156 // consumed a header, false if it did not. Sets \|value_assigned\| to true if it

	157 // has assigned to value, otherwise it sets it to false.

	158 bool TryReadHeader(base::StringPiece* name,

	159 base::StringPiece* value,

	160 bool* value_assigned);

	161

	162 // Helper to GetNextNameValue. Attempts to read the data portion of a body

	163 // part. If \|value\| is not NULL but empty, it sets it to contain the data

	164 // portion. Returns true when the reading was successful.

	165 bool GetNextNameValueContinue(base::StringPiece* value);

	166

	167 // Ideally those should be static, so that they are constructed only once,

	168 // independently on how many parser instances we have. However, then we would

	169 // run into exit-time destructors problems.

	170 const RE2 transfer_padding_pattern_;

	171 const RE2 crlf_pattern_;

	172 const RE2 closing_pattern_;

	173 const RE2 epilogue_pattern_;

	174 const RE2 crlf_free_pattern_;

	175 const RE2 preamble_pattern_;

	176 const RE2 header_pattern_;

	177 const RE2 content_disposition_pattern_;

	178 const RE2 name_pattern_;

	179 const RE2 value_pattern_;

	180

	181 const RE2 dash_boundary_pattern_;

	182

	183 // Because of initialisation dependency, \|state_\| needs to be declared after

	184 // \|dash_boundary_pattern_\|.

	185 State state_;

	186

	187 // The parsed message can be split into multiple sources which we read

	188 // sequentially.

	189 re2::StringPiece source_;

	190

	191 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);

	192 };

	193

	194 // Implementation of FormDataParser and FormDataParser::Result .

	195

	196 FormDataParser::Result::Result() {}

	197 FormDataParser::Result::~Result() {}

	198

	199 void FormDataParser::Result::Reset() {

	200 name_.erase();

	201 value_.erase();

	202 }

	203

	204 FormDataParser::~FormDataParser() {}

	205

	206 // static

	207 scoped_ptr<FormDataParser> FormDataParser::Create(

	208 const net::URLRequest* request) {

	209 std::string value;

	210 const bool found = request->extra_request_headers().GetHeader(

	211 net::HttpRequestHeaders::kContentType, &value);

	212 return Create(found ? &value : NULL);

	213 }

	214

	215 // static

	216 scoped_ptr<FormDataParser> FormDataParser::Create(

	217 const std::string* content_type_header) {

	218 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};

	219 ParserChoice choice = ERROR_CHOICE;

	220 std::string boundary;

	221

	222 if (content_type_header == NULL) {

	223 choice = URL_ENCODED;

	224 } else {

	225 const std::string content_type(

	226 content_type_header->substr(0, content_type_header->find(';')));

	227

	228 if (base::strcasecmp(

	229 content_type.c_str(), "application/x-www-form-urlencoded") == 0) {

	230 choice = URL_ENCODED;

	231 } else if (base::strcasecmp(

	232 content_type.c_str(), "multipart/form-data") == 0) {

	233 static const char kBoundaryString[] = "boundary=";

	234 size_t offset = content_type_header->find(kBoundaryString);

	235 if (offset == std::string::npos) {

	236 // Malformed header.

	237 return scoped_ptr<FormDataParser>();

	238 }

	239 offset += sizeof(kBoundaryString) - 1;

	240 boundary = content_type_header->substr(

	241 offset, content_type_header->find(';', offset));

	242 if (!boundary.empty())

	243 choice = MULTIPART;

	244 }

	245 }

	246 // Other cases are unparseable, including when \|content_type\| is "text/plain".

	247

	248 switch (choice) {

	249 case URL_ENCODED:

	250 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());

	251 case MULTIPART:

	252 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));

	253 default: // In other words, case ERROR_CHOICE:

	254 return scoped_ptr<FormDataParser>();

	255 }

	256 }

	257

	258 FormDataParser::FormDataParser() {}

	259

	260 // Implementation of FormDataParserUrlEncoded.

	261

	262 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =

	263 net::UnescapeRule::URL_SPECIAL_CHARS \| net::UnescapeRule::CONTROL_CHARS \|

	264 net::UnescapeRule::SPACES \| net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

	265

	266 FormDataParserUrlEncoded::FormDataParserUrlEncoded()

	267 : pattern_("([^=])=([^&])&?"),

	268 source_(NULL),

	269 source_set_(false),

	270 arg_name_(&name_),

	271 arg_value_(&value_) {

	272 args_[0] = &arg_name_;

	273 args_[1] = &arg_value_;

	274 }

	275

	276 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}

	277

	278 bool FormDataParserUrlEncoded::AllDataReadOK() {

	279 // All OK means we read the whole source.

	280 return source_set_ && source_.size() == 0;

	281 }

	282

	283 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {

	284 if (!source_set_)

	285 return false;

	286

	287 bool success = RE2::ConsumeN(&source_, pattern_, args_, args_size_);

	288 if (success) {

	289 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));

	290 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));

	291 }

	292 return success;

	293 }

	294

	295 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {

	296 if (source_set_)

	297 return false; // We do not allow multiple sources for this parser.

	298 source_.set(source.data(), source.size());

	299 source_set_ = true;

	300 return true;

	301 }

	302

	303 // Implementation of FormDataParserMultipart.

	304

	305 // static

	306 std::string FormDataParserMultipart::GetDashBoundaryPattern(
	tkent 2012/09/04 07:43:53 nit: The function name doesn't represent what it d nit: The function name doesn't represent what it does. It should be named as "EscapeUnquote", "MakePatternFromLiteral" or something. vabr (Chromium) 2012/09/04 11:45:25 Good point. I chose "GetBoundaryPatternFromLiteral Show quoted text On 2012/09/04 07:43:53, Kent Tamura wrote: > nit: > The function name doesn't represent what it does. > It should be named as "EscapeUnquote", "MakePatternFromLiteral" or something. Good point. I chose "GetBoundaryPatternFromLiteral" because the function makes one boundary-specific thing: it prepends "--".
	307 const std::string& boundary) {

	308 static const char escape_closing_quote[] = "\\\\E";

	309 // The following should be ideally static, to spare execution time. See the

	310 // comment at const RE2 data members of FormDataParserMultipart. Note that

	311 // this method is only called once for each instance of

	312 // FormDataParserMultipart, so we keep \|unqoute_pattern\| local even though

	313 // non-static.

	314 const RE2 unquote_pattern(escape_closing_quote);

	315 #define OPEN_QUOTE "\\Q"

	316 static const char opening_quote[] = OPEN_QUOTE;

	317 static const char closing_quote[] = "\\E";

	318

	319 std::string output(OPEN_QUOTE "--"); // Let us start with the "--".

	320 #undef OPEN_QUOTE

	321 re2::StringPiece seek_unquote(boundary);

	322 const char* copy_start = boundary.data();

	323 size_t copy_length = boundary.size();

	324 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern)) {

	325 copy_length = seek_unquote.data() - copy_start;

	326 output.append(copy_start, copy_length);

	327 output.append(escape_closing_quote);

	328 output.append(opening_quote);

	329 copy_start = seek_unquote.data();

	330 }

	331 copy_length = (boundary.data() + boundary.size()) - copy_start;

	332 output.append(copy_start, copy_length);

	333 output.append(closing_quote);

	334 return output;

	335 }

	336

	337 // static

	338 bool FormDataParserMultipart::LookAhead(const RE2& pattern,

	339 const re2::StringPiece& input) {

	340 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);

	341 }

	342

	343 #define CONTENT_DISPOSITION "content-disposition:"

	344 FormDataParserMultipart::FormDataParserMultipart(

	345 const std::string& boundary_separator)

	346 : transfer_padding_pattern_("[ \\t]*\\r\\n"),

	347 crlf_pattern_("\\r\\n"),

	348 closing_pattern_("--[ \\t]*"),

	349 epilogue_pattern_("\|\\r\\n(?s:.)*"),

	350 crlf_free_pattern_("(?:[^\\r]\|\\r+[^\\r\\n])*"),

	351 preamble_pattern_(".*?"),

	352 header_pattern_("[!-9;-~]+:(.\|\\r\\n[\\t ])*\\r\\n"),

	353 content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"),

	354 name_pattern_("\\bname=\"([^\"]*)\""),

	355 value_pattern_("\\bfilename=\"([^\"]*)\""),

	356 dash_boundary_pattern_(GetDashBoundaryPattern(boundary_separator)),

	357 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {}

	358

	359 FormDataParserMultipart::~FormDataParserMultipart() {}

	360

	361 bool FormDataParserMultipart::AllDataReadOK() {

	362 return state_ == STATE_FINISHED;

	363 }

	364

	365 bool FormDataParserMultipart::GetNextNameValueContinue(
	tkent 2012/09/04 07:43:53 nit: Again, the function name isn't good. The name nit: Again, the function name isn't good. The name should be "GetDataOfCurrentPartAndSkipUntilNextBoudnary()" or something. vabr (Chromium) 2012/09/04 11:45:25 Changed to "FinishReadingPart", and the argument n Show quoted text On 2012/09/04 07:43:53, Kent Tamura wrote: > nit: > Again, the function name isn't good. > The name should be "GetDataOfCurrentPartAndSkipUntilNextBoudnary()" or > something. Changed to "FinishReadingPart", and the argument name from \|value\| to \|data\|, to make it clear that both: 1) the body part is read until its end, and 2) the output parameter should contain data, without making the name painfully long. Comment at the declaration also edited to explain properly what the method does.
	366 base::StringPiece* value) {

	367 const char* data_start = source_.data();

	368 while (!LookAhead(dash_boundary_pattern_, source_)) {

	369 if (!RE2::Consume(&source_, crlf_free_pattern_) \|\|

	370 !RE2::Consume(&source_, crlf_pattern_)) {

	371 state_ = STATE_ERROR;

	372 return false;

	373 }

	374 }

	375 if (value != NULL) {

	376 if (source_.data() == data_start) {

	377 // No data in this body part.

	378 state_ = STATE_ERROR;

	379 return false;

	380 }

	381 // Subtract 2u for the trailing "\r\n".

	382 value->set(data_start, source_.data() - data_start - 2u);

	383 }

	384

	385 // Finally, read the dash-boundary and either skip to the next body part, or

	386 // finish reading the source.

	387 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));

	388 if (LookAhead(closing_pattern_, source_)) {

	389 CHECK(RE2::Consume(&source_, closing_pattern_));

	390 if (RE2::Consume(&source_, epilogue_pattern_))

	391 state_ = STATE_FINISHED;

	392 else

	393 state_ = STATE_ERROR;

	394 } else { // Next body part ahead.

	395 if (!RE2::Consume(&source_, transfer_padding_pattern_))

	396 state_ = STATE_ERROR;

	397 }

	398 return state_ != STATE_ERROR;

	399 }

	400

	401 bool FormDataParserMultipart::GetNextNameValue(Result* result) {

	402 if (source_.size() == 0 \|\| state_ != STATE_READY)

	403 return false;

	404

	405 // 1. Read body-part headers.

	406 base::StringPiece name;

	407 base::StringPiece value;

	408 bool value_assigned = false;

	409 bool value_assigned_temp;

	410 while (TryReadHeader(&name, &value, &value_assigned_temp))

	411 value_assigned \|= value_assigned_temp;

	412 if (name.size() == 0) {

	413 state_ = STATE_ERROR;

	414 return false;

	415 }

	416

	417 // 2. Read the trailing CRLF after headers.

	418 if (!RE2::Consume(&source_, crlf_pattern_)) {

	419 state_ = STATE_ERROR;

	420 return false;

	421 }

	422

	423 // 3. Read the data of this body part, i.e., everything until the first

	424 // dash-boundary.

	425 bool return_value = true;

	426 if (value_assigned && source_.size() == 0) // Wait for a new source?

	427 state_ = STATE_SUSPEND;

	428 else

	429 return_value = GetNextNameValueContinue(value_assigned ? NULL : &value);

	430

	431 std::string unescaped_name = net::UnescapeURLComponent(

	432 name.as_string(),

	433 net::UnescapeRule::URL_SPECIAL_CHARS \| net::UnescapeRule::CONTROL_CHARS);

	434 result->set_name(unescaped_name);

	435 result->set_value(value);

	436

	437 return return_value;

	438 }

	439

	440 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {

	441 if (source.data() == NULL \|\| source_.size() != 0)

	442 return false;

	443 source_.set(source.data(), source.size());

	444

	445 switch (state_) {

	446 case STATE_INIT:

	447 // Seek behind the preamble.

	448 while (!LookAhead(dash_boundary_pattern_, source_)) {

	449 if (!RE2::Consume(&source_, preamble_pattern_)) {

	450 state_ = STATE_ERROR;

	451 break;

	452 }

	453 }

	454 // Read dash-boundary, transfer padding, and CRLF.

	455 if (state_ != STATE_ERROR) {

	456 if (!RE2::Consume(&source_, dash_boundary_pattern_) \|\|

	457 !RE2::Consume(&source_, transfer_padding_pattern_))

	458 state_ = STATE_ERROR;

	459 else

	460 state_ = STATE_READY;

	461 }

	462 break;

	463 case STATE_READY: // Nothing to do.

	464 break;

	465 case STATE_SUSPEND:

	466 state_ = GetNextNameValueContinue(NULL) ? STATE_READY : STATE_ERROR;

	467 break;

	468 default:

	469 state_ = STATE_ERROR;

	470 }

	471 return state_ != STATE_ERROR;

	472 }

	473

	474 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,

	475 base::StringPiece* value,

	476 bool* value_assigned) {

	477 static const size_t content_disposition_value_offset =

	478 sizeof(CONTENT_DISPOSITION) - 1;

	479 #undef CONTENT_DISPOSITION

	480

	481 *value_assigned = false;

	482 const char* header_start = source_.data();

	483 if (!RE2::Consume(&source_, header_pattern_))

	484 return false;

	485 // (*) After this point we must return true, because we consumed one header.

	486

	487 // Subtract 2u for the trailing "\r\n".

	488 re2::StringPiece header(header_start, source_.data() - header_start - 2u);

	489

	490 // Now we check whether \|header\| is a Content-Disposition header, and try

	491 // to extract name and possibly value from it.

	492 if (LookAhead(content_disposition_pattern_, header)) {

	493 re2::StringPiece groups[2u];

	494

	495 if (!name_pattern_.Match(header,

	496 content_disposition_value_offset, header.size(),

	497 RE2::UNANCHORED, groups, 2)) {

	498 state_ = STATE_ERROR;

	499 return true; // See (*) for why true.

	500 }

	501 name->set(groups[1].data(), groups[1].size());

	502

	503 if (!value_pattern_.Match(header,

	504 content_disposition_value_offset, header.size(),

	505 RE2::UNANCHORED, groups, 2))

	506 return true; // See (*) for why true.

	507 value->set(groups[1].data(), groups[1].size());

	508 *value_assigned = true;

	509 }

	510 return true;

	511 }

	512

	513 } // namespace extensions

OLD	NEW