chrome/browser/extensions/api/web_request/form_data_parser.cc - Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Introducing LazyInstance for "static" RE2 Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/browser/extensions/api/web_request/form_data_parser.h ('k') | chrome/browser/extensions/api/web_request/form_data_parser_unittest.cc » ('j') | chrome/common/extensions/api/web_request.json » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h"

	6

	7 #include <vector>

	8

	9 #include "base/lazy_instance.h"

	10 #include "base/string_util.h"

	11 #include "base/values.h"

	12 #include "net/base/escape.h"

	13 #include "net/url_request/url_request.h"

	14 #include "third_party/re2/re2/re2.h"

	15

	16 using base::DictionaryValue;

	17 using base::ListValue;

	18 using base::StringPiece;

	19 using re2::RE2;

	20

	21 namespace extensions {

	22

	23 namespace {

	24

	25 #define CONTENT_DISPOSITION "content-disposition:"

	26

	27 static const char g_escape_closing_quote[] = "\\\\E";

	28 static const size_t g_content_disposition_length =

	29 sizeof(CONTENT_DISPOSITION) - 1;

	30

	31 // A wrapper struct for static RE2 objects to be held as LazyInstance.

	32 struct Patterns {

	33 Patterns();

	34 ~Patterns();

	35 const RE2 transfer_padding_pattern_;

	36 const RE2 crlf_pattern_;

	37 const RE2 closing_pattern_;

	38 const RE2 epilogue_pattern_;

	39 const RE2 crlf_free_pattern_;

	40 const RE2 preamble_pattern_;

	41 const RE2 header_pattern_;

	42 const RE2 content_disposition_pattern_;

	43 const RE2 name_pattern_;

	44 const RE2 value_pattern_;

	45 const RE2 unquote_pattern_;

	46 const RE2 url_encoded_pattern_;

	47 };

	48

	49 Patterns::Patterns()

	50 : transfer_padding_pattern_("[ \\t]*\\r\\n"),

	51 crlf_pattern_("\\r\\n"),

	52 closing_pattern_("--[ \\t]*"),

	53 epilogue_pattern_("\|\\r\\n(?s:.)*"),

	54 crlf_free_pattern_("(?:[^\\r]\|\\r+[^\\r\\n])*"),

	55 preamble_pattern_(".*?"),

	56 header_pattern_("[!-9;-~]+:(.\|\\r\\n[\\t ])*\\r\\n"),

	57 content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"),

	58 name_pattern_("\\bname=\"([^\"]*)\""),

	59 value_pattern_("\\bfilename=\"([^\"]*)\""),

	60 unquote_pattern_(g_escape_closing_quote),

	61 url_encoded_pattern_("([^=])=([^&])&?") {}

	62

	63 #undef CONTENT_DISPOSITION

	64

	65 Patterns::~Patterns() {}

	66

	67 static base::LazyInstance<Patterns> g_patterns = LAZY_INSTANCE_INITIALIZER;

	68

	69 } // namespace

	70

	71 // Parses URLencoded forms, see

	72 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .

	73 class FormDataParserUrlEncoded : public FormDataParser {

	74 public:

	75 FormDataParserUrlEncoded();

	76 virtual ~FormDataParserUrlEncoded();

	77

	78 // Implementation of FormDataParser.

	79 virtual bool AllDataReadOK() OVERRIDE;

	80 virtual bool GetNextNameValue(Result* result) OVERRIDE;

	81 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

	82

	83 private:

	84 // The pattern to match a single name-value pair.

	85 const RE2& pattern() {

	86 return g_patterns.Get().url_encoded_pattern_;

	87 }

	88

	89 static const size_t args_size_ = 2u; // Auxiliary constant for using RE2.

	90 static const net::UnescapeRule::Type unescape_rules_;

	91

	92 re2::StringPiece source_;

	93 bool source_set_;

	94

	95 // Auxiliary store for using RE2.

	96 std::string name_;

	97 std::string value_;

	98 const RE2::Arg arg_name_;

	99 const RE2::Arg arg_value_;

	100 const RE2::Arg* args_[args_size_];

	101

	102 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);

	103 };

	104

	105 // The following class, FormDataParserMultipart, parses forms encoded as

	106 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart

	107 // encoding) and 5322 (MIME-headers).

	108 //

	109 // Implementation details

	110 //

	111 // The original grammar from RFC 2046 is this, "multipart-body" being the root

	112 // non-terminal:

	113 //

	114 // boundary := 0*69<bchars> bcharsnospace

	115 // bchars := bcharsnospace / " "

	116 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","

	117 // / "-" / "." / "/" / ":" / "=" / "?"

	118 // dash-boundary := "--" boundary

	119 // multipart-body := [preamble CRLF]

	120 // dash-boundary transport-padding CRLF

	121 // body-part *encapsulation

	122 // close-delimiter transport-padding

	123 // [CRLF epilogue]

	124 // transport-padding := *LWSP-char

	125 // encapsulation := delimiter transport-padding CRLF body-part

	126 // delimiter := CRLF dash-boundary

	127 // close-delimiter := delimiter "--"

	128 // preamble := discard-text

	129 // epilogue := discard-text

	130 // discard-text := (text CRLF) *text

	131 // body-part := MIME-part-headers [CRLF *OCTET]

	132 // OCTET := <any 0-255 octet value>

	133 //

	134 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,

	135 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the

	136 // English alphabet, respectively.

	137 // The non-terminal "text" is presumably just any text, excluding line breaks.

	138 // The non-terminal "LWSP-char" is not directly defined in the original grammar

	139 // but it means "linear whitespace", which is a space or a horizontal tab.

	140 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use

	141 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:

	142 //

	143 // MIME-part-headers := field-name ":" unstructured CRLF

	144 // field-name := 1*ftext

	145 // ftext := %d33-57 / ; Printable US-ASCII

	146 // %d59-126 ; characters not including ":".

	147 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which

	148 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and

	149 // "CRLF<horizontal tab>", which serve for "folding".

	150 //

	151 // The FormDataParseMultipart class reads the input source and tries to parse it

	152 // according to the grammar above, rooted at the "multipart-body" non-terminal.

	153 // This happens in stages:

	154 //

	155 // 1. The optional preamble and the initial dash-boundary with transport padding

	156 // and a CRLF are read and ignored.

	157 //

	158 // 2. Repeatedly each body part is read. The body parts can either serve to

	159 // upload a file, or just a string of bytes.

	160 // 2.a. The headers of that part are searched for the "content-disposition"

	161 // header, which contains the name of the value represented by that body

	162 // part. If the body-part is for file upload, that header also contains a

	163 // filename.

	164 // 2.b. The "*OCTET" part of the body part is then read and passed as the value

	165 // of the name-value pair for body parts representing a string of bytes.

	166 // For body parts for uploading a file the "*OCTET" part is just ignored

	167 // and the filename is used for value instead.

	168 //

	169 // 3. The final close-delimiter and epilogue are read and ignored.

	170 //

	171 // IMPORTANT NOTE

	172 // This parser supports multiple sources, i.e., SetSource can be called multiple

	173 // times if the input is spread over several byte blocks. However, the split

	174 // may only occur inside a body part, right after the trailing CRLF of headers.

	175 class FormDataParserMultipart : public FormDataParser {

	176 public:

	177 explicit FormDataParserMultipart(const std::string& boundary_separator);

	178 virtual ~FormDataParserMultipart();

	179

	180 // Implementation of FormDataParser.

	181 virtual bool AllDataReadOK() OVERRIDE;

	182 virtual bool GetNextNameValue(Result* result) OVERRIDE;

	183 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

	184

	185 private:

	186 enum State {

	187 STATE_INIT, // No input read yet.

	188 STATE_READY, // Ready to call GetNextNameValue.

	189 STATE_FINISHED, // Read the input until the end.

	190 STATE_SUSPEND, // Waiting until a new \|source_\| is set.

	191 STATE_ERROR

	192 };

	193

	194 // Produces a regexp to match the string "--" + \|literal\|.

	195 static std::string GetBoundaryPatternFromLiteral(const std::string& literal);

	196

	197 // Tests whether \|input\| has a prefix matching \|pattern\|.

	198 static bool LookAhead(const RE2& pattern, const re2::StringPiece& input);

	199

	200 // If source_ starts with a header, consumes it. If the header is

	201 // Content-Disposition, it also extracts \|name\| from "name=" and possibly

	202 // \|value\| from "filename=" fields of that header. It only touches \|name\| or

	203 // \|value\| if it finds the respective fields for them. Returns true if it

	204 // consumed a header, false if it did not. Sets \|value_assigned\| to true if it

	205 // has assigned to value, otherwise it sets it to false.

	206 bool TryReadHeader(base::StringPiece* name,

	207 base::StringPiece* value,

	208 bool* value_assigned);

	209

	210 // Helper to GetNextNameValue. Expects that the input starts with a data

	211 // portion of a body part. It then attempts to read the input until the end of

	212 // that body part. If \|data\| is not NULL, it sets it to contain the data

	213 // portion. Returns true when the reading was successful.

	214 bool FinishReadingPart(base::StringPiece* data);

	215

	216 static const RE2& transfer_padding_pattern() {

	217 return g_patterns.Get().transfer_padding_pattern_;

	218 }

	219 static const RE2& crlf_pattern() {

	220 return g_patterns.Get().crlf_pattern_;

	221 }

	222 static const RE2& closing_pattern() {

	223 return g_patterns.Get().closing_pattern_;

	224 }

	225 static const RE2& epilogue_pattern() {

	226 return g_patterns.Get().epilogue_pattern_;

	227 }

	228 static const RE2& crlf_free_pattern() {

	229 return g_patterns.Get().crlf_free_pattern_;

	230 }

	231 static const RE2& preamble_pattern() {

	232 return g_patterns.Get().preamble_pattern_;

	233 }

	234 static const RE2& header_pattern() {

	235 return g_patterns.Get().header_pattern_;

	236 }

	237 static const RE2& content_disposition_pattern() {

	238 return g_patterns.Get().content_disposition_pattern_;

	239 }

	240 static const RE2& name_pattern() {

	241 return g_patterns.Get().name_pattern_;

	242 }

	243 static const RE2& value_pattern() {

	244 return g_patterns.Get().value_pattern_;

	245 }

	246 static const RE2& unquote_pattern() {

	247 return g_patterns.Get().unquote_pattern_;

	248 }

	249

	250 const RE2 dash_boundary_pattern_;

	251

	252 // Because of initialisation dependency, \|state_\| needs to be declared after

	253 // \|dash_boundary_pattern_\|.

	254 State state_;

	255

	256 // The parsed message can be split into multiple sources which we read

	257 // sequentially.

	258 re2::StringPiece source_;

	259

	260 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);

	261 };

	262

	263 // Implementation of FormDataParser and FormDataParser::Result .

	264

	265 FormDataParser::Result::Result() {}

	266 FormDataParser::Result::~Result() {}

	267

	268 void FormDataParser::Result::Reset() {

	269 name_.erase();

	270 value_.erase();

	271 }

	272

	273 FormDataParser::~FormDataParser() {}

	274

	275 // static

	276 scoped_ptr<FormDataParser> FormDataParser::Create(

	277 const net::URLRequest* request) {

	278 std::string value;

	279 const bool found = request->extra_request_headers().GetHeader(

	280 net::HttpRequestHeaders::kContentType, &value);

	281 return Create(found ? &value : NULL);

	282 }

	283

	284 // static

	285 scoped_ptr<FormDataParser> FormDataParser::Create(

	286 const std::string* content_type_header) {

	287 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};

	288 ParserChoice choice = ERROR_CHOICE;

	289 std::string boundary;

	290

	291 if (content_type_header == NULL) {

	292 choice = URL_ENCODED;

	293 } else {

	294 const std::string content_type(

	295 content_type_header->substr(0, content_type_header->find(';')));

	296

	297 if (base::strcasecmp(

	298 content_type.c_str(), "application/x-www-form-urlencoded") == 0) {

	299 choice = URL_ENCODED;

	300 } else if (base::strcasecmp(

	301 content_type.c_str(), "multipart/form-data") == 0) {

	302 static const char kBoundaryString[] = "boundary=";

	303 size_t offset = content_type_header->find(kBoundaryString);

	304 if (offset == std::string::npos) {

	305 // Malformed header.

	306 return scoped_ptr<FormDataParser>();

	307 }

	308 offset += sizeof(kBoundaryString) - 1;

	309 boundary = content_type_header->substr(

	310 offset, content_type_header->find(';', offset));

	311 if (!boundary.empty())

	312 choice = MULTIPART;

	313 }

	314 }

	315 // Other cases are unparseable, including when \|content_type\| is "text/plain".

	316

	317 switch (choice) {

	318 case URL_ENCODED:

	319 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());

	320 case MULTIPART:

	321 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));

	322 default: // In other words, case ERROR_CHOICE:

	323 return scoped_ptr<FormDataParser>();

	324 }

	325 }

	326

	327 FormDataParser::FormDataParser() {}

	328

	329 // Implementation of FormDataParserUrlEncoded.

	330

	331 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =

	332 net::UnescapeRule::URL_SPECIAL_CHARS \| net::UnescapeRule::CONTROL_CHARS \|

	333 net::UnescapeRule::SPACES \| net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

	334

	335 FormDataParserUrlEncoded::FormDataParserUrlEncoded()

	336 : source_(NULL),

	337 source_set_(false),

	338 arg_name_(&name_),

	339 arg_value_(&value_) {

	340 args_[0] = &arg_name_;

	341 args_[1] = &arg_value_;

	342 }

	343

	344 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}

	345

	346 bool FormDataParserUrlEncoded::AllDataReadOK() {

	347 // All OK means we read the whole source.

	348 return source_set_ && source_.size() == 0;

	349 }

	350

	351 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {

	352 if (!source_set_)

	353 return false;

	354

	355 bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_);

	356 if (success) {

	357 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));

	358 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));

	359 }

	360 return success;

	361 }

	362

	363 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {

	364 if (source_set_)

	365 return false; // We do not allow multiple sources for this parser.

	366 source_.set(source.data(), source.size());

	367 source_set_ = true;

	368 return true;

	369 }

	370

	371 // Implementation of FormDataParserMultipart.

	372

	373 // static

	374 std::string FormDataParserMultipart::GetBoundaryPatternFromLiteral(

	375 const std::string& literal) {

	376 #define OPEN_QUOTE "\\Q"

	377 static const char opening_quote[] = OPEN_QUOTE;

	378 static const char closing_quote[] = "\\E";

	379

	380 std::string output(OPEN_QUOTE "--"); // Let us start with the "--".

	381 #undef OPEN_QUOTE

	382 re2::StringPiece seek_unquote(literal);

	383 const char* copy_start = literal.data();

	384 size_t copy_length = literal.size();

	385 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) {

	386 copy_length = seek_unquote.data() - copy_start;

	387 output.append(copy_start, copy_length);

	388 output.append(g_escape_closing_quote);

	389 output.append(opening_quote);

	390 copy_start = seek_unquote.data();

	391 }

	392 copy_length = (literal.data() + literal.size()) - copy_start;

	393 output.append(copy_start, copy_length);

	394 output.append(closing_quote);

	395 return output;

	396 }

	397

	398 // static

	399 bool FormDataParserMultipart::LookAhead(const RE2& pattern,

	400 const re2::StringPiece& input) {

	401 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);

	402 }

	403

	404 FormDataParserMultipart::FormDataParserMultipart(

	405 const std::string& boundary_separator)

	406 : dash_boundary_pattern_(GetBoundaryPatternFromLiteral(boundary_separator)),

	407 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {}

	408

	409 FormDataParserMultipart::~FormDataParserMultipart() {}

	410

	411 bool FormDataParserMultipart::AllDataReadOK() {

	412 return state_ == STATE_FINISHED;

	413 }

	414

	415 bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) {

	416 const char* data_start = source_.data();

	417 while (!LookAhead(dash_boundary_pattern_, source_)) {

	418 if (!RE2::Consume(&source_, crlf_free_pattern()) \|\|

	419 !RE2::Consume(&source_, crlf_pattern())) {

	420 state_ = STATE_ERROR;

	421 return false;

	422 }

	423 }

	424 if (data != NULL) {

	425 if (source_.data() == data_start) {

	426 // No data in this body part.

	427 state_ = STATE_ERROR;

	428 return false;

	429 }

	430 // Subtract 2u for the trailing "\r\n".

	431 data->set(data_start, source_.data() - data_start - 2u);

	432 }

	433

	434 // Finally, read the dash-boundary and either skip to the next body part, or

	435 // finish reading the source.

	436 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));

	437 if (LookAhead(closing_pattern(), source_)) {

	438 CHECK(RE2::Consume(&source_, closing_pattern()));

	439 if (RE2::Consume(&source_, epilogue_pattern()))

	440 state_ = STATE_FINISHED;

	441 else

	442 state_ = STATE_ERROR;

	443 } else { // Next body part ahead.

	444 if (!RE2::Consume(&source_, transfer_padding_pattern()))

	445 state_ = STATE_ERROR;

	446 }

	447 return state_ != STATE_ERROR;

	448 }

	449

	450 bool FormDataParserMultipart::GetNextNameValue(Result* result) {

	451 if (source_.size() == 0 \|\| state_ != STATE_READY)

	452 return false;

	453

	454 // 1. Read body-part headers.

	455 base::StringPiece name;

	456 base::StringPiece value;

	457 bool value_assigned = false;

	458 bool value_assigned_temp;

	459 while (TryReadHeader(&name, &value, &value_assigned_temp))

	460 value_assigned \|= value_assigned_temp;

	461 if (name.size() == 0) {

	462 state_ = STATE_ERROR;

	463 return false;

	464 }

	465

	466 // 2. Read the trailing CRLF after headers.

	467 if (!RE2::Consume(&source_, crlf_pattern())) {

	468 state_ = STATE_ERROR;

	469 return false;

	470 }

	471

	472 // 3. Read the data of this body part, i.e., everything until the first

	473 // dash-boundary.

	474 bool return_value = true;

	475 if (value_assigned && source_.size() == 0) // Wait for a new source?

	476 state_ = STATE_SUSPEND;

	477 else

	478 return_value = FinishReadingPart(value_assigned ? NULL : &value);

	479

	480 std::string unescaped_name = net::UnescapeURLComponent(

	481 name.as_string(),

	482 net::UnescapeRule::URL_SPECIAL_CHARS \| net::UnescapeRule::CONTROL_CHARS);

	483 result->set_name(unescaped_name);

	484 result->set_value(value);

	485

	486 return return_value;

	487 }

	488

	489 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {

	490 if (source.data() == NULL \|\| source_.size() != 0)

	491 return false;

	492 source_.set(source.data(), source.size());

	493

	494 switch (state_) {

	495 case STATE_INIT:

	496 // Seek behind the preamble.

	497 while (!LookAhead(dash_boundary_pattern_, source_)) {

	498 if (!RE2::Consume(&source_, preamble_pattern())) {

	499 state_ = STATE_ERROR;

	500 break;

	501 }

	502 }

	503 // Read dash-boundary, transfer padding, and CRLF.

	504 if (state_ != STATE_ERROR) {

	505 if (!RE2::Consume(&source_, dash_boundary_pattern_) \|\|

	506 !RE2::Consume(&source_, transfer_padding_pattern()))

	507 state_ = STATE_ERROR;

	508 else

	509 state_ = STATE_READY;

	510 }

	511 break;

	512 case STATE_READY: // Nothing to do.

	513 break;

	514 case STATE_SUSPEND:

	515 state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR;

	516 break;

	517 default:

	518 state_ = STATE_ERROR;

	519 }

	520 return state_ != STATE_ERROR;

	521 }

	522

	523 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,

	524 base::StringPiece* value,

	525 bool* value_assigned) {

	526 *value_assigned = false;

	527 const char* header_start = source_.data();

	528 if (!RE2::Consume(&source_, header_pattern()))

	529 return false;

	530 // (*) After this point we must return true, because we consumed one header.

	531

	532 // Subtract 2u for the trailing "\r\n".

	533 re2::StringPiece header(header_start, source_.data() - header_start - 2u);

	534

	535 // Now we check whether \|header\| is a Content-Disposition header, and try

	536 // to extract name and possibly value from it.

	537 if (LookAhead(content_disposition_pattern(), header)) {

	538 re2::StringPiece groups[2u];

	539

	540 if (!name_pattern().Match(header,

	541 g_content_disposition_length, header.size(),

	542 RE2::UNANCHORED, groups, 2)) {

	543 state_ = STATE_ERROR;

	544 return true; // See (*) for why true.

	545 }

	546 name->set(groups[1].data(), groups[1].size());

	547

	548 if (!value_pattern().Match(header,

	549 g_content_disposition_length, header.size(),

	550 RE2::UNANCHORED, groups, 2))

	551 return true; // See (*) for why true.

	552 value->set(groups[1].data(), groups[1].size());

	553 *value_assigned = true;

	554 }

	555 return true;

	556 }

	557

	558 } // namespace extensions

OLD	NEW