chrome/browser/extensions/api/web_request/form_data_parser.h - Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.h

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Corrected the multipart parser + parsedForm->formData Created 8 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_

	6 #define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_

	7

	8 #include <string>

	9 #include <vector>

	10

	11 #include "base/memory/scoped_ptr.h"

	12 // Cannot forward declare StringPiece because it is a typedef.

	13 #include "base/string_piece.h"

	14

	15 namespace net {

	16 class URLRequest;

	17 }

	18

	19 namespace extensions {

	20

	21 // Interface for the form data parsers.

	22 class FormDataParser {

	23 public:

	24 class Result {

	25 public:

	26 Result();

	27 ~Result();

	28 const std::string& name() const {

	29 return name_;

	30 }

	31 const std::string& value() const {

	32 return value_;

	33 }

	34 void set_name(const base::StringPiece& str) {

	35 str.CopyToString(&name_);

	36 }

	37 void set_value(const base::StringPiece& str) {

	38 str.CopyToString(&value_);

	39 }

	40 void set_name(const std::string& str) {

	41 name_ = str;

	42 }

	43 void set_value(const std::string& str) {

	44 value_ = str;

	45 }

	46 void Reset();

	47
	battre 2012/08/16 19:18:03 nit: -1 new line nit: -1 new line vabr (Chromium) 2012/08/17 18:29:57 Done. Show quoted text On 2012/08/16 19:18:03, battre wrote: > nit: -1 new line Done.
	48

	49 private:

	50 std::string name_;

	51 std::string value_;
	battre 2012/08/16 19:18:03 DISALLOW_COPY_AND_ASSIGN(Result); + #include "bas DISALLOW_COPY_AND_ASSIGN(Result); + #include "base/basictypes.h" vabr (Chromium) 2012/08/17 18:29:57 Done. Show quoted text On 2012/08/16 19:18:03, battre wrote: > DISALLOW_COPY_AND_ASSIGN(Result); > > + #include "base/basictypes.h" Done.
	52 };

	53

	54 virtual ~FormDataParser();

	55

	56 // Creates a correct parser instance based on the \|request\|. Returns NULL

	57 // on failure.

	58 static scoped_ptr<FormDataParser> Create(const net::URLRequest* request);

	59

	60 // Creates a correct parser instance based on \|content_type_header\|, the

	61 // "Content-Type" request header value. If \|content_type_header\| is NULL, it

	62 // defaults to "application/x-www-form-urlencoded". Returns NULL on failure.

	63 static scoped_ptr<FormDataParser> Create(

	64 const std::string* content_type_header);

	65

	66 // Returns true if there was some data, it was well formed and all was read.

	67 virtual bool AllDataReadOK() = 0;

	68

	69 // Returns the next name-value pair as \|result\|. After SetSource has

	70 // succeeded, this allows to iterate over all pairs in the source.

	71 // Returns true as long as a new pair was successfully found.

	72 virtual bool GetNextNameValue(Result* result) = 0;

	73

	74 // Sets the \|source\| of the data to be parsed. The ownership is left with the

	75 // caller and the source should live until \|this\| dies or \|this->SetSource()\|

	76 // is called again, whichever comes sooner. Returns true on success.

	77 virtual bool SetSource(const std::vector<char>* source) = 0;

	78

	79 protected:

	80 FormDataParser();

	81

	82 private:

	83 DISALLOW_COPY_AND_ASSIGN(FormDataParser);

	84 };

	85

	86 // Parses URLencoded forms, see

	87 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .

	88 class FormDataParserUrlEncoded : public FormDataParser {

	89 public:

	90 FormDataParserUrlEncoded();

	91 virtual ~FormDataParserUrlEncoded();

	92

	93 // Implementation of FormDataParser.

	94 virtual bool AllDataReadOK() OVERRIDE;

	95 virtual bool GetNextNameValue(Result* result) OVERRIDE;

	96 virtual bool SetSource(const std::vector<char>* source) OVERRIDE;

	97

	98 private:

	99 // Gets next char from \|source_\|, seeks, and does book-keeping of = and &.

	100 // Returns false if end of \|source_\| was reached, otherwise true.

	101 bool GetNextChar(char* c);

	102 // Once called the parser gives up and claims any results so far invalid.

	103 void Abort();

	104

	105 const std::vector<char>* source_;

	106 bool aborted_;

	107

	108 // Variables from this block are only to be written to by GetNextChar.

	109 std::vector<char>::const_iterator offset_; // Next char to be read.

	110 size_t equality_signs_; // How many '=' were read so far.

	111 size_t amp_signs_; // How many '&' were read so far.

	112 bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')?

	113

	114 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);

	115 };

	116

	117 // The following class, FormDataParserMultipart, parses forms encoded as

	118 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart

	119 // encoding) and 822 (MIME-headers).

	120 //

	121 // Implementation details

	122 //

	123 // The original grammar from RFC 2046 is this, "multipart-body" being the root

	124 // non-terminal:

	125 //

	126 // boundary := 0*69<bchars> bcharsnospace

	127 // bchars := bcharsnospace / " "

	128 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","

	129 // / "-" / "." / "/" / ":" / "=" / "?"

	130 // dash-boundary := "--" boundary

	131 // multipart-body := [preamble CRLF]

	132 // dash-boundary transport-padding CRLF

	133 // body-part *encapsulation

	134 // close-delimiter transport-padding

	135 // [CRLF epilogue]

	136 // transport-padding := *LWSP-char

	137 // encapsulation := delimiter transport-padding CRLF body-part

	138 // delimiter := CRLF dash-boundary

	139 // close-delimiter := delimiter "--"

	140 // preamble := discard-text

	141 // epilogue := discard-text

	142 // discard-text := (text CRLF) *text

	143 // body-part := MIME-part-headers [CRLF *OCTET]

	144 // OCTET := <any 0-255 octet value>

	145 //

	146 // Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters

	147 // of the English alphabet, respectively.

	148 // The non-terminal "text" is presumably just any text, excluding line breaks.

	149 // The non-terminal "LWSP-char" is not directly defined in the original grammar

	150 // but it means "linear whitespace", which is a space or a horizontal tab.

	151 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in

	152 // English defined in RFC 822, and can be presented as follows:

	153 //

	154 // MIME-part-headers := *MIME-part-header

	155 // MIME-part-header := name ':' *(text / whitespace) linebreak

	156 // linebreak := '\r' / '\n' / CRLF

	157 // whitespace := LWSP-char / CRLF LWSP-char

	158 // name := namechar *namechar

	159 // namechar := <ASCII char between 33 and 126, excluding ':'>

	160 //

	161 // This sets of rules together compose a grammar, with the root non-terminal

	162 // "multipart-body". This grammer defines a regular language. Indeed, if the

	163 // non-terminals are ordered in this way:

	164 // namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace <

	165 // linebreak < MIME-part-header < MIME-part-headers < bcharsnospace <

	166 // bchars < boundary < dash-boundary < delimiter < close-delimiter <

	167 // discard-text < transport-padding < OCTET < body-part < encapsulation <

	168 // multipart-body

	169 // then it is easy to verify that whenever A<B then no grammar rule with head

	170 // A contains B in the body. By induction on the above order, each non-terminal

	171 // defines a regular language: a non-terminal C is defined by a rule C := exp,

	172 // where "exp" is an expression composed from character constants, non-terminals

	173 // less than C, and the following closure operations of regular languages:

	174 // concatenation, union and Kleene-star. By induction, all the lesser

	175 // non-terminals represent regular languages, thus "exp" also represents a

	176 // regular language. In particular, the root non-terminal (and thus the grammar)

	177 // defines a regular language.

	178 //

	179 // The FormDataParseMultipart class uses a finite automaton to represent this

	180 // language. It is easiest to view it in an extended form, with longer words

	181 // allowed to label a single transition to keep the number of states is low.

	182 // Importand states have full-word names, unimportant states (allways with only

	183 // one incoming label) have names abbreviating the incoming label, possibly

	184 // with an index.

	185 //

	186 // Automaton for "multipart-body":
	vabr (Chromium) 2012/08/16 08:00:59 An alternative to hand-writing the automaton would An alternative to hand-writing the automaton would be to encode the "multipart-body" as a regular expression and use, e.g., RE2 library to parse it. Pros: * shorter and clearer code * easily checkable against errors in language specification * very probably faster, although not sure how significantly Cons: * expects the whole input string in a continuous segment of memory, whereas we have several displaced instances of vector<char>; we don't really want to copy them just to have them in one place, they can be big I decided the single "con" was heavier than the three "pros", but feel free to let me know if you think opposite and/or have suggestions at overcoming the "con".
	187 // Initial state = Start

	188 // Final states = {End, IgnoreEpilogue}

	189 // Implicit state (when a transition is missing) = Error

	190 // Transition table ('*' is a label matching everything not matched by other

	191 // labels leaving the same state):

	192 // FROM LABEL TO

	193 // Start dash-boundary DB1

	194 // CR CR1

	195 // * IgnorePreamble

	196 // CR1 LF Start

	197 // * IgnorePreamble

	198 // IgnorePreamble CR CR1

	199 // * IgnorePreamble

	200 // DB1 LWSP-char DB1

	201 // CR CR2

	202 // CR2 LF Part

	203 // Part <ASCII 33-126, excluding ':'> Name

	204 // CR CR3

	205 // Name <ASCII 33-126, excluding ':'> Name

	206 // ':' Colon

	207 // Colon LF End1

	208 // CR End2

	209 // * Colon

	210 // End1 CR CR3

	211 // <ASCII 33-126, excluding ':'> Name

	212 // End2 LF End3

	213 // CR CR3

	214 // <ASCII 33-126, excluding ':'> Name

	215 // End3 LWSP-char Colon

	216 // CR CR3

	217 // <ASCII 33-126, excluding ':'> Name

	218 // CR3 LF PreData

	219 // PreData dash-boundary DB2

	220 // CR CR4

	221 // * Data

	222 // CR4 LF Data2

	223 // * Data

	224 // Data CR CR4

	225 // * Data

	226 // Data2 dash-boundary DB2

	227 // * CR4

	228 // DB2 LWSP-char DB1

	229 // CR CR2

	230 // '-' D

	231 // D '-' End

	232 // End LWSP-char End

	233 // CR CR5

	234 // CR5 LF IgnoreEpilogue

	235 // IgnoreEpilogue * IgnoreEpilogue

	236 //

	237 // The automaton itself only allows to check that the input is a well-formed

	238 // multipart encoding of a form. To also extract the data, additional logic is

	239 // added:

	240 // * The header "Content-Disposition" (read between Part and PreData) contains

	241 // the elements name=... and optionally filename=... The former is the name

	242 // of the corresponding field of a form. The latter is only present if that

	243 // field was a file-upload, and contains the path to the uploaded file.

	244 // * The data of a message part is read between PreData and DB2, excluding the

	245 // last CR LF dash-boundary.

	246 //

	247 // IMPORTANT NOTE

	248 // This parser supports multiple sources, i.e., SetSource can be called multiple

	249 // times if the input is spread over several byte vectors. However, the split

	250 // must not occur in the middle of a transition of the above described automata,

	251 // e.g., if there is a transition StateA --dash-boundary--> StateB, then the

	252 // whole string with the dash--boundary bust be contained in the first source,

	253 // or in the other. Also, the split must not occur in the middle of a header,

	254 // or a part body data. A message part from one source must be read via

	255 // GetNextNameValue before setting up a new source.

	256 class FormDataParserMultipart : public FormDataParser {

	257 public:

	258 explicit FormDataParserMultipart(const std::string& boundary_separator);

	259 virtual ~FormDataParserMultipart();

	260

	261 // Implementation of FormDataParser.

	262 virtual bool AllDataReadOK() OVERRIDE;

	263 virtual bool GetNextNameValue(Result* result) OVERRIDE;

	264 virtual bool SetSource(const std::vector<char>* source) OVERRIDE;

	265

	266 private:

	267 // State and Transition are numbered to make sure they form a continuous block

	268 // of numbers for array indexing in lookup tables. If changing State or

	269 // Transition, don't forget to update k*Size and the lookup tables.

	270 enum State {

	271 kStart = 0,

	272 kCR1 = 1,

	273 kIgnorePreamble = 2,

	274 kDB1 = 3,

	275 kCR2 = 4,

	276 kPart = 5,

	277 kName = 6,

	278 kColonS = 7, // "S" to distinguish it from the transition kColonT.

	279 kEnd1 = 8,

	280 kEnd2 = 9,

	281 kEnd3 = 10,

	282 kCR3 = 11,

	283 kPreData = 12,

	284 kCR4 = 13,

	285 kData = 14,

	286 kData2 = 15,

	287 kDB2 = 16,

	288 kD = 17,

	289 kEnd = 18,

	290 kCR5 = 19,

	291 kIgnoreEpilogue = 20,

	292 kError = 21

	293 };

	294 enum Transition {

	295 kLF = 0,

	296 kCR = 1,

	297 kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'.

	298 kLwsp = 3,

	299 kDashBoundary = 4,

	300 kColonT = 5, // "T" to distinguish it from the state kColonS.

	301 kDash = 6, // Meaning '-', not "--".

	302 kAny = 7 // To represent '*'.

	303 };

	304 static const size_t kStateSize = 22;

	305 static const size_t kTransitionSize = 8;

	306

	307 // Lookup tables:

	308 // Maps transitions with one-character label to that character (else to 0).

	309 static char kTransitionToChar[];

	310 // Indices of transitions available in state \|s\| in \|kAvailableTransitions\|

	311 // start at kStateToTransition[s] and the last transition for \|s\| is always

	312 // kAny. The target state corresponding to transition kAvailableTransitions[i]

	313 // is kNextState[i].

	314 static Transition kAvailableTransitions[];

	315 static State kNextState[];

	316 static size_t kStateToTransition[];

	317

	318 // Reads the source until the next name-value pair is read. Returns true if

	319 // \|next_name_\| and \|next_value_\| were successfully updated.

	320 bool ReadNextNameValue();

	321 // One step of the automaton, based on \|state_\| and the input from \|source_\|

	322 // to be read. Updates the \|offset_\| iterator. Returns true on success.

	323 bool DoStep();

	324 // Tests whether the input pointed to by \|offset_\| allows to read transition

	325 // \|t\|. It returns the number of bytes to be read, or 0 if \|t\| cannot be read.

	326 size_t LookUp(Transition t);

	327

	328 // Extracts "name" and possibly "value" from a Content-Disposition header.

	329 // Writes directly into \|next_name_\| and \|next_value_\|. Returns true on

	330 // success and false otherwise.

	331 bool ParseHeader(const base::StringPiece& header);

	332

	333 bool InFinalState() {

	334 return state_ == kEnd \|\| state_ == kIgnoreEpilogue;

	335 }

	336

	337 // The parsed message can be split into multiple sources which we read

	338 // sequentially.

	339 const std::vector<char>* source_;

	340 std::vector<char>::const_iterator offset_;

	341 // The dash-boundary string is used for all sources.

	342 const std::string dash_boundary_;

	343 State state_;

	344 // The next result to be returned by GetNextNameValue. It is stored as a pair

	345 // of StringPieces instead of a Result, to avoid one copy of the data (note

	346 // that Result stores a copy of the data in std::string, whereas StringPiece

	347 // is just a pointer to source_).

	348 base::StringPiece next_name_;

	349 base::StringPiece next_value_;

	350 bool value_name_present_;

	351

	352 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);

	353 };

	354

	355 } // namespace extensions

	356

	357 #endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_

OLD	NEW