Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3069)

Unified Diff: chrome/browser/extensions/api/web_request/form_data_parser.h

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Corrected the multipart parser + parsedForm->formData Created 8 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/extensions/api/web_request/form_data_parser.h
diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.h b/chrome/browser/extensions/api/web_request/form_data_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..3de822e7307357899c6ff2621811c643f9db6eae
--- /dev/null
+++ b/chrome/browser/extensions/api/web_request/form_data_parser.h
@@ -0,0 +1,357 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
+#define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
+
+#include <string>
+#include <vector>
+
+#include "base/memory/scoped_ptr.h"
+// Cannot forward declare StringPiece because it is a typedef.
+#include "base/string_piece.h"
+
+namespace net {
+class URLRequest;
+}
+
+namespace extensions {
+
+// Interface for the form data parsers.
+class FormDataParser {
+ public:
+ class Result {
+ public:
+ Result();
+ ~Result();
+ const std::string& name() const {
+ return name_;
+ }
+ const std::string& value() const {
+ return value_;
+ }
+ void set_name(const base::StringPiece& str) {
+ str.CopyToString(&name_);
+ }
+ void set_value(const base::StringPiece& str) {
+ str.CopyToString(&value_);
+ }
+ void set_name(const std::string& str) {
+ name_ = str;
+ }
+ void set_value(const std::string& str) {
+ value_ = str;
+ }
+ void Reset();
+
battre 2012/08/16 19:18:03 nit: -1 new line
vabr (Chromium) 2012/08/17 18:29:57 Done.
+
+ private:
+ std::string name_;
+ std::string value_;
battre 2012/08/16 19:18:03 DISALLOW_COPY_AND_ASSIGN(Result); + #include "bas
vabr (Chromium) 2012/08/17 18:29:57 Done.
+ };
+
+ virtual ~FormDataParser();
+
+ // Creates a correct parser instance based on the |request|. Returns NULL
+ // on failure.
+ static scoped_ptr<FormDataParser> Create(const net::URLRequest* request);
+
+ // Creates a correct parser instance based on |content_type_header|, the
+ // "Content-Type" request header value. If |content_type_header| is NULL, it
+ // defaults to "application/x-www-form-urlencoded". Returns NULL on failure.
+ static scoped_ptr<FormDataParser> Create(
+ const std::string* content_type_header);
+
+ // Returns true if there was some data, it was well formed and all was read.
+ virtual bool AllDataReadOK() = 0;
+
+ // Returns the next name-value pair as |result|. After SetSource has
+ // succeeded, this allows to iterate over all pairs in the source.
+ // Returns true as long as a new pair was successfully found.
+ virtual bool GetNextNameValue(Result* result) = 0;
+
+ // Sets the |source| of the data to be parsed. The ownership is left with the
+ // caller and the source should live until |this| dies or |this->SetSource()|
+ // is called again, whichever comes sooner. Returns true on success.
+ virtual bool SetSource(const std::vector<char>* source) = 0;
+
+ protected:
+ FormDataParser();
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(FormDataParser);
+};
+
+// Parses URLencoded forms, see
+// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
+class FormDataParserUrlEncoded : public FormDataParser {
+ public:
+ FormDataParserUrlEncoded();
+ virtual ~FormDataParserUrlEncoded();
+
+ // Implementation of FormDataParser.
+ virtual bool AllDataReadOK() OVERRIDE;
+ virtual bool GetNextNameValue(Result* result) OVERRIDE;
+ virtual bool SetSource(const std::vector<char>* source) OVERRIDE;
+
+ private:
+ // Gets next char from |source_|, seeks, and does book-keeping of = and &.
+ // Returns false if end of |source_| was reached, otherwise true.
+ bool GetNextChar(char* c);
+ // Once called the parser gives up and claims any results so far invalid.
+ void Abort();
+
+ const std::vector<char>* source_;
+ bool aborted_;
+
+ // Variables from this block are only to be written to by GetNextChar.
+ std::vector<char>::const_iterator offset_; // Next char to be read.
+ size_t equality_signs_; // How many '=' were read so far.
+ size_t amp_signs_; // How many '&' were read so far.
+ bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')?
+
+ DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
+};
+
+// The following class, FormDataParserMultipart, parses forms encoded as
+// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
+// encoding) and 822 (MIME-headers).
+//
+// Implementation details
+//
+// The original grammar from RFC 2046 is this, "multipart-body" being the root
+// non-terminal:
+//
+// boundary := 0*69<bchars> bcharsnospace
+// bchars := bcharsnospace / " "
+// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
+// / "-" / "." / "/" / ":" / "=" / "?"
+// dash-boundary := "--" boundary
+// multipart-body := [preamble CRLF]
+// dash-boundary transport-padding CRLF
+// body-part *encapsulation
+// close-delimiter transport-padding
+// [CRLF epilogue]
+// transport-padding := *LWSP-char
+// encapsulation := delimiter transport-padding CRLF body-part
+// delimiter := CRLF dash-boundary
+// close-delimiter := delimiter "--"
+// preamble := discard-text
+// epilogue := discard-text
+// discard-text := *(*text CRLF) *text
+// body-part := MIME-part-headers [CRLF *OCTET]
+// OCTET := <any 0-255 octet value>
+//
+// Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters
+// of the English alphabet, respectively.
+// The non-terminal "text" is presumably just any text, excluding line breaks.
+// The non-terminal "LWSP-char" is not directly defined in the original grammar
+// but it means "linear whitespace", which is a space or a horizontal tab.
+// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in
+// English defined in RFC 822, and can be presented as follows:
+//
+// MIME-part-headers := *MIME-part-header
+// MIME-part-header := name ':' *(text / whitespace) linebreak
+// linebreak := '\r' / '\n' / CRLF
+// whitespace := LWSP-char / CRLF LWSP-char
+// name := namechar *namechar
+// namechar := <ASCII char between 33 and 126, excluding ':'>
+//
+// This sets of rules together compose a grammar, with the root non-terminal
+// "multipart-body". This grammer defines a regular language. Indeed, if the
+// non-terminals are ordered in this way:
+// namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace <
+// linebreak < MIME-part-header < MIME-part-headers < bcharsnospace <
+// bchars < boundary < dash-boundary < delimiter < close-delimiter <
+// discard-text < transport-padding < OCTET < body-part < encapsulation <
+// multipart-body
+// then it is easy to verify that whenever A<B then no grammar rule with head
+// A contains B in the body. By induction on the above order, each non-terminal
+// defines a regular language: a non-terminal C is defined by a rule C := exp,
+// where "exp" is an expression composed from character constants, non-terminals
+// less than C, and the following closure operations of regular languages:
+// concatenation, union and Kleene-star. By induction, all the lesser
+// non-terminals represent regular languages, thus "exp" also represents a
+// regular language. In particular, the root non-terminal (and thus the grammar)
+// defines a regular language.
+//
+// The FormDataParseMultipart class uses a finite automaton to represent this
+// language. It is easiest to view it in an extended form, with longer words
+// allowed to label a single transition to keep the number of states is low.
+// Importand states have full-word names, unimportant states (allways with only
+// one incoming label) have names abbreviating the incoming label, possibly
+// with an index.
+//
+// Automaton for "multipart-body":
vabr (Chromium) 2012/08/16 08:00:59 An alternative to hand-writing the automaton would
+// Initial state = Start
+// Final states = {End, IgnoreEpilogue}
+// Implicit state (when a transition is missing) = Error
+// Transition table ('*' is a label matching everything not matched by other
+// labels leaving the same state):
+// FROM LABEL TO
+// Start dash-boundary DB1
+// CR CR1
+// * IgnorePreamble
+// CR1 LF Start
+// * IgnorePreamble
+// IgnorePreamble CR CR1
+// * IgnorePreamble
+// DB1 LWSP-char DB1
+// CR CR2
+// CR2 LF Part
+// Part <ASCII 33-126, excluding ':'> Name
+// CR CR3
+// Name <ASCII 33-126, excluding ':'> Name
+// ':' Colon
+// Colon LF End1
+// CR End2
+// * Colon
+// End1 CR CR3
+// <ASCII 33-126, excluding ':'> Name
+// End2 LF End3
+// CR CR3
+// <ASCII 33-126, excluding ':'> Name
+// End3 LWSP-char Colon
+// CR CR3
+// <ASCII 33-126, excluding ':'> Name
+// CR3 LF PreData
+// PreData dash-boundary DB2
+// CR CR4
+// * Data
+// CR4 LF Data2
+// * Data
+// Data CR CR4
+// * Data
+// Data2 dash-boundary DB2
+// * CR4
+// DB2 LWSP-char DB1
+// CR CR2
+// '-' D
+// D '-' End
+// End LWSP-char End
+// CR CR5
+// CR5 LF IgnoreEpilogue
+// IgnoreEpilogue * IgnoreEpilogue
+//
+// The automaton itself only allows to check that the input is a well-formed
+// multipart encoding of a form. To also extract the data, additional logic is
+// added:
+// * The header "Content-Disposition" (read between Part and PreData) contains
+// the elements name=... and optionally filename=... The former is the name
+// of the corresponding field of a form. The latter is only present if that
+// field was a file-upload, and contains the path to the uploaded file.
+// * The data of a message part is read between PreData and DB2, excluding the
+// last CR LF dash-boundary.
+//
+// IMPORTANT NOTE
+// This parser supports multiple sources, i.e., SetSource can be called multiple
+// times if the input is spread over several byte vectors. However, the split
+// must not occur in the middle of a transition of the above described automata,
+// e.g., if there is a transition StateA --dash-boundary--> StateB, then the
+// whole string with the dash--boundary bust be contained in the first source,
+// or in the other. Also, the split must not occur in the middle of a header,
+// or a part body data. A message part from one source must be read via
+// GetNextNameValue before setting up a new source.
+class FormDataParserMultipart : public FormDataParser {
+ public:
+ explicit FormDataParserMultipart(const std::string& boundary_separator);
+ virtual ~FormDataParserMultipart();
+
+ // Implementation of FormDataParser.
+ virtual bool AllDataReadOK() OVERRIDE;
+ virtual bool GetNextNameValue(Result* result) OVERRIDE;
+ virtual bool SetSource(const std::vector<char>* source) OVERRIDE;
+
+ private:
+ // State and Transition are numbered to make sure they form a continuous block
+ // of numbers for array indexing in lookup tables. If changing State or
+ // Transition, don't forget to update k*Size and the lookup tables.
+ enum State {
+ kStart = 0,
+ kCR1 = 1,
+ kIgnorePreamble = 2,
+ kDB1 = 3,
+ kCR2 = 4,
+ kPart = 5,
+ kName = 6,
+ kColonS = 7, // "S" to distinguish it from the transition kColonT.
+ kEnd1 = 8,
+ kEnd2 = 9,
+ kEnd3 = 10,
+ kCR3 = 11,
+ kPreData = 12,
+ kCR4 = 13,
+ kData = 14,
+ kData2 = 15,
+ kDB2 = 16,
+ kD = 17,
+ kEnd = 18,
+ kCR5 = 19,
+ kIgnoreEpilogue = 20,
+ kError = 21
+ };
+ enum Transition {
+ kLF = 0,
+ kCR = 1,
+ kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'.
+ kLwsp = 3,
+ kDashBoundary = 4,
+ kColonT = 5, // "T" to distinguish it from the state kColonS.
+ kDash = 6, // Meaning '-', not "--".
+ kAny = 7 // To represent '*'.
+ };
+ static const size_t kStateSize = 22;
+ static const size_t kTransitionSize = 8;
+
+ // Lookup tables:
+ // Maps transitions with one-character label to that character (else to 0).
+ static char kTransitionToChar[];
+ // Indices of transitions available in state |s| in |kAvailableTransitions|
+ // start at kStateToTransition[s] and the last transition for |s| is always
+ // kAny. The target state corresponding to transition kAvailableTransitions[i]
+ // is kNextState[i].
+ static Transition kAvailableTransitions[];
+ static State kNextState[];
+ static size_t kStateToTransition[];
+
+ // Reads the source until the next name-value pair is read. Returns true if
+ // |next_name_| and |next_value_| were successfully updated.
+ bool ReadNextNameValue();
+ // One step of the automaton, based on |state_| and the input from |source_|
+ // to be read. Updates the |offset_| iterator. Returns true on success.
+ bool DoStep();
+ // Tests whether the input pointed to by |offset_| allows to read transition
+ // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read.
+ size_t LookUp(Transition t);
+
+ // Extracts "name" and possibly "value" from a Content-Disposition header.
+ // Writes directly into |next_name_| and |next_value_|. Returns true on
+ // success and false otherwise.
+ bool ParseHeader(const base::StringPiece& header);
+
+ bool InFinalState() {
+ return state_ == kEnd || state_ == kIgnoreEpilogue;
+ }
+
+ // The parsed message can be split into multiple sources which we read
+ // sequentially.
+ const std::vector<char>* source_;
+ std::vector<char>::const_iterator offset_;
+ // The dash-boundary string is used for all sources.
+ const std::string dash_boundary_;
+ State state_;
+ // The next result to be returned by GetNextNameValue. It is stored as a pair
+ // of StringPieces instead of a Result, to avoid one copy of the data (note
+ // that Result stores a copy of the data in std::string, whereas StringPiece
+ // is just a pointer to source_).
+ base::StringPiece next_name_;
+ base::StringPiece next_value_;
+ bool value_name_present_;
+
+ DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
+};
+
+} // namespace extensions
+
+#endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_

Powered by Google App Engine
This is Rietveld 408576698