Index: chrome/browser/extensions/api/web_request/form_data_parser.h |
diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.h b/chrome/browser/extensions/api/web_request/form_data_parser.h |
new file mode 100644 |
index 0000000000000000000000000000000000000000..5058cc412c3aec134a6b09d22da73b6031bac36d |
--- /dev/null |
+++ b/chrome/browser/extensions/api/web_request/form_data_parser.h |
@@ -0,0 +1,361 @@ |
+// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ |
+#define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ |
+ |
+#include <string> |
+#include <vector> |
+ |
+#include "base/basictypes.h" |
+#include "base/memory/scoped_ptr.h" |
+// Cannot forward declare StringPiece because it is a typedef. |
+#include "base/string_piece.h" |
+ |
+namespace net { |
+class URLRequest; |
+} |
+ |
+namespace extensions { |
+ |
+// Interface for the form data parsers. |
+class FormDataParser { |
+ public: |
+ class Result { |
+ public: |
+ Result(); |
+ ~Result(); |
+ const std::string& name() const { |
+ return name_; |
+ } |
+ const std::string& value() const { |
+ return value_; |
+ } |
+ void set_name(const base::StringPiece& str) { |
+ str.CopyToString(&name_); |
+ } |
+ void set_value(const base::StringPiece& str) { |
+ str.CopyToString(&value_); |
+ } |
+ void set_name(const std::string& str) { |
+ name_ = str; |
+ } |
+ void set_value(const std::string& str) { |
+ value_ = str; |
+ } |
+ void Reset(); |
+ |
+ private: |
+ std::string name_; |
+ std::string value_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(Result); |
+ }; |
+ |
+ virtual ~FormDataParser(); |
+ |
+ // Creates a correct parser instance based on the |request|. Returns NULL |
+ // on failure. |
+ static scoped_ptr<FormDataParser> Create(const net::URLRequest* request); |
+ |
+ // Creates a correct parser instance based on |content_type_header|, the |
+ // "Content-Type" request header value. If |content_type_header| is NULL, it |
+ // defaults to "application/x-www-form-urlencoded". Returns NULL on failure. |
+ static scoped_ptr<FormDataParser> Create( |
+ const std::string* content_type_header); |
+ |
+ // Returns true if there was some data, it was well formed and all was read. |
+ virtual bool AllDataReadOK() = 0; |
+ |
+ // Returns the next name-value pair as |result|. After SetSource has |
+ // succeeded, this allows to iterate over all pairs in the source. |
+ // Returns true as long as a new pair was successfully found. |
+ virtual bool GetNextNameValue(Result* result) = 0; |
+ |
+ // Sets the |source| of the data to be parsed. The ownership is left with the |
+ // caller and the source should live until |this| dies or |this->SetSource()| |
+ // is called again, whichever comes sooner. Returns true on success. |
+ virtual bool SetSource(const base::StringPiece& source) = 0; |
+ |
+ protected: |
+ FormDataParser(); |
+ |
+ private: |
+ DISALLOW_COPY_AND_ASSIGN(FormDataParser); |
+}; |
+ |
+// Parses URLencoded forms, see |
+// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . |
+class FormDataParserUrlEncoded : public FormDataParser { |
tkent
2012/08/24 14:26:50
It seems this class is not referred by files other
vabr (Chromium)
2012/08/24 16:16:59
Done, thanks for spotting this.
|
+ public: |
+ FormDataParserUrlEncoded(); |
+ virtual ~FormDataParserUrlEncoded(); |
+ |
+ // Implementation of FormDataParser. |
+ virtual bool AllDataReadOK() OVERRIDE; |
+ virtual bool GetNextNameValue(Result* result) OVERRIDE; |
+ virtual bool SetSource(const base::StringPiece& source) OVERRIDE; |
+ |
+ private: |
+ // Gets next char from |source_|, seeks, and does book-keeping of = and &. |
+ // Returns false if end of |source_| was reached, otherwise true. |
+ bool GetNextChar(char* c); |
+ // Once called the parser gives up and claims any results so far invalid. |
+ void Abort(); |
+ |
+ base::StringPiece source_; |
+ const char* source_end_; |
+ bool aborted_; |
+ |
+ // Variables from this block are only to be written to by GetNextChar. |
+ const char* offset_; // Next char to be read. |
+ size_t equality_signs_; // How many '=' were read so far. |
+ size_t amp_signs_; // How many '&' were read so far. |
+ bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')? |
+ |
+ DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); |
+}; |
+ |
+// The following class, FormDataParserMultipart, parses forms encoded as |
+// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart |
+// encoding) and 822 (MIME-headers). |
+// |
+// Implementation details |
+// |
+// The original grammar from RFC 2046 is this, "multipart-body" being the root |
+// non-terminal: |
+// |
+// boundary := 0*69<bchars> bcharsnospace |
+// bchars := bcharsnospace / " " |
+// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," |
+// / "-" / "." / "/" / ":" / "=" / "?" |
+// dash-boundary := "--" boundary |
+// multipart-body := [preamble CRLF] |
+// dash-boundary transport-padding CRLF |
+// body-part *encapsulation |
+// close-delimiter transport-padding |
+// [CRLF epilogue] |
+// transport-padding := *LWSP-char |
+// encapsulation := delimiter transport-padding CRLF body-part |
+// delimiter := CRLF dash-boundary |
+// close-delimiter := delimiter "--" |
+// preamble := discard-text |
+// epilogue := discard-text |
+// discard-text := *(*text CRLF) *text |
+// body-part := MIME-part-headers [CRLF *OCTET] |
+// OCTET := <any 0-255 octet value> |
+// |
+// Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters |
+// of the English alphabet, respectively. |
+// The non-terminal "text" is presumably just any text, excluding line breaks. |
+// The non-terminal "LWSP-char" is not directly defined in the original grammar |
+// but it means "linear whitespace", which is a space or a horizontal tab. |
+// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in |
+// English defined in RFC 822, and can be presented as follows: |
+// |
+// MIME-part-headers := *MIME-part-header |
+// MIME-part-header := name ':' *(text / whitespace) linebreak |
+// linebreak := '\r' / '\n' / CRLF |
+// whitespace := LWSP-char / CRLF LWSP-char |
+// name := namechar *namechar |
+// namechar := <ASCII char between 33 and 126, excluding ':'> |
+// |
+// This sets of rules together compose a grammar, with the root non-terminal |
+// "multipart-body". This grammer defines a regular language. Indeed, if the |
+// non-terminals are ordered in this way: |
+// namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace < |
+// linebreak < MIME-part-header < MIME-part-headers < bcharsnospace < |
+// bchars < boundary < dash-boundary < delimiter < close-delimiter < |
+// discard-text < transport-padding < OCTET < body-part < encapsulation < |
+// multipart-body |
+// then it is easy to verify that whenever A<B then no grammar rule with head |
+// A contains B in the body. By induction on the above order, each non-terminal |
+// defines a regular language: a non-terminal C is defined by a rule C := exp, |
+// where "exp" is an expression composed from character constants, non-terminals |
+// less than C, and the following closure operations of regular languages: |
+// concatenation, union and Kleene-star. By induction, all the lesser |
+// non-terminals represent regular languages, thus "exp" also represents a |
+// regular language. In particular, the root non-terminal (and thus the grammar) |
+// defines a regular language. |
+// |
+// The FormDataParseMultipart class uses a finite automaton to represent this |
+// language. It is easiest to view it in an extended form, with longer words |
+// allowed to label a single transition to keep the number of states is low. |
+// Importand states have full-word names, unimportant states (allways with only |
+// one incoming label) have names abbreviating the incoming label, possibly |
+// with an index. |
+// |
+// Automaton for "multipart-body": |
+// Initial state = Start |
+// Final states = {End, IgnoreEpilogue} |
+// Implicit state (when a transition is missing) = Error |
+// Transition table ('*' is a label matching everything not matched by other |
+// labels leaving the same state): |
+// FROM LABEL TO |
+// Start dash-boundary DB1 |
+// CR CR1 |
+// * IgnorePreamble |
+// CR1 LF Start |
+// * IgnorePreamble |
+// IgnorePreamble CR CR1 |
+// * IgnorePreamble |
+// DB1 LWSP-char DB1 |
+// CR CR2 |
+// CR2 LF Part |
+// Part <ASCII 33-126, excluding ':'> Name |
+// CR CR3 |
+// Name <ASCII 33-126, excluding ':'> Name |
+// ':' Colon |
+// Colon LF End1 |
+// CR End2 |
+// * Colon |
+// End1 CR CR3 |
+// <ASCII 33-126, excluding ':'> Name |
+// End2 LF End3 |
+// CR CR3 |
+// <ASCII 33-126, excluding ':'> Name |
+// End3 LWSP-char Colon |
+// CR CR3 |
+// <ASCII 33-126, excluding ':'> Name |
+// CR3 LF PreData |
+// PreData dash-boundary DB2 |
+// CR CR4 |
+// * Data |
+// CR4 LF Data2 |
+// * Data |
+// Data CR CR4 |
+// * Data |
+// Data2 dash-boundary DB2 |
+// * CR4 |
+// DB2 LWSP-char DB1 |
+// CR CR2 |
+// '-' D |
+// D '-' End |
+// End LWSP-char End |
+// CR CR5 |
+// CR5 LF IgnoreEpilogue |
+// IgnoreEpilogue * IgnoreEpilogue |
+// |
+// The automaton itself only allows to check that the input is a well-formed |
+// multipart encoding of a form. To also extract the data, additional logic is |
+// added: |
+// * The header "Content-Disposition" (read between Part and PreData) contains |
+// the elements name=... and optionally filename=... The former is the name |
+// of the corresponding field of a form. The latter is only present if that |
+// field was a file-upload, and contains the path to the uploaded file. |
+// * The data of a message part is read between PreData and DB2, excluding the |
+// last CR LF dash-boundary. |
+// |
+// IMPORTANT NOTE |
+// This parser supports multiple sources, i.e., SetSource can be called multiple |
+// times if the input is spread over several byte blocks. However, the split |
+// must not occur in the middle of a transition of the above described automata, |
+// e.g., if there is a transition StateA --dash-boundary--> StateB, then the |
+// whole string with the dash--boundary bust be contained in the first source, |
+// or in the other. Also, the split must not occur in the middle of a header, |
+// or a part body data. A message part from one source must be read via |
+// GetNextNameValue before setting up a new source. |
+class FormDataParserMultipart : public FormDataParser { |
tkent
2012/08/24 14:26:50
ditto.
vabr (Chromium)
2012/08/24 16:16:59
Done.
|
+ public: |
+ explicit FormDataParserMultipart(const std::string& boundary_separator); |
+ virtual ~FormDataParserMultipart(); |
+ |
+ // Implementation of FormDataParser. |
+ virtual bool AllDataReadOK() OVERRIDE; |
+ virtual bool GetNextNameValue(Result* result) OVERRIDE; |
+ virtual bool SetSource(const base::StringPiece& source) OVERRIDE; |
+ |
+ private: |
+ // State and Transition are numbered to make sure they form a continuous block |
+ // of numbers for array indexing in lookup tables. If changing State or |
+ // Transition, don't forget to update k*Size and the lookup tables. |
+ enum State { |
+ kStart = 0, |
+ kCR1 = 1, |
+ kIgnorePreamble = 2, |
+ kDB1 = 3, |
+ kCR2 = 4, |
+ kPart = 5, |
+ kName = 6, |
+ kColonS = 7, // "S" to distinguish it from the transition kColonT. |
+ kEnd1 = 8, |
+ kEnd2 = 9, |
+ kEnd3 = 10, |
+ kCR3 = 11, |
+ kPreData = 12, |
+ kCR4 = 13, |
+ kData = 14, |
+ kData2 = 15, |
+ kDB2 = 16, |
+ kD = 17, |
+ kEnd = 18, |
+ kCR5 = 19, |
+ kIgnoreEpilogue = 20, |
+ kError = 21 |
+ }; |
+ enum Transition { |
+ kLF = 0, |
+ kCR = 1, |
+ kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'. |
+ kLwsp = 3, |
+ kDashBoundary = 4, |
+ kColonT = 5, // "T" to distinguish it from the state kColonS. |
+ kDash = 6, // Meaning '-', not "--". |
+ kAny = 7 // To represent '*'. |
+ }; |
+ static const size_t kStateSize = 22; |
+ static const size_t kTransitionSize = 8; |
+ |
+ // Lookup tables: |
+ // Maps transitions with one-character label to that character (else to 0). |
+ static char kTransitionToChar[]; |
+ // Indices of transitions available in state |s| in |kAvailableTransitions| |
+ // start at kStateToTransition[s] and the last transition for |s| is always |
+ // kAny. The target state corresponding to transition kAvailableTransitions[i] |
+ // is kNextState[i]. |
+ static Transition kAvailableTransitions[]; |
+ static State kNextState[]; |
+ static size_t kStateToTransition[]; |
+ |
+ // Reads the source until the next name-value pair is read. Returns true if |
+ // |next_name_| and |next_value_| were successfully updated. |
+ bool ReadNextNameValue(); |
+ // One step of the automaton, based on |state_| and the input from |source_| |
+ // to be read. Updates the |offset_| iterator. Returns true on success. |
+ bool DoStep(); |
+ // Tests whether the input pointed to by |offset_| allows to read transition |
+ // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read. |
+ size_t LookUp(Transition t); |
+ |
+ // Extracts "name" and possibly "value" from a Content-Disposition header. |
+ // Writes directly into |next_name_| and |next_value_|. Returns true on |
+ // success and false otherwise. |
+ bool ParseHeader(const base::StringPiece& header); |
+ |
+ bool InFinalState() { |
+ return state_ == kEnd || state_ == kIgnoreEpilogue; |
+ } |
+ |
+ // The parsed message can be split into multiple sources which we read |
+ // sequentially. |
+ base::StringPiece source_; |
+ const char* source_end_; |
+ const char* offset_; |
+ // The dash-boundary string is used for all sources. |
+ const std::string dash_boundary_; |
+ State state_; |
+ // The next result to be returned by GetNextNameValue. It is stored as a pair |
+ // of StringPieces instead of a Result, to avoid one copy of the data (note |
+ // that Result stores a copy of the data in std::string, whereas StringPiece |
+ // is just a pointer to the data in |source_|). |
+ base::StringPiece next_name_; |
+ base::StringPiece next_value_; |
+ bool value_name_present_; |
+ |
+ DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); |
+}; |
+ |
+} // namespace extensions |
+ |
+#endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ |