| Index: chrome/browser/extensions/api/web_request/form_data_parser.h
|
| diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.h b/chrome/browser/extensions/api/web_request/form_data_parser.h
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..e7807e8676622983dcc2387e0fb924c34bb8cc8b
|
| --- /dev/null
|
| +++ b/chrome/browser/extensions/api/web_request/form_data_parser.h
|
| @@ -0,0 +1,359 @@
|
| +// Copyright (c) 2012 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
|
| +#define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
|
| +
|
| +#include <string>
|
| +#include <vector>
|
| +
|
| +#include "base/basictypes.h"
|
| +#include "base/memory/scoped_ptr.h"
|
| +// Cannot forward declare StringPiece because it is a typedef.
|
| +#include "base/string_piece.h"
|
| +
|
| +namespace net {
|
| +class URLRequest;
|
| +}
|
| +
|
| +namespace extensions {
|
| +
|
| +// Interface for the form data parsers.
|
| +class FormDataParser {
|
| + public:
|
| + class Result {
|
| + public:
|
| + Result();
|
| + ~Result();
|
| + const std::string& name() const {
|
| + return name_;
|
| + }
|
| + const std::string& value() const {
|
| + return value_;
|
| + }
|
| + void set_name(const base::StringPiece& str) {
|
| + str.CopyToString(&name_);
|
| + }
|
| + void set_value(const base::StringPiece& str) {
|
| + str.CopyToString(&value_);
|
| + }
|
| + void set_name(const std::string& str) {
|
| + name_ = str;
|
| + }
|
| + void set_value(const std::string& str) {
|
| + value_ = str;
|
| + }
|
| + void Reset();
|
| +
|
| + private:
|
| + std::string name_;
|
| + std::string value_;
|
| +
|
| + DISALLOW_COPY_AND_ASSIGN(Result);
|
| + };
|
| +
|
| + virtual ~FormDataParser();
|
| +
|
| + // Creates a correct parser instance based on the |request|. Returns NULL
|
| + // on failure.
|
| + static scoped_ptr<FormDataParser> Create(const net::URLRequest* request);
|
| +
|
| + // Creates a correct parser instance based on |content_type_header|, the
|
| + // "Content-Type" request header value. If |content_type_header| is NULL, it
|
| + // defaults to "application/x-www-form-urlencoded". Returns NULL on failure.
|
| + static scoped_ptr<FormDataParser> Create(
|
| + const std::string* content_type_header);
|
| +
|
| + // Returns true if there was some data, it was well formed and all was read.
|
| + virtual bool AllDataReadOK() = 0;
|
| +
|
| + // Returns the next name-value pair as |result|. After SetSource has
|
| + // succeeded, this allows to iterate over all pairs in the source.
|
| + // Returns true as long as a new pair was successfully found.
|
| + virtual bool GetNextNameValue(Result* result) = 0;
|
| +
|
| + // Sets the |source| of the data to be parsed. The ownership is left with the
|
| + // caller and the source should live until |this| dies or |this->SetSource()|
|
| + // is called again, whichever comes sooner. Returns true on success.
|
| + virtual bool SetSource(const std::vector<char>* source) = 0;
|
| +
|
| + protected:
|
| + FormDataParser();
|
| +
|
| + private:
|
| + DISALLOW_COPY_AND_ASSIGN(FormDataParser);
|
| +};
|
| +
|
| +// Parses URLencoded forms, see
|
| +// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
|
| +class FormDataParserUrlEncoded : public FormDataParser {
|
| + public:
|
| + FormDataParserUrlEncoded();
|
| + virtual ~FormDataParserUrlEncoded();
|
| +
|
| + // Implementation of FormDataParser.
|
| + virtual bool AllDataReadOK() OVERRIDE;
|
| + virtual bool GetNextNameValue(Result* result) OVERRIDE;
|
| + virtual bool SetSource(const std::vector<char>* source) OVERRIDE;
|
| +
|
| + private:
|
| + // Gets next char from |source_|, seeks, and does book-keeping of = and &.
|
| + // Returns false if end of |source_| was reached, otherwise true.
|
| + bool GetNextChar(char* c);
|
| + // Once called the parser gives up and claims any results so far invalid.
|
| + void Abort();
|
| +
|
| + const std::vector<char>* source_;
|
| + bool aborted_;
|
| +
|
| + // Variables from this block are only to be written to by GetNextChar.
|
| + std::vector<char>::const_iterator offset_; // Next char to be read.
|
| + size_t equality_signs_; // How many '=' were read so far.
|
| + size_t amp_signs_; // How many '&' were read so far.
|
| + bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')?
|
| +
|
| + DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
|
| +};
|
| +
|
| +// The following class, FormDataParserMultipart, parses forms encoded as
|
| +// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
|
| +// encoding) and 822 (MIME-headers).
|
| +//
|
| +// Implementation details
|
| +//
|
| +// The original grammar from RFC 2046 is this, "multipart-body" being the root
|
| +// non-terminal:
|
| +//
|
| +// boundary := 0*69<bchars> bcharsnospace
|
| +// bchars := bcharsnospace / " "
|
| +// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
|
| +// / "-" / "." / "/" / ":" / "=" / "?"
|
| +// dash-boundary := "--" boundary
|
| +// multipart-body := [preamble CRLF]
|
| +// dash-boundary transport-padding CRLF
|
| +// body-part *encapsulation
|
| +// close-delimiter transport-padding
|
| +// [CRLF epilogue]
|
| +// transport-padding := *LWSP-char
|
| +// encapsulation := delimiter transport-padding CRLF body-part
|
| +// delimiter := CRLF dash-boundary
|
| +// close-delimiter := delimiter "--"
|
| +// preamble := discard-text
|
| +// epilogue := discard-text
|
| +// discard-text := *(*text CRLF) *text
|
| +// body-part := MIME-part-headers [CRLF *OCTET]
|
| +// OCTET := <any 0-255 octet value>
|
| +//
|
| +// Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters
|
| +// of the English alphabet, respectively.
|
| +// The non-terminal "text" is presumably just any text, excluding line breaks.
|
| +// The non-terminal "LWSP-char" is not directly defined in the original grammar
|
| +// but it means "linear whitespace", which is a space or a horizontal tab.
|
| +// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in
|
| +// English defined in RFC 822, and can be presented as follows:
|
| +//
|
| +// MIME-part-headers := *MIME-part-header
|
| +// MIME-part-header := name ':' *(text / whitespace) linebreak
|
| +// linebreak := '\r' / '\n' / CRLF
|
| +// whitespace := LWSP-char / CRLF LWSP-char
|
| +// name := namechar *namechar
|
| +// namechar := <ASCII char between 33 and 126, excluding ':'>
|
| +//
|
| +// This sets of rules together compose a grammar, with the root non-terminal
|
| +// "multipart-body". This grammer defines a regular language. Indeed, if the
|
| +// non-terminals are ordered in this way:
|
| +// namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace <
|
| +// linebreak < MIME-part-header < MIME-part-headers < bcharsnospace <
|
| +// bchars < boundary < dash-boundary < delimiter < close-delimiter <
|
| +// discard-text < transport-padding < OCTET < body-part < encapsulation <
|
| +// multipart-body
|
| +// then it is easy to verify that whenever A<B then no grammar rule with head
|
| +// A contains B in the body. By induction on the above order, each non-terminal
|
| +// defines a regular language: a non-terminal C is defined by a rule C := exp,
|
| +// where "exp" is an expression composed from character constants, non-terminals
|
| +// less than C, and the following closure operations of regular languages:
|
| +// concatenation, union and Kleene-star. By induction, all the lesser
|
| +// non-terminals represent regular languages, thus "exp" also represents a
|
| +// regular language. In particular, the root non-terminal (and thus the grammar)
|
| +// defines a regular language.
|
| +//
|
| +// The FormDataParseMultipart class uses a finite automaton to represent this
|
| +// language. It is easiest to view it in an extended form, with longer words
|
| +// allowed to label a single transition to keep the number of states is low.
|
| +// Importand states have full-word names, unimportant states (allways with only
|
| +// one incoming label) have names abbreviating the incoming label, possibly
|
| +// with an index.
|
| +//
|
| +// Automaton for "multipart-body":
|
| +// Initial state = Start
|
| +// Final states = {End, IgnoreEpilogue}
|
| +// Implicit state (when a transition is missing) = Error
|
| +// Transition table ('*' is a label matching everything not matched by other
|
| +// labels leaving the same state):
|
| +// FROM LABEL TO
|
| +// Start dash-boundary DB1
|
| +// CR CR1
|
| +// * IgnorePreamble
|
| +// CR1 LF Start
|
| +// * IgnorePreamble
|
| +// IgnorePreamble CR CR1
|
| +// * IgnorePreamble
|
| +// DB1 LWSP-char DB1
|
| +// CR CR2
|
| +// CR2 LF Part
|
| +// Part <ASCII 33-126, excluding ':'> Name
|
| +// CR CR3
|
| +// Name <ASCII 33-126, excluding ':'> Name
|
| +// ':' Colon
|
| +// Colon LF End1
|
| +// CR End2
|
| +// * Colon
|
| +// End1 CR CR3
|
| +// <ASCII 33-126, excluding ':'> Name
|
| +// End2 LF End3
|
| +// CR CR3
|
| +// <ASCII 33-126, excluding ':'> Name
|
| +// End3 LWSP-char Colon
|
| +// CR CR3
|
| +// <ASCII 33-126, excluding ':'> Name
|
| +// CR3 LF PreData
|
| +// PreData dash-boundary DB2
|
| +// CR CR4
|
| +// * Data
|
| +// CR4 LF Data2
|
| +// * Data
|
| +// Data CR CR4
|
| +// * Data
|
| +// Data2 dash-boundary DB2
|
| +// * CR4
|
| +// DB2 LWSP-char DB1
|
| +// CR CR2
|
| +// '-' D
|
| +// D '-' End
|
| +// End LWSP-char End
|
| +// CR CR5
|
| +// CR5 LF IgnoreEpilogue
|
| +// IgnoreEpilogue * IgnoreEpilogue
|
| +//
|
| +// The automaton itself only allows to check that the input is a well-formed
|
| +// multipart encoding of a form. To also extract the data, additional logic is
|
| +// added:
|
| +// * The header "Content-Disposition" (read between Part and PreData) contains
|
| +// the elements name=... and optionally filename=... The former is the name
|
| +// of the corresponding field of a form. The latter is only present if that
|
| +// field was a file-upload, and contains the path to the uploaded file.
|
| +// * The data of a message part is read between PreData and DB2, excluding the
|
| +// last CR LF dash-boundary.
|
| +//
|
| +// IMPORTANT NOTE
|
| +// This parser supports multiple sources, i.e., SetSource can be called multiple
|
| +// times if the input is spread over several byte vectors. However, the split
|
| +// must not occur in the middle of a transition of the above described automata,
|
| +// e.g., if there is a transition StateA --dash-boundary--> StateB, then the
|
| +// whole string with the dash--boundary bust be contained in the first source,
|
| +// or in the other. Also, the split must not occur in the middle of a header,
|
| +// or a part body data. A message part from one source must be read via
|
| +// GetNextNameValue before setting up a new source.
|
| +class FormDataParserMultipart : public FormDataParser {
|
| + public:
|
| + explicit FormDataParserMultipart(const std::string& boundary_separator);
|
| + virtual ~FormDataParserMultipart();
|
| +
|
| + // Implementation of FormDataParser.
|
| + virtual bool AllDataReadOK() OVERRIDE;
|
| + virtual bool GetNextNameValue(Result* result) OVERRIDE;
|
| + virtual bool SetSource(const std::vector<char>* source) OVERRIDE;
|
| +
|
| + private:
|
| + // State and Transition are numbered to make sure they form a continuous block
|
| + // of numbers for array indexing in lookup tables. If changing State or
|
| + // Transition, don't forget to update k*Size and the lookup tables.
|
| + enum State {
|
| + kStart = 0,
|
| + kCR1 = 1,
|
| + kIgnorePreamble = 2,
|
| + kDB1 = 3,
|
| + kCR2 = 4,
|
| + kPart = 5,
|
| + kName = 6,
|
| + kColonS = 7, // "S" to distinguish it from the transition kColonT.
|
| + kEnd1 = 8,
|
| + kEnd2 = 9,
|
| + kEnd3 = 10,
|
| + kCR3 = 11,
|
| + kPreData = 12,
|
| + kCR4 = 13,
|
| + kData = 14,
|
| + kData2 = 15,
|
| + kDB2 = 16,
|
| + kD = 17,
|
| + kEnd = 18,
|
| + kCR5 = 19,
|
| + kIgnoreEpilogue = 20,
|
| + kError = 21
|
| + };
|
| + enum Transition {
|
| + kLF = 0,
|
| + kCR = 1,
|
| + kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'.
|
| + kLwsp = 3,
|
| + kDashBoundary = 4,
|
| + kColonT = 5, // "T" to distinguish it from the state kColonS.
|
| + kDash = 6, // Meaning '-', not "--".
|
| + kAny = 7 // To represent '*'.
|
| + };
|
| + static const size_t kStateSize = 22;
|
| + static const size_t kTransitionSize = 8;
|
| +
|
| + // Lookup tables:
|
| + // Maps transitions with one-character label to that character (else to 0).
|
| + static char kTransitionToChar[];
|
| + // Indices of transitions available in state |s| in |kAvailableTransitions|
|
| + // start at kStateToTransition[s] and the last transition for |s| is always
|
| + // kAny. The target state corresponding to transition kAvailableTransitions[i]
|
| + // is kNextState[i].
|
| + static Transition kAvailableTransitions[];
|
| + static State kNextState[];
|
| + static size_t kStateToTransition[];
|
| +
|
| + // Reads the source until the next name-value pair is read. Returns true if
|
| + // |next_name_| and |next_value_| were successfully updated.
|
| + bool ReadNextNameValue();
|
| + // One step of the automaton, based on |state_| and the input from |source_|
|
| + // to be read. Updates the |offset_| iterator. Returns true on success.
|
| + bool DoStep();
|
| + // Tests whether the input pointed to by |offset_| allows to read transition
|
| + // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read.
|
| + size_t LookUp(Transition t);
|
| +
|
| + // Extracts "name" and possibly "value" from a Content-Disposition header.
|
| + // Writes directly into |next_name_| and |next_value_|. Returns true on
|
| + // success and false otherwise.
|
| + bool ParseHeader(const base::StringPiece& header);
|
| +
|
| + bool InFinalState() {
|
| + return state_ == kEnd || state_ == kIgnoreEpilogue;
|
| + }
|
| +
|
| + // The parsed message can be split into multiple sources which we read
|
| + // sequentially.
|
| + const std::vector<char>* source_;
|
| + std::vector<char>::const_iterator offset_;
|
| + // The dash-boundary string is used for all sources.
|
| + const std::string dash_boundary_;
|
| + State state_;
|
| + // The next result to be returned by GetNextNameValue. It is stored as a pair
|
| + // of StringPieces instead of a Result, to avoid one copy of the data (note
|
| + // that Result stores a copy of the data in std::string, whereas StringPiece
|
| + // is just a pointer to source_).
|
| + base::StringPiece next_name_;
|
| + base::StringPiece next_value_;
|
| + bool value_name_present_;
|
| +
|
| + DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
|
| +};
|
| +
|
| +} // namespace extensions
|
| +
|
| +#endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
|
|
|