OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ |
| 6 #define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ |
| 7 |
| 8 #include <string> |
| 9 #include <vector> |
| 10 |
| 11 #include "base/basictypes.h" |
| 12 #include "base/memory/scoped_ptr.h" |
| 13 // Cannot forward declare StringPiece because it is a typedef. |
| 14 #include "base/string_piece.h" |
| 15 |
| 16 namespace net { |
| 17 class URLRequest; |
| 18 } |
| 19 |
| 20 namespace extensions { |
| 21 |
| 22 // Interface for the form data parsers. |
| 23 class FormDataParser { |
| 24 public: |
| 25 class Result { |
| 26 public: |
| 27 Result(); |
| 28 ~Result(); |
| 29 const std::string& name() const { |
| 30 return name_; |
| 31 } |
| 32 const std::string& value() const { |
| 33 return value_; |
| 34 } |
| 35 void set_name(const base::StringPiece& str) { |
| 36 str.CopyToString(&name_); |
| 37 } |
| 38 void set_value(const base::StringPiece& str) { |
| 39 str.CopyToString(&value_); |
| 40 } |
| 41 void set_name(const std::string& str) { |
| 42 name_ = str; |
| 43 } |
| 44 void set_value(const std::string& str) { |
| 45 value_ = str; |
| 46 } |
| 47 void Reset(); |
| 48 |
| 49 private: |
| 50 std::string name_; |
| 51 std::string value_; |
| 52 |
| 53 DISALLOW_COPY_AND_ASSIGN(Result); |
| 54 }; |
| 55 |
| 56 virtual ~FormDataParser(); |
| 57 |
| 58 // Creates a correct parser instance based on the |request|. Returns NULL |
| 59 // on failure. |
| 60 static scoped_ptr<FormDataParser> Create(const net::URLRequest* request); |
| 61 |
| 62 // Creates a correct parser instance based on |content_type_header|, the |
| 63 // "Content-Type" request header value. If |content_type_header| is NULL, it |
| 64 // defaults to "application/x-www-form-urlencoded". Returns NULL on failure. |
| 65 static scoped_ptr<FormDataParser> Create( |
| 66 const std::string* content_type_header); |
| 67 |
| 68 // Returns true if there was some data, it was well formed and all was read. |
| 69 virtual bool AllDataReadOK() = 0; |
| 70 |
| 71 // Returns the next name-value pair as |result|. After SetSource has |
| 72 // succeeded, this allows to iterate over all pairs in the source. |
| 73 // Returns true as long as a new pair was successfully found. |
| 74 virtual bool GetNextNameValue(Result* result) = 0; |
| 75 |
| 76 // Sets the |source| of the data to be parsed. The ownership is left with the |
| 77 // caller and the source should live until |this| dies or |this->SetSource()| |
| 78 // is called again, whichever comes sooner. Returns true on success. |
| 79 virtual bool SetSource(const std::vector<char>* source) = 0; |
| 80 |
| 81 protected: |
| 82 FormDataParser(); |
| 83 |
| 84 private: |
| 85 DISALLOW_COPY_AND_ASSIGN(FormDataParser); |
| 86 }; |
| 87 |
| 88 // Parses URLencoded forms, see |
| 89 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . |
| 90 class FormDataParserUrlEncoded : public FormDataParser { |
| 91 public: |
| 92 FormDataParserUrlEncoded(); |
| 93 virtual ~FormDataParserUrlEncoded(); |
| 94 |
| 95 // Implementation of FormDataParser. |
| 96 virtual bool AllDataReadOK() OVERRIDE; |
| 97 virtual bool GetNextNameValue(Result* result) OVERRIDE; |
| 98 virtual bool SetSource(const std::vector<char>* source) OVERRIDE; |
| 99 |
| 100 private: |
| 101 // Gets next char from |source_|, seeks, and does book-keeping of = and &. |
| 102 // Returns false if end of |source_| was reached, otherwise true. |
| 103 bool GetNextChar(char* c); |
| 104 // Once called the parser gives up and claims any results so far invalid. |
| 105 void Abort(); |
| 106 |
| 107 const std::vector<char>* source_; |
| 108 bool aborted_; |
| 109 |
| 110 // Variables from this block are only to be written to by GetNextChar. |
| 111 std::vector<char>::const_iterator offset_; // Next char to be read. |
| 112 size_t equality_signs_; // How many '=' were read so far. |
| 113 size_t amp_signs_; // How many '&' were read so far. |
| 114 bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')? |
| 115 |
| 116 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); |
| 117 }; |
| 118 |
| 119 // The following class, FormDataParserMultipart, parses forms encoded as |
| 120 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart |
| 121 // encoding) and 822 (MIME-headers). |
| 122 // |
| 123 // Implementation details |
| 124 // |
| 125 // The original grammar from RFC 2046 is this, "multipart-body" being the root |
| 126 // non-terminal: |
| 127 // |
| 128 // boundary := 0*69<bchars> bcharsnospace |
| 129 // bchars := bcharsnospace / " " |
| 130 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," |
| 131 // / "-" / "." / "/" / ":" / "=" / "?" |
| 132 // dash-boundary := "--" boundary |
| 133 // multipart-body := [preamble CRLF] |
| 134 // dash-boundary transport-padding CRLF |
| 135 // body-part *encapsulation |
| 136 // close-delimiter transport-padding |
| 137 // [CRLF epilogue] |
| 138 // transport-padding := *LWSP-char |
| 139 // encapsulation := delimiter transport-padding CRLF body-part |
| 140 // delimiter := CRLF dash-boundary |
| 141 // close-delimiter := delimiter "--" |
| 142 // preamble := discard-text |
| 143 // epilogue := discard-text |
| 144 // discard-text := *(*text CRLF) *text |
| 145 // body-part := MIME-part-headers [CRLF *OCTET] |
| 146 // OCTET := <any 0-255 octet value> |
| 147 // |
| 148 // Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters |
| 149 // of the English alphabet, respectively. |
| 150 // The non-terminal "text" is presumably just any text, excluding line breaks. |
| 151 // The non-terminal "LWSP-char" is not directly defined in the original grammar |
| 152 // but it means "linear whitespace", which is a space or a horizontal tab. |
| 153 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in |
| 154 // English defined in RFC 822, and can be presented as follows: |
| 155 // |
| 156 // MIME-part-headers := *MIME-part-header |
| 157 // MIME-part-header := name ':' *(text / whitespace) linebreak |
| 158 // linebreak := '\r' / '\n' / CRLF |
| 159 // whitespace := LWSP-char / CRLF LWSP-char |
| 160 // name := namechar *namechar |
| 161 // namechar := <ASCII char between 33 and 126, excluding ':'> |
| 162 // |
| 163 // This sets of rules together compose a grammar, with the root non-terminal |
| 164 // "multipart-body". This grammer defines a regular language. Indeed, if the |
| 165 // non-terminals are ordered in this way: |
| 166 // namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace < |
| 167 // linebreak < MIME-part-header < MIME-part-headers < bcharsnospace < |
| 168 // bchars < boundary < dash-boundary < delimiter < close-delimiter < |
| 169 // discard-text < transport-padding < OCTET < body-part < encapsulation < |
| 170 // multipart-body |
| 171 // then it is easy to verify that whenever A<B then no grammar rule with head |
| 172 // A contains B in the body. By induction on the above order, each non-terminal |
| 173 // defines a regular language: a non-terminal C is defined by a rule C := exp, |
| 174 // where "exp" is an expression composed from character constants, non-terminals |
| 175 // less than C, and the following closure operations of regular languages: |
| 176 // concatenation, union and Kleene-star. By induction, all the lesser |
| 177 // non-terminals represent regular languages, thus "exp" also represents a |
| 178 // regular language. In particular, the root non-terminal (and thus the grammar) |
| 179 // defines a regular language. |
| 180 // |
| 181 // The FormDataParseMultipart class uses a finite automaton to represent this |
| 182 // language. It is easiest to view it in an extended form, with longer words |
| 183 // allowed to label a single transition to keep the number of states is low. |
| 184 // Importand states have full-word names, unimportant states (allways with only |
| 185 // one incoming label) have names abbreviating the incoming label, possibly |
| 186 // with an index. |
| 187 // |
| 188 // Automaton for "multipart-body": |
| 189 // Initial state = Start |
| 190 // Final states = {End, IgnoreEpilogue} |
| 191 // Implicit state (when a transition is missing) = Error |
| 192 // Transition table ('*' is a label matching everything not matched by other |
| 193 // labels leaving the same state): |
| 194 // FROM LABEL TO |
| 195 // Start dash-boundary DB1 |
| 196 // CR CR1 |
| 197 // * IgnorePreamble |
| 198 // CR1 LF Start |
| 199 // * IgnorePreamble |
| 200 // IgnorePreamble CR CR1 |
| 201 // * IgnorePreamble |
| 202 // DB1 LWSP-char DB1 |
| 203 // CR CR2 |
| 204 // CR2 LF Part |
| 205 // Part <ASCII 33-126, excluding ':'> Name |
| 206 // CR CR3 |
| 207 // Name <ASCII 33-126, excluding ':'> Name |
| 208 // ':' Colon |
| 209 // Colon LF End1 |
| 210 // CR End2 |
| 211 // * Colon |
| 212 // End1 CR CR3 |
| 213 // <ASCII 33-126, excluding ':'> Name |
| 214 // End2 LF End3 |
| 215 // CR CR3 |
| 216 // <ASCII 33-126, excluding ':'> Name |
| 217 // End3 LWSP-char Colon |
| 218 // CR CR3 |
| 219 // <ASCII 33-126, excluding ':'> Name |
| 220 // CR3 LF PreData |
| 221 // PreData dash-boundary DB2 |
| 222 // CR CR4 |
| 223 // * Data |
| 224 // CR4 LF Data2 |
| 225 // * Data |
| 226 // Data CR CR4 |
| 227 // * Data |
| 228 // Data2 dash-boundary DB2 |
| 229 // * CR4 |
| 230 // DB2 LWSP-char DB1 |
| 231 // CR CR2 |
| 232 // '-' D |
| 233 // D '-' End |
| 234 // End LWSP-char End |
| 235 // CR CR5 |
| 236 // CR5 LF IgnoreEpilogue |
| 237 // IgnoreEpilogue * IgnoreEpilogue |
| 238 // |
| 239 // The automaton itself only allows to check that the input is a well-formed |
| 240 // multipart encoding of a form. To also extract the data, additional logic is |
| 241 // added: |
| 242 // * The header "Content-Disposition" (read between Part and PreData) contains |
| 243 // the elements name=... and optionally filename=... The former is the name |
| 244 // of the corresponding field of a form. The latter is only present if that |
| 245 // field was a file-upload, and contains the path to the uploaded file. |
| 246 // * The data of a message part is read between PreData and DB2, excluding the |
| 247 // last CR LF dash-boundary. |
| 248 // |
| 249 // IMPORTANT NOTE |
| 250 // This parser supports multiple sources, i.e., SetSource can be called multiple |
| 251 // times if the input is spread over several byte vectors. However, the split |
| 252 // must not occur in the middle of a transition of the above described automata, |
| 253 // e.g., if there is a transition StateA --dash-boundary--> StateB, then the |
| 254 // whole string with the dash--boundary bust be contained in the first source, |
| 255 // or in the other. Also, the split must not occur in the middle of a header, |
| 256 // or a part body data. A message part from one source must be read via |
| 257 // GetNextNameValue before setting up a new source. |
| 258 class FormDataParserMultipart : public FormDataParser { |
| 259 public: |
| 260 explicit FormDataParserMultipart(const std::string& boundary_separator); |
| 261 virtual ~FormDataParserMultipart(); |
| 262 |
| 263 // Implementation of FormDataParser. |
| 264 virtual bool AllDataReadOK() OVERRIDE; |
| 265 virtual bool GetNextNameValue(Result* result) OVERRIDE; |
| 266 virtual bool SetSource(const std::vector<char>* source) OVERRIDE; |
| 267 |
| 268 private: |
| 269 // State and Transition are numbered to make sure they form a continuous block |
| 270 // of numbers for array indexing in lookup tables. If changing State or |
| 271 // Transition, don't forget to update k*Size and the lookup tables. |
| 272 enum State { |
| 273 kStart = 0, |
| 274 kCR1 = 1, |
| 275 kIgnorePreamble = 2, |
| 276 kDB1 = 3, |
| 277 kCR2 = 4, |
| 278 kPart = 5, |
| 279 kName = 6, |
| 280 kColonS = 7, // "S" to distinguish it from the transition kColonT. |
| 281 kEnd1 = 8, |
| 282 kEnd2 = 9, |
| 283 kEnd3 = 10, |
| 284 kCR3 = 11, |
| 285 kPreData = 12, |
| 286 kCR4 = 13, |
| 287 kData = 14, |
| 288 kData2 = 15, |
| 289 kDB2 = 16, |
| 290 kD = 17, |
| 291 kEnd = 18, |
| 292 kCR5 = 19, |
| 293 kIgnoreEpilogue = 20, |
| 294 kError = 21 |
| 295 }; |
| 296 enum Transition { |
| 297 kLF = 0, |
| 298 kCR = 1, |
| 299 kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'. |
| 300 kLwsp = 3, |
| 301 kDashBoundary = 4, |
| 302 kColonT = 5, // "T" to distinguish it from the state kColonS. |
| 303 kDash = 6, // Meaning '-', not "--". |
| 304 kAny = 7 // To represent '*'. |
| 305 }; |
| 306 static const size_t kStateSize = 22; |
| 307 static const size_t kTransitionSize = 8; |
| 308 |
| 309 // Lookup tables: |
| 310 // Maps transitions with one-character label to that character (else to 0). |
| 311 static char kTransitionToChar[]; |
| 312 // Indices of transitions available in state |s| in |kAvailableTransitions| |
| 313 // start at kStateToTransition[s] and the last transition for |s| is always |
| 314 // kAny. The target state corresponding to transition kAvailableTransitions[i] |
| 315 // is kNextState[i]. |
| 316 static Transition kAvailableTransitions[]; |
| 317 static State kNextState[]; |
| 318 static size_t kStateToTransition[]; |
| 319 |
| 320 // Reads the source until the next name-value pair is read. Returns true if |
| 321 // |next_name_| and |next_value_| were successfully updated. |
| 322 bool ReadNextNameValue(); |
| 323 // One step of the automaton, based on |state_| and the input from |source_| |
| 324 // to be read. Updates the |offset_| iterator. Returns true on success. |
| 325 bool DoStep(); |
| 326 // Tests whether the input pointed to by |offset_| allows to read transition |
| 327 // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read. |
| 328 size_t LookUp(Transition t); |
| 329 |
| 330 // Extracts "name" and possibly "value" from a Content-Disposition header. |
| 331 // Writes directly into |next_name_| and |next_value_|. Returns true on |
| 332 // success and false otherwise. |
| 333 bool ParseHeader(const base::StringPiece& header); |
| 334 |
| 335 bool InFinalState() { |
| 336 return state_ == kEnd || state_ == kIgnoreEpilogue; |
| 337 } |
| 338 |
| 339 // The parsed message can be split into multiple sources which we read |
| 340 // sequentially. |
| 341 const std::vector<char>* source_; |
| 342 std::vector<char>::const_iterator offset_; |
| 343 // The dash-boundary string is used for all sources. |
| 344 const std::string dash_boundary_; |
| 345 State state_; |
| 346 // The next result to be returned by GetNextNameValue. It is stored as a pair |
| 347 // of StringPieces instead of a Result, to avoid one copy of the data (note |
| 348 // that Result stores a copy of the data in std::string, whereas StringPiece |
| 349 // is just a pointer to source_). |
| 350 base::StringPiece next_name_; |
| 351 base::StringPiece next_value_; |
| 352 bool value_name_present_; |
| 353 |
| 354 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); |
| 355 }; |
| 356 |
| 357 } // namespace extensions |
| 358 |
| 359 #endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ |
OLD | NEW |