Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h" | |
| 6 | |
| 7 #include <vector> | |
| 8 | |
| 9 #include "base/string_util.h" | |
| 10 #include "base/values.h" | |
| 11 #include "net/base/escape.h" | |
| 12 #include "net/url_request/url_request.h" | |
| 13 | |
| 14 using base::DictionaryValue; | |
| 15 using base::ListValue; | |
| 16 using base::StringPiece; | |
| 17 | |
| 18 namespace extensions { | |
| 19 | |
| 20 // Parses URLencoded forms, see | |
| 21 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . | |
| 22 class FormDataParserUrlEncoded : public FormDataParser { | |
| 23 public: | |
| 24 FormDataParserUrlEncoded(); | |
| 25 virtual ~FormDataParserUrlEncoded(); | |
| 26 | |
| 27 // Implementation of FormDataParser. | |
| 28 virtual bool AllDataReadOK() OVERRIDE; | |
| 29 virtual bool GetNextNameValue(Result* result) OVERRIDE; | |
| 30 virtual bool SetSource(const base::StringPiece& source) OVERRIDE; | |
| 31 | |
| 32 private: | |
| 33 // Gets next char from |source_|, seeks, and does book-keeping of = and &. | |
| 34 // Returns false if end of |source_| was reached, otherwise true. | |
| 35 bool GetNextChar(char* c); | |
| 36 // Once called the parser gives up and claims any results so far invalid. | |
| 37 void Abort(); | |
| 38 | |
| 39 base::StringPiece source_; | |
| 40 const char* source_end_; | |
| 41 bool aborted_; | |
| 42 | |
| 43 // Variables from this block are only to be written to by GetNextChar. | |
| 44 const char* offset_; // Next char to be read. | |
| 45 size_t equality_signs_; // How many '=' were read so far. | |
| 46 size_t amp_signs_; // How many '&' were read so far. | |
| 47 bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')? | |
| 48 | |
| 49 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); | |
| 50 }; | |
| 51 | |
| 52 // The following class, FormDataParserMultipart, parses forms encoded as | |
| 53 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart | |
| 54 // encoding) and 822 (MIME-headers). | |
|
tkent
2012/08/27 07:09:17
Please do not refer to RFC 822, which was obsolete
vabr (Chromium)
2012/08/29 19:57:07
Done.
Thanks for making me aware of this.
| |
| 55 // | |
| 56 // Implementation details | |
| 57 // | |
| 58 // The original grammar from RFC 2046 is this, "multipart-body" being the root | |
| 59 // non-terminal: | |
| 60 // | |
| 61 // boundary := 0*69<bchars> bcharsnospace | |
| 62 // bchars := bcharsnospace / " " | |
| 63 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," | |
| 64 // / "-" / "." / "/" / ":" / "=" / "?" | |
| 65 // dash-boundary := "--" boundary | |
| 66 // multipart-body := [preamble CRLF] | |
| 67 // dash-boundary transport-padding CRLF | |
| 68 // body-part *encapsulation | |
| 69 // close-delimiter transport-padding | |
| 70 // [CRLF epilogue] | |
| 71 // transport-padding := *LWSP-char | |
| 72 // encapsulation := delimiter transport-padding CRLF body-part | |
| 73 // delimiter := CRLF dash-boundary | |
| 74 // close-delimiter := delimiter "--" | |
| 75 // preamble := discard-text | |
| 76 // epilogue := discard-text | |
| 77 // discard-text := *(*text CRLF) *text | |
| 78 // body-part := MIME-part-headers [CRLF *OCTET] | |
| 79 // OCTET := <any 0-255 octet value> | |
| 80 // | |
| 81 // Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters | |
| 82 // of the English alphabet, respectively. | |
| 83 // The non-terminal "text" is presumably just any text, excluding line breaks. | |
| 84 // The non-terminal "LWSP-char" is not directly defined in the original grammar | |
| 85 // but it means "linear whitespace", which is a space or a horizontal tab. | |
| 86 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in | |
| 87 // English defined in RFC 822, and can be presented as follows: | |
| 88 // | |
| 89 // MIME-part-headers := *MIME-part-header | |
| 90 // MIME-part-header := name ':' *(text / whitespace) linebreak | |
| 91 // linebreak := '\r' / '\n' / CRLF | |
| 92 // whitespace := LWSP-char / CRLF LWSP-char | |
| 93 // name := namechar *namechar | |
| 94 // namechar := <ASCII char between 33 and 126, excluding ':'> | |
| 95 // | |
| 96 // This sets of rules together compose a grammar, with the root non-terminal | |
| 97 // "multipart-body". This grammer defines a regular language. Indeed, if the | |
| 98 // non-terminals are ordered in this way: | |
| 99 // namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace < | |
| 100 // linebreak < MIME-part-header < MIME-part-headers < bcharsnospace < | |
| 101 // bchars < boundary < dash-boundary < delimiter < close-delimiter < | |
| 102 // discard-text < transport-padding < OCTET < body-part < encapsulation < | |
| 103 // multipart-body | |
| 104 // then it is easy to verify that whenever A<B then no grammar rule with head | |
| 105 // A contains B in the body. By induction on the above order, each non-terminal | |
| 106 // defines a regular language: a non-terminal C is defined by a rule C := exp, | |
| 107 // where "exp" is an expression composed from character constants, non-terminals | |
| 108 // less than C, and the following closure operations of regular languages: | |
| 109 // concatenation, union and Kleene-star. By induction, all the lesser | |
| 110 // non-terminals represent regular languages, thus "exp" also represents a | |
| 111 // regular language. In particular, the root non-terminal (and thus the grammar) | |
| 112 // defines a regular language. | |
| 113 // | |
| 114 // The FormDataParseMultipart class uses a finite automaton to represent this | |
| 115 // language. It is easiest to view it in an extended form, with longer words | |
| 116 // allowed to label a single transition to keep the number of states is low. | |
| 117 // Importand states have full-word names, unimportant states (allways with only | |
| 118 // one incoming label) have names abbreviating the incoming label, possibly | |
| 119 // with an index. | |
| 120 // | |
| 121 // Automaton for "multipart-body": | |
| 122 // Initial state = Start | |
| 123 // Final states = {End, IgnoreEpilogue} | |
| 124 // Implicit state (when a transition is missing) = Error | |
| 125 // Transition table ('*' is a label matching everything not matched by other | |
| 126 // labels leaving the same state): | |
| 127 // FROM LABEL TO | |
| 128 // Start dash-boundary DB1 | |
| 129 // CR CR1 | |
| 130 // * IgnorePreamble | |
| 131 // CR1 LF Start | |
| 132 // * IgnorePreamble | |
| 133 // IgnorePreamble CR CR1 | |
| 134 // * IgnorePreamble | |
| 135 // DB1 LWSP-char DB1 | |
| 136 // CR CR2 | |
| 137 // CR2 LF Part | |
| 138 // Part <ASCII 33-126, excluding ':'> Name | |
| 139 // CR CR3 | |
| 140 // Name <ASCII 33-126, excluding ':'> Name | |
| 141 // ':' Colon | |
| 142 // Colon LF End1 | |
| 143 // CR End2 | |
| 144 // * Colon | |
| 145 // End1 CR CR3 | |
| 146 // <ASCII 33-126, excluding ':'> Name | |
| 147 // End2 LF End3 | |
| 148 // CR CR3 | |
| 149 // <ASCII 33-126, excluding ':'> Name | |
| 150 // End3 LWSP-char Colon | |
| 151 // CR CR3 | |
| 152 // <ASCII 33-126, excluding ':'> Name | |
| 153 // CR3 LF PreData | |
| 154 // PreData dash-boundary DB2 | |
| 155 // CR CR4 | |
| 156 // * Data | |
| 157 // CR4 LF Data2 | |
| 158 // * Data | |
| 159 // Data CR CR4 | |
| 160 // * Data | |
| 161 // Data2 dash-boundary DB2 | |
| 162 // * CR4 | |
| 163 // DB2 LWSP-char DB1 | |
| 164 // CR CR2 | |
| 165 // '-' D | |
| 166 // D '-' End | |
| 167 // End LWSP-char End | |
| 168 // CR CR5 | |
| 169 // CR5 LF IgnoreEpilogue | |
| 170 // IgnoreEpilogue * IgnoreEpilogue | |
| 171 // | |
| 172 // The automaton itself only allows to check that the input is a well-formed | |
| 173 // multipart encoding of a form. To also extract the data, additional logic is | |
| 174 // added: | |
| 175 // * The header "Content-Disposition" (read between Part and PreData) contains | |
| 176 // the elements name=... and optionally filename=... The former is the name | |
| 177 // of the corresponding field of a form. The latter is only present if that | |
| 178 // field was a file-upload, and contains the path to the uploaded file. | |
| 179 // * The data of a message part is read between PreData and DB2, excluding the | |
| 180 // last CR LF dash-boundary. | |
| 181 // | |
| 182 // IMPORTANT NOTE | |
| 183 // This parser supports multiple sources, i.e., SetSource can be called multiple | |
| 184 // times if the input is spread over several byte blocks. However, the split | |
| 185 // must not occur in the middle of a transition of the above described automata, | |
| 186 // e.g., if there is a transition StateA --dash-boundary--> StateB, then the | |
| 187 // whole string with the dash--boundary bust be contained in the first source, | |
|
tkent
2012/08/27 07:09:17
bust -> must?
vabr (Chromium)
2012/08/29 19:57:07
Rewritten in the meantime.
| |
| 188 // or in the other. Also, the split must not occur in the middle of a header, | |
| 189 // or a part body data. A message part from one source must be read via | |
| 190 // GetNextNameValue before setting up a new source. | |
| 191 class FormDataParserMultipart : public FormDataParser { | |
| 192 public: | |
| 193 explicit FormDataParserMultipart(const std::string& boundary_separator); | |
| 194 virtual ~FormDataParserMultipart(); | |
| 195 | |
| 196 // Implementation of FormDataParser. | |
| 197 virtual bool AllDataReadOK() OVERRIDE; | |
| 198 virtual bool GetNextNameValue(Result* result) OVERRIDE; | |
| 199 virtual bool SetSource(const base::StringPiece& source) OVERRIDE; | |
| 200 | |
| 201 private: | |
| 202 // State and Transition are numbered to make sure they form a continuous block | |
| 203 // of numbers for array indexing in lookup tables. If changing State or | |
| 204 // Transition, don't forget to update k*Size and the lookup tables. | |
| 205 enum State { | |
| 206 kStart = 0, | |
|
tkent
2012/08/27 07:09:17
See http://www.chromium.org/developers/coding-styl
vabr (Chromium)
2012/08/29 19:57:07
Added STATE_ prefix to states, transitions disappe
| |
| 207 kCR1 = 1, | |
| 208 kIgnorePreamble = 2, | |
| 209 kDB1 = 3, | |
| 210 kCR2 = 4, | |
| 211 kPart = 5, | |
| 212 kName = 6, | |
| 213 kColonS = 7, // "S" to distinguish it from the transition kColonT. | |
| 214 kEnd1 = 8, | |
| 215 kEnd2 = 9, | |
| 216 kEnd3 = 10, | |
| 217 kCR3 = 11, | |
| 218 kPreData = 12, | |
| 219 kCR4 = 13, | |
| 220 kData = 14, | |
| 221 kData2 = 15, | |
| 222 kDB2 = 16, | |
| 223 kD = 17, | |
| 224 kEnd = 18, | |
| 225 kCR5 = 19, | |
| 226 kIgnoreEpilogue = 20, | |
| 227 kError = 21 | |
| 228 }; | |
| 229 enum Transition { | |
| 230 kLF = 0, | |
| 231 kCR = 1, | |
| 232 kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'. | |
| 233 kLwsp = 3, | |
| 234 kDashBoundary = 4, | |
| 235 kColonT = 5, // "T" to distinguish it from the state kColonS. | |
| 236 kDash = 6, // Meaning '-', not "--". | |
| 237 kAny = 7 // To represent '*'. | |
| 238 }; | |
| 239 static const size_t kStateSize = 22; | |
| 240 static const size_t kTransitionSize = 8; | |
| 241 | |
| 242 // Lookup tables: | |
| 243 // Maps transitions with one-character label to that character (else to 0). | |
| 244 static char kTransitionToChar[]; | |
| 245 // Indices of transitions available in state |s| in |kAvailableTransitions| | |
| 246 // start at kStateToTransition[s] and the last transition for |s| is always | |
| 247 // kAny. The target state corresponding to transition kAvailableTransitions[i] | |
| 248 // is kNextState[i]. | |
| 249 static Transition kAvailableTransitions[]; | |
| 250 static State kNextState[]; | |
| 251 static size_t kStateToTransition[]; | |
| 252 | |
| 253 // Reads the source until the next name-value pair is read. Returns true if | |
| 254 // |next_name_| and |next_value_| were successfully updated. | |
| 255 bool ReadNextNameValue(); | |
| 256 // One step of the automaton, based on |state_| and the input from |source_| | |
| 257 // to be read. Updates the |offset_| iterator. Returns true on success. | |
| 258 bool DoStep(); | |
| 259 // Tests whether the input pointed to by |offset_| allows to read transition | |
| 260 // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read. | |
| 261 size_t LookUp(Transition t); | |
| 262 | |
| 263 // Extracts "name" and possibly "value" from a Content-Disposition header. | |
| 264 // Writes directly into |next_name_| and |next_value_|. Returns true on | |
| 265 // success and false otherwise. | |
| 266 bool ParseHeader(const base::StringPiece& header); | |
| 267 | |
| 268 bool InFinalState() { | |
| 269 return state_ == kEnd || state_ == kIgnoreEpilogue; | |
| 270 } | |
| 271 | |
| 272 // The parsed message can be split into multiple sources which we read | |
| 273 // sequentially. | |
| 274 base::StringPiece source_; | |
| 275 const char* source_end_; | |
| 276 const char* offset_; | |
|
tkent
2012/08/27 07:09:17
The name "offset_" is confusing. It's not an offs
vabr (Chromium)
2012/08/29 19:57:07
You're right.
This disappeared after rewriting.
| |
| 277 // The dash-boundary string is used for all sources. | |
| 278 const std::string dash_boundary_; | |
| 279 State state_; | |
| 280 // The next result to be returned by GetNextNameValue. It is stored as a pair | |
| 281 // of StringPieces instead of a Result, to avoid one copy of the data (note | |
| 282 // that Result stores a copy of the data in std::string, whereas StringPiece | |
| 283 // is just a pointer to the data in |source_|). | |
| 284 base::StringPiece next_name_; | |
| 285 base::StringPiece next_value_; | |
| 286 bool value_name_present_; | |
| 287 | |
| 288 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); | |
| 289 }; | |
| 290 | |
| 291 // Implementation of FormDataParser and FormDataParser::Result . | |
| 292 | |
| 293 FormDataParser::Result::Result() {} | |
| 294 FormDataParser::Result::~Result() {} | |
| 295 | |
| 296 void FormDataParser::Result::Reset() { | |
| 297 name_.erase(); | |
| 298 value_.erase(); | |
| 299 } | |
| 300 | |
| 301 FormDataParser::~FormDataParser() {} | |
| 302 | |
| 303 // static | |
| 304 scoped_ptr<FormDataParser> FormDataParser::Create( | |
| 305 const net::URLRequest* request) { | |
| 306 std::string value; | |
| 307 const bool found = request->extra_request_headers().GetHeader( | |
| 308 net::HttpRequestHeaders::kContentType, &value); | |
| 309 return Create(found ? &value : NULL); | |
| 310 } | |
| 311 | |
| 312 // static | |
| 313 scoped_ptr<FormDataParser> FormDataParser::Create( | |
| 314 const std::string* content_type_header) { | |
| 315 enum ParserChoice {kUrlEncoded, kMultipart, kError}; | |
| 316 ParserChoice choice = kError; | |
| 317 std::string boundary; | |
| 318 | |
| 319 if (content_type_header == NULL) { | |
| 320 choice = kUrlEncoded; | |
| 321 } else { | |
| 322 const std::string content_type( | |
| 323 content_type_header->substr(0, content_type_header->find(';'))); | |
| 324 | |
| 325 if (base::strcasecmp( | |
| 326 content_type.c_str(), "application/x-www-form-urlencoded") == 0) { | |
| 327 choice = kUrlEncoded; | |
| 328 } else if (base::strcasecmp( | |
| 329 content_type.c_str(), "multipart/form-data") == 0) { | |
| 330 static const char kBoundaryString[] = "boundary="; | |
| 331 size_t offset = content_type_header->find(kBoundaryString); | |
| 332 if (offset == std::string::npos) { | |
| 333 // Malformed header. | |
| 334 return scoped_ptr<FormDataParser>(); | |
| 335 } | |
| 336 offset += strlen(kBoundaryString); | |
| 337 boundary = content_type_header->substr( | |
| 338 offset, content_type_header->find(';', offset)); | |
| 339 if (!boundary.empty()) | |
| 340 choice = kMultipart; | |
| 341 } | |
| 342 } | |
| 343 // Other cases are unparseable, including when |content_type| is "text/plain". | |
| 344 | |
| 345 switch (choice) { | |
| 346 case kUrlEncoded: | |
| 347 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded()); | |
| 348 case kMultipart: | |
| 349 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary)); | |
| 350 default: // In other words, case kError: | |
| 351 return scoped_ptr<FormDataParser>(); | |
| 352 } | |
| 353 } | |
| 354 | |
| 355 FormDataParser::FormDataParser() {} | |
| 356 | |
| 357 // Implementation of FormDataParserUrlEncoded. | |
| 358 | |
| 359 FormDataParserUrlEncoded::FormDataParserUrlEncoded() | |
| 360 : source_end_(NULL), | |
| 361 aborted_(false), | |
| 362 offset_(NULL), | |
| 363 equality_signs_(0), | |
| 364 amp_signs_(0), | |
| 365 expect_equality_(true) { | |
| 366 } | |
| 367 | |
| 368 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {} | |
| 369 | |
| 370 bool FormDataParserUrlEncoded::AllDataReadOK() { | |
| 371 return source_.data() != NULL && | |
| 372 !aborted_ && | |
| 373 offset_ == source_end_ && | |
| 374 equality_signs_ == amp_signs_ + 1; | |
|
tkent
2012/08/27 07:09:17
Why do we need to check the number of = and & ?
eq
vabr (Chromium)
2012/08/29 19:57:07
Now the parser uses a regexp which eliminates such
| |
| 375 } | |
| 376 | |
| 377 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) { | |
| 378 result->Reset(); | |
| 379 if (source_.data() == NULL || aborted_) | |
| 380 return false; | |
| 381 if (offset_ == source_end_) | |
| 382 return false; | |
| 383 const char* const name_start = &(*offset_); | |
| 384 char c; | |
| 385 bool last_read_success = GetNextChar(&c); | |
| 386 while (last_read_success && c != '=') | |
| 387 last_read_success = GetNextChar(&c); | |
| 388 if (!last_read_success) { // This means the data is malformed. | |
| 389 Abort(); | |
| 390 return false; | |
| 391 } | |
| 392 const char* const name_end = offset_ - 1; | |
| 393 const std::string encoded_name(name_start, name_end - name_start); | |
| 394 const net::UnescapeRule::Type unescape_rules = | |
| 395 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS | | |
| 396 net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE; | |
| 397 result->set_name(net::UnescapeURLComponent(encoded_name, unescape_rules)); | |
| 398 | |
| 399 const char* const value_start = offset_; | |
| 400 last_read_success = GetNextChar(&c); | |
| 401 while (last_read_success && c != '&') | |
| 402 last_read_success = GetNextChar(&c); | |
| 403 const char* const value_end = | |
| 404 last_read_success ? offset_ - 1 : offset_; | |
| 405 const std::string encoded_value(value_start, value_end - value_start); | |
| 406 result->set_value(net::UnescapeURLComponent(encoded_value, unescape_rules)); | |
| 407 return true; | |
| 408 } | |
| 409 | |
| 410 bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) { | |
| 411 if (source_ != NULL || source.data() == NULL || aborted_) | |
| 412 return false; | |
| 413 source_ = source; | |
| 414 source_end_ = source_.data() + source_.size(); | |
| 415 offset_ = source_.data(); | |
| 416 return true; | |
| 417 } | |
| 418 | |
| 419 bool FormDataParserUrlEncoded::GetNextChar(char* c) { | |
| 420 if (offset_ == source_end_ || aborted_) | |
| 421 return false; | |
| 422 *c = *offset_; | |
| 423 ++offset_; | |
| 424 | |
| 425 if (*c == '=') { | |
| 426 if (expect_equality_) { | |
| 427 ++equality_signs_; | |
| 428 expect_equality_ = false; | |
| 429 } else { | |
| 430 Abort(); | |
| 431 return false; | |
| 432 } | |
| 433 } | |
| 434 if (*c == '&' && offset_ != source_end_) { | |
| 435 if (!expect_equality_) { | |
| 436 ++amp_signs_; | |
| 437 expect_equality_ = true; | |
| 438 } else { | |
| 439 Abort(); | |
| 440 return false; | |
| 441 } | |
| 442 } | |
| 443 | |
| 444 return true; | |
| 445 } | |
| 446 | |
| 447 void FormDataParserUrlEncoded::Abort() { | |
| 448 aborted_ = true; | |
| 449 } | |
| 450 | |
| 451 // Implementation of FormDataParserMultipart. | |
| 452 | |
| 453 FormDataParserMultipart::FormDataParserMultipart( | |
| 454 const std::string& boundary_separator) | |
| 455 : source_end_(NULL), | |
| 456 offset_(NULL), | |
| 457 dash_boundary_("--" + boundary_separator), | |
| 458 state_(kStart), | |
| 459 value_name_present_(false) { | |
| 460 } | |
| 461 | |
| 462 FormDataParserMultipart::~FormDataParserMultipart() {} | |
| 463 | |
| 464 bool FormDataParserMultipart::AllDataReadOK() { | |
| 465 return source_.data() != NULL && InFinalState(); | |
| 466 } | |
| 467 | |
| 468 bool FormDataParserMultipart::GetNextNameValue(Result* result) { | |
| 469 if (!value_name_present_ || state_ == kError) | |
| 470 return false; | |
| 471 result->set_name(next_name_); | |
| 472 result->set_value(next_value_); | |
| 473 next_name_.clear(); | |
| 474 next_value_.clear(); | |
| 475 value_name_present_ = ReadNextNameValue(); | |
| 476 return true; | |
| 477 } | |
| 478 | |
| 479 bool FormDataParserMultipart::SetSource(const base::StringPiece& source) { | |
| 480 if (state_ == kError || | |
| 481 source.data() == NULL || | |
| 482 // Message part across a source split is also an error. | |
| 483 next_name_.data() != NULL || next_value_.data() != NULL) | |
| 484 return false; | |
| 485 if (source_.data() != NULL && offset_ != source_end_){ | |
| 486 // Try to seek until the end. If no name-value pair is found, this is OK. | |
| 487 value_name_present_ = ReadNextNameValue(); | |
| 488 if (!value_name_present_ || offset_ != source_end_) | |
| 489 return false; | |
| 490 } | |
| 491 source_ = source; | |
| 492 source_end_ = source_.data() + source_.size(); | |
| 493 offset_ = source_.data(); | |
| 494 value_name_present_ = ReadNextNameValue(); | |
| 495 return true; | |
| 496 } | |
| 497 | |
| 498 // static | |
| 499 char FormDataParserMultipart::kTransitionToChar[] = { | |
| 500 '\n', // For kLF. | |
| 501 '\r', // For kCR. | |
| 502 0, // For kAscii. | |
| 503 0, // For kLwsp. | |
| 504 0, // For kDashBoundary. | |
| 505 ':', // For kColonT. | |
| 506 '-', // For kDash. | |
| 507 0, // For kAny. | |
| 508 }; | |
| 509 | |
| 510 // static | |
| 511 FormDataParserMultipart::Transition | |
| 512 FormDataParserMultipart::kAvailableTransitions[] = { | |
| 513 kDashBoundary, kCR, kAny, // For kStart. | |
| 514 kLF, kAny, // For kCR1. | |
| 515 kCR, kAny, // For kIgnorePreamble. | |
| 516 kLwsp, kCR, kAny, // For kDB1. | |
| 517 kLF, kAny, // For kCR2. | |
| 518 kAscii, kCR, kAny, // For kPart. | |
| 519 kAscii, kColonT, kAny, // For kName. | |
| 520 kLF, kCR, kAny, // For kColonS. | |
| 521 kCR, kAscii, kAny, // For kEnd1. | |
| 522 kLF, kCR, kAscii, kAny, // For kEnd2. | |
| 523 kLwsp, kCR, kAscii, kAny, // For kEnd3. | |
| 524 kLF, kAny, // For kCR3. | |
| 525 kDashBoundary, kCR, kAny, // For kPreData. | |
| 526 kLF, kAny, // For kCR4. | |
| 527 kCR, kAny, // For kData. | |
| 528 kDashBoundary, kAny, // For kData2. | |
| 529 kLwsp, kCR, kDash, kAny, // For kDB2. | |
| 530 kDash, kAny, // For kD. | |
| 531 kLwsp, kCR, kAny, // For kEnd. | |
| 532 kLF, kAny, // For kCR5. | |
| 533 kAny, // For kIgnoreEpilogue. | |
| 534 kAny // For kError. | |
| 535 }; | |
| 536 | |
| 537 // static | |
| 538 FormDataParserMultipart::State FormDataParserMultipart::kNextState[] = { | |
|
tkent
2012/08/27 07:09:17
kAvailableTransitions and kNextState should be mer
vabr (Chromium)
2012/08/29 19:57:07
Disappeared after rewriting.
| |
| 539 kDB1, kCR1, kIgnorePreamble, // For kStart; size so far: 03. | |
| 540 kStart, kIgnorePreamble, // For kCR1; 05. | |
| 541 kCR1, kIgnorePreamble, // For kIgnorePreamble; 07. | |
| 542 kDB1, kCR2, kError, // For kDB1; 10. | |
| 543 kPart, kError, // For kCR2; 12. | |
| 544 kName, kCR3, kError, // For kPart; 15. | |
| 545 kName, kColonS, kError, // For kName; 18. | |
| 546 kEnd1, kEnd2, kColonS, // For kColonS; 21. | |
| 547 kCR3, kName, kError, // For kEnd1; 24. | |
| 548 kEnd3, kCR3, kName, kError, // For kEnd2; 28. | |
| 549 kColonS, kCR3, kName, kError, // For kEnd3; 32. | |
| 550 kPreData, kError, // For kCR3; 34. | |
| 551 kDB2, kCR3, kData, // For kPreData; 37. | |
| 552 kData2, kData, // For kCR4; 39. | |
| 553 kCR4, kData, // For kData; 41. | |
| 554 kDB2, kCR4, // For kData2; 43. | |
| 555 kDB1, kCR2, kD, kError, // For kDB2; 47. | |
| 556 kEnd, kError, // For kD; 49. | |
| 557 kEnd, kCR5, kError, // For kEnd; 52. | |
| 558 kIgnoreEpilogue, kError, // For kCR5; 54. | |
| 559 kIgnoreEpilogue, // For kIgnoreEpilogue; 55. | |
| 560 kError // For kError; 56. | |
| 561 }; | |
| 562 | |
| 563 // static | |
| 564 size_t FormDataParserMultipart::kStateToTransition[] = { | |
| 565 0u, // For kStart | |
| 566 3u, // For kCR1 | |
| 567 5u, // For kIgnorePreamble | |
| 568 7u, // For kDB1 | |
| 569 10u, // For kCR2 | |
| 570 12u, // For kPart | |
| 571 15u, // For kName | |
| 572 18u, // For kColonS | |
| 573 21u, // For kEnd1 | |
| 574 24u, // For kEnd2 | |
| 575 28u, // For kEnd3 | |
| 576 32u, // For kCR3 | |
| 577 34u, // For kPreData | |
| 578 37u, // For kCR4 | |
| 579 39u, // For kData | |
| 580 41u, // For kData2 | |
| 581 43u, // For kDB2 | |
| 582 47u, // For kD | |
| 583 49u, // For kEnd | |
| 584 52u, // For kCR5 | |
| 585 54u, // For kIgnoreEpilogue | |
| 586 55u, // For kError | |
| 587 }; | |
| 588 | |
| 589 bool FormDataParserMultipart::ReadNextNameValue() { | |
| 590 if (state_ == kError || source_.data() == NULL || | |
| 591 next_name_.data() != NULL || next_value_.data() != NULL) | |
| 592 return false; | |
| 593 | |
| 594 // Seek to the next part's headers. | |
| 595 while (state_ != kPart) { | |
| 596 if (!DoStep()) | |
| 597 return false; | |
| 598 } | |
|
tkent
2012/08/27 07:09:17
We had better have a function DoStepsUntil(State).
vabr (Chromium)
2012/08/29 19:57:07
Disappeared after the rewrite.
| |
| 599 while (state_ != kPreData) { | |
| 600 const char* header = offset_; | |
| 601 while (state_ != kColonS) { | |
| 602 if (!DoStep()) | |
| 603 return false; | |
| 604 } | |
| 605 size_t header_length = 0u; | |
| 606 while (state_ != kPreData && state_ != kName) { | |
| 607 if (state_ == kEnd1 || state_ == kEnd2 || state_ == kEnd3) { | |
| 608 // The cast is safe, we know that offset only moves forward. | |
| 609 header_length = static_cast<size_t>(offset_ - header); | |
| 610 } | |
| 611 if (!DoStep()) | |
| 612 return false; | |
| 613 } | |
| 614 if (ParseHeader(base::StringPiece(header, header_length))) { | |
| 615 // Found what we were looking for, just skip to the part's body. | |
| 616 while (state_ != kPreData) { | |
| 617 if (!DoStep()) | |
| 618 return false; | |
| 619 } | |
| 620 } | |
| 621 } | |
| 622 | |
| 623 const char* body = offset_; | |
| 624 size_t body_length = 0; | |
| 625 while (state_ != kDB2 && offset_ != source_end_) { | |
| 626 if (!DoStep()) | |
| 627 return false; | |
| 628 if (state_ == kCR4) { | |
| 629 // We are in the middle of which might be the CRLF starting the part | |
| 630 // separator (see the "delimiter" non-terminal from the grammar given | |
| 631 // in the header file). The cast is safe, we know that offset only moves | |
| 632 // forward and body was assigned at least 1 transition ago. | |
| 633 body_length = static_cast<size_t>(offset_ - body - 1); | |
| 634 } | |
| 635 } | |
| 636 if (body_length > 0) | |
| 637 next_value_.set(body, body_length); | |
| 638 return true; | |
| 639 } | |
| 640 | |
| 641 bool FormDataParserMultipart::DoStep() { | |
| 642 if (state_ == kError || offset_ == source_end_) | |
| 643 return false; | |
| 644 size_t transition_index = kStateToTransition[state_]; | |
| 645 Transition t = kAvailableTransitions[transition_index]; | |
| 646 while (t != kAny) { | |
| 647 const State s = kNextState[transition_index]; | |
| 648 const size_t length = LookUp(t); | |
| 649 if (length > 0) { | |
| 650 offset_ += length; | |
| 651 state_ = s; | |
| 652 return true; | |
| 653 } | |
| 654 t = kAvailableTransitions[++transition_index]; | |
| 655 } | |
| 656 // We have kAny, the default choice. Seek by one and switch the state. | |
| 657 ++offset_; | |
| 658 state_ = kNextState[transition_index]; | |
| 659 return true; | |
| 660 } | |
| 661 | |
| 662 // Contract -- the following must be true: offset_ != source_end_ . | |
| 663 // The idea is to check this only once in the caller (DoStep()), and do not | |
| 664 // repeat it here every time, as this can be called many times from one call | |
| 665 // to DoStep(). | |
| 666 size_t FormDataParserMultipart::LookUp(FormDataParserMultipart::Transition t) { | |
| 667 const char ahead = *offset_; | |
| 668 const char first_char = kTransitionToChar[t]; | |
| 669 | |
| 670 // Easy case: labels corresponding to a single char. | |
| 671 if (first_char != 0) | |
| 672 return ahead == first_char ? 1u : 0u; | |
| 673 | |
| 674 // Harder cases. | |
| 675 switch (t) { | |
| 676 // Multiple alternatives, 1-char long: return immediately. | |
| 677 case kAscii: | |
| 678 return (ahead >= 33 && ahead <= 126 && ahead != ':') ? 1u : 0u; | |
| 679 case kLwsp: | |
| 680 return (ahead == ' ' || ahead == '\t') ? 1u : 0u; | |
| 681 | |
| 682 // Longer than 1 char: prepare work for later. | |
| 683 case kDashBoundary: { | |
| 684 const size_t length = dash_boundary_.size(); | |
| 685 // The cast below is safe, we know that the difference is not negative. | |
| 686 if (static_cast<size_t>(source_end_ - offset_) < length || | |
| 687 memcmp(dash_boundary_.c_str(), offset_, length) != 0) | |
| 688 return 0u; | |
| 689 return length; | |
| 690 } | |
| 691 case kAny: | |
| 692 // We are not supposed to be asked for kAny, but this is the right answer: | |
| 693 return 1u; | |
| 694 default: // We never get here -- the rest has already been handled above. | |
| 695 NOTREACHED(); | |
| 696 return 0u; | |
| 697 } | |
| 698 } | |
| 699 | |
| 700 bool FormDataParserMultipart::ParseHeader(const base::StringPiece& header) { | |
| 701 static const char kContentDisposition[] = "Content-Disposition:"; | |
| 702 if (memcmp(header.data(), kContentDisposition, | |
| 703 strlen(kContentDisposition) != 0)) | |
|
tkent
2012/08/27 07:09:17
strlen is not needed. The length of kContentDispo
vabr (Chromium)
2012/08/29 19:57:07
Correct. Although this particular instance and tho
| |
| 704 return false; | |
| 705 static const char kNameEquals[] = " name=\""; | |
| 706 static const char kFilenameEquals[] = " filename=\""; | |
| 707 | |
| 708 // Mandatory part: find the name and set it as |next_name_|. | |
| 709 StringPiece::size_type field_offset = header.find(kNameEquals); | |
| 710 if (field_offset == StringPiece::npos) | |
| 711 return false; | |
| 712 field_offset += strlen(kNameEquals); | |
|
tkent
2012/08/27 07:09:17
ditto.
| |
| 713 StringPiece::size_type field_end = header.find('"', field_offset); | |
| 714 if (field_end == StringPiece::npos) | |
| 715 return false; | |
| 716 next_name_.set(header.data() + field_offset, field_end - field_offset); | |
|
tkent
2012/08/27 07:09:17
Need to decode the name value.
BTW, what's the ex
vabr (Chromium)
2012/08/29 19:57:07
Thanks very much for bringing this up!
On 2012/08
| |
| 717 | |
| 718 // Optional part: find the filename and set it as |next_value_|. | |
| 719 field_offset = header.find(kFilenameEquals); | |
| 720 if (field_offset == StringPiece::npos) | |
| 721 return true; // This was only optional | |
| 722 field_offset += strlen(kFilenameEquals); | |
|
tkent
2012/08/27 07:09:17
ditto.
| |
| 723 field_end = header.find('"', field_offset); | |
| 724 if (field_end == StringPiece::npos) | |
| 725 return false; // This is a malformed header. | |
| 726 next_value_.set(header.data() + field_offset, field_end - field_offset); | |
| 727 return true; | |
| 728 } | |
| 729 | |
| 730 } // namespace extensions | |
| OLD | NEW |