Chromium Code Reviews| Index: chrome/browser/extensions/api/web_request/form_data_parser.cc |
| diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.cc b/chrome/browser/extensions/api/web_request/form_data_parser.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..0ea100cf856a4919b653c06aabd20150b98d95de |
| --- /dev/null |
| +++ b/chrome/browser/extensions/api/web_request/form_data_parser.cc |
| @@ -0,0 +1,730 @@ |
| +// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "chrome/browser/extensions/api/web_request/form_data_parser.h" |
| + |
| +#include <vector> |
| + |
| +#include "base/string_util.h" |
| +#include "base/values.h" |
| +#include "net/base/escape.h" |
| +#include "net/url_request/url_request.h" |
| + |
| +using base::DictionaryValue; |
| +using base::ListValue; |
| +using base::StringPiece; |
| + |
| +namespace extensions { |
| + |
| +// Parses URLencoded forms, see |
| +// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . |
| +class FormDataParserUrlEncoded : public FormDataParser { |
| + public: |
| + FormDataParserUrlEncoded(); |
| + virtual ~FormDataParserUrlEncoded(); |
| + |
| + // Implementation of FormDataParser. |
| + virtual bool AllDataReadOK() OVERRIDE; |
| + virtual bool GetNextNameValue(Result* result) OVERRIDE; |
| + virtual bool SetSource(const base::StringPiece& source) OVERRIDE; |
| + |
| + private: |
| + // Gets next char from |source_|, seeks, and does book-keeping of = and &. |
| + // Returns false if end of |source_| was reached, otherwise true. |
| + bool GetNextChar(char* c); |
| + // Once called the parser gives up and claims any results so far invalid. |
| + void Abort(); |
| + |
| + base::StringPiece source_; |
| + const char* source_end_; |
| + bool aborted_; |
| + |
| + // Variables from this block are only to be written to by GetNextChar. |
| + const char* offset_; // Next char to be read. |
| + size_t equality_signs_; // How many '=' were read so far. |
| + size_t amp_signs_; // How many '&' were read so far. |
| + bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')? |
| + |
| + DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); |
| +}; |
| + |
| +// The following class, FormDataParserMultipart, parses forms encoded as |
| +// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart |
| +// encoding) and 822 (MIME-headers). |
|
tkent
2012/08/27 07:09:17
Please do not refer to RFC 822, which was obsolete
vabr (Chromium)
2012/08/29 19:57:07
Done.
Thanks for making me aware of this.
|
| +// |
| +// Implementation details |
| +// |
| +// The original grammar from RFC 2046 is this, "multipart-body" being the root |
| +// non-terminal: |
| +// |
| +// boundary := 0*69<bchars> bcharsnospace |
| +// bchars := bcharsnospace / " " |
| +// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," |
| +// / "-" / "." / "/" / ":" / "=" / "?" |
| +// dash-boundary := "--" boundary |
| +// multipart-body := [preamble CRLF] |
| +// dash-boundary transport-padding CRLF |
| +// body-part *encapsulation |
| +// close-delimiter transport-padding |
| +// [CRLF epilogue] |
| +// transport-padding := *LWSP-char |
| +// encapsulation := delimiter transport-padding CRLF body-part |
| +// delimiter := CRLF dash-boundary |
| +// close-delimiter := delimiter "--" |
| +// preamble := discard-text |
| +// epilogue := discard-text |
| +// discard-text := *(*text CRLF) *text |
| +// body-part := MIME-part-headers [CRLF *OCTET] |
| +// OCTET := <any 0-255 octet value> |
| +// |
| +// Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters |
| +// of the English alphabet, respectively. |
| +// The non-terminal "text" is presumably just any text, excluding line breaks. |
| +// The non-terminal "LWSP-char" is not directly defined in the original grammar |
| +// but it means "linear whitespace", which is a space or a horizontal tab. |
| +// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in |
| +// English defined in RFC 822, and can be presented as follows: |
| +// |
| +// MIME-part-headers := *MIME-part-header |
| +// MIME-part-header := name ':' *(text / whitespace) linebreak |
| +// linebreak := '\r' / '\n' / CRLF |
| +// whitespace := LWSP-char / CRLF LWSP-char |
| +// name := namechar *namechar |
| +// namechar := <ASCII char between 33 and 126, excluding ':'> |
| +// |
| +// This sets of rules together compose a grammar, with the root non-terminal |
| +// "multipart-body". This grammer defines a regular language. Indeed, if the |
| +// non-terminals are ordered in this way: |
| +// namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace < |
| +// linebreak < MIME-part-header < MIME-part-headers < bcharsnospace < |
| +// bchars < boundary < dash-boundary < delimiter < close-delimiter < |
| +// discard-text < transport-padding < OCTET < body-part < encapsulation < |
| +// multipart-body |
| +// then it is easy to verify that whenever A<B then no grammar rule with head |
| +// A contains B in the body. By induction on the above order, each non-terminal |
| +// defines a regular language: a non-terminal C is defined by a rule C := exp, |
| +// where "exp" is an expression composed from character constants, non-terminals |
| +// less than C, and the following closure operations of regular languages: |
| +// concatenation, union and Kleene-star. By induction, all the lesser |
| +// non-terminals represent regular languages, thus "exp" also represents a |
| +// regular language. In particular, the root non-terminal (and thus the grammar) |
| +// defines a regular language. |
| +// |
| +// The FormDataParseMultipart class uses a finite automaton to represent this |
| +// language. It is easiest to view it in an extended form, with longer words |
| +// allowed to label a single transition to keep the number of states is low. |
| +// Importand states have full-word names, unimportant states (allways with only |
| +// one incoming label) have names abbreviating the incoming label, possibly |
| +// with an index. |
| +// |
| +// Automaton for "multipart-body": |
| +// Initial state = Start |
| +// Final states = {End, IgnoreEpilogue} |
| +// Implicit state (when a transition is missing) = Error |
| +// Transition table ('*' is a label matching everything not matched by other |
| +// labels leaving the same state): |
| +// FROM LABEL TO |
| +// Start dash-boundary DB1 |
| +// CR CR1 |
| +// * IgnorePreamble |
| +// CR1 LF Start |
| +// * IgnorePreamble |
| +// IgnorePreamble CR CR1 |
| +// * IgnorePreamble |
| +// DB1 LWSP-char DB1 |
| +// CR CR2 |
| +// CR2 LF Part |
| +// Part <ASCII 33-126, excluding ':'> Name |
| +// CR CR3 |
| +// Name <ASCII 33-126, excluding ':'> Name |
| +// ':' Colon |
| +// Colon LF End1 |
| +// CR End2 |
| +// * Colon |
| +// End1 CR CR3 |
| +// <ASCII 33-126, excluding ':'> Name |
| +// End2 LF End3 |
| +// CR CR3 |
| +// <ASCII 33-126, excluding ':'> Name |
| +// End3 LWSP-char Colon |
| +// CR CR3 |
| +// <ASCII 33-126, excluding ':'> Name |
| +// CR3 LF PreData |
| +// PreData dash-boundary DB2 |
| +// CR CR4 |
| +// * Data |
| +// CR4 LF Data2 |
| +// * Data |
| +// Data CR CR4 |
| +// * Data |
| +// Data2 dash-boundary DB2 |
| +// * CR4 |
| +// DB2 LWSP-char DB1 |
| +// CR CR2 |
| +// '-' D |
| +// D '-' End |
| +// End LWSP-char End |
| +// CR CR5 |
| +// CR5 LF IgnoreEpilogue |
| +// IgnoreEpilogue * IgnoreEpilogue |
| +// |
| +// The automaton itself only allows to check that the input is a well-formed |
| +// multipart encoding of a form. To also extract the data, additional logic is |
| +// added: |
| +// * The header "Content-Disposition" (read between Part and PreData) contains |
| +// the elements name=... and optionally filename=... The former is the name |
| +// of the corresponding field of a form. The latter is only present if that |
| +// field was a file-upload, and contains the path to the uploaded file. |
| +// * The data of a message part is read between PreData and DB2, excluding the |
| +// last CR LF dash-boundary. |
| +// |
| +// IMPORTANT NOTE |
| +// This parser supports multiple sources, i.e., SetSource can be called multiple |
| +// times if the input is spread over several byte blocks. However, the split |
| +// must not occur in the middle of a transition of the above described automata, |
| +// e.g., if there is a transition StateA --dash-boundary--> StateB, then the |
| +// whole string with the dash--boundary bust be contained in the first source, |
|
tkent
2012/08/27 07:09:17
bust -> must?
vabr (Chromium)
2012/08/29 19:57:07
Rewritten in the meantime.
|
| +// or in the other. Also, the split must not occur in the middle of a header, |
| +// or a part body data. A message part from one source must be read via |
| +// GetNextNameValue before setting up a new source. |
| +class FormDataParserMultipart : public FormDataParser { |
| + public: |
| + explicit FormDataParserMultipart(const std::string& boundary_separator); |
| + virtual ~FormDataParserMultipart(); |
| + |
| + // Implementation of FormDataParser. |
| + virtual bool AllDataReadOK() OVERRIDE; |
| + virtual bool GetNextNameValue(Result* result) OVERRIDE; |
| + virtual bool SetSource(const base::StringPiece& source) OVERRIDE; |
| + |
| + private: |
| + // State and Transition are numbered to make sure they form a continuous block |
| + // of numbers for array indexing in lookup tables. If changing State or |
| + // Transition, don't forget to update k*Size and the lookup tables. |
| + enum State { |
| + kStart = 0, |
|
tkent
2012/08/27 07:09:17
See http://www.chromium.org/developers/coding-styl
vabr (Chromium)
2012/08/29 19:57:07
Added STATE_ prefix to states, transitions disappe
|
| + kCR1 = 1, |
| + kIgnorePreamble = 2, |
| + kDB1 = 3, |
| + kCR2 = 4, |
| + kPart = 5, |
| + kName = 6, |
| + kColonS = 7, // "S" to distinguish it from the transition kColonT. |
| + kEnd1 = 8, |
| + kEnd2 = 9, |
| + kEnd3 = 10, |
| + kCR3 = 11, |
| + kPreData = 12, |
| + kCR4 = 13, |
| + kData = 14, |
| + kData2 = 15, |
| + kDB2 = 16, |
| + kD = 17, |
| + kEnd = 18, |
| + kCR5 = 19, |
| + kIgnoreEpilogue = 20, |
| + kError = 21 |
| + }; |
| + enum Transition { |
| + kLF = 0, |
| + kCR = 1, |
| + kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'. |
| + kLwsp = 3, |
| + kDashBoundary = 4, |
| + kColonT = 5, // "T" to distinguish it from the state kColonS. |
| + kDash = 6, // Meaning '-', not "--". |
| + kAny = 7 // To represent '*'. |
| + }; |
| + static const size_t kStateSize = 22; |
| + static const size_t kTransitionSize = 8; |
| + |
| + // Lookup tables: |
| + // Maps transitions with one-character label to that character (else to 0). |
| + static char kTransitionToChar[]; |
| + // Indices of transitions available in state |s| in |kAvailableTransitions| |
| + // start at kStateToTransition[s] and the last transition for |s| is always |
| + // kAny. The target state corresponding to transition kAvailableTransitions[i] |
| + // is kNextState[i]. |
| + static Transition kAvailableTransitions[]; |
| + static State kNextState[]; |
| + static size_t kStateToTransition[]; |
| + |
| + // Reads the source until the next name-value pair is read. Returns true if |
| + // |next_name_| and |next_value_| were successfully updated. |
| + bool ReadNextNameValue(); |
| + // One step of the automaton, based on |state_| and the input from |source_| |
| + // to be read. Updates the |offset_| iterator. Returns true on success. |
| + bool DoStep(); |
| + // Tests whether the input pointed to by |offset_| allows to read transition |
| + // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read. |
| + size_t LookUp(Transition t); |
| + |
| + // Extracts "name" and possibly "value" from a Content-Disposition header. |
| + // Writes directly into |next_name_| and |next_value_|. Returns true on |
| + // success and false otherwise. |
| + bool ParseHeader(const base::StringPiece& header); |
| + |
| + bool InFinalState() { |
| + return state_ == kEnd || state_ == kIgnoreEpilogue; |
| + } |
| + |
| + // The parsed message can be split into multiple sources which we read |
| + // sequentially. |
| + base::StringPiece source_; |
| + const char* source_end_; |
| + const char* offset_; |
|
tkent
2012/08/27 07:09:17
The name "offset_" is confusing. It's not an offs
vabr (Chromium)
2012/08/29 19:57:07
You're right.
This disappeared after rewriting.
|
| + // The dash-boundary string is used for all sources. |
| + const std::string dash_boundary_; |
| + State state_; |
| + // The next result to be returned by GetNextNameValue. It is stored as a pair |
| + // of StringPieces instead of a Result, to avoid one copy of the data (note |
| + // that Result stores a copy of the data in std::string, whereas StringPiece |
| + // is just a pointer to the data in |source_|). |
| + base::StringPiece next_name_; |
| + base::StringPiece next_value_; |
| + bool value_name_present_; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); |
| +}; |
| + |
| +// Implementation of FormDataParser and FormDataParser::Result . |
| + |
| +FormDataParser::Result::Result() {} |
| +FormDataParser::Result::~Result() {} |
| + |
| +void FormDataParser::Result::Reset() { |
| + name_.erase(); |
| + value_.erase(); |
| +} |
| + |
| +FormDataParser::~FormDataParser() {} |
| + |
| +// static |
| +scoped_ptr<FormDataParser> FormDataParser::Create( |
| + const net::URLRequest* request) { |
| + std::string value; |
| + const bool found = request->extra_request_headers().GetHeader( |
| + net::HttpRequestHeaders::kContentType, &value); |
| + return Create(found ? &value : NULL); |
| +} |
| + |
| +// static |
| +scoped_ptr<FormDataParser> FormDataParser::Create( |
| + const std::string* content_type_header) { |
| + enum ParserChoice {kUrlEncoded, kMultipart, kError}; |
| + ParserChoice choice = kError; |
| + std::string boundary; |
| + |
| + if (content_type_header == NULL) { |
| + choice = kUrlEncoded; |
| + } else { |
| + const std::string content_type( |
| + content_type_header->substr(0, content_type_header->find(';'))); |
| + |
| + if (base::strcasecmp( |
| + content_type.c_str(), "application/x-www-form-urlencoded") == 0) { |
| + choice = kUrlEncoded; |
| + } else if (base::strcasecmp( |
| + content_type.c_str(), "multipart/form-data") == 0) { |
| + static const char kBoundaryString[] = "boundary="; |
| + size_t offset = content_type_header->find(kBoundaryString); |
| + if (offset == std::string::npos) { |
| + // Malformed header. |
| + return scoped_ptr<FormDataParser>(); |
| + } |
| + offset += strlen(kBoundaryString); |
| + boundary = content_type_header->substr( |
| + offset, content_type_header->find(';', offset)); |
| + if (!boundary.empty()) |
| + choice = kMultipart; |
| + } |
| + } |
| + // Other cases are unparseable, including when |content_type| is "text/plain". |
| + |
| + switch (choice) { |
| + case kUrlEncoded: |
| + return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded()); |
| + case kMultipart: |
| + return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary)); |
| + default: // In other words, case kError: |
| + return scoped_ptr<FormDataParser>(); |
| + } |
| +} |
| + |
| +FormDataParser::FormDataParser() {} |
| + |
| +// Implementation of FormDataParserUrlEncoded. |
| + |
| +FormDataParserUrlEncoded::FormDataParserUrlEncoded() |
| + : source_end_(NULL), |
| + aborted_(false), |
| + offset_(NULL), |
| + equality_signs_(0), |
| + amp_signs_(0), |
| + expect_equality_(true) { |
| +} |
| + |
| +FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {} |
| + |
| +bool FormDataParserUrlEncoded::AllDataReadOK() { |
| + return source_.data() != NULL && |
| + !aborted_ && |
| + offset_ == source_end_ && |
| + equality_signs_ == amp_signs_ + 1; |
|
tkent
2012/08/27 07:09:17
Why do we need to check the number of = and & ?
eq
vabr (Chromium)
2012/08/29 19:57:07
Now the parser uses a regexp which eliminates such
|
| +} |
| + |
| +bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) { |
| + result->Reset(); |
| + if (source_.data() == NULL || aborted_) |
| + return false; |
| + if (offset_ == source_end_) |
| + return false; |
| + const char* const name_start = &(*offset_); |
| + char c; |
| + bool last_read_success = GetNextChar(&c); |
| + while (last_read_success && c != '=') |
| + last_read_success = GetNextChar(&c); |
| + if (!last_read_success) { // This means the data is malformed. |
| + Abort(); |
| + return false; |
| + } |
| + const char* const name_end = offset_ - 1; |
| + const std::string encoded_name(name_start, name_end - name_start); |
| + const net::UnescapeRule::Type unescape_rules = |
| + net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS | |
| + net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE; |
| + result->set_name(net::UnescapeURLComponent(encoded_name, unescape_rules)); |
| + |
| + const char* const value_start = offset_; |
| + last_read_success = GetNextChar(&c); |
| + while (last_read_success && c != '&') |
| + last_read_success = GetNextChar(&c); |
| + const char* const value_end = |
| + last_read_success ? offset_ - 1 : offset_; |
| + const std::string encoded_value(value_start, value_end - value_start); |
| + result->set_value(net::UnescapeURLComponent(encoded_value, unescape_rules)); |
| + return true; |
| +} |
| + |
| +bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) { |
| + if (source_ != NULL || source.data() == NULL || aborted_) |
| + return false; |
| + source_ = source; |
| + source_end_ = source_.data() + source_.size(); |
| + offset_ = source_.data(); |
| + return true; |
| +} |
| + |
| +bool FormDataParserUrlEncoded::GetNextChar(char* c) { |
| + if (offset_ == source_end_ || aborted_) |
| + return false; |
| + *c = *offset_; |
| + ++offset_; |
| + |
| + if (*c == '=') { |
| + if (expect_equality_) { |
| + ++equality_signs_; |
| + expect_equality_ = false; |
| + } else { |
| + Abort(); |
| + return false; |
| + } |
| + } |
| + if (*c == '&' && offset_ != source_end_) { |
| + if (!expect_equality_) { |
| + ++amp_signs_; |
| + expect_equality_ = true; |
| + } else { |
| + Abort(); |
| + return false; |
| + } |
| + } |
| + |
| + return true; |
| +} |
| + |
| +void FormDataParserUrlEncoded::Abort() { |
| + aborted_ = true; |
| +} |
| + |
| +// Implementation of FormDataParserMultipart. |
| + |
| +FormDataParserMultipart::FormDataParserMultipart( |
| + const std::string& boundary_separator) |
| + : source_end_(NULL), |
| + offset_(NULL), |
| + dash_boundary_("--" + boundary_separator), |
| + state_(kStart), |
| + value_name_present_(false) { |
| +} |
| + |
| +FormDataParserMultipart::~FormDataParserMultipart() {} |
| + |
| +bool FormDataParserMultipart::AllDataReadOK() { |
| + return source_.data() != NULL && InFinalState(); |
| +} |
| + |
| +bool FormDataParserMultipart::GetNextNameValue(Result* result) { |
| + if (!value_name_present_ || state_ == kError) |
| + return false; |
| + result->set_name(next_name_); |
| + result->set_value(next_value_); |
| + next_name_.clear(); |
| + next_value_.clear(); |
| + value_name_present_ = ReadNextNameValue(); |
| + return true; |
| +} |
| + |
| +bool FormDataParserMultipart::SetSource(const base::StringPiece& source) { |
| + if (state_ == kError || |
| + source.data() == NULL || |
| + // Message part across a source split is also an error. |
| + next_name_.data() != NULL || next_value_.data() != NULL) |
| + return false; |
| + if (source_.data() != NULL && offset_ != source_end_){ |
| + // Try to seek until the end. If no name-value pair is found, this is OK. |
| + value_name_present_ = ReadNextNameValue(); |
| + if (!value_name_present_ || offset_ != source_end_) |
| + return false; |
| + } |
| + source_ = source; |
| + source_end_ = source_.data() + source_.size(); |
| + offset_ = source_.data(); |
| + value_name_present_ = ReadNextNameValue(); |
| + return true; |
| +} |
| + |
| +// static |
| +char FormDataParserMultipart::kTransitionToChar[] = { |
| + '\n', // For kLF. |
| + '\r', // For kCR. |
| + 0, // For kAscii. |
| + 0, // For kLwsp. |
| + 0, // For kDashBoundary. |
| + ':', // For kColonT. |
| + '-', // For kDash. |
| + 0, // For kAny. |
| +}; |
| + |
| +// static |
| +FormDataParserMultipart::Transition |
| + FormDataParserMultipart::kAvailableTransitions[] = { |
| + kDashBoundary, kCR, kAny, // For kStart. |
| + kLF, kAny, // For kCR1. |
| + kCR, kAny, // For kIgnorePreamble. |
| + kLwsp, kCR, kAny, // For kDB1. |
| + kLF, kAny, // For kCR2. |
| + kAscii, kCR, kAny, // For kPart. |
| + kAscii, kColonT, kAny, // For kName. |
| + kLF, kCR, kAny, // For kColonS. |
| + kCR, kAscii, kAny, // For kEnd1. |
| + kLF, kCR, kAscii, kAny, // For kEnd2. |
| + kLwsp, kCR, kAscii, kAny, // For kEnd3. |
| + kLF, kAny, // For kCR3. |
| + kDashBoundary, kCR, kAny, // For kPreData. |
| + kLF, kAny, // For kCR4. |
| + kCR, kAny, // For kData. |
| + kDashBoundary, kAny, // For kData2. |
| + kLwsp, kCR, kDash, kAny, // For kDB2. |
| + kDash, kAny, // For kD. |
| + kLwsp, kCR, kAny, // For kEnd. |
| + kLF, kAny, // For kCR5. |
| + kAny, // For kIgnoreEpilogue. |
| + kAny // For kError. |
| +}; |
| + |
| +// static |
| +FormDataParserMultipart::State FormDataParserMultipart::kNextState[] = { |
|
tkent
2012/08/27 07:09:17
kAvailableTransitions and kNextState should be mer
vabr (Chromium)
2012/08/29 19:57:07
Disappeared after rewriting.
|
| + kDB1, kCR1, kIgnorePreamble, // For kStart; size so far: 03. |
| + kStart, kIgnorePreamble, // For kCR1; 05. |
| + kCR1, kIgnorePreamble, // For kIgnorePreamble; 07. |
| + kDB1, kCR2, kError, // For kDB1; 10. |
| + kPart, kError, // For kCR2; 12. |
| + kName, kCR3, kError, // For kPart; 15. |
| + kName, kColonS, kError, // For kName; 18. |
| + kEnd1, kEnd2, kColonS, // For kColonS; 21. |
| + kCR3, kName, kError, // For kEnd1; 24. |
| + kEnd3, kCR3, kName, kError, // For kEnd2; 28. |
| + kColonS, kCR3, kName, kError, // For kEnd3; 32. |
| + kPreData, kError, // For kCR3; 34. |
| + kDB2, kCR3, kData, // For kPreData; 37. |
| + kData2, kData, // For kCR4; 39. |
| + kCR4, kData, // For kData; 41. |
| + kDB2, kCR4, // For kData2; 43. |
| + kDB1, kCR2, kD, kError, // For kDB2; 47. |
| + kEnd, kError, // For kD; 49. |
| + kEnd, kCR5, kError, // For kEnd; 52. |
| + kIgnoreEpilogue, kError, // For kCR5; 54. |
| + kIgnoreEpilogue, // For kIgnoreEpilogue; 55. |
| + kError // For kError; 56. |
| +}; |
| + |
| +// static |
| +size_t FormDataParserMultipart::kStateToTransition[] = { |
| + 0u, // For kStart |
| + 3u, // For kCR1 |
| + 5u, // For kIgnorePreamble |
| + 7u, // For kDB1 |
| + 10u, // For kCR2 |
| + 12u, // For kPart |
| + 15u, // For kName |
| + 18u, // For kColonS |
| + 21u, // For kEnd1 |
| + 24u, // For kEnd2 |
| + 28u, // For kEnd3 |
| + 32u, // For kCR3 |
| + 34u, // For kPreData |
| + 37u, // For kCR4 |
| + 39u, // For kData |
| + 41u, // For kData2 |
| + 43u, // For kDB2 |
| + 47u, // For kD |
| + 49u, // For kEnd |
| + 52u, // For kCR5 |
| + 54u, // For kIgnoreEpilogue |
| + 55u, // For kError |
| +}; |
| + |
| +bool FormDataParserMultipart::ReadNextNameValue() { |
| + if (state_ == kError || source_.data() == NULL || |
| + next_name_.data() != NULL || next_value_.data() != NULL) |
| + return false; |
| + |
| + // Seek to the next part's headers. |
| + while (state_ != kPart) { |
| + if (!DoStep()) |
| + return false; |
| + } |
|
tkent
2012/08/27 07:09:17
We had better have a function DoStepsUntil(State).
vabr (Chromium)
2012/08/29 19:57:07
Disappeared after the rewrite.
|
| + while (state_ != kPreData) { |
| + const char* header = offset_; |
| + while (state_ != kColonS) { |
| + if (!DoStep()) |
| + return false; |
| + } |
| + size_t header_length = 0u; |
| + while (state_ != kPreData && state_ != kName) { |
| + if (state_ == kEnd1 || state_ == kEnd2 || state_ == kEnd3) { |
| + // The cast is safe, we know that offset only moves forward. |
| + header_length = static_cast<size_t>(offset_ - header); |
| + } |
| + if (!DoStep()) |
| + return false; |
| + } |
| + if (ParseHeader(base::StringPiece(header, header_length))) { |
| + // Found what we were looking for, just skip to the part's body. |
| + while (state_ != kPreData) { |
| + if (!DoStep()) |
| + return false; |
| + } |
| + } |
| + } |
| + |
| + const char* body = offset_; |
| + size_t body_length = 0; |
| + while (state_ != kDB2 && offset_ != source_end_) { |
| + if (!DoStep()) |
| + return false; |
| + if (state_ == kCR4) { |
| + // We are in the middle of which might be the CRLF starting the part |
| + // separator (see the "delimiter" non-terminal from the grammar given |
| + // in the header file). The cast is safe, we know that offset only moves |
| + // forward and body was assigned at least 1 transition ago. |
| + body_length = static_cast<size_t>(offset_ - body - 1); |
| + } |
| + } |
| + if (body_length > 0) |
| + next_value_.set(body, body_length); |
| + return true; |
| +} |
| + |
| +bool FormDataParserMultipart::DoStep() { |
| + if (state_ == kError || offset_ == source_end_) |
| + return false; |
| + size_t transition_index = kStateToTransition[state_]; |
| + Transition t = kAvailableTransitions[transition_index]; |
| + while (t != kAny) { |
| + const State s = kNextState[transition_index]; |
| + const size_t length = LookUp(t); |
| + if (length > 0) { |
| + offset_ += length; |
| + state_ = s; |
| + return true; |
| + } |
| + t = kAvailableTransitions[++transition_index]; |
| + } |
| + // We have kAny, the default choice. Seek by one and switch the state. |
| + ++offset_; |
| + state_ = kNextState[transition_index]; |
| + return true; |
| +} |
| + |
| +// Contract -- the following must be true: offset_ != source_end_ . |
| +// The idea is to check this only once in the caller (DoStep()), and do not |
| +// repeat it here every time, as this can be called many times from one call |
| +// to DoStep(). |
| +size_t FormDataParserMultipart::LookUp(FormDataParserMultipart::Transition t) { |
| + const char ahead = *offset_; |
| + const char first_char = kTransitionToChar[t]; |
| + |
| + // Easy case: labels corresponding to a single char. |
| + if (first_char != 0) |
| + return ahead == first_char ? 1u : 0u; |
| + |
| + // Harder cases. |
| + switch (t) { |
| + // Multiple alternatives, 1-char long: return immediately. |
| + case kAscii: |
| + return (ahead >= 33 && ahead <= 126 && ahead != ':') ? 1u : 0u; |
| + case kLwsp: |
| + return (ahead == ' ' || ahead == '\t') ? 1u : 0u; |
| + |
| + // Longer than 1 char: prepare work for later. |
| + case kDashBoundary: { |
| + const size_t length = dash_boundary_.size(); |
| + // The cast below is safe, we know that the difference is not negative. |
| + if (static_cast<size_t>(source_end_ - offset_) < length || |
| + memcmp(dash_boundary_.c_str(), offset_, length) != 0) |
| + return 0u; |
| + return length; |
| + } |
| + case kAny: |
| + // We are not supposed to be asked for kAny, but this is the right answer: |
| + return 1u; |
| + default: // We never get here -- the rest has already been handled above. |
| + NOTREACHED(); |
| + return 0u; |
| + } |
| +} |
| + |
| +bool FormDataParserMultipart::ParseHeader(const base::StringPiece& header) { |
| + static const char kContentDisposition[] = "Content-Disposition:"; |
| + if (memcmp(header.data(), kContentDisposition, |
| + strlen(kContentDisposition) != 0)) |
|
tkent
2012/08/27 07:09:17
strlen is not needed. The length of kContentDispo
vabr (Chromium)
2012/08/29 19:57:07
Correct. Although this particular instance and tho
|
| + return false; |
| + static const char kNameEquals[] = " name=\""; |
| + static const char kFilenameEquals[] = " filename=\""; |
| + |
| + // Mandatory part: find the name and set it as |next_name_|. |
| + StringPiece::size_type field_offset = header.find(kNameEquals); |
| + if (field_offset == StringPiece::npos) |
| + return false; |
| + field_offset += strlen(kNameEquals); |
|
tkent
2012/08/27 07:09:17
ditto.
|
| + StringPiece::size_type field_end = header.find('"', field_offset); |
| + if (field_end == StringPiece::npos) |
| + return false; |
| + next_name_.set(header.data() + field_offset, field_end - field_offset); |
|
tkent
2012/08/27 07:09:17
Need to decode the name value.
BTW, what's the ex
vabr (Chromium)
2012/08/29 19:57:07
Thanks very much for bringing this up!
On 2012/08
|
| + |
| + // Optional part: find the filename and set it as |next_value_|. |
| + field_offset = header.find(kFilenameEquals); |
| + if (field_offset == StringPiece::npos) |
| + return true; // This was only optional |
| + field_offset += strlen(kFilenameEquals); |
|
tkent
2012/08/27 07:09:17
ditto.
|
| + field_end = header.find('"', field_offset); |
| + if (field_end == StringPiece::npos) |
| + return false; // This is a malformed header. |
| + next_value_.set(header.data() + field_offset, field_end - field_offset); |
| + return true; |
| +} |
| + |
| +} // namespace extensions |