Chromium Code Reviews| Index: chrome/browser/extensions/api/web_request/form_data_parser.cc |
| diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.cc b/chrome/browser/extensions/api/web_request/form_data_parser.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..35ed4eb94be4b64469bcd136411ea7ebc40d24ca |
| --- /dev/null |
| +++ b/chrome/browser/extensions/api/web_request/form_data_parser.cc |
| @@ -0,0 +1,505 @@ |
| +// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "chrome/browser/extensions/api/web_request/form_data_parser.h" |
| + |
| +#include <vector> |
| + |
| +#include "base/string_util.h" |
| +#include "base/values.h" |
| +#include "net/base/escape.h" |
| +#include "net/url_request/url_request.h" |
| +#include "third_party/re2/re2/re2.h" |
| + |
| +using base::DictionaryValue; |
| +using base::ListValue; |
| +using base::StringPiece; |
| +using re2::RE2; |
| + |
| +namespace extensions { |
| + |
| +// Parses URLencoded forms, see |
| +// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . |
| +class FormDataParserUrlEncoded : public FormDataParser { |
| + public: |
| + FormDataParserUrlEncoded(); |
| + virtual ~FormDataParserUrlEncoded(); |
| + |
| + // Implementation of FormDataParser. |
| + virtual bool AllDataReadOK() OVERRIDE; |
| + virtual bool GetNextNameValue(Result* result) OVERRIDE; |
| + virtual bool SetSource(const base::StringPiece& source) OVERRIDE; |
| + |
| + private: |
| + // The pattern to match a single name-value pair. Ideally this should be |
| + // static, so that it is constructed only once, independently on how many |
| + // parser instances we have. However, then we would run into exit-time |
| + // destructors problems. |
| + const RE2 pattern_; |
| + |
| + static const size_t args_size_ = 2u; // Auxiliary constant for using RE2. |
| + static const net::UnescapeRule::Type unescape_rules_; |
| + |
| + re2::StringPiece source_; |
| + bool source_set_; |
| + |
| + // Auxiliary store for using RE2. |
| + std::string name_; |
| + std::string value_; |
| + const RE2::Arg arg_name_; |
| + const RE2::Arg arg_value_; |
| + const RE2::Arg* args_[args_size_]; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); |
| +}; |
| + |
| +// The following class, FormDataParserMultipart, parses forms encoded as |
| +// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart |
| +// encoding) and 5322 (MIME-headers). |
| +// |
| +// Implementation details |
| +// |
| +// The original grammar from RFC 2046 is this, "multipart-body" being the root |
| +// non-terminal: |
| +// |
| +// boundary := 0*69<bchars> bcharsnospace |
| +// bchars := bcharsnospace / " " |
| +// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," |
| +// / "-" / "." / "/" / ":" / "=" / "?" |
| +// dash-boundary := "--" boundary |
| +// multipart-body := [preamble CRLF] |
| +// dash-boundary transport-padding CRLF |
| +// body-part *encapsulation |
| +// close-delimiter transport-padding |
| +// [CRLF epilogue] |
| +// transport-padding := *LWSP-char |
| +// encapsulation := delimiter transport-padding CRLF body-part |
| +// delimiter := CRLF dash-boundary |
| +// close-delimiter := delimiter "--" |
| +// preamble := discard-text |
| +// epilogue := discard-text |
| +// discard-text := *(*text CRLF) *text |
| +// body-part := MIME-part-headers [CRLF *OCTET] |
| +// OCTET := <any 0-255 octet value> |
| +// |
| +// Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF, |
| +// DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the |
| +// English alphabet, respectively. |
| +// The non-terminal "text" is presumably just any text, excluding line breaks. |
| +// The non-terminal "LWSP-char" is not directly defined in the original grammar |
| +// but it means "linear whitespace", which is a space or a horizontal tab. |
| +// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use |
| +// the syntax for "optional fields" from Section 3.6.8 of RFC 5322: |
| +// |
| +// MIME-part-headers := field-name ":" unstructured CRLF |
| +// field-name := 1*ftext |
| +// ftext := %d33-57 / ; Printable US-ASCII |
| +// %d59-126 ; characters not including ":". |
| +// Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which |
| +// does not contain a CRLF sub-string, except for substrings "CRLF<space>" and |
| +// "CRLF<horizontal tab>", which serve for "folding". |
| +// |
| +// The FormDataParseMultipart class reads the input source and tries to parse it |
| +// according to the grammar above, rooted at the "multipart-body" non-terminal. |
| +// This happens in stages: |
| +// |
| +// 1. The optional preamble and the initial dash-boundary with transport padding |
| +// and a CRLF are read and ignored. |
| +// |
| +// 2. Repeatedly each body part is read. The body parts can either serve to |
| +// upload a file, or just a string of bytes. |
| +// 2.a. The headers of that part are searched for the "content-disposition" |
| +// header, which contains the name of the value represented by that body |
| +// part. If the body-part is for file upload, that header also contains a |
| +// filename. |
| +// 2.b. The "*OCTET" part of the body part is then read and passed as the value |
| +// of the name-value pair for body parts representing a string of bytes. |
| +// For body parts for uploading a file the "*OCTET" part is just ignored |
| +// and the filename is used for value instead. |
| +// |
| +// 3. The final close-delimiter and epilogue are read and ignored. |
| +// |
| +// IMPORTANT NOTE |
| +// This parser supports multiple sources, i.e., SetSource can be called multiple |
| +// times if the input is spread over several byte blocks. However, the split |
| +// may only occur inside a body part, right after the trailing CRLF of headers. |
| +class FormDataParserMultipart : public FormDataParser { |
| + public: |
| + explicit FormDataParserMultipart(const std::string& boundary_separator); |
| + virtual ~FormDataParserMultipart(); |
| + |
| + // Implementation of FormDataParser. |
| + virtual bool AllDataReadOK() OVERRIDE; |
| + virtual bool GetNextNameValue(Result* result) OVERRIDE; |
| + virtual bool SetSource(const base::StringPiece& source) OVERRIDE; |
| + |
| + private: |
| + enum State { |
| + STATE_INIT, // No input read yet. |
| + STATE_READY, // Ready to call GetNextNameValue. |
| + STATE_FINISHED, // Read the input until the end. |
| + STATE_SUSPEND, // Waiting until a new |source_| is set. |
| + STATE_ERROR |
| + }; |
| + |
| + // Produces a regexp to match the |boundary| string. |
| + static std::string GetDashBoundaryPattern(const std::string& boundary); |
| + |
| + // Tests whether |input| has a prefix matching |pattern|. |
| + static bool LookAhead(const RE2& pattern, const re2::StringPiece& input); |
| + |
| + // If source_ starts with a header, consumes it. If the header is |
| + // Content-Disposition, it also extracts |name| from "name=" and possibly |
| + // |value| from "filename=" fields of that header. It only touches |name| or |
| + // |value| if it finds the respective fields for them. Returns true if it |
| + // consumed a header, false if it did not. Sets |value_assigned| to true if it |
| + // has assigned to value, otherwise it sets it to false. |
| + bool TryReadHeader(base::StringPiece* name, |
| + base::StringPiece* value, |
| + bool* value_assigned); |
| + |
| + // Helper to GetNextNameValue. Attempts to read the data portion of a body |
| + // part. If |value| is not NULL but empty, it sets it to contain the data |
| + // portion. Returns true when the reading was successful. |
| + bool GetNextNameValueContinue(base::StringPiece* value); |
| + |
| + // Ideally those should be static, so that they are constructed only once, |
| + // independently on how many parser instances we have. However, then we would |
| + // run into exit-time destructors problems. |
| + const RE2 transfer_padding_pattern_; |
| + const RE2 crlf_pattern_; |
| + const RE2 closing_pattern_; |
| + const RE2 epilogue_pattern_; |
| + const RE2 crlf_free_pattern_; |
| + const RE2 preamble_pattern_; |
| + const RE2 header_pattern_; |
| + const RE2 content_disposition_pattern_; |
| + const RE2 name_pattern_; |
| + const RE2 value_pattern_; |
| + |
| + const RE2 dash_boundary_pattern_; |
| + |
| + // Because of initialisation dependency, |state_| needs to be declared after |
| + // |dash_boundary_pattern_|. |
| + State state_; |
| + |
| + // The parsed message can be split into multiple sources which we read |
| + // sequentially. |
| + re2::StringPiece source_; |
| + |
| + DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); |
| +}; |
| + |
| +// Implementation of FormDataParser and FormDataParser::Result . |
| + |
| +FormDataParser::Result::Result() {} |
| +FormDataParser::Result::~Result() {} |
| + |
| +void FormDataParser::Result::Reset() { |
| + name_.erase(); |
| + value_.erase(); |
| +} |
| + |
| +FormDataParser::~FormDataParser() {} |
| + |
| +// static |
| +scoped_ptr<FormDataParser> FormDataParser::Create( |
| + const net::URLRequest* request) { |
| + std::string value; |
| + const bool found = request->extra_request_headers().GetHeader( |
| + net::HttpRequestHeaders::kContentType, &value); |
| + return Create(found ? &value : NULL); |
| +} |
| + |
| +// static |
| +scoped_ptr<FormDataParser> FormDataParser::Create( |
| + const std::string* content_type_header) { |
| + enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE}; |
| + ParserChoice choice = ERROR_CHOICE; |
| + std::string boundary; |
| + |
| + if (content_type_header == NULL) { |
| + choice = URL_ENCODED; |
| + } else { |
| + const std::string content_type( |
| + content_type_header->substr(0, content_type_header->find(';'))); |
| + |
| + if (base::strcasecmp( |
| + content_type.c_str(), "application/x-www-form-urlencoded") == 0) { |
| + choice = URL_ENCODED; |
| + } else if (base::strcasecmp( |
| + content_type.c_str(), "multipart/form-data") == 0) { |
| + static const char kBoundaryString[] = "boundary="; |
| + size_t offset = content_type_header->find(kBoundaryString); |
| + if (offset == std::string::npos) { |
| + // Malformed header. |
| + return scoped_ptr<FormDataParser>(); |
| + } |
| + offset += sizeof(kBoundaryString) - 1; |
| + boundary = content_type_header->substr( |
| + offset, content_type_header->find(';', offset)); |
| + if (!boundary.empty()) |
| + choice = MULTIPART; |
| + } |
| + } |
| + // Other cases are unparseable, including when |content_type| is "text/plain". |
| + |
| + switch (choice) { |
| + case URL_ENCODED: |
| + return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded()); |
| + case MULTIPART: |
| + return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary)); |
| + default: // In other words, case ERROR_CHOICE: |
| + return scoped_ptr<FormDataParser>(); |
| + } |
| +} |
| + |
| +FormDataParser::FormDataParser() {} |
| + |
| +// Implementation of FormDataParserUrlEncoded. |
| + |
| +const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ = |
| + net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS | |
| + net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE; |
| + |
| +FormDataParserUrlEncoded::FormDataParserUrlEncoded() |
| + : pattern_("([^=]*)=([^&]*)&?"), |
| + source_(NULL), |
| + source_set_(false), |
| + arg_name_(&name_), |
| + arg_value_(&value_) { |
| + args_[0] = &arg_name_; |
| + args_[1] = &arg_value_; |
| +} |
| + |
| +FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {} |
| + |
| +bool FormDataParserUrlEncoded::AllDataReadOK() { |
| + // All OK means we read the whole source. |
| + return source_set_ && source_.size() == 0; |
| +} |
| + |
| +bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) { |
| + if (!source_set_) |
| + return false; |
| + |
| + bool success = RE2::ConsumeN(&source_, pattern_, args_, args_size_); |
| + if (success) { |
| + result->set_name(net::UnescapeURLComponent(name_, unescape_rules_)); |
| + result->set_value(net::UnescapeURLComponent(value_, unescape_rules_)); |
| + } |
| + return success; |
| +} |
| + |
| +bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) { |
| + if (source_set_) |
| + return false; // We do not allow multiple sources for this parser. |
| + source_.set(source.data(), source.size()); |
| + source_set_ = true; |
| + return true; |
| +} |
| + |
| +// Implementation of FormDataParserMultipart. |
| + |
| +// static |
| +std::string FormDataParserMultipart::GetDashBoundaryPattern( |
| + const std::string& boundary) { |
| + static const char escape_closing_quote[] = "\\\\E"; |
| + static const RE2 unquote_pattern(escape_closing_quote); |
|
vabr (Chromium)
2012/08/30 12:26:48
Note to myself -- make this a non-static data memb
|
| +#define OPEN_QUOTE "\\Q" |
| + static const char opening_quote[] = OPEN_QUOTE; |
| + static const char closing_quote[] = "\\E"; |
| + |
| + std::string output(OPEN_QUOTE "--"); // Let us start with the "--". |
| +#undef OPEN_QUOTE |
| + re2::StringPiece seek_unquote(boundary); |
| + const char* copy_start = boundary.data(); |
| + size_t copy_length = boundary.size(); |
| + while (RE2::FindAndConsume(&seek_unquote, unquote_pattern)) { |
| + copy_length = seek_unquote.data() - copy_start; |
| + output.append(copy_start, copy_length); |
| + output.append(escape_closing_quote); |
| + output.append(opening_quote); |
| + copy_start = seek_unquote.data(); |
| + } |
| + copy_length = (boundary.data() + boundary.size()) - copy_start; |
| + output.append(copy_start, copy_length); |
| + output.append(closing_quote); |
| + return output; |
| +} |
| + |
| +// static |
| +bool FormDataParserMultipart::LookAhead(const RE2& pattern, |
| + const re2::StringPiece& input) { |
| + return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0); |
| +} |
| + |
| +#define CONTENT_DISPOSITION "content-disposition:" |
| +FormDataParserMultipart::FormDataParserMultipart( |
| + const std::string& boundary_separator) |
| + : transfer_padding_pattern_("[ \\t]*\\r\\n"), |
| + crlf_pattern_("\\r\\n"), |
| + closing_pattern_("--[ \\t]*"), |
| + epilogue_pattern_("|\\r\\n(?s:.)*"), |
| + crlf_free_pattern_("(?:[^\\r]|\\r+[^\\r\\n])*"), |
| + preamble_pattern_(".*?"), |
| + header_pattern_("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"), |
| + content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"), |
| + name_pattern_("\\bname=\"([^\"]*)\""), |
| + value_pattern_("\\bfilename=\"([^\"]*)\""), |
| + dash_boundary_pattern_(GetDashBoundaryPattern(boundary_separator)), |
| + state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {} |
| + |
| +FormDataParserMultipart::~FormDataParserMultipart() {} |
| + |
| +bool FormDataParserMultipart::AllDataReadOK() { |
| + return state_ == STATE_FINISHED; |
| +} |
| + |
| +bool FormDataParserMultipart::GetNextNameValueContinue( |
| + base::StringPiece* value) { |
| + const char* data_start = source_.data(); |
| + while (!LookAhead(dash_boundary_pattern_, source_)) { |
| + if (!RE2::Consume(&source_, crlf_free_pattern_) || |
| + !RE2::Consume(&source_, crlf_pattern_)) { |
| + state_ = STATE_ERROR; |
| + return false; |
| + } |
| + } |
| + if (value != NULL) { |
| + if (source_.data() == data_start) { |
| + // No data in this body part. |
| + state_ = STATE_ERROR; |
| + return false; |
| + } |
| + // Subtract 2u for the trailing "\r\n". |
| + value->set(data_start, source_.data() - data_start - 2u); |
| + } |
| + |
| + // Finally, read the dash-boundary and either skip to the next body part, or |
| + // finish reading the source. |
| + CHECK(RE2::Consume(&source_, dash_boundary_pattern_)); |
| + if (LookAhead(closing_pattern_, source_)) { |
| + CHECK(RE2::Consume(&source_, closing_pattern_)); |
| + if (RE2::Consume(&source_, epilogue_pattern_)) |
| + state_ = STATE_FINISHED; |
| + else |
| + state_ = STATE_ERROR; |
| + } else { // Next body part ahead. |
| + if (!RE2::Consume(&source_, transfer_padding_pattern_)) |
| + state_ = STATE_ERROR; |
| + } |
| + return state_ != STATE_ERROR; |
| +} |
| + |
| +bool FormDataParserMultipart::GetNextNameValue(Result* result) { |
| + if (source_.size() == 0 || state_ != STATE_READY) |
| + return false; |
| + |
| + // 1. Read body-part headers. |
| + base::StringPiece name; |
| + base::StringPiece value; |
| + bool value_assigned = false; |
| + bool value_assigned_temp; |
| + while (TryReadHeader(&name, &value, &value_assigned_temp)) |
| + value_assigned |= value_assigned_temp; |
| + if (name.size() == 0) { |
| + state_ = STATE_ERROR; |
| + return false; |
| + } |
| + |
| + // 2. Read the trailing CRLF after headers. |
| + if (!RE2::Consume(&source_, crlf_pattern_)) { |
| + state_ = STATE_ERROR; |
| + return false; |
| + } |
| + |
| + // 3. Read the data of this body part, i.e., everything until the first |
| + // dash-boundary. |
| + bool return_value = true; |
| + if (value_assigned && source_.size() == 0) // Wait for a new source? |
| + state_ = STATE_SUSPEND; |
| + else |
| + return_value = GetNextNameValueContinue(value_assigned ? NULL : &value); |
| + |
| + result->set_name(name); |
| + result->set_value(value); |
| + |
| + return return_value; |
| +} |
| + |
| +bool FormDataParserMultipart::SetSource(const base::StringPiece& source) { |
| + if (source.data() == NULL || source_.size() != 0) |
| + return false; |
| + source_.set(source.data(), source.size()); |
| + |
| + switch (state_) { |
| + case STATE_INIT: |
| + // Seek behind the preamble. |
| + while (!LookAhead(dash_boundary_pattern_, source_)) { |
| + if (!RE2::Consume(&source_, preamble_pattern_)) { |
| + state_ = STATE_ERROR; |
| + break; |
| + } |
| + } |
| + // Read dash-boundary, transfer padding, and CRLF. |
| + if (state_ != STATE_ERROR) { |
| + if (!RE2::Consume(&source_, dash_boundary_pattern_) || |
| + !RE2::Consume(&source_, transfer_padding_pattern_)) |
| + state_ = STATE_ERROR; |
| + else |
| + state_ = STATE_READY; |
| + } |
| + break; |
| + case STATE_READY: // Nothing to do. |
| + break; |
| + case STATE_SUSPEND: |
| + state_ = GetNextNameValueContinue(NULL) ? STATE_READY : STATE_ERROR; |
| + break; |
| + default: |
| + state_ = STATE_ERROR; |
| + } |
| + return state_ != STATE_ERROR; |
| +} |
| + |
| +bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name, |
| + base::StringPiece* value, |
| + bool* value_assigned) { |
| + static const size_t content_disposition_value_offset = |
| + sizeof(CONTENT_DISPOSITION) - 1; |
| +#undef CONTENT_DISPOSITION |
| + |
| + *value_assigned = false; |
| + const char* header_start = source_.data(); |
| + if (!RE2::Consume(&source_, header_pattern_)) |
| + return false; |
| + // (*) After this point we must return true, because we consumed one header. |
| + |
| + // Subtract 2u for the trailing "\r\n". |
| + re2::StringPiece header(header_start, source_.data() - header_start - 2u); |
| + |
| + // Now we check whether |header| is a Content-Disposition header, and try |
| + // to extract name and possibly value from it. |
| + if (LookAhead(content_disposition_pattern_, header)) { |
| + re2::StringPiece groups[2u]; |
| + |
| + if (!name_pattern_.Match(header, |
| + content_disposition_value_offset, header.size(), |
| + RE2::UNANCHORED, groups, 2)) { |
| + state_ = STATE_ERROR; |
| + return true; // See (*) for why true. |
| + } |
| + name->set(groups[1].data(), groups[1].size()); |
| + |
| + if (!value_pattern_.Match(header, |
| + content_disposition_value_offset, header.size(), |
| + RE2::UNANCHORED, groups, 2)) |
| + return true; // See (*) for why true. |
| + value->set(groups[1].data(), groups[1].size()); |
| + *value_assigned = true; |
| + } |
| + return true; |
| +} |
| + |
| +} // namespace extensions |