Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(359)

Unified Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: One more static RE2 object made non-static Created 8 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/extensions/api/web_request/form_data_parser.cc
diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.cc b/chrome/browser/extensions/api/web_request/form_data_parser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20cfc53d9033df5bd4bc4a9a4ed7c7c8161e8267
--- /dev/null
+++ b/chrome/browser/extensions/api/web_request/form_data_parser.cc
@@ -0,0 +1,513 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/extensions/api/web_request/form_data_parser.h"
+
+#include <vector>
+
+#include "base/string_util.h"
+#include "base/values.h"
+#include "net/base/escape.h"
+#include "net/url_request/url_request.h"
+#include "third_party/re2/re2/re2.h"
+
+using base::DictionaryValue;
+using base::ListValue;
+using base::StringPiece;
+using re2::RE2;
+
+namespace extensions {
+
+// Parses URLencoded forms, see
+// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
+class FormDataParserUrlEncoded : public FormDataParser {
+ public:
+ FormDataParserUrlEncoded();
+ virtual ~FormDataParserUrlEncoded();
+
+ // Implementation of FormDataParser.
+ virtual bool AllDataReadOK() OVERRIDE;
+ virtual bool GetNextNameValue(Result* result) OVERRIDE;
+ virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
+
+ private:
+ // The pattern to match a single name-value pair. Ideally this should be
+ // static, so that it is constructed only once, independently on how many
+ // parser instances we have. However, then we would run into exit-time
+ // destructors problems.
+ const RE2 pattern_;
+
+ static const size_t args_size_ = 2u; // Auxiliary constant for using RE2.
+ static const net::UnescapeRule::Type unescape_rules_;
+
+ re2::StringPiece source_;
+ bool source_set_;
+
+ // Auxiliary store for using RE2.
+ std::string name_;
+ std::string value_;
+ const RE2::Arg arg_name_;
+ const RE2::Arg arg_value_;
+ const RE2::Arg* args_[args_size_];
+
+ DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
+};
+
+// The following class, FormDataParserMultipart, parses forms encoded as
+// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
+// encoding) and 5322 (MIME-headers).
+//
+// Implementation details
+//
+// The original grammar from RFC 2046 is this, "multipart-body" being the root
+// non-terminal:
+//
+// boundary := 0*69<bchars> bcharsnospace
+// bchars := bcharsnospace / " "
+// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
+// / "-" / "." / "/" / ":" / "=" / "?"
+// dash-boundary := "--" boundary
+// multipart-body := [preamble CRLF]
+// dash-boundary transport-padding CRLF
+// body-part *encapsulation
+// close-delimiter transport-padding
+// [CRLF epilogue]
+// transport-padding := *LWSP-char
+// encapsulation := delimiter transport-padding CRLF body-part
+// delimiter := CRLF dash-boundary
+// close-delimiter := delimiter "--"
+// preamble := discard-text
+// epilogue := discard-text
+// discard-text := *(*text CRLF) *text
+// body-part := MIME-part-headers [CRLF *OCTET]
+// OCTET := <any 0-255 octet value>
+//
+// Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
+// DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
+// English alphabet, respectively.
+// The non-terminal "text" is presumably just any text, excluding line breaks.
+// The non-terminal "LWSP-char" is not directly defined in the original grammar
+// but it means "linear whitespace", which is a space or a horizontal tab.
+// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
+// the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
+//
+// MIME-part-headers := field-name ":" unstructured CRLF
+// field-name := 1*ftext
+// ftext := %d33-57 / ; Printable US-ASCII
+// %d59-126 ; characters not including ":".
+// Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
+// does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
+// "CRLF<horizontal tab>", which serve for "folding".
+//
+// The FormDataParseMultipart class reads the input source and tries to parse it
+// according to the grammar above, rooted at the "multipart-body" non-terminal.
+// This happens in stages:
+//
+// 1. The optional preamble and the initial dash-boundary with transport padding
+// and a CRLF are read and ignored.
+//
+// 2. Repeatedly each body part is read. The body parts can either serve to
+// upload a file, or just a string of bytes.
+// 2.a. The headers of that part are searched for the "content-disposition"
+// header, which contains the name of the value represented by that body
+// part. If the body-part is for file upload, that header also contains a
+// filename.
+// 2.b. The "*OCTET" part of the body part is then read and passed as the value
+// of the name-value pair for body parts representing a string of bytes.
+// For body parts for uploading a file the "*OCTET" part is just ignored
+// and the filename is used for value instead.
+//
+// 3. The final close-delimiter and epilogue are read and ignored.
+//
+// IMPORTANT NOTE
+// This parser supports multiple sources, i.e., SetSource can be called multiple
+// times if the input is spread over several byte blocks. However, the split
+// may only occur inside a body part, right after the trailing CRLF of headers.
+class FormDataParserMultipart : public FormDataParser {
+ public:
+ explicit FormDataParserMultipart(const std::string& boundary_separator);
+ virtual ~FormDataParserMultipart();
+
+ // Implementation of FormDataParser.
+ virtual bool AllDataReadOK() OVERRIDE;
+ virtual bool GetNextNameValue(Result* result) OVERRIDE;
+ virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
+
+ private:
+ enum State {
+ STATE_INIT, // No input read yet.
+ STATE_READY, // Ready to call GetNextNameValue.
+ STATE_FINISHED, // Read the input until the end.
+ STATE_SUSPEND, // Waiting until a new |source_| is set.
+ STATE_ERROR
+ };
+
+ // Produces a regexp to match the |boundary| string.
+ static std::string GetDashBoundaryPattern(const std::string& boundary);
+
+ // Tests whether |input| has a prefix matching |pattern|.
+ static bool LookAhead(const RE2& pattern, const re2::StringPiece& input);
+
+ // If source_ starts with a header, consumes it. If the header is
+ // Content-Disposition, it also extracts |name| from "name=" and possibly
+ // |value| from "filename=" fields of that header. It only touches |name| or
+ // |value| if it finds the respective fields for them. Returns true if it
+ // consumed a header, false if it did not. Sets |value_assigned| to true if it
+ // has assigned to value, otherwise it sets it to false.
+ bool TryReadHeader(base::StringPiece* name,
+ base::StringPiece* value,
+ bool* value_assigned);
+
+ // Helper to GetNextNameValue. Attempts to read the data portion of a body
+ // part. If |value| is not NULL but empty, it sets it to contain the data
+ // portion. Returns true when the reading was successful.
+ bool GetNextNameValueContinue(base::StringPiece* value);
+
+ // Ideally those should be static, so that they are constructed only once,
+ // independently on how many parser instances we have. However, then we would
+ // run into exit-time destructors problems.
+ const RE2 transfer_padding_pattern_;
+ const RE2 crlf_pattern_;
+ const RE2 closing_pattern_;
+ const RE2 epilogue_pattern_;
+ const RE2 crlf_free_pattern_;
+ const RE2 preamble_pattern_;
+ const RE2 header_pattern_;
+ const RE2 content_disposition_pattern_;
+ const RE2 name_pattern_;
+ const RE2 value_pattern_;
+
+ const RE2 dash_boundary_pattern_;
+
+ // Because of initialisation dependency, |state_| needs to be declared after
+ // |dash_boundary_pattern_|.
+ State state_;
+
+ // The parsed message can be split into multiple sources which we read
+ // sequentially.
+ re2::StringPiece source_;
+
+ DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
+};
+
+// Implementation of FormDataParser and FormDataParser::Result .
+
+FormDataParser::Result::Result() {}
+FormDataParser::Result::~Result() {}
+
+void FormDataParser::Result::Reset() {
+ name_.erase();
+ value_.erase();
+}
+
+FormDataParser::~FormDataParser() {}
+
+// static
+scoped_ptr<FormDataParser> FormDataParser::Create(
+ const net::URLRequest* request) {
+ std::string value;
+ const bool found = request->extra_request_headers().GetHeader(
+ net::HttpRequestHeaders::kContentType, &value);
+ return Create(found ? &value : NULL);
+}
+
+// static
+scoped_ptr<FormDataParser> FormDataParser::Create(
+ const std::string* content_type_header) {
+ enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};
+ ParserChoice choice = ERROR_CHOICE;
+ std::string boundary;
+
+ if (content_type_header == NULL) {
+ choice = URL_ENCODED;
+ } else {
+ const std::string content_type(
+ content_type_header->substr(0, content_type_header->find(';')));
+
+ if (base::strcasecmp(
+ content_type.c_str(), "application/x-www-form-urlencoded") == 0) {
+ choice = URL_ENCODED;
+ } else if (base::strcasecmp(
+ content_type.c_str(), "multipart/form-data") == 0) {
+ static const char kBoundaryString[] = "boundary=";
+ size_t offset = content_type_header->find(kBoundaryString);
+ if (offset == std::string::npos) {
+ // Malformed header.
+ return scoped_ptr<FormDataParser>();
+ }
+ offset += sizeof(kBoundaryString) - 1;
+ boundary = content_type_header->substr(
+ offset, content_type_header->find(';', offset));
+ if (!boundary.empty())
+ choice = MULTIPART;
+ }
+ }
+ // Other cases are unparseable, including when |content_type| is "text/plain".
+
+ switch (choice) {
+ case URL_ENCODED:
+ return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());
+ case MULTIPART:
+ return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));
+ default: // In other words, case ERROR_CHOICE:
+ return scoped_ptr<FormDataParser>();
+ }
+}
+
+FormDataParser::FormDataParser() {}
+
+// Implementation of FormDataParserUrlEncoded.
+
+const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =
+ net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS |
+ net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;
+
+FormDataParserUrlEncoded::FormDataParserUrlEncoded()
+ : pattern_("([^=]*)=([^&]*)&?"),
+ source_(NULL),
+ source_set_(false),
+ arg_name_(&name_),
+ arg_value_(&value_) {
+ args_[0] = &arg_name_;
+ args_[1] = &arg_value_;
+}
+
+FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}
+
+bool FormDataParserUrlEncoded::AllDataReadOK() {
+ // All OK means we read the whole source.
+ return source_set_ && source_.size() == 0;
+}
+
+bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {
+ if (!source_set_)
+ return false;
+
+ bool success = RE2::ConsumeN(&source_, pattern_, args_, args_size_);
+ if (success) {
+ result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));
+ result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));
+ }
+ return success;
+}
+
+bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {
+ if (source_set_)
+ return false; // We do not allow multiple sources for this parser.
+ source_.set(source.data(), source.size());
+ source_set_ = true;
+ return true;
+}
+
+// Implementation of FormDataParserMultipart.
+
+// static
+std::string FormDataParserMultipart::GetDashBoundaryPattern(
tkent 2012/09/04 07:43:53 nit: The function name doesn't represent what it d
vabr (Chromium) 2012/09/04 11:45:25 Good point. I chose "GetBoundaryPatternFromLiteral
+ const std::string& boundary) {
+ static const char escape_closing_quote[] = "\\\\E";
+ // The following should be ideally static, to spare execution time. See the
+ // comment at const RE2 data members of FormDataParserMultipart. Note that
+ // this method is only called once for each instance of
+ // FormDataParserMultipart, so we keep |unqoute_pattern| local even though
+ // non-static.
+ const RE2 unquote_pattern(escape_closing_quote);
+#define OPEN_QUOTE "\\Q"
+ static const char opening_quote[] = OPEN_QUOTE;
+ static const char closing_quote[] = "\\E";
+
+ std::string output(OPEN_QUOTE "--"); // Let us start with the "--".
+#undef OPEN_QUOTE
+ re2::StringPiece seek_unquote(boundary);
+ const char* copy_start = boundary.data();
+ size_t copy_length = boundary.size();
+ while (RE2::FindAndConsume(&seek_unquote, unquote_pattern)) {
+ copy_length = seek_unquote.data() - copy_start;
+ output.append(copy_start, copy_length);
+ output.append(escape_closing_quote);
+ output.append(opening_quote);
+ copy_start = seek_unquote.data();
+ }
+ copy_length = (boundary.data() + boundary.size()) - copy_start;
+ output.append(copy_start, copy_length);
+ output.append(closing_quote);
+ return output;
+}
+
+// static
+bool FormDataParserMultipart::LookAhead(const RE2& pattern,
+ const re2::StringPiece& input) {
+ return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);
+}
+
+#define CONTENT_DISPOSITION "content-disposition:"
+FormDataParserMultipart::FormDataParserMultipart(
+ const std::string& boundary_separator)
+ : transfer_padding_pattern_("[ \\t]*\\r\\n"),
+ crlf_pattern_("\\r\\n"),
+ closing_pattern_("--[ \\t]*"),
+ epilogue_pattern_("|\\r\\n(?s:.)*"),
+ crlf_free_pattern_("(?:[^\\r]|\\r+[^\\r\\n])*"),
+ preamble_pattern_(".*?"),
+ header_pattern_("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),
+ content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"),
+ name_pattern_("\\bname=\"([^\"]*)\""),
+ value_pattern_("\\bfilename=\"([^\"]*)\""),
+ dash_boundary_pattern_(GetDashBoundaryPattern(boundary_separator)),
+ state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {}
+
+FormDataParserMultipart::~FormDataParserMultipart() {}
+
+bool FormDataParserMultipart::AllDataReadOK() {
+ return state_ == STATE_FINISHED;
+}
+
+bool FormDataParserMultipart::GetNextNameValueContinue(
tkent 2012/09/04 07:43:53 nit: Again, the function name isn't good. The name
vabr (Chromium) 2012/09/04 11:45:25 Changed to "FinishReadingPart", and the argument n
+ base::StringPiece* value) {
+ const char* data_start = source_.data();
+ while (!LookAhead(dash_boundary_pattern_, source_)) {
+ if (!RE2::Consume(&source_, crlf_free_pattern_) ||
+ !RE2::Consume(&source_, crlf_pattern_)) {
+ state_ = STATE_ERROR;
+ return false;
+ }
+ }
+ if (value != NULL) {
+ if (source_.data() == data_start) {
+ // No data in this body part.
+ state_ = STATE_ERROR;
+ return false;
+ }
+ // Subtract 2u for the trailing "\r\n".
+ value->set(data_start, source_.data() - data_start - 2u);
+ }
+
+ // Finally, read the dash-boundary and either skip to the next body part, or
+ // finish reading the source.
+ CHECK(RE2::Consume(&source_, dash_boundary_pattern_));
+ if (LookAhead(closing_pattern_, source_)) {
+ CHECK(RE2::Consume(&source_, closing_pattern_));
+ if (RE2::Consume(&source_, epilogue_pattern_))
+ state_ = STATE_FINISHED;
+ else
+ state_ = STATE_ERROR;
+ } else { // Next body part ahead.
+ if (!RE2::Consume(&source_, transfer_padding_pattern_))
+ state_ = STATE_ERROR;
+ }
+ return state_ != STATE_ERROR;
+}
+
+bool FormDataParserMultipart::GetNextNameValue(Result* result) {
+ if (source_.size() == 0 || state_ != STATE_READY)
+ return false;
+
+ // 1. Read body-part headers.
+ base::StringPiece name;
+ base::StringPiece value;
+ bool value_assigned = false;
+ bool value_assigned_temp;
+ while (TryReadHeader(&name, &value, &value_assigned_temp))
+ value_assigned |= value_assigned_temp;
+ if (name.size() == 0) {
+ state_ = STATE_ERROR;
+ return false;
+ }
+
+ // 2. Read the trailing CRLF after headers.
+ if (!RE2::Consume(&source_, crlf_pattern_)) {
+ state_ = STATE_ERROR;
+ return false;
+ }
+
+ // 3. Read the data of this body part, i.e., everything until the first
+ // dash-boundary.
+ bool return_value = true;
+ if (value_assigned && source_.size() == 0) // Wait for a new source?
+ state_ = STATE_SUSPEND;
+ else
+ return_value = GetNextNameValueContinue(value_assigned ? NULL : &value);
+
+ std::string unescaped_name = net::UnescapeURLComponent(
+ name.as_string(),
+ net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS);
+ result->set_name(unescaped_name);
+ result->set_value(value);
+
+ return return_value;
+}
+
+bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {
+ if (source.data() == NULL || source_.size() != 0)
+ return false;
+ source_.set(source.data(), source.size());
+
+ switch (state_) {
+ case STATE_INIT:
+ // Seek behind the preamble.
+ while (!LookAhead(dash_boundary_pattern_, source_)) {
+ if (!RE2::Consume(&source_, preamble_pattern_)) {
+ state_ = STATE_ERROR;
+ break;
+ }
+ }
+ // Read dash-boundary, transfer padding, and CRLF.
+ if (state_ != STATE_ERROR) {
+ if (!RE2::Consume(&source_, dash_boundary_pattern_) ||
+ !RE2::Consume(&source_, transfer_padding_pattern_))
+ state_ = STATE_ERROR;
+ else
+ state_ = STATE_READY;
+ }
+ break;
+ case STATE_READY: // Nothing to do.
+ break;
+ case STATE_SUSPEND:
+ state_ = GetNextNameValueContinue(NULL) ? STATE_READY : STATE_ERROR;
+ break;
+ default:
+ state_ = STATE_ERROR;
+ }
+ return state_ != STATE_ERROR;
+}
+
+bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,
+ base::StringPiece* value,
+ bool* value_assigned) {
+ static const size_t content_disposition_value_offset =
+ sizeof(CONTENT_DISPOSITION) - 1;
+#undef CONTENT_DISPOSITION
+
+ *value_assigned = false;
+ const char* header_start = source_.data();
+ if (!RE2::Consume(&source_, header_pattern_))
+ return false;
+ // (*) After this point we must return true, because we consumed one header.
+
+ // Subtract 2u for the trailing "\r\n".
+ re2::StringPiece header(header_start, source_.data() - header_start - 2u);
+
+ // Now we check whether |header| is a Content-Disposition header, and try
+ // to extract name and possibly value from it.
+ if (LookAhead(content_disposition_pattern_, header)) {
+ re2::StringPiece groups[2u];
+
+ if (!name_pattern_.Match(header,
+ content_disposition_value_offset, header.size(),
+ RE2::UNANCHORED, groups, 2)) {
+ state_ = STATE_ERROR;
+ return true; // See (*) for why true.
+ }
+ name->set(groups[1].data(), groups[1].size());
+
+ if (!value_pattern_.Match(header,
+ content_disposition_value_offset, header.size(),
+ RE2::UNANCHORED, groups, 2))
+ return true; // See (*) for why true.
+ value->set(groups[1].data(), groups[1].size());
+ *value_assigned = true;
+ }
+ return true;
+}
+
+} // namespace extensions

Powered by Google App Engine
This is Rietveld 408576698