chrome/browser/extensions/api/web_request/form_data_parser.cc - Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest

Unified Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: One more static RE2 object made non-static Created 8 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/browser/extensions/api/web_request/form_data_parser.h ('k') | chrome/browser/extensions/api/web_request/form_data_parser_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/extensions/api/web_request/form_data_parser.cc

diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.cc b/chrome/browser/extensions/api/web_request/form_data_parser.cc

new file mode 100644

index 0000000000000000000000000000000000000000..20cfc53d9033df5bd4bc4a9a4ed7c7c8161e8267

--- /dev/null

+++ b/chrome/browser/extensions/api/web_request/form_data_parser.cc

@@ -0,0 +1,513 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "chrome/browser/extensions/api/web_request/form_data_parser.h"

+#include <vector>

+#include "base/string_util.h"

+#include "base/values.h"

+#include "net/base/escape.h"

+#include "net/url_request/url_request.h"

+#include "third_party/re2/re2/re2.h"

+using base::DictionaryValue;

+using base::ListValue;

+using base::StringPiece;

+using re2::RE2;

+namespace extensions {

+// Parses URLencoded forms, see

+// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .

+class FormDataParserUrlEncoded : public FormDataParser {

+ public:

+ FormDataParserUrlEncoded();

+ virtual ~FormDataParserUrlEncoded();

+ // Implementation of FormDataParser.

+ virtual bool AllDataReadOK() OVERRIDE;

+ virtual bool GetNextNameValue(Result* result) OVERRIDE;

+ virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

+ private:

+ // The pattern to match a single name-value pair. Ideally this should be

+ // static, so that it is constructed only once, independently on how many

+ // parser instances we have. However, then we would run into exit-time

+ // destructors problems.

+ const RE2 pattern_;

+ static const size_t args_size_ = 2u; // Auxiliary constant for using RE2.

+ static const net::UnescapeRule::Type unescape_rules_;

+ re2::StringPiece source_;

+ bool source_set_;

+ // Auxiliary store for using RE2.

+ std::string name_;

+ std::string value_;

+ const RE2::Arg arg_name_;

+ const RE2::Arg arg_value_;

+ const RE2::Arg* args_[args_size_];

+ DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);

+};

+// The following class, FormDataParserMultipart, parses forms encoded as

+// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart

+// encoding) and 5322 (MIME-headers).

+//

+// Implementation details

+//

+// The original grammar from RFC 2046 is this, "multipart-body" being the root

+// non-terminal:

+//

+// boundary := 0*69<bchars> bcharsnospace

+// bchars := bcharsnospace / " "

+// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","

+// / "-" / "." / "/" / ":" / "=" / "?"

+// dash-boundary := "--" boundary

+// multipart-body := [preamble CRLF]

+// dash-boundary transport-padding CRLF

+// body-part *encapsulation

+// close-delimiter transport-padding

+// [CRLF epilogue]

+// transport-padding := *LWSP-char

+// encapsulation := delimiter transport-padding CRLF body-part

+// delimiter := CRLF dash-boundary

+// close-delimiter := delimiter "--"

+// preamble := discard-text

+// epilogue := discard-text

+// discard-text := *(*text CRLF) *text

+// body-part := MIME-part-headers [CRLF *OCTET]

+// OCTET := <any 0-255 octet value>

+//

+// Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,

+// DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the

+// English alphabet, respectively.

+// The non-terminal "text" is presumably just any text, excluding line breaks.

+// The non-terminal "LWSP-char" is not directly defined in the original grammar

+// but it means "linear whitespace", which is a space or a horizontal tab.

+// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use

+// the syntax for "optional fields" from Section 3.6.8 of RFC 5322:

+//

+// MIME-part-headers := field-name ":" unstructured CRLF

+// field-name := 1*ftext

+// ftext := %d33-57 / ; Printable US-ASCII

+// %d59-126 ; characters not including ":".

+// Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which

+// does not contain a CRLF sub-string, except for substrings "CRLF<space>" and

+// "CRLF<horizontal tab>", which serve for "folding".

+//

+// The FormDataParseMultipart class reads the input source and tries to parse it

+// according to the grammar above, rooted at the "multipart-body" non-terminal.

+// This happens in stages:

+//

+// 1. The optional preamble and the initial dash-boundary with transport padding

+// and a CRLF are read and ignored.

+//

+// 2. Repeatedly each body part is read. The body parts can either serve to

+// upload a file, or just a string of bytes.

+// 2.a. The headers of that part are searched for the "content-disposition"

+// header, which contains the name of the value represented by that body

+// part. If the body-part is for file upload, that header also contains a

+// filename.

+// 2.b. The "*OCTET" part of the body part is then read and passed as the value

+// of the name-value pair for body parts representing a string of bytes.

+// For body parts for uploading a file the "*OCTET" part is just ignored

+// and the filename is used for value instead.

+//

+// 3. The final close-delimiter and epilogue are read and ignored.

+//

+// IMPORTANT NOTE

+// This parser supports multiple sources, i.e., SetSource can be called multiple

+// times if the input is spread over several byte blocks. However, the split

+// may only occur inside a body part, right after the trailing CRLF of headers.

+class FormDataParserMultipart : public FormDataParser {

+ public:

+ explicit FormDataParserMultipart(const std::string& boundary_separator);

+ virtual ~FormDataParserMultipart();

+ // Implementation of FormDataParser.

+ virtual bool AllDataReadOK() OVERRIDE;

+ virtual bool GetNextNameValue(Result* result) OVERRIDE;

+ virtual bool SetSource(const base::StringPiece& source) OVERRIDE;

+ private:

+ enum State {

+ STATE_INIT, // No input read yet.

+ STATE_READY, // Ready to call GetNextNameValue.

+ STATE_FINISHED, // Read the input until the end.

+ STATE_SUSPEND, // Waiting until a new |source_| is set.

+ STATE_ERROR

+ };

+ // Produces a regexp to match the |boundary| string.

+ static std::string GetDashBoundaryPattern(const std::string& boundary);

+ // Tests whether |input| has a prefix matching |pattern|.

+ static bool LookAhead(const RE2& pattern, const re2::StringPiece& input);

+ // If source_ starts with a header, consumes it. If the header is

+ // Content-Disposition, it also extracts |name| from "name=" and possibly

+ // |value| from "filename=" fields of that header. It only touches |name| or

+ // |value| if it finds the respective fields for them. Returns true if it

+ // consumed a header, false if it did not. Sets |value_assigned| to true if it

+ // has assigned to value, otherwise it sets it to false.

+ bool TryReadHeader(base::StringPiece* name,

+ base::StringPiece* value,

+ bool* value_assigned);

+ // Helper to GetNextNameValue. Attempts to read the data portion of a body

+ // part. If |value| is not NULL but empty, it sets it to contain the data

+ // portion. Returns true when the reading was successful.

+ bool GetNextNameValueContinue(base::StringPiece* value);

+ // Ideally those should be static, so that they are constructed only once,

+ // independently on how many parser instances we have. However, then we would

+ // run into exit-time destructors problems.

+ const RE2 transfer_padding_pattern_;

+ const RE2 crlf_pattern_;

+ const RE2 closing_pattern_;

+ const RE2 epilogue_pattern_;

+ const RE2 crlf_free_pattern_;

+ const RE2 preamble_pattern_;

+ const RE2 header_pattern_;

+ const RE2 content_disposition_pattern_;

+ const RE2 name_pattern_;

+ const RE2 value_pattern_;

+ const RE2 dash_boundary_pattern_;

+ // Because of initialisation dependency, |state_| needs to be declared after

+ // |dash_boundary_pattern_|.

+ State state_;

+ // The parsed message can be split into multiple sources which we read

+ // sequentially.

+ re2::StringPiece source_;

+ DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);

+};

+// Implementation of FormDataParser and FormDataParser::Result .

+FormDataParser::Result::Result() {}

+FormDataParser::Result::~Result() {}

+void FormDataParser::Result::Reset() {

+ name_.erase();

+ value_.erase();

+FormDataParser::~FormDataParser() {}

+// static

+scoped_ptr<FormDataParser> FormDataParser::Create(

+ const net::URLRequest* request) {

+ std::string value;

+ const bool found = request->extra_request_headers().GetHeader(

+ net::HttpRequestHeaders::kContentType, &value);

+ return Create(found ? &value : NULL);

+// static

+scoped_ptr<FormDataParser> FormDataParser::Create(

+ const std::string* content_type_header) {

+ enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};

+ ParserChoice choice = ERROR_CHOICE;

+ std::string boundary;

+ if (content_type_header == NULL) {

+ choice = URL_ENCODED;

+ } else {

+ const std::string content_type(

+ content_type_header->substr(0, content_type_header->find(';')));

+ if (base::strcasecmp(

+ content_type.c_str(), "application/x-www-form-urlencoded") == 0) {

+ choice = URL_ENCODED;

+ } else if (base::strcasecmp(

+ content_type.c_str(), "multipart/form-data") == 0) {

+ static const char kBoundaryString[] = "boundary=";

+ size_t offset = content_type_header->find(kBoundaryString);

+ if (offset == std::string::npos) {

+ // Malformed header.

+ return scoped_ptr<FormDataParser>();

+ }

+ offset += sizeof(kBoundaryString) - 1;

+ boundary = content_type_header->substr(

+ offset, content_type_header->find(';', offset));

+ if (!boundary.empty())

+ choice = MULTIPART;

+ }

+ // Other cases are unparseable, including when |content_type| is "text/plain".

+ switch (choice) {

+ case URL_ENCODED:

+ return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());

+ case MULTIPART:

+ return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));

+ default: // In other words, case ERROR_CHOICE:

+ return scoped_ptr<FormDataParser>();

+ }

+FormDataParser::FormDataParser() {}

+// Implementation of FormDataParserUrlEncoded.

+const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =

+ net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS |

+ net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

+FormDataParserUrlEncoded::FormDataParserUrlEncoded()

+ : pattern_("([^=]*)=([^&]*)&?"),

+ source_(NULL),

+ source_set_(false),

+ arg_name_(&name_),

+ arg_value_(&value_) {

+ args_[0] = &arg_name_;

+ args_[1] = &arg_value_;

+FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}

+bool FormDataParserUrlEncoded::AllDataReadOK() {

+ // All OK means we read the whole source.

+ return source_set_ && source_.size() == 0;

+bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {

+ if (!source_set_)

+ return false;

+ bool success = RE2::ConsumeN(&source_, pattern_, args_, args_size_);

+ if (success) {

+ result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));

+ result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));

+ }

+ return success;

+bool FormDataParserUrlEncoded::SetSource(const base::StringPiece& source) {

+ if (source_set_)

+ return false; // We do not allow multiple sources for this parser.

+ source_.set(source.data(), source.size());

+ source_set_ = true;

+ return true;

+// Implementation of FormDataParserMultipart.

+// static

+std::string FormDataParserMultipart::GetDashBoundaryPattern(

tkent 2012/09/04 07:43:53 nit: The function name doesn't represent what it d

vabr (Chromium) 2012/09/04 11:45:25 Good point. I chose "GetBoundaryPatternFromLiteral

+ const std::string& boundary) {

+ static const char escape_closing_quote[] = "\\\\E";

+ // The following should be ideally static, to spare execution time. See the

+ // comment at const RE2 data members of FormDataParserMultipart. Note that

+ // this method is only called once for each instance of

+ // FormDataParserMultipart, so we keep |unqoute_pattern| local even though

+ // non-static.

+ const RE2 unquote_pattern(escape_closing_quote);

+#define OPEN_QUOTE "\\Q"

+ static const char opening_quote[] = OPEN_QUOTE;

+ static const char closing_quote[] = "\\E";

+ std::string output(OPEN_QUOTE "--"); // Let us start with the "--".

+#undef OPEN_QUOTE

+ re2::StringPiece seek_unquote(boundary);

+ const char* copy_start = boundary.data();

+ size_t copy_length = boundary.size();

+ while (RE2::FindAndConsume(&seek_unquote, unquote_pattern)) {

+ copy_length = seek_unquote.data() - copy_start;

+ output.append(copy_start, copy_length);

+ output.append(escape_closing_quote);

+ output.append(opening_quote);

+ copy_start = seek_unquote.data();

+ }

+ copy_length = (boundary.data() + boundary.size()) - copy_start;

+ output.append(copy_start, copy_length);

+ output.append(closing_quote);

+ return output;

+// static

+bool FormDataParserMultipart::LookAhead(const RE2& pattern,

+ const re2::StringPiece& input) {

+ return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);

+#define CONTENT_DISPOSITION "content-disposition:"

+FormDataParserMultipart::FormDataParserMultipart(

+ const std::string& boundary_separator)

+ : transfer_padding_pattern_("[ \\t]*\\r\\n"),

+ crlf_pattern_("\\r\\n"),

+ closing_pattern_("--[ \\t]*"),

+ epilogue_pattern_("|\\r\\n(?s:.)*"),

+ crlf_free_pattern_("(?:[^\\r]|\\r+[^\\r\\n])*"),

+ preamble_pattern_(".*?"),

+ header_pattern_("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),

+ content_disposition_pattern_("(?i:" CONTENT_DISPOSITION ")"),

+ name_pattern_("\\bname=\"([^\"]*)\""),

+ value_pattern_("\\bfilename=\"([^\"]*)\""),

+ dash_boundary_pattern_(GetDashBoundaryPattern(boundary_separator)),

+ state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR) {}

+FormDataParserMultipart::~FormDataParserMultipart() {}

+bool FormDataParserMultipart::AllDataReadOK() {

+ return state_ == STATE_FINISHED;

+bool FormDataParserMultipart::GetNextNameValueContinue(

tkent 2012/09/04 07:43:53 nit: Again, the function name isn't good. The name

vabr (Chromium) 2012/09/04 11:45:25 Changed to "FinishReadingPart", and the argument n

+ base::StringPiece* value) {

+ const char* data_start = source_.data();

+ while (!LookAhead(dash_boundary_pattern_, source_)) {

+ if (!RE2::Consume(&source_, crlf_free_pattern_) ||

+ !RE2::Consume(&source_, crlf_pattern_)) {

+ state_ = STATE_ERROR;

+ return false;

+ }

+ if (value != NULL) {

+ if (source_.data() == data_start) {

+ // No data in this body part.

+ state_ = STATE_ERROR;

+ return false;

+ }

+ // Subtract 2u for the trailing "\r\n".

+ value->set(data_start, source_.data() - data_start - 2u);

+ }

+ // Finally, read the dash-boundary and either skip to the next body part, or

+ // finish reading the source.

+ CHECK(RE2::Consume(&source_, dash_boundary_pattern_));

+ if (LookAhead(closing_pattern_, source_)) {

+ CHECK(RE2::Consume(&source_, closing_pattern_));

+ if (RE2::Consume(&source_, epilogue_pattern_))

+ state_ = STATE_FINISHED;

+ else

+ state_ = STATE_ERROR;

+ } else { // Next body part ahead.

+ if (!RE2::Consume(&source_, transfer_padding_pattern_))

+ state_ = STATE_ERROR;

+ }

+ return state_ != STATE_ERROR;

+bool FormDataParserMultipart::GetNextNameValue(Result* result) {

+ if (source_.size() == 0 || state_ != STATE_READY)

+ return false;

+ // 1. Read body-part headers.

+ base::StringPiece name;

+ base::StringPiece value;

+ bool value_assigned = false;

+ bool value_assigned_temp;

+ while (TryReadHeader(&name, &value, &value_assigned_temp))

+ value_assigned |= value_assigned_temp;

+ if (name.size() == 0) {

+ state_ = STATE_ERROR;

+ return false;

+ }

+ // 2. Read the trailing CRLF after headers.

+ if (!RE2::Consume(&source_, crlf_pattern_)) {

+ state_ = STATE_ERROR;

+ return false;

+ }

+ // 3. Read the data of this body part, i.e., everything until the first

+ // dash-boundary.

+ bool return_value = true;

+ if (value_assigned && source_.size() == 0) // Wait for a new source?

+ state_ = STATE_SUSPEND;

+ else

+ return_value = GetNextNameValueContinue(value_assigned ? NULL : &value);

+ std::string unescaped_name = net::UnescapeURLComponent(

+ name.as_string(),

+ net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS);

+ result->set_name(unescaped_name);

+ result->set_value(value);

+ return return_value;

+bool FormDataParserMultipart::SetSource(const base::StringPiece& source) {

+ if (source.data() == NULL || source_.size() != 0)

+ return false;

+ source_.set(source.data(), source.size());

+ switch (state_) {

+ case STATE_INIT:

+ // Seek behind the preamble.

+ while (!LookAhead(dash_boundary_pattern_, source_)) {

+ if (!RE2::Consume(&source_, preamble_pattern_)) {

+ state_ = STATE_ERROR;

+ break;

+ }

+ // Read dash-boundary, transfer padding, and CRLF.

+ if (state_ != STATE_ERROR) {

+ if (!RE2::Consume(&source_, dash_boundary_pattern_) ||

+ !RE2::Consume(&source_, transfer_padding_pattern_))

+ state_ = STATE_ERROR;

+ else

+ state_ = STATE_READY;

+ }

+ break;

+ case STATE_READY: // Nothing to do.

+ break;

+ case STATE_SUSPEND:

+ state_ = GetNextNameValueContinue(NULL) ? STATE_READY : STATE_ERROR;

+ break;

+ default:

+ state_ = STATE_ERROR;

+ }

+ return state_ != STATE_ERROR;

+bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,

+ base::StringPiece* value,

+ bool* value_assigned) {

+ static const size_t content_disposition_value_offset =

+ sizeof(CONTENT_DISPOSITION) - 1;

+#undef CONTENT_DISPOSITION

+ *value_assigned = false;

+ const char* header_start = source_.data();

+ if (!RE2::Consume(&source_, header_pattern_))

+ return false;

+ // (*) After this point we must return true, because we consumed one header.

+ // Subtract 2u for the trailing "\r\n".

+ re2::StringPiece header(header_start, source_.data() - header_start - 2u);

+ // Now we check whether |header| is a Content-Disposition header, and try

+ // to extract name and possibly value from it.

+ if (LookAhead(content_disposition_pattern_, header)) {

+ re2::StringPiece groups[2u];

+ if (!name_pattern_.Match(header,

+ content_disposition_value_offset, header.size(),

+ RE2::UNANCHORED, groups, 2)) {

+ state_ = STATE_ERROR;

+ return true; // See (*) for why true.

+ }

+ name->set(groups[1].data(), groups[1].size());

+ if (!value_pattern_.Match(header,

+ content_disposition_value_offset, header.size(),

+ RE2::UNANCHORED, groups, 2))

+ return true; // See (*) for why true.

+ value->set(groups[1].data(), groups[1].size());

+ *value_assigned = true;

+ }

+ return true;

+} // namespace extensions