chrome/browser/extensions/api/web_request/form_data_parser.cc - Issue 584163004: Move web_request directory to //extensions.

Unified Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 584163004: Move web_request directory to //extensions. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebase again Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « chrome/browser/extensions/api/web_request/form_data_parser.h ('k') | chrome/browser/extensions/api/web_request/form_data_parser_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/browser/extensions/api/web_request/form_data_parser.cc

diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.cc b/chrome/browser/extensions/api/web_request/form_data_parser.cc

deleted file mode 100644

index 57156976b78bd9554076b7fa3d0a7ba5f6e155a9..0000000000000000000000000000000000000000

--- a/chrome/browser/extensions/api/web_request/form_data_parser.cc

+++ /dev/null

@@ -1,595 +0,0 @@

-// Use of this source code is governed by a BSD-style license that can be

-// found in the LICENSE file.

-#include "chrome/browser/extensions/api/web_request/form_data_parser.h"

-#include <vector>

-#include "base/lazy_instance.h"

-#include "base/logging.h"

-#include "base/macros.h"

-#include "base/strings/string_util.h"

-#include "base/values.h"

-#include "net/base/escape.h"

-#include "net/url_request/url_request.h"

-#include "third_party/re2/re2/re2.h"

-using base::DictionaryValue;

-using base::ListValue;

-using base::StringPiece;

-using re2::RE2;

-namespace extensions {

-namespace {

-const char kContentDisposition[] = "content-disposition:";

-const size_t kContentDispositionLength = arraysize(kContentDisposition) - 1;

-// kCharacterPattern is an allowed character in a URL encoding. Definition is

-// from RFC 1738, end of section 2.2.

-const char kCharacterPattern[] =

- "(?:[a-zA-Z0-9$_.+!*'(),]|-|(?:%[a-fA-F0-9]{2}))";

-const char kEscapeClosingQuote[] = "\\\\E";

-// A wrapper struct for static RE2 objects to be held as LazyInstance.

-struct Patterns {

- Patterns();

- ~Patterns();

- const RE2 transfer_padding_pattern;

- const RE2 crlf_pattern;

- const RE2 closing_pattern;

- const RE2 epilogue_pattern;

- const RE2 crlf_free_pattern;

- const RE2 preamble_pattern;

- const RE2 header_pattern;

- const RE2 content_disposition_pattern;

- const RE2 name_pattern;

- const RE2 value_pattern;

- const RE2 unquote_pattern;

- const RE2 url_encoded_pattern;

-};

-Patterns::Patterns()

- : transfer_padding_pattern("[ \\t]*\\r\\n"),

- crlf_pattern("\\r\\n"),

- closing_pattern("--[ \\t]*"),

- epilogue_pattern("|\\r\\n(?s:.)*"),

- crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"),

- preamble_pattern(".+?"),

- header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),

- content_disposition_pattern(std::string("(?i:") + kContentDisposition +

- ")"),

- name_pattern("\\bname=\"([^\"]*)\""),

- value_pattern("\\bfilename=\"([^\"]*)\""),

- unquote_pattern(kEscapeClosingQuote),

- url_encoded_pattern(std::string("(") + kCharacterPattern + "*)=(" +

- kCharacterPattern +

- "*)") {

-Patterns::~Patterns() {}

-base::LazyInstance<Patterns>::Leaky g_patterns = LAZY_INSTANCE_INITIALIZER;

-} // namespace

-// Parses URLencoded forms, see

-// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .

-class FormDataParserUrlEncoded : public FormDataParser {

- public:

- FormDataParserUrlEncoded();

- virtual ~FormDataParserUrlEncoded();

- // Implementation of FormDataParser.

- virtual bool AllDataReadOK() OVERRIDE;

- virtual bool GetNextNameValue(Result* result) OVERRIDE;

- virtual bool SetSource(base::StringPiece source) OVERRIDE;

- private:

- // Returns the pattern to match a single name-value pair. This could be even

- // static, but then we would have to spend more code on initializing the

- // cached pointer to g_patterns.Get().

- const RE2& pattern() const {

- return patterns_->url_encoded_pattern;

- }

- // Auxiliary constant for using RE2. Number of arguments for parsing

- // name-value pairs (one for name, one for value).

- static const size_t args_size_ = 2u;

- static const net::UnescapeRule::Type unescape_rules_;

- re2::StringPiece source_;

- bool source_set_;

- bool source_malformed_;

- // Auxiliary store for using RE2.

- std::string name_;

- std::string value_;

- const RE2::Arg arg_name_;

- const RE2::Arg arg_value_;

- const RE2::Arg* args_[args_size_];

- // Caching the pointer to g_patterns.Get().

- const Patterns* patterns_;

- DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);

-};

-// The following class, FormDataParserMultipart, parses forms encoded as

-// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart

-// encoding) and 5322 (MIME-headers).

-//

-// Implementation details

-//

-// The original grammar from RFC 2046 is this, "multipart-body" being the root

-// non-terminal:

-//

-// boundary := 0*69<bchars> bcharsnospace

-// bchars := bcharsnospace / " "

-// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","

-// / "-" / "." / "/" / ":" / "=" / "?"

-// dash-boundary := "--" boundary

-// multipart-body := [preamble CRLF]

-// dash-boundary transport-padding CRLF

-// body-part *encapsulation

-// close-delimiter transport-padding

-// [CRLF epilogue]

-// transport-padding := *LWSP-char

-// encapsulation := delimiter transport-padding CRLF body-part

-// delimiter := CRLF dash-boundary

-// close-delimiter := delimiter "--"

-// preamble := discard-text

-// epilogue := discard-text

-// discard-text := *(*text CRLF) *text

-// body-part := MIME-part-headers [CRLF *OCTET]

-// OCTET := <any 0-255 octet value>

-//

-// Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,

-// DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the

-// English alphabet, respectively.

-// The non-terminal "text" is presumably just any text, excluding line breaks.

-// The non-terminal "LWSP-char" is not directly defined in the original grammar

-// but it means "linear whitespace", which is a space or a horizontal tab.

-// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use

-// the syntax for "optional fields" from Section 3.6.8 of RFC 5322:

-//

-// MIME-part-headers := field-name ":" unstructured CRLF

-// field-name := 1*ftext

-// ftext := %d33-57 / ; Printable US-ASCII

-// %d59-126 ; characters not including ":".

-// Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which

-// does not contain a CRLF sub-string, except for substrings "CRLF<space>" and

-// "CRLF<horizontal tab>", which serve for "folding".

-//

-// The FormDataParseMultipart class reads the input source and tries to parse it

-// according to the grammar above, rooted at the "multipart-body" non-terminal.

-// This happens in stages:

-//

-// 1. The optional preamble and the initial dash-boundary with transport padding

-// and a CRLF are read and ignored.

-//

-// 2. Repeatedly each body part is read. The body parts can either serve to

-// upload a file, or just a string of bytes.

-// 2.a. The headers of that part are searched for the "content-disposition"

-// header, which contains the name of the value represented by that body

-// part. If the body-part is for file upload, that header also contains a

-// filename.

-// 2.b. The "*OCTET" part of the body part is then read and passed as the value

-// of the name-value pair for body parts representing a string of bytes.

-// For body parts for uploading a file the "*OCTET" part is just ignored

-// and the filename is used for value instead.

-//

-// 3. The final close-delimiter and epilogue are read and ignored.

-//

-// IMPORTANT NOTE

-// This parser supports sources split into multiple chunks. Therefore SetSource

-// can be called multiple times if the source is spread over several chunks.

-// However, the split may only occur inside a body part, right after the

-// trailing CRLF of headers.

-class FormDataParserMultipart : public FormDataParser {

- public:

- explicit FormDataParserMultipart(const std::string& boundary_separator);

- virtual ~FormDataParserMultipart();

- // Implementation of FormDataParser.

- virtual bool AllDataReadOK() OVERRIDE;

- virtual bool GetNextNameValue(Result* result) OVERRIDE;

- virtual bool SetSource(base::StringPiece source) OVERRIDE;

- private:

- enum State {

- STATE_INIT, // No input read yet.

- STATE_READY, // Ready to call GetNextNameValue.

- STATE_FINISHED, // Read the input until the end.

- STATE_SUSPEND, // Waiting until a new |source_| is set.

- STATE_ERROR

- };

- // Produces a regexp to match the string "--" + |literal|. The idea is to

- // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed

- // in "\\Q" and "\\E". The only catch is to watch out for occurences of "\\E"

- // inside |literal|. Those must be excluded from the quote and the backslash

- // doubly escaped. For example, for literal == "abc\\Edef" the result is

- // "\\Q--abc\\E\\\\E\\Qdef\\E".

- static std::string CreateBoundaryPatternFromLiteral(

- const std::string& literal);

- // Tests whether |input| has a prefix matching |pattern|.

- static bool StartsWithPattern(const re2::StringPiece& input,

- const RE2& pattern);

- // If |source_| starts with a header, seeks |source_| beyond the header. If

- // the header is Content-Disposition, extracts |name| from "name=" and

- // possibly |value| from "filename=" fields of that header. Only if the

- // "name" or "filename" fields are found, then |name| or |value| are touched.

- // Returns true iff |source_| is seeked forward. Sets |value_assigned|

- // to true iff |value| has been assigned to.

- bool TryReadHeader(base::StringPiece* name,

- base::StringPiece* value,

- bool* value_assigned);

- // Helper to GetNextNameValue. Expects that the input starts with a data

- // portion of a body part. An attempt is made to read the input until the end

- // of that body part. If |data| is not NULL, it is set to contain the data

- // portion. Returns true iff the reading was successful.

- bool FinishReadingPart(base::StringPiece* data);

- // These methods could be even static, but then we would have to spend more

- // code on initializing the cached pointer to g_patterns.Get().

- const RE2& transfer_padding_pattern() const {

- return patterns_->transfer_padding_pattern;

- }

- const RE2& crlf_pattern() const {

- return patterns_->crlf_pattern;

- }

- const RE2& closing_pattern() const {

- return patterns_->closing_pattern;

- }

- const RE2& epilogue_pattern() const {

- return patterns_->epilogue_pattern;

- }

- const RE2& crlf_free_pattern() const {

- return patterns_->crlf_free_pattern;

- }

- const RE2& preamble_pattern() const {

- return patterns_->preamble_pattern;

- }

- const RE2& header_pattern() const {

- return patterns_->header_pattern;

- }

- const RE2& content_disposition_pattern() const {

- return patterns_->content_disposition_pattern;

- }

- const RE2& name_pattern() const {

- return patterns_->name_pattern;

- }

- const RE2& value_pattern() const {

- return patterns_->value_pattern;

- }

- // However, this is used in a static method so it needs to be static.

- static const RE2& unquote_pattern() {

- return g_patterns.Get().unquote_pattern; // No caching g_patterns here.

- }

- const RE2 dash_boundary_pattern_;

- // Because of initialisation dependency, |state_| needs to be declared after

- // |dash_boundary_pattern_|.

- State state_;

- // The parsed message can be split into multiple sources which we read

- // sequentially.

- re2::StringPiece source_;

- // Caching the pointer to g_patterns.Get().

- const Patterns* patterns_;

- DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);

-};

-FormDataParser::Result::Result() {}

-FormDataParser::Result::~Result() {}

-FormDataParser::~FormDataParser() {}

-// static

-scoped_ptr<FormDataParser> FormDataParser::Create(

- const net::URLRequest& request) {

- std::string value;

- const bool found = request.extra_request_headers().GetHeader(

- net::HttpRequestHeaders::kContentType, &value);

- return CreateFromContentTypeHeader(found ? &value : NULL);

-// static

-scoped_ptr<FormDataParser> FormDataParser::CreateFromContentTypeHeader(

- const std::string* content_type_header) {

- enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};

- ParserChoice choice = ERROR_CHOICE;

- std::string boundary;

- if (content_type_header == NULL) {

- choice = URL_ENCODED;

- } else {

- const std::string content_type(

- content_type_header->substr(0, content_type_header->find(';')));

- if (base::strcasecmp(

- content_type.c_str(), "application/x-www-form-urlencoded") == 0) {

- choice = URL_ENCODED;

- } else if (base::strcasecmp(

- content_type.c_str(), "multipart/form-data") == 0) {

- static const char kBoundaryString[] = "boundary=";

- size_t offset = content_type_header->find(kBoundaryString);

- if (offset == std::string::npos) {

- // Malformed header.

- return scoped_ptr<FormDataParser>();

- }

- offset += sizeof(kBoundaryString) - 1;

- boundary = content_type_header->substr(

- offset, content_type_header->find(';', offset));

- if (!boundary.empty())

- choice = MULTIPART;

- }

- // Other cases are unparseable, including when |content_type| is "text/plain".

- switch (choice) {

- case URL_ENCODED:

- return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());

- case MULTIPART:

- return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));

- case ERROR_CHOICE:

- return scoped_ptr<FormDataParser>();

- }

- NOTREACHED(); // Some compilers do not believe this is unreachable.

- return scoped_ptr<FormDataParser>();

-FormDataParser::FormDataParser() {}

-const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =

- net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS |

- net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

-FormDataParserUrlEncoded::FormDataParserUrlEncoded()

- : source_(NULL),

- source_set_(false),

- source_malformed_(false),

- arg_name_(&name_),

- arg_value_(&value_),

- patterns_(g_patterns.Pointer()) {

- args_[0] = &arg_name_;

- args_[1] = &arg_value_;

-FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}

-bool FormDataParserUrlEncoded::AllDataReadOK() {

- // All OK means we read the whole source.

- return source_set_ && source_.empty() && !source_malformed_;

-bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {

- if (!source_set_ || source_malformed_)

- return false;

- bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_);

- if (success) {

- result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));

- result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));

- }

- if (source_.length() > 0) {

- if (source_[0] == '&')

- source_.remove_prefix(1); // Remove the leading '&'.

- else

- source_malformed_ = true; // '&' missing between two name-value pairs.

- }

- return success && !source_malformed_;

-bool FormDataParserUrlEncoded::SetSource(base::StringPiece source) {

- if (source_set_)

- return false; // We do not allow multiple sources for this parser.

- source_.set(source.data(), source.size());

- source_set_ = true;

- source_malformed_ = false;

- return true;

-// static

-std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral(

- const std::string& literal) {

- static const char quote[] = "\\Q";

- static const char unquote[] = "\\E";

- // The result always starts with opening the qoute and then "--".

- std::string result("\\Q--");

- // This StringPiece is used below to record the next occurrence of "\\E" in

- // |literal|.

- re2::StringPiece seek_unquote(literal);

- const char* copy_start = literal.data();

- size_t copy_length = literal.size();

- // Find all "\\E" in |literal| and exclude them from the \Q...\E quote.

- while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) {

- copy_length = seek_unquote.data() - copy_start;

- result.append(copy_start, copy_length);

- result.append(kEscapeClosingQuote);

- result.append(quote);

- copy_start = seek_unquote.data();

- }

- // Finish the last \Q...\E quote.

- copy_length = (literal.data() + literal.size()) - copy_start;

- result.append(copy_start, copy_length);

- result.append(unquote);

- return result;

-// static

-bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input,

- const RE2& pattern) {

- return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);

-FormDataParserMultipart::FormDataParserMultipart(

- const std::string& boundary_separator)

- : dash_boundary_pattern_(

- CreateBoundaryPatternFromLiteral(boundary_separator)),

- state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR),

- patterns_(g_patterns.Pointer()) {}

-FormDataParserMultipart::~FormDataParserMultipart() {}

-bool FormDataParserMultipart::AllDataReadOK() {

- return state_ == STATE_FINISHED;

-bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) {

- const char* data_start = source_.data();

- while (!StartsWithPattern(source_, dash_boundary_pattern_)) {

- if (!RE2::Consume(&source_, crlf_free_pattern()) ||

- !RE2::Consume(&source_, crlf_pattern())) {

- state_ = STATE_ERROR;

- return false;

- }

- if (data != NULL) {

- if (source_.data() == data_start) {

- // No data in this body part.

- state_ = STATE_ERROR;

- return false;

- }

- // Subtract 2 for the trailing "\r\n".

- data->set(data_start, source_.data() - data_start - 2);

- }

- // Finally, read the dash-boundary and either skip to the next body part, or

- // finish reading the source.

- CHECK(RE2::Consume(&source_, dash_boundary_pattern_));

- if (StartsWithPattern(source_, closing_pattern())) {

- CHECK(RE2::Consume(&source_, closing_pattern()));

- if (RE2::Consume(&source_, epilogue_pattern()))

- state_ = STATE_FINISHED;

- else

- state_ = STATE_ERROR;

- } else { // Next body part ahead.

- if (!RE2::Consume(&source_, transfer_padding_pattern()))

- state_ = STATE_ERROR;

- }

- return state_ != STATE_ERROR;

-bool FormDataParserMultipart::GetNextNameValue(Result* result) {

- if (source_.empty() || state_ != STATE_READY)

- return false;

- // 1. Read body-part headers.

- base::StringPiece name;

- base::StringPiece value;

- bool value_assigned = false;

- bool value_assigned_temp;

- while (TryReadHeader(&name, &value, &value_assigned_temp))

- value_assigned |= value_assigned_temp;

- if (name.empty() || state_ == STATE_ERROR) {

- state_ = STATE_ERROR;

- return false;

- }

- // 2. Read the trailing CRLF after headers.

- if (!RE2::Consume(&source_, crlf_pattern())) {

- state_ = STATE_ERROR;

- return false;

- }

- // 3. Read the data of this body part, i.e., everything until the first

- // dash-boundary.

- bool return_value;

- if (value_assigned && source_.empty()) { // Wait for a new source?

- return_value = true;

- state_ = STATE_SUSPEND;

- } else {

- return_value = FinishReadingPart(value_assigned ? NULL : &value);

- }

- std::string unescaped_name = net::UnescapeURLComponent(

- name.as_string(),

- net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS);

- result->set_name(unescaped_name);

- result->set_value(value);

- return return_value;

-bool FormDataParserMultipart::SetSource(base::StringPiece source) {

- if (source.data() == NULL || !source_.empty())

- return false;

- source_.set(source.data(), source.size());

- switch (state_) {

- case STATE_INIT:

- // Seek behind the preamble.

- while (!StartsWithPattern(source_, dash_boundary_pattern_)) {

- if (!RE2::Consume(&source_, preamble_pattern())) {

- state_ = STATE_ERROR;

- break;

- }

- // Read dash-boundary, transfer padding, and CRLF.

- if (state_ != STATE_ERROR) {

- if (!RE2::Consume(&source_, dash_boundary_pattern_) ||

- !RE2::Consume(&source_, transfer_padding_pattern()))

- state_ = STATE_ERROR;

- else

- state_ = STATE_READY;

- }

- break;

- case STATE_READY: // Nothing to do.

- break;

- case STATE_SUSPEND:

- state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR;

- break;

- default:

- state_ = STATE_ERROR;

- }

- return state_ != STATE_ERROR;

-bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,

- base::StringPiece* value,

- bool* value_assigned) {

- *value_assigned = false;

- const char* header_start = source_.data();

- if (!RE2::Consume(&source_, header_pattern()))

- return false;

- // (*) After this point we must return true, because we consumed one header.

- // Subtract 2 for the trailing "\r\n".

- re2::StringPiece header(header_start, source_.data() - header_start - 2);

- if (!StartsWithPattern(header, content_disposition_pattern()))

- return true; // Skip headers that don't describe the content-disposition.

- re2::StringPiece groups[2];

- if (!name_pattern().Match(header,

- kContentDispositionLength, header.size(),

- RE2::UNANCHORED, groups, 2)) {

- state_ = STATE_ERROR;

- return true; // See (*) for why true.

- }

- name->set(groups[1].data(), groups[1].size());

- if (value_pattern().Match(header,

- kContentDispositionLength, header.size(),

- RE2::UNANCHORED, groups, 2)) {

- value->set(groups[1].data(), groups[1].size());

- *value_assigned = true;

- }

- return true;

-} // namespace extensions