Index: chrome/browser/extensions/api/web_request/form_data_parser.cc |
diff --git a/chrome/browser/extensions/api/web_request/form_data_parser.cc b/chrome/browser/extensions/api/web_request/form_data_parser.cc |
deleted file mode 100644 |
index 57156976b78bd9554076b7fa3d0a7ba5f6e155a9..0000000000000000000000000000000000000000 |
--- a/chrome/browser/extensions/api/web_request/form_data_parser.cc |
+++ /dev/null |
@@ -1,595 +0,0 @@ |
-// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
-// Use of this source code is governed by a BSD-style license that can be |
-// found in the LICENSE file. |
- |
-#include "chrome/browser/extensions/api/web_request/form_data_parser.h" |
- |
-#include <vector> |
- |
-#include "base/lazy_instance.h" |
-#include "base/logging.h" |
-#include "base/macros.h" |
-#include "base/strings/string_util.h" |
-#include "base/values.h" |
-#include "net/base/escape.h" |
-#include "net/url_request/url_request.h" |
-#include "third_party/re2/re2/re2.h" |
- |
-using base::DictionaryValue; |
-using base::ListValue; |
-using base::StringPiece; |
-using re2::RE2; |
- |
-namespace extensions { |
- |
-namespace { |
- |
-const char kContentDisposition[] = "content-disposition:"; |
-const size_t kContentDispositionLength = arraysize(kContentDisposition) - 1; |
-// kCharacterPattern is an allowed character in a URL encoding. Definition is |
-// from RFC 1738, end of section 2.2. |
-const char kCharacterPattern[] = |
- "(?:[a-zA-Z0-9$_.+!*'(),]|-|(?:%[a-fA-F0-9]{2}))"; |
-const char kEscapeClosingQuote[] = "\\\\E"; |
- |
-// A wrapper struct for static RE2 objects to be held as LazyInstance. |
-struct Patterns { |
- Patterns(); |
- ~Patterns(); |
- const RE2 transfer_padding_pattern; |
- const RE2 crlf_pattern; |
- const RE2 closing_pattern; |
- const RE2 epilogue_pattern; |
- const RE2 crlf_free_pattern; |
- const RE2 preamble_pattern; |
- const RE2 header_pattern; |
- const RE2 content_disposition_pattern; |
- const RE2 name_pattern; |
- const RE2 value_pattern; |
- const RE2 unquote_pattern; |
- const RE2 url_encoded_pattern; |
-}; |
- |
-Patterns::Patterns() |
- : transfer_padding_pattern("[ \\t]*\\r\\n"), |
- crlf_pattern("\\r\\n"), |
- closing_pattern("--[ \\t]*"), |
- epilogue_pattern("|\\r\\n(?s:.)*"), |
- crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"), |
- preamble_pattern(".+?"), |
- header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"), |
- content_disposition_pattern(std::string("(?i:") + kContentDisposition + |
- ")"), |
- name_pattern("\\bname=\"([^\"]*)\""), |
- value_pattern("\\bfilename=\"([^\"]*)\""), |
- unquote_pattern(kEscapeClosingQuote), |
- url_encoded_pattern(std::string("(") + kCharacterPattern + "*)=(" + |
- kCharacterPattern + |
- "*)") { |
-} |
- |
-Patterns::~Patterns() {} |
- |
-base::LazyInstance<Patterns>::Leaky g_patterns = LAZY_INSTANCE_INITIALIZER; |
- |
-} // namespace |
- |
-// Parses URLencoded forms, see |
-// http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . |
-class FormDataParserUrlEncoded : public FormDataParser { |
- public: |
- FormDataParserUrlEncoded(); |
- virtual ~FormDataParserUrlEncoded(); |
- |
- // Implementation of FormDataParser. |
- virtual bool AllDataReadOK() OVERRIDE; |
- virtual bool GetNextNameValue(Result* result) OVERRIDE; |
- virtual bool SetSource(base::StringPiece source) OVERRIDE; |
- |
- private: |
- // Returns the pattern to match a single name-value pair. This could be even |
- // static, but then we would have to spend more code on initializing the |
- // cached pointer to g_patterns.Get(). |
- const RE2& pattern() const { |
- return patterns_->url_encoded_pattern; |
- } |
- |
- // Auxiliary constant for using RE2. Number of arguments for parsing |
- // name-value pairs (one for name, one for value). |
- static const size_t args_size_ = 2u; |
- static const net::UnescapeRule::Type unescape_rules_; |
- |
- re2::StringPiece source_; |
- bool source_set_; |
- bool source_malformed_; |
- |
- // Auxiliary store for using RE2. |
- std::string name_; |
- std::string value_; |
- const RE2::Arg arg_name_; |
- const RE2::Arg arg_value_; |
- const RE2::Arg* args_[args_size_]; |
- |
- // Caching the pointer to g_patterns.Get(). |
- const Patterns* patterns_; |
- |
- DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); |
-}; |
- |
-// The following class, FormDataParserMultipart, parses forms encoded as |
-// multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart |
-// encoding) and 5322 (MIME-headers). |
-// |
-// Implementation details |
-// |
-// The original grammar from RFC 2046 is this, "multipart-body" being the root |
-// non-terminal: |
-// |
-// boundary := 0*69<bchars> bcharsnospace |
-// bchars := bcharsnospace / " " |
-// bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," |
-// / "-" / "." / "/" / ":" / "=" / "?" |
-// dash-boundary := "--" boundary |
-// multipart-body := [preamble CRLF] |
-// dash-boundary transport-padding CRLF |
-// body-part *encapsulation |
-// close-delimiter transport-padding |
-// [CRLF epilogue] |
-// transport-padding := *LWSP-char |
-// encapsulation := delimiter transport-padding CRLF body-part |
-// delimiter := CRLF dash-boundary |
-// close-delimiter := delimiter "--" |
-// preamble := discard-text |
-// epilogue := discard-text |
-// discard-text := *(*text CRLF) *text |
-// body-part := MIME-part-headers [CRLF *OCTET] |
-// OCTET := <any 0-255 octet value> |
-// |
-// Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF, |
-// DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the |
-// English alphabet, respectively. |
-// The non-terminal "text" is presumably just any text, excluding line breaks. |
-// The non-terminal "LWSP-char" is not directly defined in the original grammar |
-// but it means "linear whitespace", which is a space or a horizontal tab. |
-// The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use |
-// the syntax for "optional fields" from Section 3.6.8 of RFC 5322: |
-// |
-// MIME-part-headers := field-name ":" unstructured CRLF |
-// field-name := 1*ftext |
-// ftext := %d33-57 / ; Printable US-ASCII |
-// %d59-126 ; characters not including ":". |
-// Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which |
-// does not contain a CRLF sub-string, except for substrings "CRLF<space>" and |
-// "CRLF<horizontal tab>", which serve for "folding". |
-// |
-// The FormDataParseMultipart class reads the input source and tries to parse it |
-// according to the grammar above, rooted at the "multipart-body" non-terminal. |
-// This happens in stages: |
-// |
-// 1. The optional preamble and the initial dash-boundary with transport padding |
-// and a CRLF are read and ignored. |
-// |
-// 2. Repeatedly each body part is read. The body parts can either serve to |
-// upload a file, or just a string of bytes. |
-// 2.a. The headers of that part are searched for the "content-disposition" |
-// header, which contains the name of the value represented by that body |
-// part. If the body-part is for file upload, that header also contains a |
-// filename. |
-// 2.b. The "*OCTET" part of the body part is then read and passed as the value |
-// of the name-value pair for body parts representing a string of bytes. |
-// For body parts for uploading a file the "*OCTET" part is just ignored |
-// and the filename is used for value instead. |
-// |
-// 3. The final close-delimiter and epilogue are read and ignored. |
-// |
-// IMPORTANT NOTE |
-// This parser supports sources split into multiple chunks. Therefore SetSource |
-// can be called multiple times if the source is spread over several chunks. |
-// However, the split may only occur inside a body part, right after the |
-// trailing CRLF of headers. |
-class FormDataParserMultipart : public FormDataParser { |
- public: |
- explicit FormDataParserMultipart(const std::string& boundary_separator); |
- virtual ~FormDataParserMultipart(); |
- |
- // Implementation of FormDataParser. |
- virtual bool AllDataReadOK() OVERRIDE; |
- virtual bool GetNextNameValue(Result* result) OVERRIDE; |
- virtual bool SetSource(base::StringPiece source) OVERRIDE; |
- |
- private: |
- enum State { |
- STATE_INIT, // No input read yet. |
- STATE_READY, // Ready to call GetNextNameValue. |
- STATE_FINISHED, // Read the input until the end. |
- STATE_SUSPEND, // Waiting until a new |source_| is set. |
- STATE_ERROR |
- }; |
- |
- // Produces a regexp to match the string "--" + |literal|. The idea is to |
- // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed |
- // in "\\Q" and "\\E". The only catch is to watch out for occurences of "\\E" |
- // inside |literal|. Those must be excluded from the quote and the backslash |
- // doubly escaped. For example, for literal == "abc\\Edef" the result is |
- // "\\Q--abc\\E\\\\E\\Qdef\\E". |
- static std::string CreateBoundaryPatternFromLiteral( |
- const std::string& literal); |
- |
- // Tests whether |input| has a prefix matching |pattern|. |
- static bool StartsWithPattern(const re2::StringPiece& input, |
- const RE2& pattern); |
- |
- // If |source_| starts with a header, seeks |source_| beyond the header. If |
- // the header is Content-Disposition, extracts |name| from "name=" and |
- // possibly |value| from "filename=" fields of that header. Only if the |
- // "name" or "filename" fields are found, then |name| or |value| are touched. |
- // Returns true iff |source_| is seeked forward. Sets |value_assigned| |
- // to true iff |value| has been assigned to. |
- bool TryReadHeader(base::StringPiece* name, |
- base::StringPiece* value, |
- bool* value_assigned); |
- |
- // Helper to GetNextNameValue. Expects that the input starts with a data |
- // portion of a body part. An attempt is made to read the input until the end |
- // of that body part. If |data| is not NULL, it is set to contain the data |
- // portion. Returns true iff the reading was successful. |
- bool FinishReadingPart(base::StringPiece* data); |
- |
- // These methods could be even static, but then we would have to spend more |
- // code on initializing the cached pointer to g_patterns.Get(). |
- const RE2& transfer_padding_pattern() const { |
- return patterns_->transfer_padding_pattern; |
- } |
- const RE2& crlf_pattern() const { |
- return patterns_->crlf_pattern; |
- } |
- const RE2& closing_pattern() const { |
- return patterns_->closing_pattern; |
- } |
- const RE2& epilogue_pattern() const { |
- return patterns_->epilogue_pattern; |
- } |
- const RE2& crlf_free_pattern() const { |
- return patterns_->crlf_free_pattern; |
- } |
- const RE2& preamble_pattern() const { |
- return patterns_->preamble_pattern; |
- } |
- const RE2& header_pattern() const { |
- return patterns_->header_pattern; |
- } |
- const RE2& content_disposition_pattern() const { |
- return patterns_->content_disposition_pattern; |
- } |
- const RE2& name_pattern() const { |
- return patterns_->name_pattern; |
- } |
- const RE2& value_pattern() const { |
- return patterns_->value_pattern; |
- } |
- // However, this is used in a static method so it needs to be static. |
- static const RE2& unquote_pattern() { |
- return g_patterns.Get().unquote_pattern; // No caching g_patterns here. |
- } |
- |
- const RE2 dash_boundary_pattern_; |
- |
- // Because of initialisation dependency, |state_| needs to be declared after |
- // |dash_boundary_pattern_|. |
- State state_; |
- |
- // The parsed message can be split into multiple sources which we read |
- // sequentially. |
- re2::StringPiece source_; |
- |
- // Caching the pointer to g_patterns.Get(). |
- const Patterns* patterns_; |
- |
- DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); |
-}; |
- |
-FormDataParser::Result::Result() {} |
-FormDataParser::Result::~Result() {} |
- |
-FormDataParser::~FormDataParser() {} |
- |
-// static |
-scoped_ptr<FormDataParser> FormDataParser::Create( |
- const net::URLRequest& request) { |
- std::string value; |
- const bool found = request.extra_request_headers().GetHeader( |
- net::HttpRequestHeaders::kContentType, &value); |
- return CreateFromContentTypeHeader(found ? &value : NULL); |
-} |
- |
-// static |
-scoped_ptr<FormDataParser> FormDataParser::CreateFromContentTypeHeader( |
- const std::string* content_type_header) { |
- enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE}; |
- ParserChoice choice = ERROR_CHOICE; |
- std::string boundary; |
- |
- if (content_type_header == NULL) { |
- choice = URL_ENCODED; |
- } else { |
- const std::string content_type( |
- content_type_header->substr(0, content_type_header->find(';'))); |
- |
- if (base::strcasecmp( |
- content_type.c_str(), "application/x-www-form-urlencoded") == 0) { |
- choice = URL_ENCODED; |
- } else if (base::strcasecmp( |
- content_type.c_str(), "multipart/form-data") == 0) { |
- static const char kBoundaryString[] = "boundary="; |
- size_t offset = content_type_header->find(kBoundaryString); |
- if (offset == std::string::npos) { |
- // Malformed header. |
- return scoped_ptr<FormDataParser>(); |
- } |
- offset += sizeof(kBoundaryString) - 1; |
- boundary = content_type_header->substr( |
- offset, content_type_header->find(';', offset)); |
- if (!boundary.empty()) |
- choice = MULTIPART; |
- } |
- } |
- // Other cases are unparseable, including when |content_type| is "text/plain". |
- |
- switch (choice) { |
- case URL_ENCODED: |
- return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded()); |
- case MULTIPART: |
- return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary)); |
- case ERROR_CHOICE: |
- return scoped_ptr<FormDataParser>(); |
- } |
- NOTREACHED(); // Some compilers do not believe this is unreachable. |
- return scoped_ptr<FormDataParser>(); |
-} |
- |
-FormDataParser::FormDataParser() {} |
- |
-const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ = |
- net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS | |
- net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE; |
- |
-FormDataParserUrlEncoded::FormDataParserUrlEncoded() |
- : source_(NULL), |
- source_set_(false), |
- source_malformed_(false), |
- arg_name_(&name_), |
- arg_value_(&value_), |
- patterns_(g_patterns.Pointer()) { |
- args_[0] = &arg_name_; |
- args_[1] = &arg_value_; |
-} |
- |
-FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {} |
- |
-bool FormDataParserUrlEncoded::AllDataReadOK() { |
- // All OK means we read the whole source. |
- return source_set_ && source_.empty() && !source_malformed_; |
-} |
- |
-bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) { |
- if (!source_set_ || source_malformed_) |
- return false; |
- |
- bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_); |
- if (success) { |
- result->set_name(net::UnescapeURLComponent(name_, unescape_rules_)); |
- result->set_value(net::UnescapeURLComponent(value_, unescape_rules_)); |
- } |
- if (source_.length() > 0) { |
- if (source_[0] == '&') |
- source_.remove_prefix(1); // Remove the leading '&'. |
- else |
- source_malformed_ = true; // '&' missing between two name-value pairs. |
- } |
- return success && !source_malformed_; |
-} |
- |
-bool FormDataParserUrlEncoded::SetSource(base::StringPiece source) { |
- if (source_set_) |
- return false; // We do not allow multiple sources for this parser. |
- source_.set(source.data(), source.size()); |
- source_set_ = true; |
- source_malformed_ = false; |
- return true; |
-} |
- |
-// static |
-std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral( |
- const std::string& literal) { |
- static const char quote[] = "\\Q"; |
- static const char unquote[] = "\\E"; |
- |
- // The result always starts with opening the qoute and then "--". |
- std::string result("\\Q--"); |
- |
- // This StringPiece is used below to record the next occurrence of "\\E" in |
- // |literal|. |
- re2::StringPiece seek_unquote(literal); |
- const char* copy_start = literal.data(); |
- size_t copy_length = literal.size(); |
- |
- // Find all "\\E" in |literal| and exclude them from the \Q...\E quote. |
- while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) { |
- copy_length = seek_unquote.data() - copy_start; |
- result.append(copy_start, copy_length); |
- result.append(kEscapeClosingQuote); |
- result.append(quote); |
- copy_start = seek_unquote.data(); |
- } |
- |
- // Finish the last \Q...\E quote. |
- copy_length = (literal.data() + literal.size()) - copy_start; |
- result.append(copy_start, copy_length); |
- result.append(unquote); |
- return result; |
-} |
- |
-// static |
-bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input, |
- const RE2& pattern) { |
- return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0); |
-} |
- |
-FormDataParserMultipart::FormDataParserMultipart( |
- const std::string& boundary_separator) |
- : dash_boundary_pattern_( |
- CreateBoundaryPatternFromLiteral(boundary_separator)), |
- state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR), |
- patterns_(g_patterns.Pointer()) {} |
- |
-FormDataParserMultipart::~FormDataParserMultipart() {} |
- |
-bool FormDataParserMultipart::AllDataReadOK() { |
- return state_ == STATE_FINISHED; |
-} |
- |
-bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) { |
- const char* data_start = source_.data(); |
- while (!StartsWithPattern(source_, dash_boundary_pattern_)) { |
- if (!RE2::Consume(&source_, crlf_free_pattern()) || |
- !RE2::Consume(&source_, crlf_pattern())) { |
- state_ = STATE_ERROR; |
- return false; |
- } |
- } |
- if (data != NULL) { |
- if (source_.data() == data_start) { |
- // No data in this body part. |
- state_ = STATE_ERROR; |
- return false; |
- } |
- // Subtract 2 for the trailing "\r\n". |
- data->set(data_start, source_.data() - data_start - 2); |
- } |
- |
- // Finally, read the dash-boundary and either skip to the next body part, or |
- // finish reading the source. |
- CHECK(RE2::Consume(&source_, dash_boundary_pattern_)); |
- if (StartsWithPattern(source_, closing_pattern())) { |
- CHECK(RE2::Consume(&source_, closing_pattern())); |
- if (RE2::Consume(&source_, epilogue_pattern())) |
- state_ = STATE_FINISHED; |
- else |
- state_ = STATE_ERROR; |
- } else { // Next body part ahead. |
- if (!RE2::Consume(&source_, transfer_padding_pattern())) |
- state_ = STATE_ERROR; |
- } |
- return state_ != STATE_ERROR; |
-} |
- |
-bool FormDataParserMultipart::GetNextNameValue(Result* result) { |
- if (source_.empty() || state_ != STATE_READY) |
- return false; |
- |
- // 1. Read body-part headers. |
- base::StringPiece name; |
- base::StringPiece value; |
- bool value_assigned = false; |
- bool value_assigned_temp; |
- while (TryReadHeader(&name, &value, &value_assigned_temp)) |
- value_assigned |= value_assigned_temp; |
- if (name.empty() || state_ == STATE_ERROR) { |
- state_ = STATE_ERROR; |
- return false; |
- } |
- |
- // 2. Read the trailing CRLF after headers. |
- if (!RE2::Consume(&source_, crlf_pattern())) { |
- state_ = STATE_ERROR; |
- return false; |
- } |
- |
- // 3. Read the data of this body part, i.e., everything until the first |
- // dash-boundary. |
- bool return_value; |
- if (value_assigned && source_.empty()) { // Wait for a new source? |
- return_value = true; |
- state_ = STATE_SUSPEND; |
- } else { |
- return_value = FinishReadingPart(value_assigned ? NULL : &value); |
- } |
- |
- std::string unescaped_name = net::UnescapeURLComponent( |
- name.as_string(), |
- net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS); |
- result->set_name(unescaped_name); |
- result->set_value(value); |
- |
- return return_value; |
-} |
- |
-bool FormDataParserMultipart::SetSource(base::StringPiece source) { |
- if (source.data() == NULL || !source_.empty()) |
- return false; |
- source_.set(source.data(), source.size()); |
- |
- switch (state_) { |
- case STATE_INIT: |
- // Seek behind the preamble. |
- while (!StartsWithPattern(source_, dash_boundary_pattern_)) { |
- if (!RE2::Consume(&source_, preamble_pattern())) { |
- state_ = STATE_ERROR; |
- break; |
- } |
- } |
- // Read dash-boundary, transfer padding, and CRLF. |
- if (state_ != STATE_ERROR) { |
- if (!RE2::Consume(&source_, dash_boundary_pattern_) || |
- !RE2::Consume(&source_, transfer_padding_pattern())) |
- state_ = STATE_ERROR; |
- else |
- state_ = STATE_READY; |
- } |
- break; |
- case STATE_READY: // Nothing to do. |
- break; |
- case STATE_SUSPEND: |
- state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR; |
- break; |
- default: |
- state_ = STATE_ERROR; |
- } |
- return state_ != STATE_ERROR; |
-} |
- |
-bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name, |
- base::StringPiece* value, |
- bool* value_assigned) { |
- *value_assigned = false; |
- const char* header_start = source_.data(); |
- if (!RE2::Consume(&source_, header_pattern())) |
- return false; |
- // (*) After this point we must return true, because we consumed one header. |
- |
- // Subtract 2 for the trailing "\r\n". |
- re2::StringPiece header(header_start, source_.data() - header_start - 2); |
- |
- if (!StartsWithPattern(header, content_disposition_pattern())) |
- return true; // Skip headers that don't describe the content-disposition. |
- |
- re2::StringPiece groups[2]; |
- |
- if (!name_pattern().Match(header, |
- kContentDispositionLength, header.size(), |
- RE2::UNANCHORED, groups, 2)) { |
- state_ = STATE_ERROR; |
- return true; // See (*) for why true. |
- } |
- name->set(groups[1].data(), groups[1].size()); |
- |
- if (value_pattern().Match(header, |
- kContentDispositionLength, header.size(), |
- RE2::UNANCHORED, groups, 2)) { |
- value->set(groups[1].data(), groups[1].size()); |
- *value_assigned = true; |
- } |
- return true; |
-} |
- |
-} // namespace extensions |