chrome/browser/extensions/api/web_request/form_data_parser.cc - Issue 584163004: Move web_request directory to //extensions.

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.cc

Issue 584163004: Move web_request directory to //extensions. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebase again Created 6 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « chrome/browser/extensions/api/web_request/form_data_parser.h ('k') | chrome/browser/extensions/api/web_request/form_data_parser_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/browser/extensions/api/web_request/form_data_parser.h"

6

7 #include <vector>

8

9 #include "base/lazy_instance.h"

10 #include "base/logging.h"

11 #include "base/macros.h"

12 #include "base/strings/string_util.h"

13 #include "base/values.h"

14 #include "net/base/escape.h"

15 #include "net/url_request/url_request.h"

16 #include "third_party/re2/re2/re2.h"

17

18 using base::DictionaryValue;

19 using base::ListValue;

20 using base::StringPiece;

21 using re2::RE2;

22

23 namespace extensions {

24

25 namespace {

26

27 const char kContentDisposition[] = "content-disposition:";

28 const size_t kContentDispositionLength = arraysize(kContentDisposition) - 1;

29 // kCharacterPattern is an allowed character in a URL encoding. Definition is

30 // from RFC 1738, end of section 2.2.

31 const char kCharacterPattern[] =

32 "(?:[a-zA-Z0-9$_.+!*'(),]\|-\|(?:%[a-fA-F0-9]{2}))";

33 const char kEscapeClosingQuote[] = "\\\\E";

34

35 // A wrapper struct for static RE2 objects to be held as LazyInstance.

36 struct Patterns {

37 Patterns();

38 ~Patterns();

39 const RE2 transfer_padding_pattern;

40 const RE2 crlf_pattern;

41 const RE2 closing_pattern;

42 const RE2 epilogue_pattern;

43 const RE2 crlf_free_pattern;

44 const RE2 preamble_pattern;

45 const RE2 header_pattern;

46 const RE2 content_disposition_pattern;

47 const RE2 name_pattern;

48 const RE2 value_pattern;

49 const RE2 unquote_pattern;

50 const RE2 url_encoded_pattern;

51 };

52

53 Patterns::Patterns()

54 : transfer_padding_pattern("[ \\t]*\\r\\n"),

55 crlf_pattern("\\r\\n"),

56 closing_pattern("--[ \\t]*"),

57 epilogue_pattern("\|\\r\\n(?s:.)*"),

58 crlf_free_pattern("(?:[^\\r]\|\\r+[^\\r\\n])*"),

59 preamble_pattern(".+?"),

60 header_pattern("[!-9;-~]+:(.\|\\r\\n[\\t ])*\\r\\n"),

61 content_disposition_pattern(std::string("(?i:") + kContentDisposition +

62 ")"),

63 name_pattern("\\bname=\"([^\"]*)\""),

64 value_pattern("\\bfilename=\"([^\"]*)\""),

65 unquote_pattern(kEscapeClosingQuote),

66 url_encoded_pattern(std::string("(") + kCharacterPattern + "*)=(" +

67 kCharacterPattern +

68 "*)") {

69 }

70

71 Patterns::~Patterns() {}

72

73 base::LazyInstance<Patterns>::Leaky g_patterns = LAZY_INSTANCE_INITIALIZER;

74

75 } // namespace

76

77 // Parses URLencoded forms, see

78 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .

79 class FormDataParserUrlEncoded : public FormDataParser {

80 public:

81 FormDataParserUrlEncoded();

82 virtual ~FormDataParserUrlEncoded();

83

84 // Implementation of FormDataParser.

85 virtual bool AllDataReadOK() OVERRIDE;

86 virtual bool GetNextNameValue(Result* result) OVERRIDE;

87 virtual bool SetSource(base::StringPiece source) OVERRIDE;

88

89 private:

90 // Returns the pattern to match a single name-value pair. This could be even

91 // static, but then we would have to spend more code on initializing the

92 // cached pointer to g_patterns.Get().

93 const RE2& pattern() const {

94 return patterns_->url_encoded_pattern;

95 }

96

97 // Auxiliary constant for using RE2. Number of arguments for parsing

98 // name-value pairs (one for name, one for value).

99 static const size_t args_size_ = 2u;

100 static const net::UnescapeRule::Type unescape_rules_;

101

102 re2::StringPiece source_;

103 bool source_set_;

104 bool source_malformed_;

105

106 // Auxiliary store for using RE2.

107 std::string name_;

108 std::string value_;

109 const RE2::Arg arg_name_;

110 const RE2::Arg arg_value_;

111 const RE2::Arg* args_[args_size_];

112

113 // Caching the pointer to g_patterns.Get().

114 const Patterns* patterns_;

115

116 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);

117 };

118

119 // The following class, FormDataParserMultipart, parses forms encoded as

120 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart

121 // encoding) and 5322 (MIME-headers).

122 //

123 // Implementation details

124 //

125 // The original grammar from RFC 2046 is this, "multipart-body" being the root

126 // non-terminal:

127 //

128 // boundary := 0*69<bchars> bcharsnospace

129 // bchars := bcharsnospace / " "

130 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","

131 // / "-" / "." / "/" / ":" / "=" / "?"

132 // dash-boundary := "--" boundary

133 // multipart-body := [preamble CRLF]

134 // dash-boundary transport-padding CRLF

135 // body-part *encapsulation

136 // close-delimiter transport-padding

137 // [CRLF epilogue]

138 // transport-padding := *LWSP-char

139 // encapsulation := delimiter transport-padding CRLF body-part

140 // delimiter := CRLF dash-boundary

141 // close-delimiter := delimiter "--"

142 // preamble := discard-text

143 // epilogue := discard-text

144 // discard-text := (text CRLF) *text

145 // body-part := MIME-part-headers [CRLF *OCTET]

146 // OCTET := <any 0-255 octet value>

147 //

148 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,

149 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the

150 // English alphabet, respectively.

151 // The non-terminal "text" is presumably just any text, excluding line breaks.

152 // The non-terminal "LWSP-char" is not directly defined in the original grammar

153 // but it means "linear whitespace", which is a space or a horizontal tab.

154 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use

155 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:

156 //

157 // MIME-part-headers := field-name ":" unstructured CRLF

158 // field-name := 1*ftext

159 // ftext := %d33-57 / ; Printable US-ASCII

160 // %d59-126 ; characters not including ":".

161 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which

162 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and

163 // "CRLF<horizontal tab>", which serve for "folding".

164 //

165 // The FormDataParseMultipart class reads the input source and tries to parse it

166 // according to the grammar above, rooted at the "multipart-body" non-terminal.

167 // This happens in stages:

168 //

169 // 1. The optional preamble and the initial dash-boundary with transport padding

170 // and a CRLF are read and ignored.

171 //

172 // 2. Repeatedly each body part is read. The body parts can either serve to

173 // upload a file, or just a string of bytes.

174 // 2.a. The headers of that part are searched for the "content-disposition"

175 // header, which contains the name of the value represented by that body

176 // part. If the body-part is for file upload, that header also contains a

177 // filename.

178 // 2.b. The "*OCTET" part of the body part is then read and passed as the value

179 // of the name-value pair for body parts representing a string of bytes.

180 // For body parts for uploading a file the "*OCTET" part is just ignored

181 // and the filename is used for value instead.

182 //

183 // 3. The final close-delimiter and epilogue are read and ignored.

184 //

185 // IMPORTANT NOTE

186 // This parser supports sources split into multiple chunks. Therefore SetSource

187 // can be called multiple times if the source is spread over several chunks.

188 // However, the split may only occur inside a body part, right after the

189 // trailing CRLF of headers.

190 class FormDataParserMultipart : public FormDataParser {

191 public:

192 explicit FormDataParserMultipart(const std::string& boundary_separator);

193 virtual ~FormDataParserMultipart();

194

195 // Implementation of FormDataParser.

196 virtual bool AllDataReadOK() OVERRIDE;

197 virtual bool GetNextNameValue(Result* result) OVERRIDE;

198 virtual bool SetSource(base::StringPiece source) OVERRIDE;

199

200 private:

201 enum State {

202 STATE_INIT, // No input read yet.

203 STATE_READY, // Ready to call GetNextNameValue.

204 STATE_FINISHED, // Read the input until the end.

205 STATE_SUSPEND, // Waiting until a new \|source_\| is set.

206 STATE_ERROR

207 };

208

209 // Produces a regexp to match the string "--" + \|literal\|. The idea is to

210 // represent "--" + \|literal\| as a "quoted pattern", a verbatim copy enclosed

211 // in "\\Q" and "\\E". The only catch is to watch out for occurences of "\\E"

212 // inside \|literal\|. Those must be excluded from the quote and the backslash

213 // doubly escaped. For example, for literal == "abc\\Edef" the result is

214 // "\\Q--abc\\E\\\\E\\Qdef\\E".

215 static std::string CreateBoundaryPatternFromLiteral(

216 const std::string& literal);

217

218 // Tests whether \|input\| has a prefix matching \|pattern\|.

219 static bool StartsWithPattern(const re2::StringPiece& input,

220 const RE2& pattern);

221

222 // If \|source_\| starts with a header, seeks \|source_\| beyond the header. If

223 // the header is Content-Disposition, extracts \|name\| from "name=" and

224 // possibly \|value\| from "filename=" fields of that header. Only if the

225 // "name" or "filename" fields are found, then \|name\| or \|value\| are touched.

226 // Returns true iff \|source_\| is seeked forward. Sets \|value_assigned\|

227 // to true iff \|value\| has been assigned to.

228 bool TryReadHeader(base::StringPiece* name,

229 base::StringPiece* value,

230 bool* value_assigned);

231

232 // Helper to GetNextNameValue. Expects that the input starts with a data

233 // portion of a body part. An attempt is made to read the input until the end

234 // of that body part. If \|data\| is not NULL, it is set to contain the data

235 // portion. Returns true iff the reading was successful.

236 bool FinishReadingPart(base::StringPiece* data);

237

238 // These methods could be even static, but then we would have to spend more

239 // code on initializing the cached pointer to g_patterns.Get().

240 const RE2& transfer_padding_pattern() const {

241 return patterns_->transfer_padding_pattern;

242 }

243 const RE2& crlf_pattern() const {

244 return patterns_->crlf_pattern;

245 }

246 const RE2& closing_pattern() const {

247 return patterns_->closing_pattern;

248 }

249 const RE2& epilogue_pattern() const {

250 return patterns_->epilogue_pattern;

251 }

252 const RE2& crlf_free_pattern() const {

253 return patterns_->crlf_free_pattern;

254 }

255 const RE2& preamble_pattern() const {

256 return patterns_->preamble_pattern;

257 }

258 const RE2& header_pattern() const {

259 return patterns_->header_pattern;

260 }

261 const RE2& content_disposition_pattern() const {

262 return patterns_->content_disposition_pattern;

263 }

264 const RE2& name_pattern() const {

265 return patterns_->name_pattern;

266 }

267 const RE2& value_pattern() const {

268 return patterns_->value_pattern;

269 }

270 // However, this is used in a static method so it needs to be static.

271 static const RE2& unquote_pattern() {

272 return g_patterns.Get().unquote_pattern; // No caching g_patterns here.

273 }

274

275 const RE2 dash_boundary_pattern_;

276

277 // Because of initialisation dependency, \|state_\| needs to be declared after

278 // \|dash_boundary_pattern_\|.

279 State state_;

280

281 // The parsed message can be split into multiple sources which we read

282 // sequentially.

283 re2::StringPiece source_;

284

285 // Caching the pointer to g_patterns.Get().

286 const Patterns* patterns_;

287

288 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);

289 };

290

291 FormDataParser::Result::Result() {}

292 FormDataParser::Result::~Result() {}

293

294 FormDataParser::~FormDataParser() {}

295

296 // static

297 scoped_ptr<FormDataParser> FormDataParser::Create(

298 const net::URLRequest& request) {

299 std::string value;

300 const bool found = request.extra_request_headers().GetHeader(

301 net::HttpRequestHeaders::kContentType, &value);

302 return CreateFromContentTypeHeader(found ? &value : NULL);

303 }

304

305 // static

306 scoped_ptr<FormDataParser> FormDataParser::CreateFromContentTypeHeader(

307 const std::string* content_type_header) {

308 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};

309 ParserChoice choice = ERROR_CHOICE;

310 std::string boundary;

311

312 if (content_type_header == NULL) {

313 choice = URL_ENCODED;

314 } else {

315 const std::string content_type(

316 content_type_header->substr(0, content_type_header->find(';')));

317

318 if (base::strcasecmp(

319 content_type.c_str(), "application/x-www-form-urlencoded") == 0) {

320 choice = URL_ENCODED;

321 } else if (base::strcasecmp(

322 content_type.c_str(), "multipart/form-data") == 0) {

323 static const char kBoundaryString[] = "boundary=";

324 size_t offset = content_type_header->find(kBoundaryString);

325 if (offset == std::string::npos) {

326 // Malformed header.

327 return scoped_ptr<FormDataParser>();

328 }

329 offset += sizeof(kBoundaryString) - 1;

330 boundary = content_type_header->substr(

331 offset, content_type_header->find(';', offset));

332 if (!boundary.empty())

333 choice = MULTIPART;

334 }

335 }

336 // Other cases are unparseable, including when \|content_type\| is "text/plain".

337

338 switch (choice) {

339 case URL_ENCODED:

340 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());

341 case MULTIPART:

342 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));

343 case ERROR_CHOICE:

344 return scoped_ptr<FormDataParser>();

345 }

346 NOTREACHED(); // Some compilers do not believe this is unreachable.

347 return scoped_ptr<FormDataParser>();

348 }

349

350 FormDataParser::FormDataParser() {}

351

352 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =

353 net::UnescapeRule::URL_SPECIAL_CHARS \| net::UnescapeRule::CONTROL_CHARS \|

354 net::UnescapeRule::SPACES \| net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;

355

356 FormDataParserUrlEncoded::FormDataParserUrlEncoded()

357 : source_(NULL),

358 source_set_(false),

359 source_malformed_(false),

360 arg_name_(&name_),

361 arg_value_(&value_),

362 patterns_(g_patterns.Pointer()) {

363 args_[0] = &arg_name_;

364 args_[1] = &arg_value_;

365 }

366

367 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}

368

369 bool FormDataParserUrlEncoded::AllDataReadOK() {

370 // All OK means we read the whole source.

371 return source_set_ && source_.empty() && !source_malformed_;

372 }

373

374 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {

375 if (!source_set_ \|\| source_malformed_)

376 return false;

377

378 bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_);

379 if (success) {

380 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));

381 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));

382 }

383 if (source_.length() > 0) {

384 if (source_[0] == '&')

385 source_.remove_prefix(1); // Remove the leading '&'.

386 else

387 source_malformed_ = true; // '&' missing between two name-value pairs.

388 }

389 return success && !source_malformed_;

390 }

391

392 bool FormDataParserUrlEncoded::SetSource(base::StringPiece source) {

393 if (source_set_)

394 return false; // We do not allow multiple sources for this parser.

395 source_.set(source.data(), source.size());

396 source_set_ = true;

397 source_malformed_ = false;

398 return true;

399 }

400

401 // static

402 std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral(

403 const std::string& literal) {

404 static const char quote[] = "\\Q";

405 static const char unquote[] = "\\E";

406

407 // The result always starts with opening the qoute and then "--".

408 std::string result("\\Q--");

409

410 // This StringPiece is used below to record the next occurrence of "\\E" in

411 // \|literal\|.

412 re2::StringPiece seek_unquote(literal);

413 const char* copy_start = literal.data();

414 size_t copy_length = literal.size();

415

416 // Find all "\\E" in \|literal\| and exclude them from the \Q...\E quote.

417 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) {

418 copy_length = seek_unquote.data() - copy_start;

419 result.append(copy_start, copy_length);

420 result.append(kEscapeClosingQuote);

421 result.append(quote);

422 copy_start = seek_unquote.data();

423 }

424

425 // Finish the last \Q...\E quote.

426 copy_length = (literal.data() + literal.size()) - copy_start;

427 result.append(copy_start, copy_length);

428 result.append(unquote);

429 return result;

430 }

431

432 // static

433 bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input,

434 const RE2& pattern) {

435 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);

436 }

437

438 FormDataParserMultipart::FormDataParserMultipart(

439 const std::string& boundary_separator)

440 : dash_boundary_pattern_(

441 CreateBoundaryPatternFromLiteral(boundary_separator)),

442 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR),

443 patterns_(g_patterns.Pointer()) {}

444

445 FormDataParserMultipart::~FormDataParserMultipart() {}

446

447 bool FormDataParserMultipart::AllDataReadOK() {

448 return state_ == STATE_FINISHED;

449 }

450

451 bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) {

452 const char* data_start = source_.data();

453 while (!StartsWithPattern(source_, dash_boundary_pattern_)) {

454 if (!RE2::Consume(&source_, crlf_free_pattern()) \|\|

455 !RE2::Consume(&source_, crlf_pattern())) {

456 state_ = STATE_ERROR;

457 return false;

458 }

459 }

460 if (data != NULL) {

461 if (source_.data() == data_start) {

462 // No data in this body part.

463 state_ = STATE_ERROR;

464 return false;

465 }

466 // Subtract 2 for the trailing "\r\n".

467 data->set(data_start, source_.data() - data_start - 2);

468 }

469

470 // Finally, read the dash-boundary and either skip to the next body part, or

471 // finish reading the source.

472 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));

473 if (StartsWithPattern(source_, closing_pattern())) {

474 CHECK(RE2::Consume(&source_, closing_pattern()));

475 if (RE2::Consume(&source_, epilogue_pattern()))

476 state_ = STATE_FINISHED;

477 else

478 state_ = STATE_ERROR;

479 } else { // Next body part ahead.

480 if (!RE2::Consume(&source_, transfer_padding_pattern()))

481 state_ = STATE_ERROR;

482 }

483 return state_ != STATE_ERROR;

484 }

485

486 bool FormDataParserMultipart::GetNextNameValue(Result* result) {

487 if (source_.empty() \|\| state_ != STATE_READY)

488 return false;

489

490 // 1. Read body-part headers.

491 base::StringPiece name;

492 base::StringPiece value;

493 bool value_assigned = false;

494 bool value_assigned_temp;

495 while (TryReadHeader(&name, &value, &value_assigned_temp))

496 value_assigned \|= value_assigned_temp;

497 if (name.empty() \|\| state_ == STATE_ERROR) {

498 state_ = STATE_ERROR;

499 return false;

500 }

501

502 // 2. Read the trailing CRLF after headers.

503 if (!RE2::Consume(&source_, crlf_pattern())) {

504 state_ = STATE_ERROR;

505 return false;

506 }

507

508 // 3. Read the data of this body part, i.e., everything until the first

509 // dash-boundary.

510 bool return_value;

511 if (value_assigned && source_.empty()) { // Wait for a new source?

512 return_value = true;

513 state_ = STATE_SUSPEND;

514 } else {

515 return_value = FinishReadingPart(value_assigned ? NULL : &value);

516 }

517

518 std::string unescaped_name = net::UnescapeURLComponent(

519 name.as_string(),

520 net::UnescapeRule::URL_SPECIAL_CHARS \| net::UnescapeRule::CONTROL_CHARS);

521 result->set_name(unescaped_name);

522 result->set_value(value);

523

524 return return_value;

525 }

526

527 bool FormDataParserMultipart::SetSource(base::StringPiece source) {

528 if (source.data() == NULL \|\| !source_.empty())

529 return false;

530 source_.set(source.data(), source.size());

531

532 switch (state_) {

533 case STATE_INIT:

534 // Seek behind the preamble.

535 while (!StartsWithPattern(source_, dash_boundary_pattern_)) {

536 if (!RE2::Consume(&source_, preamble_pattern())) {

537 state_ = STATE_ERROR;

538 break;

539 }

540 }

541 // Read dash-boundary, transfer padding, and CRLF.

542 if (state_ != STATE_ERROR) {

543 if (!RE2::Consume(&source_, dash_boundary_pattern_) \|\|

544 !RE2::Consume(&source_, transfer_padding_pattern()))

545 state_ = STATE_ERROR;

546 else

547 state_ = STATE_READY;

548 }

549 break;

550 case STATE_READY: // Nothing to do.

551 break;

552 case STATE_SUSPEND:

553 state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR;

554 break;

555 default:

556 state_ = STATE_ERROR;

557 }

558 return state_ != STATE_ERROR;

559 }

560

561 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,

562 base::StringPiece* value,

563 bool* value_assigned) {

564 *value_assigned = false;

565 const char* header_start = source_.data();

566 if (!RE2::Consume(&source_, header_pattern()))

567 return false;

568 // (*) After this point we must return true, because we consumed one header.

569

570 // Subtract 2 for the trailing "\r\n".

571 re2::StringPiece header(header_start, source_.data() - header_start - 2);

572

573 if (!StartsWithPattern(header, content_disposition_pattern()))

574 return true; // Skip headers that don't describe the content-disposition.

575

576 re2::StringPiece groups[2];

577

578 if (!name_pattern().Match(header,

579 kContentDispositionLength, header.size(),

580 RE2::UNANCHORED, groups, 2)) {

581 state_ = STATE_ERROR;

582 return true; // See (*) for why true.

583 }

584 name->set(groups[1].data(), groups[1].size());

585

586 if (value_pattern().Match(header,

587 kContentDispositionLength, header.size(),

588 RE2::UNANCHORED, groups, 2)) {

589 value->set(groups[1].data(), groups[1].size());

590 *value_assigned = true;

591 }

592 return true;

593 }

594

595 } // namespace extensions

OLD	NEW