Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(220)

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.h

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Corrected the multipart parser + parsedForm->formData Created 8 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
6 #define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
7
8 #include <string>
9 #include <vector>
10
11 #include "base/memory/scoped_ptr.h"
12 // Cannot forward declare StringPiece because it is a typedef.
13 #include "base/string_piece.h"
14
15 namespace net {
16 class URLRequest;
17 }
18
19 namespace extensions {
20
21 // Interface for the form data parsers.
22 class FormDataParser {
23 public:
24 class Result {
25 public:
26 Result();
27 ~Result();
28 const std::string& name() const {
29 return name_;
30 }
31 const std::string& value() const {
32 return value_;
33 }
34 void set_name(const base::StringPiece& str) {
35 str.CopyToString(&name_);
36 }
37 void set_value(const base::StringPiece& str) {
38 str.CopyToString(&value_);
39 }
40 void set_name(const std::string& str) {
41 name_ = str;
42 }
43 void set_value(const std::string& str) {
44 value_ = str;
45 }
46 void Reset();
47
battre 2012/08/16 19:18:03 nit: -1 new line
vabr (Chromium) 2012/08/17 18:29:57 Done.
48
49 private:
50 std::string name_;
51 std::string value_;
battre 2012/08/16 19:18:03 DISALLOW_COPY_AND_ASSIGN(Result); + #include "bas
vabr (Chromium) 2012/08/17 18:29:57 Done.
52 };
53
54 virtual ~FormDataParser();
55
56 // Creates a correct parser instance based on the |request|. Returns NULL
57 // on failure.
58 static scoped_ptr<FormDataParser> Create(const net::URLRequest* request);
59
60 // Creates a correct parser instance based on |content_type_header|, the
61 // "Content-Type" request header value. If |content_type_header| is NULL, it
62 // defaults to "application/x-www-form-urlencoded". Returns NULL on failure.
63 static scoped_ptr<FormDataParser> Create(
64 const std::string* content_type_header);
65
66 // Returns true if there was some data, it was well formed and all was read.
67 virtual bool AllDataReadOK() = 0;
68
69 // Returns the next name-value pair as |result|. After SetSource has
70 // succeeded, this allows to iterate over all pairs in the source.
71 // Returns true as long as a new pair was successfully found.
72 virtual bool GetNextNameValue(Result* result) = 0;
73
74 // Sets the |source| of the data to be parsed. The ownership is left with the
75 // caller and the source should live until |this| dies or |this->SetSource()|
76 // is called again, whichever comes sooner. Returns true on success.
77 virtual bool SetSource(const std::vector<char>* source) = 0;
78
79 protected:
80 FormDataParser();
81
82 private:
83 DISALLOW_COPY_AND_ASSIGN(FormDataParser);
84 };
85
86 // Parses URLencoded forms, see
87 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
88 class FormDataParserUrlEncoded : public FormDataParser {
89 public:
90 FormDataParserUrlEncoded();
91 virtual ~FormDataParserUrlEncoded();
92
93 // Implementation of FormDataParser.
94 virtual bool AllDataReadOK() OVERRIDE;
95 virtual bool GetNextNameValue(Result* result) OVERRIDE;
96 virtual bool SetSource(const std::vector<char>* source) OVERRIDE;
97
98 private:
99 // Gets next char from |source_|, seeks, and does book-keeping of = and &.
100 // Returns false if end of |source_| was reached, otherwise true.
101 bool GetNextChar(char* c);
102 // Once called the parser gives up and claims any results so far invalid.
103 void Abort();
104
105 const std::vector<char>* source_;
106 bool aborted_;
107
108 // Variables from this block are only to be written to by GetNextChar.
109 std::vector<char>::const_iterator offset_; // Next char to be read.
110 size_t equality_signs_; // How many '=' were read so far.
111 size_t amp_signs_; // How many '&' were read so far.
112 bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')?
113
114 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
115 };
116
117 // The following class, FormDataParserMultipart, parses forms encoded as
118 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
119 // encoding) and 822 (MIME-headers).
120 //
121 // Implementation details
122 //
123 // The original grammar from RFC 2046 is this, "multipart-body" being the root
124 // non-terminal:
125 //
126 // boundary := 0*69<bchars> bcharsnospace
127 // bchars := bcharsnospace / " "
128 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
129 // / "-" / "." / "/" / ":" / "=" / "?"
130 // dash-boundary := "--" boundary
131 // multipart-body := [preamble CRLF]
132 // dash-boundary transport-padding CRLF
133 // body-part *encapsulation
134 // close-delimiter transport-padding
135 // [CRLF epilogue]
136 // transport-padding := *LWSP-char
137 // encapsulation := delimiter transport-padding CRLF body-part
138 // delimiter := CRLF dash-boundary
139 // close-delimiter := delimiter "--"
140 // preamble := discard-text
141 // epilogue := discard-text
142 // discard-text := *(*text CRLF) *text
143 // body-part := MIME-part-headers [CRLF *OCTET]
144 // OCTET := <any 0-255 octet value>
145 //
146 // Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters
147 // of the English alphabet, respectively.
148 // The non-terminal "text" is presumably just any text, excluding line breaks.
149 // The non-terminal "LWSP-char" is not directly defined in the original grammar
150 // but it means "linear whitespace", which is a space or a horizontal tab.
151 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in
152 // English defined in RFC 822, and can be presented as follows:
153 //
154 // MIME-part-headers := *MIME-part-header
155 // MIME-part-header := name ':' *(text / whitespace) linebreak
156 // linebreak := '\r' / '\n' / CRLF
157 // whitespace := LWSP-char / CRLF LWSP-char
158 // name := namechar *namechar
159 // namechar := <ASCII char between 33 and 126, excluding ':'>
160 //
161 // This sets of rules together compose a grammar, with the root non-terminal
162 // "multipart-body". This grammer defines a regular language. Indeed, if the
163 // non-terminals are ordered in this way:
164 // namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace <
165 // linebreak < MIME-part-header < MIME-part-headers < bcharsnospace <
166 // bchars < boundary < dash-boundary < delimiter < close-delimiter <
167 // discard-text < transport-padding < OCTET < body-part < encapsulation <
168 // multipart-body
169 // then it is easy to verify that whenever A<B then no grammar rule with head
170 // A contains B in the body. By induction on the above order, each non-terminal
171 // defines a regular language: a non-terminal C is defined by a rule C := exp,
172 // where "exp" is an expression composed from character constants, non-terminals
173 // less than C, and the following closure operations of regular languages:
174 // concatenation, union and Kleene-star. By induction, all the lesser
175 // non-terminals represent regular languages, thus "exp" also represents a
176 // regular language. In particular, the root non-terminal (and thus the grammar)
177 // defines a regular language.
178 //
179 // The FormDataParseMultipart class uses a finite automaton to represent this
180 // language. It is easiest to view it in an extended form, with longer words
181 // allowed to label a single transition to keep the number of states is low.
182 // Importand states have full-word names, unimportant states (allways with only
183 // one incoming label) have names abbreviating the incoming label, possibly
184 // with an index.
185 //
186 // Automaton for "multipart-body":
vabr (Chromium) 2012/08/16 08:00:59 An alternative to hand-writing the automaton would
187 // Initial state = Start
188 // Final states = {End, IgnoreEpilogue}
189 // Implicit state (when a transition is missing) = Error
190 // Transition table ('*' is a label matching everything not matched by other
191 // labels leaving the same state):
192 // FROM LABEL TO
193 // Start dash-boundary DB1
194 // CR CR1
195 // * IgnorePreamble
196 // CR1 LF Start
197 // * IgnorePreamble
198 // IgnorePreamble CR CR1
199 // * IgnorePreamble
200 // DB1 LWSP-char DB1
201 // CR CR2
202 // CR2 LF Part
203 // Part <ASCII 33-126, excluding ':'> Name
204 // CR CR3
205 // Name <ASCII 33-126, excluding ':'> Name
206 // ':' Colon
207 // Colon LF End1
208 // CR End2
209 // * Colon
210 // End1 CR CR3
211 // <ASCII 33-126, excluding ':'> Name
212 // End2 LF End3
213 // CR CR3
214 // <ASCII 33-126, excluding ':'> Name
215 // End3 LWSP-char Colon
216 // CR CR3
217 // <ASCII 33-126, excluding ':'> Name
218 // CR3 LF PreData
219 // PreData dash-boundary DB2
220 // CR CR4
221 // * Data
222 // CR4 LF Data2
223 // * Data
224 // Data CR CR4
225 // * Data
226 // Data2 dash-boundary DB2
227 // * CR4
228 // DB2 LWSP-char DB1
229 // CR CR2
230 // '-' D
231 // D '-' End
232 // End LWSP-char End
233 // CR CR5
234 // CR5 LF IgnoreEpilogue
235 // IgnoreEpilogue * IgnoreEpilogue
236 //
237 // The automaton itself only allows to check that the input is a well-formed
238 // multipart encoding of a form. To also extract the data, additional logic is
239 // added:
240 // * The header "Content-Disposition" (read between Part and PreData) contains
241 // the elements name=... and optionally filename=... The former is the name
242 // of the corresponding field of a form. The latter is only present if that
243 // field was a file-upload, and contains the path to the uploaded file.
244 // * The data of a message part is read between PreData and DB2, excluding the
245 // last CR LF dash-boundary.
246 //
247 // IMPORTANT NOTE
248 // This parser supports multiple sources, i.e., SetSource can be called multiple
249 // times if the input is spread over several byte vectors. However, the split
250 // must not occur in the middle of a transition of the above described automata,
251 // e.g., if there is a transition StateA --dash-boundary--> StateB, then the
252 // whole string with the dash--boundary bust be contained in the first source,
253 // or in the other. Also, the split must not occur in the middle of a header,
254 // or a part body data. A message part from one source must be read via
255 // GetNextNameValue before setting up a new source.
256 class FormDataParserMultipart : public FormDataParser {
257 public:
258 explicit FormDataParserMultipart(const std::string& boundary_separator);
259 virtual ~FormDataParserMultipart();
260
261 // Implementation of FormDataParser.
262 virtual bool AllDataReadOK() OVERRIDE;
263 virtual bool GetNextNameValue(Result* result) OVERRIDE;
264 virtual bool SetSource(const std::vector<char>* source) OVERRIDE;
265
266 private:
267 // State and Transition are numbered to make sure they form a continuous block
268 // of numbers for array indexing in lookup tables. If changing State or
269 // Transition, don't forget to update k*Size and the lookup tables.
270 enum State {
271 kStart = 0,
272 kCR1 = 1,
273 kIgnorePreamble = 2,
274 kDB1 = 3,
275 kCR2 = 4,
276 kPart = 5,
277 kName = 6,
278 kColonS = 7, // "S" to distinguish it from the transition kColonT.
279 kEnd1 = 8,
280 kEnd2 = 9,
281 kEnd3 = 10,
282 kCR3 = 11,
283 kPreData = 12,
284 kCR4 = 13,
285 kData = 14,
286 kData2 = 15,
287 kDB2 = 16,
288 kD = 17,
289 kEnd = 18,
290 kCR5 = 19,
291 kIgnoreEpilogue = 20,
292 kError = 21
293 };
294 enum Transition {
295 kLF = 0,
296 kCR = 1,
297 kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'.
298 kLwsp = 3,
299 kDashBoundary = 4,
300 kColonT = 5, // "T" to distinguish it from the state kColonS.
301 kDash = 6, // Meaning '-', not "--".
302 kAny = 7 // To represent '*'.
303 };
304 static const size_t kStateSize = 22;
305 static const size_t kTransitionSize = 8;
306
307 // Lookup tables:
308 // Maps transitions with one-character label to that character (else to 0).
309 static char kTransitionToChar[];
310 // Indices of transitions available in state |s| in |kAvailableTransitions|
311 // start at kStateToTransition[s] and the last transition for |s| is always
312 // kAny. The target state corresponding to transition kAvailableTransitions[i]
313 // is kNextState[i].
314 static Transition kAvailableTransitions[];
315 static State kNextState[];
316 static size_t kStateToTransition[];
317
318 // Reads the source until the next name-value pair is read. Returns true if
319 // |next_name_| and |next_value_| were successfully updated.
320 bool ReadNextNameValue();
321 // One step of the automaton, based on |state_| and the input from |source_|
322 // to be read. Updates the |offset_| iterator. Returns true on success.
323 bool DoStep();
324 // Tests whether the input pointed to by |offset_| allows to read transition
325 // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read.
326 size_t LookUp(Transition t);
327
328 // Extracts "name" and possibly "value" from a Content-Disposition header.
329 // Writes directly into |next_name_| and |next_value_|. Returns true on
330 // success and false otherwise.
331 bool ParseHeader(const base::StringPiece& header);
332
333 bool InFinalState() {
334 return state_ == kEnd || state_ == kIgnoreEpilogue;
335 }
336
337 // The parsed message can be split into multiple sources which we read
338 // sequentially.
339 const std::vector<char>* source_;
340 std::vector<char>::const_iterator offset_;
341 // The dash-boundary string is used for all sources.
342 const std::string dash_boundary_;
343 State state_;
344 // The next result to be returned by GetNextNameValue. It is stored as a pair
345 // of StringPieces instead of a Result, to avoid one copy of the data (note
346 // that Result stores a copy of the data in std::string, whereas StringPiece
347 // is just a pointer to source_).
348 base::StringPiece next_name_;
349 base::StringPiece next_value_;
350 bool value_name_present_;
351
352 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
353 };
354
355 } // namespace extensions
356
357 #endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698