Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(84)

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.h

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Dominic's comments + adjusting to the recent move of UploadElement out of UploadData. Created 8 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
6 #define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
7
8 #include <string>
9 #include <vector>
10
11 #include "base/basictypes.h"
12 #include "base/memory/scoped_ptr.h"
13 // Cannot forward declare StringPiece because it is a typedef.
14 #include "base/string_piece.h"
15
16 namespace net {
17 class URLRequest;
18 }
19
20 namespace extensions {
21
22 // Interface for the form data parsers.
23 class FormDataParser {
24 public:
25 class Result {
26 public:
27 Result();
28 ~Result();
29 const std::string& name() const {
30 return name_;
31 }
32 const std::string& value() const {
33 return value_;
34 }
35 void set_name(const base::StringPiece& str) {
36 str.CopyToString(&name_);
37 }
38 void set_value(const base::StringPiece& str) {
39 str.CopyToString(&value_);
40 }
41 void set_name(const std::string& str) {
42 name_ = str;
43 }
44 void set_value(const std::string& str) {
45 value_ = str;
46 }
47 void Reset();
48
49 private:
50 std::string name_;
51 std::string value_;
52
53 DISALLOW_COPY_AND_ASSIGN(Result);
54 };
55
56 virtual ~FormDataParser();
57
58 // Creates a correct parser instance based on the |request|. Returns NULL
59 // on failure.
60 static scoped_ptr<FormDataParser> Create(const net::URLRequest* request);
61
62 // Creates a correct parser instance based on |content_type_header|, the
63 // "Content-Type" request header value. If |content_type_header| is NULL, it
64 // defaults to "application/x-www-form-urlencoded". Returns NULL on failure.
65 static scoped_ptr<FormDataParser> Create(
66 const std::string* content_type_header);
67
68 // Returns true if there was some data, it was well formed and all was read.
69 virtual bool AllDataReadOK() = 0;
70
71 // Returns the next name-value pair as |result|. After SetSource has
72 // succeeded, this allows to iterate over all pairs in the source.
73 // Returns true as long as a new pair was successfully found.
74 virtual bool GetNextNameValue(Result* result) = 0;
75
76 // Sets the |source| of the data to be parsed. The ownership is left with the
77 // caller and the source should live until |this| dies or |this->SetSource()|
78 // is called again, whichever comes sooner. Returns true on success.
79 virtual bool SetSource(const base::StringPiece& source) = 0;
80
81 protected:
82 FormDataParser();
83
84 private:
85 DISALLOW_COPY_AND_ASSIGN(FormDataParser);
86 };
87
88 // Parses URLencoded forms, see
89 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
90 class FormDataParserUrlEncoded : public FormDataParser {
tkent 2012/08/24 14:26:50 It seems this class is not referred by files other
vabr (Chromium) 2012/08/24 16:16:59 Done, thanks for spotting this.
91 public:
92 FormDataParserUrlEncoded();
93 virtual ~FormDataParserUrlEncoded();
94
95 // Implementation of FormDataParser.
96 virtual bool AllDataReadOK() OVERRIDE;
97 virtual bool GetNextNameValue(Result* result) OVERRIDE;
98 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
99
100 private:
101 // Gets next char from |source_|, seeks, and does book-keeping of = and &.
102 // Returns false if end of |source_| was reached, otherwise true.
103 bool GetNextChar(char* c);
104 // Once called the parser gives up and claims any results so far invalid.
105 void Abort();
106
107 base::StringPiece source_;
108 const char* source_end_;
109 bool aborted_;
110
111 // Variables from this block are only to be written to by GetNextChar.
112 const char* offset_; // Next char to be read.
113 size_t equality_signs_; // How many '=' were read so far.
114 size_t amp_signs_; // How many '&' were read so far.
115 bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')?
116
117 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
118 };
119
120 // The following class, FormDataParserMultipart, parses forms encoded as
121 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
122 // encoding) and 822 (MIME-headers).
123 //
124 // Implementation details
125 //
126 // The original grammar from RFC 2046 is this, "multipart-body" being the root
127 // non-terminal:
128 //
129 // boundary := 0*69<bchars> bcharsnospace
130 // bchars := bcharsnospace / " "
131 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
132 // / "-" / "." / "/" / ":" / "=" / "?"
133 // dash-boundary := "--" boundary
134 // multipart-body := [preamble CRLF]
135 // dash-boundary transport-padding CRLF
136 // body-part *encapsulation
137 // close-delimiter transport-padding
138 // [CRLF epilogue]
139 // transport-padding := *LWSP-char
140 // encapsulation := delimiter transport-padding CRLF body-part
141 // delimiter := CRLF dash-boundary
142 // close-delimiter := delimiter "--"
143 // preamble := discard-text
144 // epilogue := discard-text
145 // discard-text := *(*text CRLF) *text
146 // body-part := MIME-part-headers [CRLF *OCTET]
147 // OCTET := <any 0-255 octet value>
148 //
149 // Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters
150 // of the English alphabet, respectively.
151 // The non-terminal "text" is presumably just any text, excluding line breaks.
152 // The non-terminal "LWSP-char" is not directly defined in the original grammar
153 // but it means "linear whitespace", which is a space or a horizontal tab.
154 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in
155 // English defined in RFC 822, and can be presented as follows:
156 //
157 // MIME-part-headers := *MIME-part-header
158 // MIME-part-header := name ':' *(text / whitespace) linebreak
159 // linebreak := '\r' / '\n' / CRLF
160 // whitespace := LWSP-char / CRLF LWSP-char
161 // name := namechar *namechar
162 // namechar := <ASCII char between 33 and 126, excluding ':'>
163 //
164 // This sets of rules together compose a grammar, with the root non-terminal
165 // "multipart-body". This grammer defines a regular language. Indeed, if the
166 // non-terminals are ordered in this way:
167 // namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace <
168 // linebreak < MIME-part-header < MIME-part-headers < bcharsnospace <
169 // bchars < boundary < dash-boundary < delimiter < close-delimiter <
170 // discard-text < transport-padding < OCTET < body-part < encapsulation <
171 // multipart-body
172 // then it is easy to verify that whenever A<B then no grammar rule with head
173 // A contains B in the body. By induction on the above order, each non-terminal
174 // defines a regular language: a non-terminal C is defined by a rule C := exp,
175 // where "exp" is an expression composed from character constants, non-terminals
176 // less than C, and the following closure operations of regular languages:
177 // concatenation, union and Kleene-star. By induction, all the lesser
178 // non-terminals represent regular languages, thus "exp" also represents a
179 // regular language. In particular, the root non-terminal (and thus the grammar)
180 // defines a regular language.
181 //
182 // The FormDataParseMultipart class uses a finite automaton to represent this
183 // language. It is easiest to view it in an extended form, with longer words
184 // allowed to label a single transition to keep the number of states is low.
185 // Importand states have full-word names, unimportant states (allways with only
186 // one incoming label) have names abbreviating the incoming label, possibly
187 // with an index.
188 //
189 // Automaton for "multipart-body":
190 // Initial state = Start
191 // Final states = {End, IgnoreEpilogue}
192 // Implicit state (when a transition is missing) = Error
193 // Transition table ('*' is a label matching everything not matched by other
194 // labels leaving the same state):
195 // FROM LABEL TO
196 // Start dash-boundary DB1
197 // CR CR1
198 // * IgnorePreamble
199 // CR1 LF Start
200 // * IgnorePreamble
201 // IgnorePreamble CR CR1
202 // * IgnorePreamble
203 // DB1 LWSP-char DB1
204 // CR CR2
205 // CR2 LF Part
206 // Part <ASCII 33-126, excluding ':'> Name
207 // CR CR3
208 // Name <ASCII 33-126, excluding ':'> Name
209 // ':' Colon
210 // Colon LF End1
211 // CR End2
212 // * Colon
213 // End1 CR CR3
214 // <ASCII 33-126, excluding ':'> Name
215 // End2 LF End3
216 // CR CR3
217 // <ASCII 33-126, excluding ':'> Name
218 // End3 LWSP-char Colon
219 // CR CR3
220 // <ASCII 33-126, excluding ':'> Name
221 // CR3 LF PreData
222 // PreData dash-boundary DB2
223 // CR CR4
224 // * Data
225 // CR4 LF Data2
226 // * Data
227 // Data CR CR4
228 // * Data
229 // Data2 dash-boundary DB2
230 // * CR4
231 // DB2 LWSP-char DB1
232 // CR CR2
233 // '-' D
234 // D '-' End
235 // End LWSP-char End
236 // CR CR5
237 // CR5 LF IgnoreEpilogue
238 // IgnoreEpilogue * IgnoreEpilogue
239 //
240 // The automaton itself only allows to check that the input is a well-formed
241 // multipart encoding of a form. To also extract the data, additional logic is
242 // added:
243 // * The header "Content-Disposition" (read between Part and PreData) contains
244 // the elements name=... and optionally filename=... The former is the name
245 // of the corresponding field of a form. The latter is only present if that
246 // field was a file-upload, and contains the path to the uploaded file.
247 // * The data of a message part is read between PreData and DB2, excluding the
248 // last CR LF dash-boundary.
249 //
250 // IMPORTANT NOTE
251 // This parser supports multiple sources, i.e., SetSource can be called multiple
252 // times if the input is spread over several byte blocks. However, the split
253 // must not occur in the middle of a transition of the above described automata,
254 // e.g., if there is a transition StateA --dash-boundary--> StateB, then the
255 // whole string with the dash--boundary bust be contained in the first source,
256 // or in the other. Also, the split must not occur in the middle of a header,
257 // or a part body data. A message part from one source must be read via
258 // GetNextNameValue before setting up a new source.
259 class FormDataParserMultipart : public FormDataParser {
tkent 2012/08/24 14:26:50 ditto.
vabr (Chromium) 2012/08/24 16:16:59 Done.
260 public:
261 explicit FormDataParserMultipart(const std::string& boundary_separator);
262 virtual ~FormDataParserMultipart();
263
264 // Implementation of FormDataParser.
265 virtual bool AllDataReadOK() OVERRIDE;
266 virtual bool GetNextNameValue(Result* result) OVERRIDE;
267 virtual bool SetSource(const base::StringPiece& source) OVERRIDE;
268
269 private:
270 // State and Transition are numbered to make sure they form a continuous block
271 // of numbers for array indexing in lookup tables. If changing State or
272 // Transition, don't forget to update k*Size and the lookup tables.
273 enum State {
274 kStart = 0,
275 kCR1 = 1,
276 kIgnorePreamble = 2,
277 kDB1 = 3,
278 kCR2 = 4,
279 kPart = 5,
280 kName = 6,
281 kColonS = 7, // "S" to distinguish it from the transition kColonT.
282 kEnd1 = 8,
283 kEnd2 = 9,
284 kEnd3 = 10,
285 kCR3 = 11,
286 kPreData = 12,
287 kCR4 = 13,
288 kData = 14,
289 kData2 = 15,
290 kDB2 = 16,
291 kD = 17,
292 kEnd = 18,
293 kCR5 = 19,
294 kIgnoreEpilogue = 20,
295 kError = 21
296 };
297 enum Transition {
298 kLF = 0,
299 kCR = 1,
300 kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'.
301 kLwsp = 3,
302 kDashBoundary = 4,
303 kColonT = 5, // "T" to distinguish it from the state kColonS.
304 kDash = 6, // Meaning '-', not "--".
305 kAny = 7 // To represent '*'.
306 };
307 static const size_t kStateSize = 22;
308 static const size_t kTransitionSize = 8;
309
310 // Lookup tables:
311 // Maps transitions with one-character label to that character (else to 0).
312 static char kTransitionToChar[];
313 // Indices of transitions available in state |s| in |kAvailableTransitions|
314 // start at kStateToTransition[s] and the last transition for |s| is always
315 // kAny. The target state corresponding to transition kAvailableTransitions[i]
316 // is kNextState[i].
317 static Transition kAvailableTransitions[];
318 static State kNextState[];
319 static size_t kStateToTransition[];
320
321 // Reads the source until the next name-value pair is read. Returns true if
322 // |next_name_| and |next_value_| were successfully updated.
323 bool ReadNextNameValue();
324 // One step of the automaton, based on |state_| and the input from |source_|
325 // to be read. Updates the |offset_| iterator. Returns true on success.
326 bool DoStep();
327 // Tests whether the input pointed to by |offset_| allows to read transition
328 // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read.
329 size_t LookUp(Transition t);
330
331 // Extracts "name" and possibly "value" from a Content-Disposition header.
332 // Writes directly into |next_name_| and |next_value_|. Returns true on
333 // success and false otherwise.
334 bool ParseHeader(const base::StringPiece& header);
335
336 bool InFinalState() {
337 return state_ == kEnd || state_ == kIgnoreEpilogue;
338 }
339
340 // The parsed message can be split into multiple sources which we read
341 // sequentially.
342 base::StringPiece source_;
343 const char* source_end_;
344 const char* offset_;
345 // The dash-boundary string is used for all sources.
346 const std::string dash_boundary_;
347 State state_;
348 // The next result to be returned by GetNextNameValue. It is stored as a pair
349 // of StringPieces instead of a Result, to avoid one copy of the data (note
350 // that Result stores a copy of the data in std::string, whereas StringPiece
351 // is just a pointer to the data in |source_|).
352 base::StringPiece next_name_;
353 base::StringPiece next_value_;
354 bool value_name_present_;
355
356 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
357 };
358
359 } // namespace extensions
360
361 #endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698