Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(391)

Side by Side Diff: chrome/browser/extensions/api/web_request/form_data_parser.h

Issue 10694055: Add read-only access to POST data for webRequest's onBeforeRequest (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Dominic's comments Created 8 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
6 #define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
7
8 #include <string>
9 #include <vector>
10
11 #include "base/basictypes.h"
12 #include "base/memory/scoped_ptr.h"
13 // Cannot forward declare StringPiece because it is a typedef.
14 #include "base/string_piece.h"
15
16 namespace net {
17 class URLRequest;
18 }
19
20 namespace extensions {
21
22 // Interface for the form data parsers.
23 class FormDataParser {
24 public:
25 class Result {
26 public:
27 Result();
28 ~Result();
29 const std::string& name() const {
30 return name_;
31 }
32 const std::string& value() const {
33 return value_;
34 }
35 void set_name(const base::StringPiece& str) {
36 str.CopyToString(&name_);
37 }
38 void set_value(const base::StringPiece& str) {
39 str.CopyToString(&value_);
40 }
41 void set_name(const std::string& str) {
42 name_ = str;
43 }
44 void set_value(const std::string& str) {
45 value_ = str;
46 }
47 void Reset();
48
49 private:
50 std::string name_;
51 std::string value_;
52
53 DISALLOW_COPY_AND_ASSIGN(Result);
54 };
55
56 virtual ~FormDataParser();
57
58 // Creates a correct parser instance based on the |request|. Returns NULL
59 // on failure.
60 static scoped_ptr<FormDataParser> Create(const net::URLRequest* request);
61
62 // Creates a correct parser instance based on |content_type_header|, the
63 // "Content-Type" request header value. If |content_type_header| is NULL, it
64 // defaults to "application/x-www-form-urlencoded". Returns NULL on failure.
65 static scoped_ptr<FormDataParser> Create(
66 const std::string* content_type_header);
67
68 // Returns true if there was some data, it was well formed and all was read.
69 virtual bool AllDataReadOK() = 0;
70
71 // Returns the next name-value pair as |result|. After SetSource has
72 // succeeded, this allows to iterate over all pairs in the source.
73 // Returns true as long as a new pair was successfully found.
74 virtual bool GetNextNameValue(Result* result) = 0;
75
76 // Sets the |source| of the data to be parsed. The ownership is left with the
77 // caller and the source should live until |this| dies or |this->SetSource()|
78 // is called again, whichever comes sooner. Returns true on success.
79 virtual bool SetSource(const std::vector<char>* source) = 0;
80
81 protected:
82 FormDataParser();
83
84 private:
85 DISALLOW_COPY_AND_ASSIGN(FormDataParser);
86 };
87
88 // Parses URLencoded forms, see
89 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
90 class FormDataParserUrlEncoded : public FormDataParser {
91 public:
92 FormDataParserUrlEncoded();
93 virtual ~FormDataParserUrlEncoded();
94
95 // Implementation of FormDataParser.
96 virtual bool AllDataReadOK() OVERRIDE;
97 virtual bool GetNextNameValue(Result* result) OVERRIDE;
98 virtual bool SetSource(const std::vector<char>* source) OVERRIDE;
99
100 private:
101 // Gets next char from |source_|, seeks, and does book-keeping of = and &.
102 // Returns false if end of |source_| was reached, otherwise true.
103 bool GetNextChar(char* c);
104 // Once called the parser gives up and claims any results so far invalid.
105 void Abort();
106
107 const std::vector<char>* source_;
108 bool aborted_;
109
110 // Variables from this block are only to be written to by GetNextChar.
111 std::vector<char>::const_iterator offset_; // Next char to be read.
112 size_t equality_signs_; // How many '=' were read so far.
113 size_t amp_signs_; // How many '&' were read so far.
114 bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')?
115
116 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
117 };
118
119 // The following class, FormDataParserMultipart, parses forms encoded as
120 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
121 // encoding) and 822 (MIME-headers).
122 //
123 // Implementation details
124 //
125 // The original grammar from RFC 2046 is this, "multipart-body" being the root
126 // non-terminal:
127 //
128 // boundary := 0*69<bchars> bcharsnospace
129 // bchars := bcharsnospace / " "
130 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
131 // / "-" / "." / "/" / ":" / "=" / "?"
132 // dash-boundary := "--" boundary
133 // multipart-body := [preamble CRLF]
134 // dash-boundary transport-padding CRLF
135 // body-part *encapsulation
136 // close-delimiter transport-padding
137 // [CRLF epilogue]
138 // transport-padding := *LWSP-char
139 // encapsulation := delimiter transport-padding CRLF body-part
140 // delimiter := CRLF dash-boundary
141 // close-delimiter := delimiter "--"
142 // preamble := discard-text
143 // epilogue := discard-text
144 // discard-text := *(*text CRLF) *text
145 // body-part := MIME-part-headers [CRLF *OCTET]
146 // OCTET := <any 0-255 octet value>
147 //
148 // Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters
149 // of the English alphabet, respectively.
150 // The non-terminal "text" is presumably just any text, excluding line breaks.
151 // The non-terminal "LWSP-char" is not directly defined in the original grammar
152 // but it means "linear whitespace", which is a space or a horizontal tab.
153 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in
154 // English defined in RFC 822, and can be presented as follows:
155 //
156 // MIME-part-headers := *MIME-part-header
157 // MIME-part-header := name ':' *(text / whitespace) linebreak
158 // linebreak := '\r' / '\n' / CRLF
159 // whitespace := LWSP-char / CRLF LWSP-char
160 // name := namechar *namechar
161 // namechar := <ASCII char between 33 and 126, excluding ':'>
162 //
163 // This sets of rules together compose a grammar, with the root non-terminal
164 // "multipart-body". This grammer defines a regular language. Indeed, if the
165 // non-terminals are ordered in this way:
166 // namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace <
167 // linebreak < MIME-part-header < MIME-part-headers < bcharsnospace <
168 // bchars < boundary < dash-boundary < delimiter < close-delimiter <
169 // discard-text < transport-padding < OCTET < body-part < encapsulation <
170 // multipart-body
171 // then it is easy to verify that whenever A<B then no grammar rule with head
172 // A contains B in the body. By induction on the above order, each non-terminal
173 // defines a regular language: a non-terminal C is defined by a rule C := exp,
174 // where "exp" is an expression composed from character constants, non-terminals
175 // less than C, and the following closure operations of regular languages:
176 // concatenation, union and Kleene-star. By induction, all the lesser
177 // non-terminals represent regular languages, thus "exp" also represents a
178 // regular language. In particular, the root non-terminal (and thus the grammar)
179 // defines a regular language.
180 //
181 // The FormDataParseMultipart class uses a finite automaton to represent this
182 // language. It is easiest to view it in an extended form, with longer words
183 // allowed to label a single transition to keep the number of states is low.
184 // Importand states have full-word names, unimportant states (allways with only
185 // one incoming label) have names abbreviating the incoming label, possibly
186 // with an index.
187 //
188 // Automaton for "multipart-body":
189 // Initial state = Start
190 // Final states = {End, IgnoreEpilogue}
191 // Implicit state (when a transition is missing) = Error
192 // Transition table ('*' is a label matching everything not matched by other
193 // labels leaving the same state):
194 // FROM LABEL TO
195 // Start dash-boundary DB1
196 // CR CR1
197 // * IgnorePreamble
198 // CR1 LF Start
199 // * IgnorePreamble
200 // IgnorePreamble CR CR1
201 // * IgnorePreamble
202 // DB1 LWSP-char DB1
203 // CR CR2
204 // CR2 LF Part
205 // Part <ASCII 33-126, excluding ':'> Name
206 // CR CR3
207 // Name <ASCII 33-126, excluding ':'> Name
208 // ':' Colon
209 // Colon LF End1
210 // CR End2
211 // * Colon
212 // End1 CR CR3
213 // <ASCII 33-126, excluding ':'> Name
214 // End2 LF End3
215 // CR CR3
216 // <ASCII 33-126, excluding ':'> Name
217 // End3 LWSP-char Colon
218 // CR CR3
219 // <ASCII 33-126, excluding ':'> Name
220 // CR3 LF PreData
221 // PreData dash-boundary DB2
222 // CR CR4
223 // * Data
224 // CR4 LF Data2
225 // * Data
226 // Data CR CR4
227 // * Data
228 // Data2 dash-boundary DB2
229 // * CR4
230 // DB2 LWSP-char DB1
231 // CR CR2
232 // '-' D
233 // D '-' End
234 // End LWSP-char End
235 // CR CR5
236 // CR5 LF IgnoreEpilogue
237 // IgnoreEpilogue * IgnoreEpilogue
238 //
239 // The automaton itself only allows to check that the input is a well-formed
240 // multipart encoding of a form. To also extract the data, additional logic is
241 // added:
242 // * The header "Content-Disposition" (read between Part and PreData) contains
243 // the elements name=... and optionally filename=... The former is the name
244 // of the corresponding field of a form. The latter is only present if that
245 // field was a file-upload, and contains the path to the uploaded file.
246 // * The data of a message part is read between PreData and DB2, excluding the
247 // last CR LF dash-boundary.
248 //
249 // IMPORTANT NOTE
250 // This parser supports multiple sources, i.e., SetSource can be called multiple
251 // times if the input is spread over several byte vectors. However, the split
252 // must not occur in the middle of a transition of the above described automata,
253 // e.g., if there is a transition StateA --dash-boundary--> StateB, then the
254 // whole string with the dash--boundary bust be contained in the first source,
255 // or in the other. Also, the split must not occur in the middle of a header,
256 // or a part body data. A message part from one source must be read via
257 // GetNextNameValue before setting up a new source.
258 class FormDataParserMultipart : public FormDataParser {
259 public:
260 explicit FormDataParserMultipart(const std::string& boundary_separator);
261 virtual ~FormDataParserMultipart();
262
263 // Implementation of FormDataParser.
264 virtual bool AllDataReadOK() OVERRIDE;
265 virtual bool GetNextNameValue(Result* result) OVERRIDE;
266 virtual bool SetSource(const std::vector<char>* source) OVERRIDE;
267
268 private:
269 // State and Transition are numbered to make sure they form a continuous block
270 // of numbers for array indexing in lookup tables. If changing State or
271 // Transition, don't forget to update k*Size and the lookup tables.
272 enum State {
273 kStart = 0,
274 kCR1 = 1,
275 kIgnorePreamble = 2,
276 kDB1 = 3,
277 kCR2 = 4,
278 kPart = 5,
279 kName = 6,
280 kColonS = 7, // "S" to distinguish it from the transition kColonT.
281 kEnd1 = 8,
282 kEnd2 = 9,
283 kEnd3 = 10,
284 kCR3 = 11,
285 kPreData = 12,
286 kCR4 = 13,
287 kData = 14,
288 kData2 = 15,
289 kDB2 = 16,
290 kD = 17,
291 kEnd = 18,
292 kCR5 = 19,
293 kIgnoreEpilogue = 20,
294 kError = 21
295 };
296 enum Transition {
297 kLF = 0,
298 kCR = 1,
299 kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'.
300 kLwsp = 3,
301 kDashBoundary = 4,
302 kColonT = 5, // "T" to distinguish it from the state kColonS.
303 kDash = 6, // Meaning '-', not "--".
304 kAny = 7 // To represent '*'.
305 };
306 static const size_t kStateSize = 22;
307 static const size_t kTransitionSize = 8;
308
309 // Lookup tables:
310 // Maps transitions with one-character label to that character (else to 0).
311 static char kTransitionToChar[];
312 // Indices of transitions available in state |s| in |kAvailableTransitions|
313 // start at kStateToTransition[s] and the last transition for |s| is always
314 // kAny. The target state corresponding to transition kAvailableTransitions[i]
315 // is kNextState[i].
316 static Transition kAvailableTransitions[];
317 static State kNextState[];
318 static size_t kStateToTransition[];
319
320 // Reads the source until the next name-value pair is read. Returns true if
321 // |next_name_| and |next_value_| were successfully updated.
322 bool ReadNextNameValue();
323 // One step of the automaton, based on |state_| and the input from |source_|
324 // to be read. Updates the |offset_| iterator. Returns true on success.
325 bool DoStep();
326 // Tests whether the input pointed to by |offset_| allows to read transition
327 // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read.
328 size_t LookUp(Transition t);
329
330 // Extracts "name" and possibly "value" from a Content-Disposition header.
331 // Writes directly into |next_name_| and |next_value_|. Returns true on
332 // success and false otherwise.
333 bool ParseHeader(const base::StringPiece& header);
334
335 bool InFinalState() {
336 return state_ == kEnd || state_ == kIgnoreEpilogue;
337 }
338
339 // The parsed message can be split into multiple sources which we read
340 // sequentially.
341 const std::vector<char>* source_;
342 std::vector<char>::const_iterator offset_;
343 // The dash-boundary string is used for all sources.
344 const std::string dash_boundary_;
345 State state_;
346 // The next result to be returned by GetNextNameValue. It is stored as a pair
347 // of StringPieces instead of a Result, to avoid one copy of the data (note
348 // that Result stores a copy of the data in std::string, whereas StringPiece
349 // is just a pointer to source_).
350 base::StringPiece next_name_;
351 base::StringPiece next_value_;
352 bool value_name_present_;
353
354 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
355 };
356
357 } // namespace extensions
358
359 #endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698