OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #ifndef CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ | |
6 #define CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ | |
7 | |
8 #include <string> | |
9 #include <vector> | |
10 | |
11 #include "base/memory/scoped_ptr.h" | |
12 // Cannot forward declare StringPiece because it is a typedef. | |
13 #include "base/string_piece.h" | |
14 | |
15 namespace net { | |
16 class URLRequest; | |
17 } | |
18 | |
19 namespace extensions { | |
20 | |
21 // Interface for the form data parsers. | |
22 class FormDataParser { | |
23 public: | |
24 class Result { | |
25 public: | |
26 Result(); | |
27 ~Result(); | |
28 const std::string& name() const { | |
29 return name_; | |
30 } | |
31 const std::string& value() const { | |
32 return value_; | |
33 } | |
34 void set_name(const base::StringPiece& str) { | |
35 str.CopyToString(&name_); | |
36 } | |
37 void set_value(const base::StringPiece& str) { | |
38 str.CopyToString(&value_); | |
39 } | |
40 void set_name(const std::string& str) { | |
41 name_ = str; | |
42 } | |
43 void set_value(const std::string& str) { | |
44 value_ = str; | |
45 } | |
46 void Reset(); | |
47 | |
battre
2012/08/16 19:18:03
nit: -1 new line
vabr (Chromium)
2012/08/17 18:29:57
Done.
| |
48 | |
49 private: | |
50 std::string name_; | |
51 std::string value_; | |
battre
2012/08/16 19:18:03
DISALLOW_COPY_AND_ASSIGN(Result);
+ #include "bas
vabr (Chromium)
2012/08/17 18:29:57
Done.
| |
52 }; | |
53 | |
54 virtual ~FormDataParser(); | |
55 | |
56 // Creates a correct parser instance based on the |request|. Returns NULL | |
57 // on failure. | |
58 static scoped_ptr<FormDataParser> Create(const net::URLRequest* request); | |
59 | |
60 // Creates a correct parser instance based on |content_type_header|, the | |
61 // "Content-Type" request header value. If |content_type_header| is NULL, it | |
62 // defaults to "application/x-www-form-urlencoded". Returns NULL on failure. | |
63 static scoped_ptr<FormDataParser> Create( | |
64 const std::string* content_type_header); | |
65 | |
66 // Returns true if there was some data, it was well formed and all was read. | |
67 virtual bool AllDataReadOK() = 0; | |
68 | |
69 // Returns the next name-value pair as |result|. After SetSource has | |
70 // succeeded, this allows to iterate over all pairs in the source. | |
71 // Returns true as long as a new pair was successfully found. | |
72 virtual bool GetNextNameValue(Result* result) = 0; | |
73 | |
74 // Sets the |source| of the data to be parsed. The ownership is left with the | |
75 // caller and the source should live until |this| dies or |this->SetSource()| | |
76 // is called again, whichever comes sooner. Returns true on success. | |
77 virtual bool SetSource(const std::vector<char>* source) = 0; | |
78 | |
79 protected: | |
80 FormDataParser(); | |
81 | |
82 private: | |
83 DISALLOW_COPY_AND_ASSIGN(FormDataParser); | |
84 }; | |
85 | |
86 // Parses URLencoded forms, see | |
87 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . | |
88 class FormDataParserUrlEncoded : public FormDataParser { | |
89 public: | |
90 FormDataParserUrlEncoded(); | |
91 virtual ~FormDataParserUrlEncoded(); | |
92 | |
93 // Implementation of FormDataParser. | |
94 virtual bool AllDataReadOK() OVERRIDE; | |
95 virtual bool GetNextNameValue(Result* result) OVERRIDE; | |
96 virtual bool SetSource(const std::vector<char>* source) OVERRIDE; | |
97 | |
98 private: | |
99 // Gets next char from |source_|, seeks, and does book-keeping of = and &. | |
100 // Returns false if end of |source_| was reached, otherwise true. | |
101 bool GetNextChar(char* c); | |
102 // Once called the parser gives up and claims any results so far invalid. | |
103 void Abort(); | |
104 | |
105 const std::vector<char>* source_; | |
106 bool aborted_; | |
107 | |
108 // Variables from this block are only to be written to by GetNextChar. | |
109 std::vector<char>::const_iterator offset_; // Next char to be read. | |
110 size_t equality_signs_; // How many '=' were read so far. | |
111 size_t amp_signs_; // How many '&' were read so far. | |
112 bool expect_equality_; // Is the next trailing sign '=' (as opposed to '&')? | |
113 | |
114 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); | |
115 }; | |
116 | |
117 // The following class, FormDataParserMultipart, parses forms encoded as | |
118 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart | |
119 // encoding) and 822 (MIME-headers). | |
120 // | |
121 // Implementation details | |
122 // | |
123 // The original grammar from RFC 2046 is this, "multipart-body" being the root | |
124 // non-terminal: | |
125 // | |
126 // boundary := 0*69<bchars> bcharsnospace | |
127 // bchars := bcharsnospace / " " | |
128 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," | |
129 // / "-" / "." / "/" / ":" / "=" / "?" | |
130 // dash-boundary := "--" boundary | |
131 // multipart-body := [preamble CRLF] | |
132 // dash-boundary transport-padding CRLF | |
133 // body-part *encapsulation | |
134 // close-delimiter transport-padding | |
135 // [CRLF epilogue] | |
136 // transport-padding := *LWSP-char | |
137 // encapsulation := delimiter transport-padding CRLF body-part | |
138 // delimiter := CRLF dash-boundary | |
139 // close-delimiter := delimiter "--" | |
140 // preamble := discard-text | |
141 // epilogue := discard-text | |
142 // discard-text := *(*text CRLF) *text | |
143 // body-part := MIME-part-headers [CRLF *OCTET] | |
144 // OCTET := <any 0-255 octet value> | |
145 // | |
146 // Here, CRLF, DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters | |
147 // of the English alphabet, respectively. | |
148 // The non-terminal "text" is presumably just any text, excluding line breaks. | |
149 // The non-terminal "LWSP-char" is not directly defined in the original grammar | |
150 // but it means "linear whitespace", which is a space or a horizontal tab. | |
151 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, but is in | |
152 // English defined in RFC 822, and can be presented as follows: | |
153 // | |
154 // MIME-part-headers := *MIME-part-header | |
155 // MIME-part-header := name ':' *(text / whitespace) linebreak | |
156 // linebreak := '\r' / '\n' / CRLF | |
157 // whitespace := LWSP-char / CRLF LWSP-char | |
158 // name := namechar *namechar | |
159 // namechar := <ASCII char between 33 and 126, excluding ':'> | |
160 // | |
161 // This sets of rules together compose a grammar, with the root non-terminal | |
162 // "multipart-body". This grammer defines a regular language. Indeed, if the | |
163 // non-terminals are ordered in this way: | |
164 // namechar < name < CRLF < DIGIT < ALPHA < text < LWSP-char < whitespace < | |
165 // linebreak < MIME-part-header < MIME-part-headers < bcharsnospace < | |
166 // bchars < boundary < dash-boundary < delimiter < close-delimiter < | |
167 // discard-text < transport-padding < OCTET < body-part < encapsulation < | |
168 // multipart-body | |
169 // then it is easy to verify that whenever A<B then no grammar rule with head | |
170 // A contains B in the body. By induction on the above order, each non-terminal | |
171 // defines a regular language: a non-terminal C is defined by a rule C := exp, | |
172 // where "exp" is an expression composed from character constants, non-terminals | |
173 // less than C, and the following closure operations of regular languages: | |
174 // concatenation, union and Kleene-star. By induction, all the lesser | |
175 // non-terminals represent regular languages, thus "exp" also represents a | |
176 // regular language. In particular, the root non-terminal (and thus the grammar) | |
177 // defines a regular language. | |
178 // | |
179 // The FormDataParseMultipart class uses a finite automaton to represent this | |
180 // language. It is easiest to view it in an extended form, with longer words | |
181 // allowed to label a single transition to keep the number of states is low. | |
182 // Importand states have full-word names, unimportant states (allways with only | |
183 // one incoming label) have names abbreviating the incoming label, possibly | |
184 // with an index. | |
185 // | |
186 // Automaton for "multipart-body": | |
vabr (Chromium)
2012/08/16 08:00:59
An alternative to hand-writing the automaton would
| |
187 // Initial state = Start | |
188 // Final states = {End, IgnoreEpilogue} | |
189 // Implicit state (when a transition is missing) = Error | |
190 // Transition table ('*' is a label matching everything not matched by other | |
191 // labels leaving the same state): | |
192 // FROM LABEL TO | |
193 // Start dash-boundary DB1 | |
194 // CR CR1 | |
195 // * IgnorePreamble | |
196 // CR1 LF Start | |
197 // * IgnorePreamble | |
198 // IgnorePreamble CR CR1 | |
199 // * IgnorePreamble | |
200 // DB1 LWSP-char DB1 | |
201 // CR CR2 | |
202 // CR2 LF Part | |
203 // Part <ASCII 33-126, excluding ':'> Name | |
204 // CR CR3 | |
205 // Name <ASCII 33-126, excluding ':'> Name | |
206 // ':' Colon | |
207 // Colon LF End1 | |
208 // CR End2 | |
209 // * Colon | |
210 // End1 CR CR3 | |
211 // <ASCII 33-126, excluding ':'> Name | |
212 // End2 LF End3 | |
213 // CR CR3 | |
214 // <ASCII 33-126, excluding ':'> Name | |
215 // End3 LWSP-char Colon | |
216 // CR CR3 | |
217 // <ASCII 33-126, excluding ':'> Name | |
218 // CR3 LF PreData | |
219 // PreData dash-boundary DB2 | |
220 // CR CR4 | |
221 // * Data | |
222 // CR4 LF Data2 | |
223 // * Data | |
224 // Data CR CR4 | |
225 // * Data | |
226 // Data2 dash-boundary DB2 | |
227 // * CR4 | |
228 // DB2 LWSP-char DB1 | |
229 // CR CR2 | |
230 // '-' D | |
231 // D '-' End | |
232 // End LWSP-char End | |
233 // CR CR5 | |
234 // CR5 LF IgnoreEpilogue | |
235 // IgnoreEpilogue * IgnoreEpilogue | |
236 // | |
237 // The automaton itself only allows to check that the input is a well-formed | |
238 // multipart encoding of a form. To also extract the data, additional logic is | |
239 // added: | |
240 // * The header "Content-Disposition" (read between Part and PreData) contains | |
241 // the elements name=... and optionally filename=... The former is the name | |
242 // of the corresponding field of a form. The latter is only present if that | |
243 // field was a file-upload, and contains the path to the uploaded file. | |
244 // * The data of a message part is read between PreData and DB2, excluding the | |
245 // last CR LF dash-boundary. | |
246 // | |
247 // IMPORTANT NOTE | |
248 // This parser supports multiple sources, i.e., SetSource can be called multiple | |
249 // times if the input is spread over several byte vectors. However, the split | |
250 // must not occur in the middle of a transition of the above described automata, | |
251 // e.g., if there is a transition StateA --dash-boundary--> StateB, then the | |
252 // whole string with the dash--boundary bust be contained in the first source, | |
253 // or in the other. Also, the split must not occur in the middle of a header, | |
254 // or a part body data. A message part from one source must be read via | |
255 // GetNextNameValue before setting up a new source. | |
256 class FormDataParserMultipart : public FormDataParser { | |
257 public: | |
258 explicit FormDataParserMultipart(const std::string& boundary_separator); | |
259 virtual ~FormDataParserMultipart(); | |
260 | |
261 // Implementation of FormDataParser. | |
262 virtual bool AllDataReadOK() OVERRIDE; | |
263 virtual bool GetNextNameValue(Result* result) OVERRIDE; | |
264 virtual bool SetSource(const std::vector<char>* source) OVERRIDE; | |
265 | |
266 private: | |
267 // State and Transition are numbered to make sure they form a continuous block | |
268 // of numbers for array indexing in lookup tables. If changing State or | |
269 // Transition, don't forget to update k*Size and the lookup tables. | |
270 enum State { | |
271 kStart = 0, | |
272 kCR1 = 1, | |
273 kIgnorePreamble = 2, | |
274 kDB1 = 3, | |
275 kCR2 = 4, | |
276 kPart = 5, | |
277 kName = 6, | |
278 kColonS = 7, // "S" to distinguish it from the transition kColonT. | |
279 kEnd1 = 8, | |
280 kEnd2 = 9, | |
281 kEnd3 = 10, | |
282 kCR3 = 11, | |
283 kPreData = 12, | |
284 kCR4 = 13, | |
285 kData = 14, | |
286 kData2 = 15, | |
287 kDB2 = 16, | |
288 kD = 17, | |
289 kEnd = 18, | |
290 kCR5 = 19, | |
291 kIgnoreEpilogue = 20, | |
292 kError = 21 | |
293 }; | |
294 enum Transition { | |
295 kLF = 0, | |
296 kCR = 1, | |
297 kAscii = 2, // A "shorthand" for ASCII 33-126 without ':'. | |
298 kLwsp = 3, | |
299 kDashBoundary = 4, | |
300 kColonT = 5, // "T" to distinguish it from the state kColonS. | |
301 kDash = 6, // Meaning '-', not "--". | |
302 kAny = 7 // To represent '*'. | |
303 }; | |
304 static const size_t kStateSize = 22; | |
305 static const size_t kTransitionSize = 8; | |
306 | |
307 // Lookup tables: | |
308 // Maps transitions with one-character label to that character (else to 0). | |
309 static char kTransitionToChar[]; | |
310 // Indices of transitions available in state |s| in |kAvailableTransitions| | |
311 // start at kStateToTransition[s] and the last transition for |s| is always | |
312 // kAny. The target state corresponding to transition kAvailableTransitions[i] | |
313 // is kNextState[i]. | |
314 static Transition kAvailableTransitions[]; | |
315 static State kNextState[]; | |
316 static size_t kStateToTransition[]; | |
317 | |
318 // Reads the source until the next name-value pair is read. Returns true if | |
319 // |next_name_| and |next_value_| were successfully updated. | |
320 bool ReadNextNameValue(); | |
321 // One step of the automaton, based on |state_| and the input from |source_| | |
322 // to be read. Updates the |offset_| iterator. Returns true on success. | |
323 bool DoStep(); | |
324 // Tests whether the input pointed to by |offset_| allows to read transition | |
325 // |t|. It returns the number of bytes to be read, or 0 if |t| cannot be read. | |
326 size_t LookUp(Transition t); | |
327 | |
328 // Extracts "name" and possibly "value" from a Content-Disposition header. | |
329 // Writes directly into |next_name_| and |next_value_|. Returns true on | |
330 // success and false otherwise. | |
331 bool ParseHeader(const base::StringPiece& header); | |
332 | |
333 bool InFinalState() { | |
334 return state_ == kEnd || state_ == kIgnoreEpilogue; | |
335 } | |
336 | |
337 // The parsed message can be split into multiple sources which we read | |
338 // sequentially. | |
339 const std::vector<char>* source_; | |
340 std::vector<char>::const_iterator offset_; | |
341 // The dash-boundary string is used for all sources. | |
342 const std::string dash_boundary_; | |
343 State state_; | |
344 // The next result to be returned by GetNextNameValue. It is stored as a pair | |
345 // of StringPieces instead of a Result, to avoid one copy of the data (note | |
346 // that Result stores a copy of the data in std::string, whereas StringPiece | |
347 // is just a pointer to source_). | |
348 base::StringPiece next_name_; | |
349 base::StringPiece next_value_; | |
350 bool value_name_present_; | |
351 | |
352 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); | |
353 }; | |
354 | |
355 } // namespace extensions | |
356 | |
357 #endif // CHROME_BROWSER_EXTENSIONS_API_WEB_REQUEST_FORM_DATA_PARSER_H_ | |
OLD | NEW |