src/url_parse.h - Issue 14090005: Modify the headers in src/ to forward to url/*.h

Side by Side Diff: src/url_parse.h

Issue 14090005: Modify the headers in src/ to forward to url/*.h (Closed) Base URL: http://google-url.googlecode.com/svn/trunk

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2007, Google Inc.	1 // Copyright 2007, Google Inc.

2 // All rights reserved.	2 // All rights reserved.

3 //	3 //

4 // Redistribution and use in source and binary forms, with or without	4 // Redistribution and use in source and binary forms, with or without

5 // modification, are permitted provided that the following conditions are	5 // modification, are permitted provided that the following conditions are

6 // met:	6 // met:

7 //	7 //

8 // * Redistributions of source code must retain the above copyright	8 // * Redistributions of source code must retain the above copyright

9 // notice, this list of conditions and the following disclaimer.	9 // notice, this list of conditions and the following disclaimer.

10 // * Redistributions in binary form must reproduce the above	10 // * Redistributions in binary form must reproduce the above

(...skipping 12 matching lines...) Expand all Loading...
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

29	29

30 #ifndef GOOGLEURL_SRC_URL_PARSE_H__	30 #ifndef GOOGLEURL_SRC_URL_PARSE_H__

31 #define GOOGLEURL_SRC_URL_PARSE_H__	31 #define GOOGLEURL_SRC_URL_PARSE_H__

32	32

33 #include <string>	33 #include "url/url_parse.h"

34

35 #include "base/basictypes.h"

36 #include "base/string16.h"

37 #include "googleurl/src/url_common.h"

38

39 namespace url_parse {

40

41 // Deprecated, but WebKit/WebCore/platform/KURLGooglePrivate.h and

42 // KURLGoogle.cpp still rely on this type.

43 typedef char16 UTF16Char;

44

45 // Component ------------------------------------------------------------------

46

47 // Represents a substring for URL parsing.

48 struct Component {

49 Component() : begin(0), len(-1) {}

50

51 // Normal constructor: takes an offset and a length.

52 Component(int b, int l) : begin(b), len(l) {}

53

54 int end() const {

55 return begin + len;

56 }

57

58 // Returns true if this component is valid, meaning the length is given. Even

59 // valid components may be empty to record the fact that they exist.

60 bool is_valid() const {

61 return (len != -1);

62 }

63

64 // Returns true if the given component is specified on false, the component

65 // is either empty or invalid.

66 bool is_nonempty() const {

67 return (len > 0);

68 }

69

70 void reset() {

71 begin = 0;

72 len = -1;

73 }

74

75 bool operator==(const Component& other) const {

76 return begin == other.begin && len == other.len;

77 }

78

79 int begin; // Byte offset in the string of this component.

80 int len; // Will be -1 if the component is unspecified.

81 };

82

83 // Helper that returns a component created with the given begin and ending

84 // points. The ending point is non-inclusive.

85 inline Component MakeRange(int begin, int end) {

86 return Component(begin, end - begin);

87 }

88

89 // Parsed ---------------------------------------------------------------------

90

91 // A structure that holds the identified parts of an input URL. This structure

92 // does NOT store the URL itself. The caller will have to store the URL text

93 // and its corresponding Parsed structure separately.

94 //

95 // Typical usage would be:

96 //

97 // url_parse::Parsed parsed;

98 // url_parse::Component scheme;

99 // if (!url_parse::ExtractScheme(url, url_len, &scheme))

100 // return I_CAN_NOT_FIND_THE_SCHEME_DUDE;

101 //

102 // if (IsStandardScheme(url, scheme)) // Not provided by this component

103 // url_parseParseStandardURL(url, url_len, &parsed);

104 // else if (IsFileURL(url, scheme)) // Not provided by this component

105 // url_parse::ParseFileURL(url, url_len, &parsed);

106 // else

107 // url_parse::ParsePathURL(url, url_len, &parsed);

108 //

109 struct Parsed {

110 // Identifies different components.

111 enum ComponentType {

112 SCHEME,

113 USERNAME,

114 PASSWORD,

115 HOST,

116 PORT,

117 PATH,

118 QUERY,

119 REF,

120 };

121

122 // The default constructor is sufficient for the components, but inner_parsed_

123 // requires special handling.

124 GURL_API Parsed();

125 GURL_API Parsed(const Parsed&);

126 GURL_API Parsed& operator=(const Parsed&);

127 GURL_API ~Parsed();

128

129 // Returns the length of the URL (the end of the last component).

130 //

131 // Note that for some invalid, non-canonical URLs, this may not be the length

132 // of the string. For example "http://": the parsed structure will only

133 // contain an entry for the four-character scheme, and it doesn't know about

134 // the "://". For all other last-components, it will return the real length.

135 GURL_API int Length() const;

136

137 // Returns the number of characters before the given component if it exists,

138 // or where the component would be if it did exist. This will return the

139 // string length if the component would be appended to the end.

140 //

141 // Note that this can get a little funny for the port, query, and ref

142 // components which have a delimiter that is not counted as part of the

143 // component. The \|include_delimiter\| flag controls if you want this counted

144 // as part of the component or not when the component exists.

145 //

146 // This example shows the difference between the two flags for two of these

147 // delimited components that is present (the port and query) and one that

148 // isn't (the reference). The components that this flag affects are marked

149 // with a *.

150 // 0 1 2

151 // 012345678901234567890

152 // Example input: http://foo:80/?query

153 // include_delim=true, ...=false ("<-" indicates different)

154 // SCHEME: 0 0

155 // USERNAME: 5 5

156 // PASSWORD: 5 5

157 // HOST: 7 7

158 // *PORT: 10 11 <-

159 // PATH: 13 13

160 // *QUERY: 14 15 <-

161 // *REF: 20 20

162 //

163 GURL_API int CountCharactersBefore(ComponentType type,

164 bool include_delimiter) const;

165

166 // Scheme without the colon: "http://foo"/ would have a scheme of "http".

167 // The length will be -1 if no scheme is specified ("foo.com"), or 0 if there

168 // is a colon but no scheme (":foo"). Note that the scheme is not guaranteed

169 // to start at the beginning of the string if there are preceeding whitespace

170 // or control characters.

171 Component scheme;

172

173 // Username. Specified in URLs with an @ sign before the host. See \|password\|

174 Component username;

175

176 // Password. The length will be -1 if unspecified, 0 if specified but empty.

177 // Not all URLs with a username have a password, as in "http://me@host/".

178 // The password is separated form the username with a colon, as in

179 // "http://me:secret@host/"

180 Component password;

181

182 // Host name.

183 Component host;

184

185 // Port number.

186 Component port;

187

188 // Path, this is everything following the host name. Length will be -1 if

189 // unspecified. This includes the preceeding slash, so the path on

190 // http://www.google.com/asdf" is "/asdf". As a result, it is impossible to

191 // have a 0 length path, it will be -1 in cases like "http://host?foo".

192 // Note that we treat backslashes the same as slashes.

193 Component path;

194

195 // Stuff between the ? and the # after the path. This does not include the

196 // preceeding ? character. Length will be -1 if unspecified, 0 if there is

197 // a question mark but no query string.

198 Component query;

199

200 // Indicated by a #, this is everything following the hash sign (not

201 // including it). If there are multiple hash signs, we'll use the last one.

202 // Length will be -1 if there is no hash sign, or 0 if there is one but

203 // nothing follows it.

204 Component ref;

205

206 // This is used for nested URL types, currently only filesystem. If you

207 // parse a filesystem URL, the resulting Parsed will have a nested

208 // inner_parsed_ to hold the parsed inner URL's component information.

209 // For all other url types [including the inner URL], it will be NULL.

210 Parsed* inner_parsed() const {

211 return inner_parsed_;

212 }

213

214 void set_inner_parsed(const Parsed& inner_parsed) {

215 if (!inner_parsed_)

216 inner_parsed_ = new Parsed(inner_parsed);

217 else

218 *inner_parsed_ = inner_parsed;

219 }

220

221 void clear_inner_parsed() {

222 if (inner_parsed_) {

223 delete inner_parsed_;

224 inner_parsed_ = NULL;

225 }

226 }

227

228 private:

229 Parsed* inner_parsed_; // This object is owned and managed by this struct.

230 };

231

232 // Initialization functions ---------------------------------------------------

233 //

234 // These functions parse the given URL, filling in all of the structure's

235 // components. These functions can not fail, they will always do their best

236 // at interpreting the input given.

237 //

238 // The string length of the URL MUST be specified, we do not check for NULLs

239 // at any point in the process, and will actually handle embedded NULLs.

240 //

241 // IMPORTANT: These functions do NOT hang on to the given pointer or copy it

242 // in any way. See the comment above the struct.

243 //

244 // The 8-bit versions require UTF-8 encoding.

245

246 // StandardURL is for when the scheme is known to be one that has an

247 // authority (host) like "http". This function will not handle weird ones

248 // like "about:" and "javascript:", or do the right thing for "file:" URLs.

249 GURL_API void ParseStandardURL(const char* url, int url_len, Parsed* parsed);

250 GURL_API void ParseStandardURL(const char16* url, int url_len, Parsed* parsed);

251

252 // PathURL is for when the scheme is known not to have an authority (host)

253 // section but that aren't file URLs either. The scheme is parsed, and

254 // everything after the scheme is considered as the path. This is used for

255 // things like "about:" and "javascript:"

256 GURL_API void ParsePathURL(const char* url, int url_len, Parsed* parsed);

257 GURL_API void ParsePathURL(const char16* url, int url_len, Parsed* parsed);

258

259 // FileURL is for file URLs. There are some special rules for interpreting

260 // these.

261 GURL_API void ParseFileURL(const char* url, int url_len, Parsed* parsed);

262 GURL_API void ParseFileURL(const char16* url, int url_len, Parsed* parsed);

263

264 // Filesystem URLs are structured differently than other URLs.

265 GURL_API void ParseFileSystemURL(const char* url,

266 int url_len,

267 Parsed* parsed);

268 GURL_API void ParseFileSystemURL(const char16* url,

269 int url_len,

270 Parsed* parsed);

271

272 // MailtoURL is for mailto: urls. They are made up scheme,path,query

273 GURL_API void ParseMailtoURL(const char* url, int url_len, Parsed* parsed);

274 GURL_API void ParseMailtoURL(const char16* url, int url_len, Parsed* parsed);

275

276 // Helper functions -----------------------------------------------------------

277

278 // Locates the scheme according to the URL parser's rules. This function is

279 // designed so the caller can find the scheme and call the correct Init*

280 // function according to their known scheme types.

281 //

282 // It also does not perform any validation on the scheme.

283 //

284 // This function will return true if the scheme is found and will put the

285 // scheme's range into *scheme. False means no scheme could be found. Note

286 // that a URL beginning with a colon has a scheme, but it is empty, so this

287 // function will return true but *scheme will = (0,0).

288 //

289 // The scheme is found by skipping spaces and control characters at the

290 // beginning, and taking everything from there to the first colon to be the

291 // scheme. The character at scheme.end() will be the colon (we may enhance

292 // this to handle full width colons or something, so don't count on the

293 // actual character value). The character at scheme.end()+1 will be the

294 // beginning of the rest of the URL, be it the authority or the path (or the

295 // end of the string).

296 //

297 // The 8-bit version requires UTF-8 encoding.

298 GURL_API bool ExtractScheme(const char* url, int url_len, Component* scheme);

299 GURL_API bool ExtractScheme(const char16* url, int url_len, Component* scheme);

300

301 // Returns true if ch is a character that terminates the authority segment

302 // of a URL.

303 GURL_API bool IsAuthorityTerminator(char16 ch);

304

305 // Does a best effort parse of input \|spec\|, in range \|auth\|. If a particular

306 // component is not found, it will be set to invalid.

307 GURL_API void ParseAuthority(const char* spec,

308 const Component& auth,

309 Component* username,

310 Component* password,

311 Component* hostname,

312 Component* port_num);

313 GURL_API void ParseAuthority(const char16* spec,

314 const Component& auth,

315 Component* username,

316 Component* password,

317 Component* hostname,

318 Component* port_num);

319

320 // Computes the integer port value from the given port component. The port

321 // component should have been identified by one of the init functions on

322 // \|Parsed\| for the given input url.

323 //

324 // The return value will be a positive integer between 0 and 64K, or one of

325 // the two special values below.

326 enum SpecialPort { PORT_UNSPECIFIED = -1, PORT_INVALID = -2 };

327 GURL_API int ParsePort(const char* url, const Component& port);

328 GURL_API int ParsePort(const char16* url, const Component& port);

329

330 // Extracts the range of the file name in the given url. The path must

331 // already have been computed by the parse function, and the matching URL

332 // and extracted path are provided to this function. The filename is

333 // defined as being everything from the last slash/backslash of the path

334 // to the end of the path.

335 //

336 // The file name will be empty if the path is empty or there is nothing

337 // following the last slash.

338 //

339 // The 8-bit version requires UTF-8 encoding.

340 GURL_API void ExtractFileName(const char* url,

341 const Component& path,

342 Component* file_name);

343 GURL_API void ExtractFileName(const char16* url,

344 const Component& path,

345 Component* file_name);

346

347 // Extract the first key/value from the range defined by \|*query\|. Updates

348 // \|*query\| to start at the end of the extracted key/value pair. This is

349 // designed for use in a loop: you can keep calling it with the same query

350 // object and it will iterate over all items in the query.

351 //

352 // Some key/value pairs may have the key, the value, or both be empty (for

353 // example, the query string "?&"). These will be returned. Note that an empty

354 // last parameter "foo.com?" or foo.com?a&" will not be returned, this case

355 // is the same as "done."

356 //

357 // The initial query component should not include the '?' (this is the default

358 // for parsed URLs).

359 //

360 // If no key/value are found \|key\| and \|value\| will be unchanged and it will

361 // return false.

362 GURL_API bool ExtractQueryKeyValue(const char* url,

363 Component* query,

364 Component* key,

365 Component* value);

366 GURL_API bool ExtractQueryKeyValue(const char16* url,

367 Component* query,

368 Component* key,

369 Component* value);

370

371 } // namespace url_parse

372	34

373 #endif // GOOGLEURL_SRC_URL_PARSE_H__	35 #endif // GOOGLEURL_SRC_URL_PARSE_H__

OLD	NEW

« no previous file with comments | « src/url_file.h ('k') | src/url_parse_internal.h » ('j') | no next file with comments »