src/url_canon.h - Issue 14090005: Modify the headers in src/ to forward to url/*.h

Side by Side Diff: src/url_canon.h

Issue 14090005: Modify the headers in src/ to forward to url/*.h (Closed) Base URL: http://google-url.googlecode.com/svn/trunk

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2007, Google Inc.	1 // Copyright 2007, Google Inc.

2 // All rights reserved.	2 // All rights reserved.

3 //	3 //

4 // Redistribution and use in source and binary forms, with or without	4 // Redistribution and use in source and binary forms, with or without

5 // modification, are permitted provided that the following conditions are	5 // modification, are permitted provided that the following conditions are

6 // met:	6 // met:

7 //	7 //

8 // * Redistributions of source code must retain the above copyright	8 // * Redistributions of source code must retain the above copyright

9 // notice, this list of conditions and the following disclaimer.	9 // notice, this list of conditions and the following disclaimer.

10 // * Redistributions in binary form must reproduce the above	10 // * Redistributions in binary form must reproduce the above

(...skipping 11 matching lines...) Expand all Loading...
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,	22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

29 #ifndef GOOGLEURL_SRC_URL_CANON_H__	29 #ifndef GOOGLEURL_SRC_URL_CANON_H__

30 #define GOOGLEURL_SRC_URL_CANON_H__	30 #define GOOGLEURL_SRC_URL_CANON_H__

31	31

32 #include <string.h>	32 #include "url/url_canon.h"

33 #include <stdlib.h>

34

35 #include "base/string16.h"

36 #include "googleurl/src/url_common.h"

37 #include "googleurl/src/url_parse.h"

38

39 namespace url_canon {

40

41 // Canonicalizer output -------------------------------------------------------

42

43 // Base class for the canonicalizer output, this maintains a buffer and

44 // supports simple resizing and append operations on it.

45 //

46 // It is VERY IMPORTANT that no virtual function calls be made on the common

47 // code path. We only have two virtual function calls, the destructor and a

48 // resize function that is called when the existing buffer is not big enough.

49 // The derived class is then in charge of setting up our buffer which we will

50 // manage.

51 template<typename T>

52 class CanonOutputT {

53 public:

54 CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {

55 }

56 virtual ~CanonOutputT() {

57 }

58

59 // Implemented to resize the buffer. This function should update the buffer

60 // pointer to point to the new buffer, and any old data up to \|cur_len_\| in

61 // the buffer must be copied over.

62 //

63 // The new size \|sz\| must be larger than buffer_len_.

64 virtual void Resize(int sz) = 0;

65

66 // Accessor for returning a character at a given position. The input offset

67 // must be in the valid range.

68 inline char at(int offset) const {

69 return buffer_[offset];

70 }

71

72 // Sets the character at the given position. The given position MUST be less

73 // than the length().

74 inline void set(int offset, int ch) {

75 buffer_[offset] = ch;

76 }

77

78 // Returns the number of characters currently in the buffer.

79 inline int length() const {

80 return cur_len_;

81 }

82

83 // Returns the current capacity of the buffer. The length() is the number of

84 // characters that have been declared to be written, but the capacity() is

85 // the number that can be written without reallocation. If the caller must

86 // write many characters at once, it can make sure there is enough capacity,

87 // write the data, then use set_size() to declare the new length().

88 int capacity() const {

89 return buffer_len_;

90 }

91

92 // Called by the user of this class to get the output. The output will NOT

93 // be NULL-terminated. Call length() to get the

94 // length.

95 const T* data() const {

96 return buffer_;

97 }

98 T* data() {

99 return buffer_;

100 }

101

102 // Shortens the URL to the new length. Used for "backing up" when processing

103 // relative paths. This can also be used if an external function writes a lot

104 // of data to the buffer (when using the "Raw" version below) beyond the end,

105 // to declare the new length.

106 //

107 // This MUST NOT be used to expand the size of the buffer beyond capacity().

108 void set_length(int new_len) {

109 cur_len_ = new_len;

110 }

111

112 // This is the most performance critical function, since it is called for

113 // every character.

114 void push_back(T ch) {

115 // In VC2005, putting this common case first speeds up execution

116 // dramatically because this branch is predicted as taken.

117 if (cur_len_ < buffer_len_) {

118 buffer_[cur_len_] = ch;

119 cur_len_++;

120 return;

121 }

122

123 // Grow the buffer to hold at least one more item. Hopefully we won't have

124 // to do this very often.

125 if (!Grow(1))

126 return;

127

128 // Actually do the insertion.

129 buffer_[cur_len_] = ch;

130 cur_len_++;

131 }

132

133 // Appends the given string to the output.

134 void Append(const T* str, int str_len) {

135 if (cur_len_ + str_len > buffer_len_) {

136 if (!Grow(cur_len_ + str_len - buffer_len_))

137 return;

138 }

139 for (int i = 0; i < str_len; i++)

140 buffer_[cur_len_ + i] = str[i];

141 cur_len_ += str_len;

142 }

143

144 protected:

145 // Grows the given buffer so that it can fit at least \|min_additional\|

146 // characters. Returns true if the buffer could be resized, false on OOM.

147 bool Grow(int min_additional) {

148 static const int kMinBufferLen = 16;

149 int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;

150 do {

151 if (new_len >= (1 << 30)) // Prevent overflow below.

152 return false;

153 new_len *= 2;

154 } while (new_len < buffer_len_ + min_additional);

155 Resize(new_len);

156 return true;

157 }

158

159 T* buffer_;

160 int buffer_len_;

161

162 // Used characters in the buffer.

163 int cur_len_;

164 };

165

166 // Simple implementation of the CanonOutput using new[]. This class

167 // also supports a static buffer so if it is allocated on the stack, most

168 // URLs can be canonicalized with no heap allocations.

169 template<typename T, int fixed_capacity = 1024>

170 class RawCanonOutputT : public CanonOutputT<T> {

171 public:

172 RawCanonOutputT() : CanonOutputT<T>() {

173 this->buffer_ = fixed_buffer_;

174 this->buffer_len_ = fixed_capacity;

175 }

176 virtual ~RawCanonOutputT() {

177 if (this->buffer_ != fixed_buffer_)

178 delete[] this->buffer_;

179 }

180

181 virtual void Resize(int sz) {

182 T* new_buf = new T[sz];

183 memcpy(new_buf, this->buffer_,

184 sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));

185 if (this->buffer_ != fixed_buffer_)

186 delete[] this->buffer_;

187 this->buffer_ = new_buf;

188 this->buffer_len_ = sz;

189 }

190

191 protected:

192 T fixed_buffer_[fixed_capacity];

193 };

194

195 // Normally, all canonicalization output is in narrow characters. We support

196 // the templates so it can also be used internally if a wide buffer is

197 // required.

198 typedef CanonOutputT<char> CanonOutput;

199 typedef CanonOutputT<char16> CanonOutputW;

200

201 template<int fixed_capacity>

202 class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};

203 template<int fixed_capacity>

204 class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {};

205

206 // Character set converter ----------------------------------------------------

207 //

208 // Converts query strings into a custom encoding. The embedder can supply an

209 // implementation of this class to interface with their own character set

210 // conversion libraries.

211 //

212 // Embedders will want to see the unit test for the ICU version.

213

214 class CharsetConverter {

215 public:

216 CharsetConverter() {}

217 virtual ~CharsetConverter() {}

218

219 // Converts the given input string from UTF-16 to whatever output format the

220 // converter supports. This is used only for the query encoding conversion,

221 // which does not fail. Instead, the converter should insert "invalid

222 // character" characters in the output for invalid sequences, and do the

223 // best it can.

224 //

225 // If the input contains a character not representable in the output

226 // character set, the converter should append the HTML entity sequence in

227 // decimal, (such as "你") with escaping of the ampersand, number

228 // sign, and semicolon (in the previous example it would be

229 // "%26%2320320%3B"). This rule is based on what IE does in this situation.

230 virtual void ConvertFromUTF16(const char16* input,

231 int input_len,

232 CanonOutput* output) = 0;

233 };

234

235 // Whitespace -----------------------------------------------------------------

236

237 // Searches for whitespace that should be removed from the middle of URLs, and

238 // removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces

239 // are preserved, which is what most browsers do. A pointer to the output will

240 // be returned, and the length of that output will be in \|output_len\|.

241 //

242 // This should be called before parsing if whitespace removal is desired (which

243 // it normally is when you are canonicalizing).

244 //

245 // If no whitespace is removed, this function will not use the buffer and will

246 // return a pointer to the input, to avoid the extra copy. If modification is

247 // required, the given \|buffer\| will be used and the returned pointer will

248 // point to the beginning of the buffer.

249 //

250 // Therefore, callers should not use the buffer, since it may actuall be empty,

251 // use the computed pointer and \|*output_len\| instead.

252 GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,

253 CanonOutputT<char>* buffer,

254 int* output_len);

255 GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,

256 CanonOutputT<char16>* buffer,

257 int* output_len);

258

259 // IDN ------------------------------------------------------------------------

260

261 // Converts the Unicode input representing a hostname to ASCII using IDN rules.

262 // The output must fall in the ASCII range, but will be encoded in UTF-16.

263 //

264 // On success, the output will be filled with the ASCII host name and it will

265 // return true. Unlike most other canonicalization functions, this assumes that

266 // the output is empty. The beginning of the host will be at offset 0, and

267 // the length of the output will be set to the length of the new host name.

268 //

269 // On error, returns false. The output in this case is undefined.

270 GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);

271

272 // Piece-by-piece canonicalizers ----------------------------------------------

273 //

274 // These individual canonicalizers append the canonicalized versions of the

275 // corresponding URL component to the given std::string. The spec and the

276 // previously-identified range of that component are the input. The range of

277 // the canonicalized component will be written to the output component.

278 //

279 // These functions all append to the output so they can be chained. Make sure

280 // the output is empty when you start.

281 //

282 // These functions returns boolean values indicating success. On failure, they

283 // will attempt to write something reasonable to the output so that, if

284 // displayed to the user, they will recognise it as something that's messed up.

285 // Nothing more should ever be done with these invalid URLs, however.

286

287 // Scheme: Appends the scheme and colon to the URL. The output component will

288 // indicate the range of characters up to but not including the colon.

289 //

290 // Canonical URLs always have a scheme. If the scheme is not present in the

291 // input, this will just write the colon to indicate an empty scheme. Does not

292 // append slashes which will be needed before any authority components for most

293 // URLs.

294 //

295 // The 8-bit version requires UTF-8 encoding.

296 GURL_API bool CanonicalizeScheme(const char* spec,

297 const url_parse::Component& scheme,

298 CanonOutput* output,

299 url_parse::Component* out_scheme);

300 GURL_API bool CanonicalizeScheme(const char16* spec,

301 const url_parse::Component& scheme,

302 CanonOutput* output,

303 url_parse::Component* out_scheme);

304

305 // User info: username/password. If present, this will add the delimiters so

306 // the output will be "<username>:<password>@" or "<username>@". Empty

307 // username/password pairs, or empty passwords, will get converted to

308 // nonexistant in the canonical version.

309 //

310 // The components for the username and password refer to ranges in the

311 // respective source strings. Usually, these will be the same string, which

312 // is legal as long as the two components don't overlap.

313 //

314 // The 8-bit version requires UTF-8 encoding.

315 GURL_API bool CanonicalizeUserInfo(const char* username_source,

316 const url_parse::Component& username,

317 const char* password_source,

318 const url_parse::Component& password,

319 CanonOutput* output,

320 url_parse::Component* out_username,

321 url_parse::Component* out_password);

322 GURL_API bool CanonicalizeUserInfo(const char16* username_source,

323 const url_parse::Component& username,

324 const char16* password_source,

325 const url_parse::Component& password,

326 CanonOutput* output,

327 url_parse::Component* out_username,

328 url_parse::Component* out_password);

329

330

331 // This structure holds detailed state exported from the IP/Host canonicalizers.

332 // Additional fields may be added as callers require them.

333 struct CanonHostInfo {

334 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}

335

336 // Convenience function to test if family is an IP address.

337 bool IsIPAddress() const { return family == IPV4 \|\| family == IPV6; }

338

339 // This field summarizes how the input was classified by the canonicalizer.

340 enum Family {

341 NEUTRAL, // - Doesn't resemble an IP address. As far as the IP

342 // canonicalizer is concerned, it should be treated as a

343 // hostname.

344 BROKEN, // - Almost an IP, but was not canonicalized. This could be an

345 // IPv4 address where truncation occurred, or something

346 // containing the special characters :[] which did not parse

347 // as an IPv6 address. Never attempt to connect to this

348 // address, because it might actually succeed!

349 IPV4, // - Successfully canonicalized as an IPv4 address.

350 IPV6, // - Successfully canonicalized as an IPv6 address.

351 };

352 Family family;

353

354 // If \|family\| is IPV4, then this is the number of nonempty dot-separated

355 // components in the input text, from 1 to 4. If \|family\| is not IPV4,

356 // this value is undefined.

357 int num_ipv4_components;

358

359 // Location of host within the canonicalized output.

360 // CanonicalizeIPAddress() only sets this field if \|family\| is IPV4 or IPV6.

361 // CanonicalizeHostVerbose() always sets it.

362 url_parse::Component out_host;

363

364 // \|address\| contains the parsed IP Address (if any) in its first

365 // AddressLength() bytes, in network order. If IsIPAddress() is false

366 // AddressLength() will return zero and the content of \|address\| is undefined.

367 unsigned char address[16];

368

369 // Convenience function to calculate the length of an IP address corresponding

370 // to the current IP version in \|family\|, if any. For use with \|address\|.

371 int AddressLength() const {

372 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0);

373 }

374 };

375

376

377 // Host.

378 //

379 // The 8-bit version requires UTF-8 encoding. Use this version when you only

380 // need to know whether canonicalization succeeded.

381 GURL_API bool CanonicalizeHost(const char* spec,

382 const url_parse::Component& host,

383 CanonOutput* output,

384 url_parse::Component* out_host);

385 GURL_API bool CanonicalizeHost(const char16* spec,

386 const url_parse::Component& host,

387 CanonOutput* output,

388 url_parse::Component* out_host);

389

390 // Extended version of CanonicalizeHost, which returns additional information.

391 // Use this when you need to know whether the hostname was an IP address.

392 // A successful return is indicated by host_info->family != BROKEN. See the

393 // definition of CanonHostInfo above for details.

394 GURL_API void CanonicalizeHostVerbose(const char* spec,

395 const url_parse::Component& host,

396 CanonOutput* output,

397 CanonHostInfo* host_info);

398 GURL_API void CanonicalizeHostVerbose(const char16* spec,

399 const url_parse::Component& host,

400 CanonOutput* output,

401 CanonHostInfo* host_info);

402

403

404 // IP addresses.

405 //

406 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is

407 // an IP address, it will canonicalize it as such, appending it to \|output\|.

408 // Additional status information is returned via the \|*host_info\| parameter.

409 // See the definition of CanonHostInfo above for details.

410 //

411 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that

412 // the input is unescaped and name-prepped, etc. It should not normally be

413 // necessary or wise to call this directly.

414 GURL_API void CanonicalizeIPAddress(const char* spec,

415 const url_parse::Component& host,

416 CanonOutput* output,

417 CanonHostInfo* host_info);

418 GURL_API void CanonicalizeIPAddress(const char16* spec,

419 const url_parse::Component& host,

420 CanonOutput* output,

421 CanonHostInfo* host_info);

422

423 // Port: this function will add the colon for the port if a port is present.

424 // The caller can pass url_parse::PORT_UNSPECIFIED as the

425 // default_port_for_scheme argument if there is no default port.

426 //

427 // The 8-bit version requires UTF-8 encoding.

428 GURL_API bool CanonicalizePort(const char* spec,

429 const url_parse::Component& port,

430 int default_port_for_scheme,

431 CanonOutput* output,

432 url_parse::Component* out_port);

433 GURL_API bool CanonicalizePort(const char16* spec,

434 const url_parse::Component& port,

435 int default_port_for_scheme,

436 CanonOutput* output,

437 url_parse::Component* out_port);

438

439 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED

440 // if the scheme is unknown.

441 GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);

442

443 // Path. If the input does not begin in a slash (including if the input is

444 // empty), we'll prepend a slash to the path to make it canonical.

445 //

446 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity

447 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid

448 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't

449 // an issue. Somebody giving us an 8-bit path is responsible for generating

450 // the path that the server expects (we'll escape high-bit characters), so

451 // if something is invalid, it's their problem.

452 GURL_API bool CanonicalizePath(const char* spec,

453 const url_parse::Component& path,

454 CanonOutput* output,

455 url_parse::Component* out_path);

456 GURL_API bool CanonicalizePath(const char16* spec,

457 const url_parse::Component& path,

458 CanonOutput* output,

459 url_parse::Component* out_path);

460

461 // Canonicalizes the input as a file path. This is like CanonicalizePath except

462 // that it also handles Windows drive specs. For example, the path can begin

463 // with "c\|\" and it will get properly canonicalized to "C:/".

464 // The string will be appended to \|output\| and \|out_path\| will be updated.

465 //

466 // The 8-bit version requires UTF-8 encoding.

467 GURL_API bool FileCanonicalizePath(const char* spec,

468 const url_parse::Component& path,

469 CanonOutput* output,

470 url_parse::Component* out_path);

471 GURL_API bool FileCanonicalizePath(const char16* spec,

472 const url_parse::Component& path,

473 CanonOutput* output,

474 url_parse::Component* out_path);

475

476 // Query: Prepends the ? if needed.

477 //

478 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly

479 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode

480 // "invalid character." This function can not fail, we always just try to do

481 // our best for crazy input here since web pages can set it themselves.

482 //

483 // This will convert the given input into the output encoding that the given

484 // character set converter object provides. The converter will only be called

485 // if necessary, for ASCII input, no conversions are necessary.

486 //

487 // The converter can be NULL. In this case, the output encoding will be UTF-8.

488 GURL_API void CanonicalizeQuery(const char* spec,

489 const url_parse::Component& query,

490 CharsetConverter* converter,

491 CanonOutput* output,

492 url_parse::Component* out_query);

493 GURL_API void CanonicalizeQuery(const char16* spec,

494 const url_parse::Component& query,

495 CharsetConverter* converter,

496 CanonOutput* output,

497 url_parse::Component* out_query);

498

499 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only

500 // canonicalizer that does not produce ASCII output). The output is

501 // guaranteed to be valid UTF-8.

502 //

503 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use

504 // the "Unicode replacement character" for the confusing bits and copy the rest.

505 GURL_API void CanonicalizeRef(const char* spec,

506 const url_parse::Component& path,

507 CanonOutput* output,

508 url_parse::Component* out_path);

509 GURL_API void CanonicalizeRef(const char16* spec,

510 const url_parse::Component& path,

511 CanonOutput* output,

512 url_parse::Component* out_path);

513

514 // Full canonicalizer ---------------------------------------------------------

515 //

516 // These functions replace any string contents, rather than append as above.

517 // See the above piece-by-piece functions for information specific to

518 // canonicalizing individual components.

519 //

520 // The output will be ASCII except the reference fragment, which may be UTF-8.

521 //

522 // The 8-bit versions require UTF-8 encoding.

523

524 // Use for standard URLs with authorities and paths.

525 GURL_API bool CanonicalizeStandardURL(const char* spec,

526 int spec_len,

527 const url_parse::Parsed& parsed,

528 CharsetConverter* query_converter,

529 CanonOutput* output,

530 url_parse::Parsed* new_parsed);

531 GURL_API bool CanonicalizeStandardURL(const char16* spec,

532 int spec_len,

533 const url_parse::Parsed& parsed,

534 CharsetConverter* query_converter,

535 CanonOutput* output,

536 url_parse::Parsed* new_parsed);

537

538 // Use for file URLs.

539 GURL_API bool CanonicalizeFileURL(const char* spec,

540 int spec_len,

541 const url_parse::Parsed& parsed,

542 CharsetConverter* query_converter,

543 CanonOutput* output,

544 url_parse::Parsed* new_parsed);

545 GURL_API bool CanonicalizeFileURL(const char16* spec,

546 int spec_len,

547 const url_parse::Parsed& parsed,

548 CharsetConverter* query_converter,

549 CanonOutput* output,

550 url_parse::Parsed* new_parsed);

551

552 // Use for filesystem URLs.

553 GURL_API bool CanonicalizeFileSystemURL(const char* spec,

554 int spec_len,

555 const url_parse::Parsed& parsed,

556 CharsetConverter* query_converter,

557 CanonOutput* output,

558 url_parse::Parsed* new_parsed);

559 GURL_API bool CanonicalizeFileSystemURL(const char16* spec,

560 int spec_len,

561 const url_parse::Parsed& parsed,

562 CharsetConverter* query_converter,

563 CanonOutput* output,

564 url_parse::Parsed* new_parsed);

565

566 // Use for path URLs such as javascript. This does not modify the path in any

567 // way, for example, by escaping it.

568 GURL_API bool CanonicalizePathURL(const char* spec,

569 int spec_len,

570 const url_parse::Parsed& parsed,

571 CanonOutput* output,

572 url_parse::Parsed* new_parsed);

573 GURL_API bool CanonicalizePathURL(const char16* spec,

574 int spec_len,

575 const url_parse::Parsed& parsed,

576 CanonOutput* output,

577 url_parse::Parsed* new_parsed);

578

579 // Use for mailto URLs. This "canonicalizes" the url into a path and query

580 // component. It does not attempt to merge "to" fields. It uses UTF-8 for

581 // the query encoding if there is a query. This is because a mailto URL is

582 // really intended for an external mail program, and the encoding of a page,

583 // etc. which would influence a query encoding normally are irrelevant.

584 GURL_API bool CanonicalizeMailtoURL(const char* spec,

585 int spec_len,

586 const url_parse::Parsed& parsed,

587 CanonOutput* output,

588 url_parse::Parsed* new_parsed);

589 GURL_API bool CanonicalizeMailtoURL(const char16* spec,

590 int spec_len,

591 const url_parse::Parsed& parsed,

592 CanonOutput* output,

593 url_parse::Parsed* new_parsed);

594

595 // Part replacer --------------------------------------------------------------

596

597 // Internal structure used for storing separate strings for each component.

598 // The basic canonicalization functions use this structure internally so that

599 // component replacement (different strings for different components) can be

600 // treated on the same code path as regular canonicalization (the same string

601 // for each component).

602 //

603 // A url_parse::Parsed structure usually goes along with this. Those

604 // components identify offsets within these strings, so that they can all be

605 // in the same string, or spread arbitrarily across different ones.

606 //

607 // This structures does not own any data. It is the caller's responsibility to

608 // ensure that the data the pointers point to stays in scope and is not

609 // modified.

610 template<typename CHAR>

611 struct URLComponentSource {

612 // Constructor normally used by callers wishing to replace components. This

613 // will make them all NULL, which is no replacement. The caller would then

614 // override the components they want to replace.

615 URLComponentSource()

616 : scheme(NULL),

617 username(NULL),

618 password(NULL),

619 host(NULL),

620 port(NULL),

621 path(NULL),

622 query(NULL),

623 ref(NULL) {

624 }

625

626 // Constructor normally used internally to initialize all the components to

627 // point to the same spec.

628 explicit URLComponentSource(const CHAR* default_value)

629 : scheme(default_value),

630 username(default_value),

631 password(default_value),

632 host(default_value),

633 port(default_value),

634 path(default_value),

635 query(default_value),

636 ref(default_value) {

637 }

638

639 const CHAR* scheme;

640 const CHAR* username;

641 const CHAR* password;

642 const CHAR* host;

643 const CHAR* port;

644 const CHAR* path;

645 const CHAR* query;

646 const CHAR* ref;

647 };

648

649 // This structure encapsulates information on modifying a URL. Each component

650 // may either be left unchanged, replaced, or deleted.

651 //

652 // By default, each component is unchanged. For those components that should be

653 // modified, call either Set* or Clear* to modify it.

654 //

655 // The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT

656 // IN SCOPE BY THE CALLER for as long as this object exists!

657 //

658 // Prefer the 8-bit replacement version if possible since it is more efficient.

659 template<typename CHAR>

660 class Replacements {

661 public:

662 Replacements() {

663 }

664

665 // Scheme

666 void SetScheme(const CHAR* s, const url_parse::Component& comp) {

667 sources_.scheme = s;

668 components_.scheme = comp;

669 }

670 // Note: we don't have a ClearScheme since this doesn't make any sense.

671 bool IsSchemeOverridden() const { return sources_.scheme != NULL; }

672

673 // Username

674 void SetUsername(const CHAR* s, const url_parse::Component& comp) {

675 sources_.username = s;

676 components_.username = comp;

677 }

678 void ClearUsername() {

679 sources_.username = Placeholder();

680 components_.username = url_parse::Component();

681 }

682 bool IsUsernameOverridden() const { return sources_.username != NULL; }

683

684 // Password

685 void SetPassword(const CHAR* s, const url_parse::Component& comp) {

686 sources_.password = s;

687 components_.password = comp;

688 }

689 void ClearPassword() {

690 sources_.password = Placeholder();

691 components_.password = url_parse::Component();

692 }

693 bool IsPasswordOverridden() const { return sources_.password != NULL; }

694

695 // Host

696 void SetHost(const CHAR* s, const url_parse::Component& comp) {

697 sources_.host = s;

698 components_.host = comp;

699 }

700 void ClearHost() {

701 sources_.host = Placeholder();

702 components_.host = url_parse::Component();

703 }

704 bool IsHostOverridden() const { return sources_.host != NULL; }

705

706 // Port

707 void SetPort(const CHAR* s, const url_parse::Component& comp) {

708 sources_.port = s;

709 components_.port = comp;

710 }

711 void ClearPort() {

712 sources_.port = Placeholder();

713 components_.port = url_parse::Component();

714 }

715 bool IsPortOverridden() const { return sources_.port != NULL; }

716

717 // Path

718 void SetPath(const CHAR* s, const url_parse::Component& comp) {

719 sources_.path = s;

720 components_.path = comp;

721 }

722 void ClearPath() {

723 sources_.path = Placeholder();

724 components_.path = url_parse::Component();

725 }

726 bool IsPathOverridden() const { return sources_.path != NULL; }

727

728 // Query

729 void SetQuery(const CHAR* s, const url_parse::Component& comp) {

730 sources_.query = s;

731 components_.query = comp;

732 }

733 void ClearQuery() {

734 sources_.query = Placeholder();

735 components_.query = url_parse::Component();

736 }

737 bool IsQueryOverridden() const { return sources_.query != NULL; }

738

739 // Ref

740 void SetRef(const CHAR* s, const url_parse::Component& comp) {

741 sources_.ref = s;

742 components_.ref = comp;

743 }

744 void ClearRef() {

745 sources_.ref = Placeholder();

746 components_.ref = url_parse::Component();

747 }

748 bool IsRefOverridden() const { return sources_.ref != NULL; }

749

750 // Getters for the itnernal data. See the variables below for how the

751 // information is encoded.

752 const URLComponentSource<CHAR>& sources() const { return sources_; }

753 const url_parse::Parsed& components() const { return components_; }

754

755 private:

756 // Returns a pointer to a static empty string that is used as a placeholder

757 // to indicate a component should be deleted (see below).

758 const CHAR* Placeholder() {

759 static const CHAR empty_string = 0;

760 return &empty_string;

761 }

762

763 // We support three states:

764 //

765 // Action \| Source Component

766 // -----------------------+--------------------------------------------------

767 // Don't change component \| NULL (unused)

768 // Replace component \| (replacement string) (replacement component)

769 // Delete component \| (non-NULL) (invalid component: (0,-1))

770 //

771 // We use a pointer to the empty string for the source when the component

772 // should be deleted.

773 URLComponentSource<CHAR> sources_;

774 url_parse::Parsed components_;

775 };

776

777 // The base must be an 8-bit canonical URL.

778 GURL_API bool ReplaceStandardURL(const char* base,

779 const url_parse::Parsed& base_parsed,

780 const Replacements<char>& replacements,

781 CharsetConverter* query_converter,

782 CanonOutput* output,

783 url_parse::Parsed* new_parsed);

784 GURL_API bool ReplaceStandardURL(const char* base,

785 const url_parse::Parsed& base_parsed,

786 const Replacements<char16>& replacements,

787 CharsetConverter* query_converter,

788 CanonOutput* output,

789 url_parse::Parsed* new_parsed);

790

791 // Filesystem URLs can only have the path, query, or ref replaced.

792 // All other components will be ignored.

793 GURL_API bool ReplaceFileSystemURL(const char* base,

794 const url_parse::Parsed& base_parsed,

795 const Replacements<char>& replacements,

796 CharsetConverter* query_converter,

797 CanonOutput* output,

798 url_parse::Parsed* new_parsed);

799 GURL_API bool ReplaceFileSystemURL(const char* base,

800 const url_parse::Parsed& base_parsed,

801 const Replacements<char16>& replacements,

802 CharsetConverter* query_converter,

803 CanonOutput* output,

804 url_parse::Parsed* new_parsed);

805

806 // Replacing some parts of a file URL is not permitted. Everything except

807 // the host, path, query, and ref will be ignored.

808 GURL_API bool ReplaceFileURL(const char* base,

809 const url_parse::Parsed& base_parsed,

810 const Replacements<char>& replacements,

811 CharsetConverter* query_converter,

812 CanonOutput* output,

813 url_parse::Parsed* new_parsed);

814 GURL_API bool ReplaceFileURL(const char* base,

815 const url_parse::Parsed& base_parsed,

816 const Replacements<char16>& replacements,

817 CharsetConverter* query_converter,

818 CanonOutput* output,

819 url_parse::Parsed* new_parsed);

820

821 // Path URLs can only have the scheme and path replaced. All other components

822 // will be ignored.

823 GURL_API bool ReplacePathURL(const char* base,

824 const url_parse::Parsed& base_parsed,

825 const Replacements<char>& replacements,

826 CanonOutput* output,

827 url_parse::Parsed* new_parsed);

828 GURL_API bool ReplacePathURL(const char* base,

829 const url_parse::Parsed& base_parsed,

830 const Replacements<char16>& replacements,

831 CanonOutput* output,

832 url_parse::Parsed* new_parsed);

833

834 // Mailto URLs can only have the scheme, path, and query replaced.

835 // All other components will be ignored.

836 GURL_API bool ReplaceMailtoURL(const char* base,

837 const url_parse::Parsed& base_parsed,

838 const Replacements<char>& replacements,

839 CanonOutput* output,

840 url_parse::Parsed* new_parsed);

841 GURL_API bool ReplaceMailtoURL(const char* base,

842 const url_parse::Parsed& base_parsed,

843 const Replacements<char16>& replacements,

844 CanonOutput* output,

845 url_parse::Parsed* new_parsed);

846

847 // Relative URL ---------------------------------------------------------------

848

849 // Given an input URL or URL fragment \|fragment\|, determines if it is a

850 // relative or absolute URL and places the result into \|*is_relative\|. If it is

851 // relative, the relevant portion of the URL will be placed into

852 // \|*relative_component\| (there may have been trimmed whitespace, for example).

853 // This value is passed to ResolveRelativeURL. If the input is not relative,

854 // this value is UNDEFINED (it may be changed by the function).

855 //

856 // Returns true on success (we successfully determined the URL is relative or

857 // not). Failure means that the combination of URLs doesn't make any sense.

858 //

859 // The base URL should always be canonical, therefore is ASCII.

860 GURL_API bool IsRelativeURL(const char* base,

861 const url_parse::Parsed& base_parsed,

862 const char* fragment,

863 int fragment_len,

864 bool is_base_hierarchical,

865 bool* is_relative,

866 url_parse::Component* relative_component);

867 GURL_API bool IsRelativeURL(const char* base,

868 const url_parse::Parsed& base_parsed,

869 const char16* fragment,

870 int fragment_len,

871 bool is_base_hierarchical,

872 bool* is_relative,

873 url_parse::Component* relative_component);

874

875 // Given a canonical parsed source URL, a URL fragment known to be relative,

876 // and the identified relevant portion of the relative URL (computed by

877 // IsRelativeURL), this produces a new parsed canonical URL in \|output\| and

878 // \|out_parsed\|.

879 //

880 // It also requires a flag indicating whether the base URL is a file: URL

881 // which triggers additional logic.

882 //

883 // The base URL should be canonical and have a host (may be empty for file

884 // URLs) and a path. If it doesn't have these, we can't resolve relative

885 // URLs off of it and will return the base as the output with an error flag.

886 // Becausee it is canonical is should also be ASCII.

887 //

888 // The query charset converter follows the same rules as CanonicalizeQuery.

889 //

890 // Returns true on success. On failure, the output will be "something

891 // reasonable" that will be consistent and valid, just probably not what

892 // was intended by the web page author or caller.

893 GURL_API bool ResolveRelativeURL(const char* base_url,

894 const url_parse::Parsed& base_parsed,

895 bool base_is_file,

896 const char* relative_url,

897 const url_parse::Component& relative_component,

898 CharsetConverter* query_converter,

899 CanonOutput* output,

900 url_parse::Parsed* out_parsed);

901 GURL_API bool ResolveRelativeURL(const char* base_url,

902 const url_parse::Parsed& base_parsed,

903 bool base_is_file,

904 const char16* relative_url,

905 const url_parse::Component& relative_component,

906 CharsetConverter* query_converter,

907 CanonOutput* output,

908 url_parse::Parsed* out_parsed);

909

910 } // namespace url_canon

911	33

912 #endif // GOOGLEURL_SRC_URL_CANON_H__	34 #endif // GOOGLEURL_SRC_URL_CANON_H__

OLD	NEW

« no previous file with comments | « src/gurl.h ('k') | src/url_canon_icu.h » ('j') | no next file with comments »