OLD | NEW |
1 // Copyright 2007, Google Inc. | 1 // Copyright 2007, Google Inc. |
2 // All rights reserved. | 2 // All rights reserved. |
3 // | 3 // |
4 // Redistribution and use in source and binary forms, with or without | 4 // Redistribution and use in source and binary forms, with or without |
5 // modification, are permitted provided that the following conditions are | 5 // modification, are permitted provided that the following conditions are |
6 // met: | 6 // met: |
7 // | 7 // |
8 // * Redistributions of source code must retain the above copyright | 8 // * Redistributions of source code must retain the above copyright |
9 // notice, this list of conditions and the following disclaimer. | 9 // notice, this list of conditions and the following disclaimer. |
10 // * Redistributions in binary form must reproduce the above | 10 // * Redistributions in binary form must reproduce the above |
11 // copyright notice, this list of conditions and the following disclaimer | 11 // copyright notice, this list of conditions and the following disclaimer |
12 // in the documentation and/or other materials provided with the | 12 // in the documentation and/or other materials provided with the |
13 // distribution. | 13 // distribution. |
14 // * Neither the name of Google Inc. nor the names of its | 14 // * Neither the name of Google Inc. nor the names of its |
15 // contributors may be used to endorse or promote products derived from | 15 // contributors may be used to endorse or promote products derived from |
16 // this software without specific prior written permission. | 16 // this software without specific prior written permission. |
17 // | 17 // |
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 #ifndef GOOGLEURL_SRC_URL_CANON_H__ | |
30 #define GOOGLEURL_SRC_URL_CANON_H__ | |
31 | 29 |
| 30 #ifndef URL_URL_CANON_H_ |
| 31 #define URL_URL_CANON_H_ |
| 32 |
| 33 #include <stdlib.h> |
32 #include <string.h> | 34 #include <string.h> |
33 #include <stdlib.h> | |
34 | 35 |
35 #include "base/string16.h" | 36 #include "base/string16.h" |
36 #include "googleurl/src/url_common.h" | 37 #include "url/url_parse.h" |
37 #include "googleurl/src/url_parse.h" | |
38 | 38 |
39 namespace url_canon { | 39 namespace url_canon { |
40 | 40 |
41 // Canonicalizer output ------------------------------------------------------- | 41 // Canonicalizer output ------------------------------------------------------- |
42 | 42 |
43 // Base class for the canonicalizer output, this maintains a buffer and | 43 // Base class for the canonicalizer output, this maintains a buffer and |
44 // supports simple resizing and append operations on it. | 44 // supports simple resizing and append operations on it. |
45 // | 45 // |
46 // It is VERY IMPORTANT that no virtual function calls be made on the common | 46 // It is VERY IMPORTANT that no virtual function calls be made on the common |
47 // code path. We only have two virtual function calls, the destructor and a | 47 // code path. We only have two virtual function calls, the destructor and a |
(...skipping 194 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
242 // This should be called before parsing if whitespace removal is desired (which | 242 // This should be called before parsing if whitespace removal is desired (which |
243 // it normally is when you are canonicalizing). | 243 // it normally is when you are canonicalizing). |
244 // | 244 // |
245 // If no whitespace is removed, this function will not use the buffer and will | 245 // If no whitespace is removed, this function will not use the buffer and will |
246 // return a pointer to the input, to avoid the extra copy. If modification is | 246 // return a pointer to the input, to avoid the extra copy. If modification is |
247 // required, the given |buffer| will be used and the returned pointer will | 247 // required, the given |buffer| will be used and the returned pointer will |
248 // point to the beginning of the buffer. | 248 // point to the beginning of the buffer. |
249 // | 249 // |
250 // Therefore, callers should not use the buffer, since it may actuall be empty, | 250 // Therefore, callers should not use the buffer, since it may actuall be empty, |
251 // use the computed pointer and |*output_len| instead. | 251 // use the computed pointer and |*output_len| instead. |
252 GURL_API const char* RemoveURLWhitespace(const char* input, int input_len, | 252 const char* RemoveURLWhitespace(const char* input, int input_len, |
253 CanonOutputT<char>* buffer, | 253 CanonOutputT<char>* buffer, |
254 int* output_len); | 254 int* output_len); |
255 GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len, | 255 const char16* RemoveURLWhitespace(const char16* input, int input_len, |
256 CanonOutputT<char16>* buffer, | 256 CanonOutputT<char16>* buffer, |
257 int* output_len); | 257 int* output_len); |
258 | 258 |
259 // IDN ------------------------------------------------------------------------ | 259 // IDN ------------------------------------------------------------------------ |
260 | 260 |
261 // Converts the Unicode input representing a hostname to ASCII using IDN rules. | 261 // Converts the Unicode input representing a hostname to ASCII using IDN rules. |
262 // The output must fall in the ASCII range, but will be encoded in UTF-16. | 262 // The output must fall in the ASCII range, but will be encoded in UTF-16. |
263 // | 263 // |
264 // On success, the output will be filled with the ASCII host name and it will | 264 // On success, the output will be filled with the ASCII host name and it will |
265 // return true. Unlike most other canonicalization functions, this assumes that | 265 // return true. Unlike most other canonicalization functions, this assumes that |
266 // the output is empty. The beginning of the host will be at offset 0, and | 266 // the output is empty. The beginning of the host will be at offset 0, and |
267 // the length of the output will be set to the length of the new host name. | 267 // the length of the output will be set to the length of the new host name. |
268 // | 268 // |
269 // On error, returns false. The output in this case is undefined. | 269 // On error, returns false. The output in this case is undefined. |
270 GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); | 270 bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); |
271 | 271 |
272 // Piece-by-piece canonicalizers ---------------------------------------------- | 272 // Piece-by-piece canonicalizers ---------------------------------------------- |
273 // | 273 // |
274 // These individual canonicalizers append the canonicalized versions of the | 274 // These individual canonicalizers append the canonicalized versions of the |
275 // corresponding URL component to the given std::string. The spec and the | 275 // corresponding URL component to the given std::string. The spec and the |
276 // previously-identified range of that component are the input. The range of | 276 // previously-identified range of that component are the input. The range of |
277 // the canonicalized component will be written to the output component. | 277 // the canonicalized component will be written to the output component. |
278 // | 278 // |
279 // These functions all append to the output so they can be chained. Make sure | 279 // These functions all append to the output so they can be chained. Make sure |
280 // the output is empty when you start. | 280 // the output is empty when you start. |
281 // | 281 // |
282 // These functions returns boolean values indicating success. On failure, they | 282 // These functions returns boolean values indicating success. On failure, they |
283 // will attempt to write something reasonable to the output so that, if | 283 // will attempt to write something reasonable to the output so that, if |
284 // displayed to the user, they will recognise it as something that's messed up. | 284 // displayed to the user, they will recognise it as something that's messed up. |
285 // Nothing more should ever be done with these invalid URLs, however. | 285 // Nothing more should ever be done with these invalid URLs, however. |
286 | 286 |
287 // Scheme: Appends the scheme and colon to the URL. The output component will | 287 // Scheme: Appends the scheme and colon to the URL. The output component will |
288 // indicate the range of characters up to but not including the colon. | 288 // indicate the range of characters up to but not including the colon. |
289 // | 289 // |
290 // Canonical URLs always have a scheme. If the scheme is not present in the | 290 // Canonical URLs always have a scheme. If the scheme is not present in the |
291 // input, this will just write the colon to indicate an empty scheme. Does not | 291 // input, this will just write the colon to indicate an empty scheme. Does not |
292 // append slashes which will be needed before any authority components for most | 292 // append slashes which will be needed before any authority components for most |
293 // URLs. | 293 // URLs. |
294 // | 294 // |
295 // The 8-bit version requires UTF-8 encoding. | 295 // The 8-bit version requires UTF-8 encoding. |
296 GURL_API bool CanonicalizeScheme(const char* spec, | 296 bool CanonicalizeScheme(const char* spec, |
297 const url_parse::Component& scheme, | 297 const url_parse::Component& scheme, |
298 CanonOutput* output, | 298 CanonOutput* output, |
299 url_parse::Component* out_scheme); | 299 url_parse::Component* out_scheme); |
300 GURL_API bool CanonicalizeScheme(const char16* spec, | 300 bool CanonicalizeScheme(const char16* spec, |
301 const url_parse::Component& scheme, | 301 const url_parse::Component& scheme, |
302 CanonOutput* output, | 302 CanonOutput* output, |
303 url_parse::Component* out_scheme); | 303 url_parse::Component* out_scheme); |
304 | 304 |
305 // User info: username/password. If present, this will add the delimiters so | 305 // User info: username/password. If present, this will add the delimiters so |
306 // the output will be "<username>:<password>@" or "<username>@". Empty | 306 // the output will be "<username>:<password>@" or "<username>@". Empty |
307 // username/password pairs, or empty passwords, will get converted to | 307 // username/password pairs, or empty passwords, will get converted to |
308 // nonexistant in the canonical version. | 308 // nonexistant in the canonical version. |
309 // | 309 // |
310 // The components for the username and password refer to ranges in the | 310 // The components for the username and password refer to ranges in the |
311 // respective source strings. Usually, these will be the same string, which | 311 // respective source strings. Usually, these will be the same string, which |
312 // is legal as long as the two components don't overlap. | 312 // is legal as long as the two components don't overlap. |
313 // | 313 // |
314 // The 8-bit version requires UTF-8 encoding. | 314 // The 8-bit version requires UTF-8 encoding. |
315 GURL_API bool CanonicalizeUserInfo(const char* username_source, | 315 bool CanonicalizeUserInfo(const char* username_source, |
316 const url_parse::Component& username, | 316 const url_parse::Component& username, |
317 const char* password_source, | 317 const char* password_source, |
318 const url_parse::Component& password, | 318 const url_parse::Component& password, |
319 CanonOutput* output, | 319 CanonOutput* output, |
320 url_parse::Component* out_username, | 320 url_parse::Component* out_username, |
321 url_parse::Component* out_password); | 321 url_parse::Component* out_password); |
322 GURL_API bool CanonicalizeUserInfo(const char16* username_source, | 322 bool CanonicalizeUserInfo(const char16* username_source, |
323 const url_parse::Component& username, | 323 const url_parse::Component& username, |
324 const char16* password_source, | 324 const char16* password_source, |
325 const url_parse::Component& password, | 325 const url_parse::Component& password, |
326 CanonOutput* output, | 326 CanonOutput* output, |
327 url_parse::Component* out_username, | 327 url_parse::Component* out_username, |
328 url_parse::Component* out_password); | 328 url_parse::Component* out_password); |
329 | 329 |
330 | 330 |
331 // This structure holds detailed state exported from the IP/Host canonicalizers. | 331 // This structure holds detailed state exported from the IP/Host canonicalizers. |
332 // Additional fields may be added as callers require them. | 332 // Additional fields may be added as callers require them. |
333 struct CanonHostInfo { | 333 struct CanonHostInfo { |
334 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} | 334 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} |
335 | 335 |
336 // Convenience function to test if family is an IP address. | 336 // Convenience function to test if family is an IP address. |
337 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } | 337 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } |
338 | 338 |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
371 int AddressLength() const { | 371 int AddressLength() const { |
372 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); | 372 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); |
373 } | 373 } |
374 }; | 374 }; |
375 | 375 |
376 | 376 |
377 // Host. | 377 // Host. |
378 // | 378 // |
379 // The 8-bit version requires UTF-8 encoding. Use this version when you only | 379 // The 8-bit version requires UTF-8 encoding. Use this version when you only |
380 // need to know whether canonicalization succeeded. | 380 // need to know whether canonicalization succeeded. |
381 GURL_API bool CanonicalizeHost(const char* spec, | 381 bool CanonicalizeHost(const char* spec, |
382 const url_parse::Component& host, | 382 const url_parse::Component& host, |
383 CanonOutput* output, | 383 CanonOutput* output, |
384 url_parse::Component* out_host); | 384 url_parse::Component* out_host); |
385 GURL_API bool CanonicalizeHost(const char16* spec, | 385 bool CanonicalizeHost(const char16* spec, |
386 const url_parse::Component& host, | 386 const url_parse::Component& host, |
387 CanonOutput* output, | 387 CanonOutput* output, |
388 url_parse::Component* out_host); | 388 url_parse::Component* out_host); |
389 | 389 |
390 // Extended version of CanonicalizeHost, which returns additional information. | 390 // Extended version of CanonicalizeHost, which returns additional information. |
391 // Use this when you need to know whether the hostname was an IP address. | 391 // Use this when you need to know whether the hostname was an IP address. |
392 // A successful return is indicated by host_info->family != BROKEN. See the | 392 // A successful return is indicated by host_info->family != BROKEN. See the |
393 // definition of CanonHostInfo above for details. | 393 // definition of CanonHostInfo above for details. |
394 GURL_API void CanonicalizeHostVerbose(const char* spec, | 394 void CanonicalizeHostVerbose(const char* spec, |
395 const url_parse::Component& host, | 395 const url_parse::Component& host, |
396 CanonOutput* output, | 396 CanonOutput* output, |
397 CanonHostInfo* host_info); | 397 CanonHostInfo* host_info); |
398 GURL_API void CanonicalizeHostVerbose(const char16* spec, | 398 void CanonicalizeHostVerbose(const char16* spec, |
399 const url_parse::Component& host, | 399 const url_parse::Component& host, |
400 CanonOutput* output, | 400 CanonOutput* output, |
401 CanonHostInfo* host_info); | 401 CanonHostInfo* host_info); |
402 | 402 |
403 | 403 |
404 // IP addresses. | 404 // IP addresses. |
405 // | 405 // |
406 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is | 406 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is |
407 // an IP address, it will canonicalize it as such, appending it to |output|. | 407 // an IP address, it will canonicalize it as such, appending it to |output|. |
408 // Additional status information is returned via the |*host_info| parameter. | 408 // Additional status information is returned via the |*host_info| parameter. |
409 // See the definition of CanonHostInfo above for details. | 409 // See the definition of CanonHostInfo above for details. |
410 // | 410 // |
411 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that | 411 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that |
412 // the input is unescaped and name-prepped, etc. It should not normally be | 412 // the input is unescaped and name-prepped, etc. It should not normally be |
413 // necessary or wise to call this directly. | 413 // necessary or wise to call this directly. |
414 GURL_API void CanonicalizeIPAddress(const char* spec, | 414 void CanonicalizeIPAddress(const char* spec, |
415 const url_parse::Component& host, | 415 const url_parse::Component& host, |
416 CanonOutput* output, | 416 CanonOutput* output, |
417 CanonHostInfo* host_info); | 417 CanonHostInfo* host_info); |
418 GURL_API void CanonicalizeIPAddress(const char16* spec, | 418 void CanonicalizeIPAddress(const char16* spec, |
419 const url_parse::Component& host, | 419 const url_parse::Component& host, |
420 CanonOutput* output, | 420 CanonOutput* output, |
421 CanonHostInfo* host_info); | 421 CanonHostInfo* host_info); |
422 | 422 |
423 // Port: this function will add the colon for the port if a port is present. | 423 // Port: this function will add the colon for the port if a port is present. |
424 // The caller can pass url_parse::PORT_UNSPECIFIED as the | 424 // The caller can pass url_parse::PORT_UNSPECIFIED as the |
425 // default_port_for_scheme argument if there is no default port. | 425 // default_port_for_scheme argument if there is no default port. |
426 // | 426 // |
427 // The 8-bit version requires UTF-8 encoding. | 427 // The 8-bit version requires UTF-8 encoding. |
428 GURL_API bool CanonicalizePort(const char* spec, | 428 bool CanonicalizePort(const char* spec, |
429 const url_parse::Component& port, | 429 const url_parse::Component& port, |
430 int default_port_for_scheme, | 430 int default_port_for_scheme, |
431 CanonOutput* output, | 431 CanonOutput* output, |
432 url_parse::Component* out_port); | 432 url_parse::Component* out_port); |
433 GURL_API bool CanonicalizePort(const char16* spec, | 433 bool CanonicalizePort(const char16* spec, |
434 const url_parse::Component& port, | 434 const url_parse::Component& port, |
435 int default_port_for_scheme, | 435 int default_port_for_scheme, |
436 CanonOutput* output, | 436 CanonOutput* output, |
437 url_parse::Component* out_port); | 437 url_parse::Component* out_port); |
438 | 438 |
439 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED | 439 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED |
440 // if the scheme is unknown. | 440 // if the scheme is unknown. |
441 GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len); | 441 int DefaultPortForScheme(const char* scheme, int scheme_len); |
442 | 442 |
443 // Path. If the input does not begin in a slash (including if the input is | 443 // Path. If the input does not begin in a slash (including if the input is |
444 // empty), we'll prepend a slash to the path to make it canonical. | 444 // empty), we'll prepend a slash to the path to make it canonical. |
445 // | 445 // |
446 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity | 446 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity |
447 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid | 447 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid |
448 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't | 448 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't |
449 // an issue. Somebody giving us an 8-bit path is responsible for generating | 449 // an issue. Somebody giving us an 8-bit path is responsible for generating |
450 // the path that the server expects (we'll escape high-bit characters), so | 450 // the path that the server expects (we'll escape high-bit characters), so |
451 // if something is invalid, it's their problem. | 451 // if something is invalid, it's their problem. |
452 GURL_API bool CanonicalizePath(const char* spec, | 452 bool CanonicalizePath(const char* spec, |
453 const url_parse::Component& path, | 453 const url_parse::Component& path, |
454 CanonOutput* output, | 454 CanonOutput* output, |
455 url_parse::Component* out_path); | 455 url_parse::Component* out_path); |
456 GURL_API bool CanonicalizePath(const char16* spec, | 456 bool CanonicalizePath(const char16* spec, |
457 const url_parse::Component& path, | 457 const url_parse::Component& path, |
458 CanonOutput* output, | 458 CanonOutput* output, |
459 url_parse::Component* out_path); | 459 url_parse::Component* out_path); |
460 | 460 |
461 // Canonicalizes the input as a file path. This is like CanonicalizePath except | 461 // Canonicalizes the input as a file path. This is like CanonicalizePath except |
462 // that it also handles Windows drive specs. For example, the path can begin | 462 // that it also handles Windows drive specs. For example, the path can begin |
463 // with "c|\" and it will get properly canonicalized to "C:/". | 463 // with "c|\" and it will get properly canonicalized to "C:/". |
464 // The string will be appended to |*output| and |*out_path| will be updated. | 464 // The string will be appended to |*output| and |*out_path| will be updated. |
465 // | 465 // |
466 // The 8-bit version requires UTF-8 encoding. | 466 // The 8-bit version requires UTF-8 encoding. |
467 GURL_API bool FileCanonicalizePath(const char* spec, | 467 bool FileCanonicalizePath(const char* spec, |
468 const url_parse::Component& path, | 468 const url_parse::Component& path, |
469 CanonOutput* output, | 469 CanonOutput* output, |
470 url_parse::Component* out_path); | 470 url_parse::Component* out_path); |
471 GURL_API bool FileCanonicalizePath(const char16* spec, | 471 bool FileCanonicalizePath(const char16* spec, |
472 const url_parse::Component& path, | 472 const url_parse::Component& path, |
473 CanonOutput* output, | 473 CanonOutput* output, |
474 url_parse::Component* out_path); | 474 url_parse::Component* out_path); |
475 | 475 |
476 // Query: Prepends the ? if needed. | 476 // Query: Prepends the ? if needed. |
477 // | 477 // |
478 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly | 478 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly |
479 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode | 479 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode |
480 // "invalid character." This function can not fail, we always just try to do | 480 // "invalid character." This function can not fail, we always just try to do |
481 // our best for crazy input here since web pages can set it themselves. | 481 // our best for crazy input here since web pages can set it themselves. |
482 // | 482 // |
483 // This will convert the given input into the output encoding that the given | 483 // This will convert the given input into the output encoding that the given |
484 // character set converter object provides. The converter will only be called | 484 // character set converter object provides. The converter will only be called |
485 // if necessary, for ASCII input, no conversions are necessary. | 485 // if necessary, for ASCII input, no conversions are necessary. |
486 // | 486 // |
487 // The converter can be NULL. In this case, the output encoding will be UTF-8. | 487 // The converter can be NULL. In this case, the output encoding will be UTF-8. |
488 GURL_API void CanonicalizeQuery(const char* spec, | 488 void CanonicalizeQuery(const char* spec, |
489 const url_parse::Component& query, | 489 const url_parse::Component& query, |
490 CharsetConverter* converter, | 490 CharsetConverter* converter, |
491 CanonOutput* output, | 491 CanonOutput* output, |
492 url_parse::Component* out_query); | 492 url_parse::Component* out_query); |
493 GURL_API void CanonicalizeQuery(const char16* spec, | 493 void CanonicalizeQuery(const char16* spec, |
494 const url_parse::Component& query, | 494 const url_parse::Component& query, |
495 CharsetConverter* converter, | 495 CharsetConverter* converter, |
496 CanonOutput* output, | 496 CanonOutput* output, |
497 url_parse::Component* out_query); | 497 url_parse::Component* out_query); |
498 | 498 |
499 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only | 499 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only |
500 // canonicalizer that does not produce ASCII output). The output is | 500 // canonicalizer that does not produce ASCII output). The output is |
501 // guaranteed to be valid UTF-8. | 501 // guaranteed to be valid UTF-8. |
502 // | 502 // |
503 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use | 503 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use |
504 // the "Unicode replacement character" for the confusing bits and copy the rest. | 504 // the "Unicode replacement character" for the confusing bits and copy the rest. |
505 GURL_API void CanonicalizeRef(const char* spec, | 505 void CanonicalizeRef(const char* spec, |
506 const url_parse::Component& path, | 506 const url_parse::Component& path, |
507 CanonOutput* output, | 507 CanonOutput* output, |
508 url_parse::Component* out_path); | 508 url_parse::Component* out_path); |
509 GURL_API void CanonicalizeRef(const char16* spec, | 509 void CanonicalizeRef(const char16* spec, |
510 const url_parse::Component& path, | 510 const url_parse::Component& path, |
511 CanonOutput* output, | 511 CanonOutput* output, |
512 url_parse::Component* out_path); | 512 url_parse::Component* out_path); |
513 | 513 |
514 // Full canonicalizer --------------------------------------------------------- | 514 // Full canonicalizer --------------------------------------------------------- |
515 // | 515 // |
516 // These functions replace any string contents, rather than append as above. | 516 // These functions replace any string contents, rather than append as above. |
517 // See the above piece-by-piece functions for information specific to | 517 // See the above piece-by-piece functions for information specific to |
518 // canonicalizing individual components. | 518 // canonicalizing individual components. |
519 // | 519 // |
520 // The output will be ASCII except the reference fragment, which may be UTF-8. | 520 // The output will be ASCII except the reference fragment, which may be UTF-8. |
521 // | 521 // |
522 // The 8-bit versions require UTF-8 encoding. | 522 // The 8-bit versions require UTF-8 encoding. |
523 | 523 |
524 // Use for standard URLs with authorities and paths. | 524 // Use for standard URLs with authorities and paths. |
525 GURL_API bool CanonicalizeStandardURL(const char* spec, | 525 bool CanonicalizeStandardURL(const char* spec, |
526 int spec_len, | 526 int spec_len, |
527 const url_parse::Parsed& parsed, | 527 const url_parse::Parsed& parsed, |
528 CharsetConverter* query_converter, | 528 CharsetConverter* query_converter, |
529 CanonOutput* output, | 529 CanonOutput* output, |
530 url_parse::Parsed* new_parsed); | 530 url_parse::Parsed* new_parsed); |
531 GURL_API bool CanonicalizeStandardURL(const char16* spec, | 531 bool CanonicalizeStandardURL(const char16* spec, |
532 int spec_len, | 532 int spec_len, |
533 const url_parse::Parsed& parsed, | 533 const url_parse::Parsed& parsed, |
534 CharsetConverter* query_converter, | 534 CharsetConverter* query_converter, |
535 CanonOutput* output, | 535 CanonOutput* output, |
536 url_parse::Parsed* new_parsed); | 536 url_parse::Parsed* new_parsed); |
537 | 537 |
538 // Use for file URLs. | 538 // Use for file URLs. |
539 GURL_API bool CanonicalizeFileURL(const char* spec, | 539 bool CanonicalizeFileURL(const char* spec, |
540 int spec_len, | 540 int spec_len, |
541 const url_parse::Parsed& parsed, | 541 const url_parse::Parsed& parsed, |
542 CharsetConverter* query_converter, | 542 CharsetConverter* query_converter, |
543 CanonOutput* output, | 543 CanonOutput* output, |
544 url_parse::Parsed* new_parsed); | 544 url_parse::Parsed* new_parsed); |
545 GURL_API bool CanonicalizeFileURL(const char16* spec, | 545 bool CanonicalizeFileURL(const char16* spec, |
546 int spec_len, | 546 int spec_len, |
547 const url_parse::Parsed& parsed, | 547 const url_parse::Parsed& parsed, |
548 CharsetConverter* query_converter, | 548 CharsetConverter* query_converter, |
549 CanonOutput* output, | 549 CanonOutput* output, |
550 url_parse::Parsed* new_parsed); | 550 url_parse::Parsed* new_parsed); |
551 | 551 |
552 // Use for filesystem URLs. | 552 // Use for filesystem URLs. |
553 GURL_API bool CanonicalizeFileSystemURL(const char* spec, | 553 bool CanonicalizeFileSystemURL(const char* spec, |
554 int spec_len, | 554 int spec_len, |
555 const url_parse::Parsed& parsed, | 555 const url_parse::Parsed& parsed, |
556 CharsetConverter* query_converter, | 556 CharsetConverter* query_converter, |
557 CanonOutput* output, | 557 CanonOutput* output, |
558 url_parse::Parsed* new_parsed); | 558 url_parse::Parsed* new_parsed); |
559 GURL_API bool CanonicalizeFileSystemURL(const char16* spec, | 559 bool CanonicalizeFileSystemURL(const char16* spec, |
560 int spec_len, | 560 int spec_len, |
561 const url_parse::Parsed& parsed, | 561 const url_parse::Parsed& parsed, |
562 CharsetConverter* query_converter, | 562 CharsetConverter* query_converter, |
563 CanonOutput* output, | 563 CanonOutput* output, |
564 url_parse::Parsed* new_parsed); | 564 url_parse::Parsed* new_parsed); |
565 | 565 |
566 // Use for path URLs such as javascript. This does not modify the path in any | 566 // Use for path URLs such as javascript. This does not modify the path in any |
567 // way, for example, by escaping it. | 567 // way, for example, by escaping it. |
568 GURL_API bool CanonicalizePathURL(const char* spec, | 568 bool CanonicalizePathURL(const char* spec, |
569 int spec_len, | 569 int spec_len, |
570 const url_parse::Parsed& parsed, | 570 const url_parse::Parsed& parsed, |
571 CanonOutput* output, | 571 CanonOutput* output, |
572 url_parse::Parsed* new_parsed); | 572 url_parse::Parsed* new_parsed); |
573 GURL_API bool CanonicalizePathURL(const char16* spec, | 573 bool CanonicalizePathURL(const char16* spec, |
574 int spec_len, | 574 int spec_len, |
575 const url_parse::Parsed& parsed, | 575 const url_parse::Parsed& parsed, |
576 CanonOutput* output, | 576 CanonOutput* output, |
577 url_parse::Parsed* new_parsed); | 577 url_parse::Parsed* new_parsed); |
578 | 578 |
579 // Use for mailto URLs. This "canonicalizes" the url into a path and query | 579 // Use for mailto URLs. This "canonicalizes" the url into a path and query |
580 // component. It does not attempt to merge "to" fields. It uses UTF-8 for | 580 // component. It does not attempt to merge "to" fields. It uses UTF-8 for |
581 // the query encoding if there is a query. This is because a mailto URL is | 581 // the query encoding if there is a query. This is because a mailto URL is |
582 // really intended for an external mail program, and the encoding of a page, | 582 // really intended for an external mail program, and the encoding of a page, |
583 // etc. which would influence a query encoding normally are irrelevant. | 583 // etc. which would influence a query encoding normally are irrelevant. |
584 GURL_API bool CanonicalizeMailtoURL(const char* spec, | 584 bool CanonicalizeMailtoURL(const char* spec, |
585 int spec_len, | 585 int spec_len, |
586 const url_parse::Parsed& parsed, | 586 const url_parse::Parsed& parsed, |
587 CanonOutput* output, | 587 CanonOutput* output, |
588 url_parse::Parsed* new_parsed); | 588 url_parse::Parsed* new_parsed); |
589 GURL_API bool CanonicalizeMailtoURL(const char16* spec, | 589 bool CanonicalizeMailtoURL(const char16* spec, |
590 int spec_len, | 590 int spec_len, |
591 const url_parse::Parsed& parsed, | 591 const url_parse::Parsed& parsed, |
592 CanonOutput* output, | 592 CanonOutput* output, |
593 url_parse::Parsed* new_parsed); | 593 url_parse::Parsed* new_parsed); |
594 | 594 |
595 // Part replacer -------------------------------------------------------------- | 595 // Part replacer -------------------------------------------------------------- |
596 | 596 |
597 // Internal structure used for storing separate strings for each component. | 597 // Internal structure used for storing separate strings for each component. |
598 // The basic canonicalization functions use this structure internally so that | 598 // The basic canonicalization functions use this structure internally so that |
599 // component replacement (different strings for different components) can be | 599 // component replacement (different strings for different components) can be |
600 // treated on the same code path as regular canonicalization (the same string | 600 // treated on the same code path as regular canonicalization (the same string |
601 // for each component). | 601 // for each component). |
602 // | 602 // |
603 // A url_parse::Parsed structure usually goes along with this. Those | 603 // A url_parse::Parsed structure usually goes along with this. Those |
(...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
768 // Replace component | (replacement string) (replacement component) | 768 // Replace component | (replacement string) (replacement component) |
769 // Delete component | (non-NULL) (invalid component: (0,-1)) | 769 // Delete component | (non-NULL) (invalid component: (0,-1)) |
770 // | 770 // |
771 // We use a pointer to the empty string for the source when the component | 771 // We use a pointer to the empty string for the source when the component |
772 // should be deleted. | 772 // should be deleted. |
773 URLComponentSource<CHAR> sources_; | 773 URLComponentSource<CHAR> sources_; |
774 url_parse::Parsed components_; | 774 url_parse::Parsed components_; |
775 }; | 775 }; |
776 | 776 |
777 // The base must be an 8-bit canonical URL. | 777 // The base must be an 8-bit canonical URL. |
778 GURL_API bool ReplaceStandardURL(const char* base, | 778 bool ReplaceStandardURL(const char* base, |
779 const url_parse::Parsed& base_parsed, | 779 const url_parse::Parsed& base_parsed, |
780 const Replacements<char>& replacements, | 780 const Replacements<char>& replacements, |
781 CharsetConverter* query_converter, | 781 CharsetConverter* query_converter, |
782 CanonOutput* output, | 782 CanonOutput* output, |
783 url_parse::Parsed* new_parsed); | 783 url_parse::Parsed* new_parsed); |
784 GURL_API bool ReplaceStandardURL(const char* base, | 784 bool ReplaceStandardURL(const char* base, |
785 const url_parse::Parsed& base_parsed, | 785 const url_parse::Parsed& base_parsed, |
786 const Replacements<char16>& replacements, | 786 const Replacements<char16>& replacements, |
787 CharsetConverter* query_converter, | 787 CharsetConverter* query_converter, |
788 CanonOutput* output, | 788 CanonOutput* output, |
789 url_parse::Parsed* new_parsed); | 789 url_parse::Parsed* new_parsed); |
790 | 790 |
791 // Filesystem URLs can only have the path, query, or ref replaced. | 791 // Filesystem URLs can only have the path, query, or ref replaced. |
792 // All other components will be ignored. | 792 // All other components will be ignored. |
793 GURL_API bool ReplaceFileSystemURL(const char* base, | 793 bool ReplaceFileSystemURL(const char* base, |
794 const url_parse::Parsed& base_parsed, | 794 const url_parse::Parsed& base_parsed, |
795 const Replacements<char>& replacements, | 795 const Replacements<char>& replacements, |
796 CharsetConverter* query_converter, | 796 CharsetConverter* query_converter, |
797 CanonOutput* output, | 797 CanonOutput* output, |
798 url_parse::Parsed* new_parsed); | 798 url_parse::Parsed* new_parsed); |
799 GURL_API bool ReplaceFileSystemURL(const char* base, | 799 bool ReplaceFileSystemURL(const char* base, |
800 const url_parse::Parsed& base_parsed, | 800 const url_parse::Parsed& base_parsed, |
801 const Replacements<char16>& replacements, | 801 const Replacements<char16>& replacements, |
802 CharsetConverter* query_converter, | 802 CharsetConverter* query_converter, |
803 CanonOutput* output, | 803 CanonOutput* output, |
804 url_parse::Parsed* new_parsed); | 804 url_parse::Parsed* new_parsed); |
805 | 805 |
806 // Replacing some parts of a file URL is not permitted. Everything except | 806 // Replacing some parts of a file URL is not permitted. Everything except |
807 // the host, path, query, and ref will be ignored. | 807 // the host, path, query, and ref will be ignored. |
808 GURL_API bool ReplaceFileURL(const char* base, | 808 bool ReplaceFileURL(const char* base, |
809 const url_parse::Parsed& base_parsed, | 809 const url_parse::Parsed& base_parsed, |
810 const Replacements<char>& replacements, | 810 const Replacements<char>& replacements, |
811 CharsetConverter* query_converter, | 811 CharsetConverter* query_converter, |
812 CanonOutput* output, | 812 CanonOutput* output, |
813 url_parse::Parsed* new_parsed); | 813 url_parse::Parsed* new_parsed); |
814 GURL_API bool ReplaceFileURL(const char* base, | 814 bool ReplaceFileURL(const char* base, |
815 const url_parse::Parsed& base_parsed, | 815 const url_parse::Parsed& base_parsed, |
816 const Replacements<char16>& replacements, | 816 const Replacements<char16>& replacements, |
817 CharsetConverter* query_converter, | 817 CharsetConverter* query_converter, |
818 CanonOutput* output, | 818 CanonOutput* output, |
819 url_parse::Parsed* new_parsed); | 819 url_parse::Parsed* new_parsed); |
820 | 820 |
821 // Path URLs can only have the scheme and path replaced. All other components | 821 // Path URLs can only have the scheme and path replaced. All other components |
822 // will be ignored. | 822 // will be ignored. |
823 GURL_API bool ReplacePathURL(const char* base, | 823 bool ReplacePathURL(const char* base, |
824 const url_parse::Parsed& base_parsed, | 824 const url_parse::Parsed& base_parsed, |
825 const Replacements<char>& replacements, | 825 const Replacements<char>& replacements, |
826 CanonOutput* output, | 826 CanonOutput* output, |
827 url_parse::Parsed* new_parsed); | 827 url_parse::Parsed* new_parsed); |
828 GURL_API bool ReplacePathURL(const char* base, | 828 bool ReplacePathURL(const char* base, |
829 const url_parse::Parsed& base_parsed, | 829 const url_parse::Parsed& base_parsed, |
830 const Replacements<char16>& replacements, | 830 const Replacements<char16>& replacements, |
831 CanonOutput* output, | 831 CanonOutput* output, |
832 url_parse::Parsed* new_parsed); | 832 url_parse::Parsed* new_parsed); |
833 | 833 |
834 // Mailto URLs can only have the scheme, path, and query replaced. | 834 // Mailto URLs can only have the scheme, path, and query replaced. |
835 // All other components will be ignored. | 835 // All other components will be ignored. |
836 GURL_API bool ReplaceMailtoURL(const char* base, | 836 bool ReplaceMailtoURL(const char* base, |
837 const url_parse::Parsed& base_parsed, | 837 const url_parse::Parsed& base_parsed, |
838 const Replacements<char>& replacements, | 838 const Replacements<char>& replacements, |
839 CanonOutput* output, | 839 CanonOutput* output, |
840 url_parse::Parsed* new_parsed); | 840 url_parse::Parsed* new_parsed); |
841 GURL_API bool ReplaceMailtoURL(const char* base, | 841 bool ReplaceMailtoURL(const char* base, |
842 const url_parse::Parsed& base_parsed, | 842 const url_parse::Parsed& base_parsed, |
843 const Replacements<char16>& replacements, | 843 const Replacements<char16>& replacements, |
844 CanonOutput* output, | 844 CanonOutput* output, |
845 url_parse::Parsed* new_parsed); | 845 url_parse::Parsed* new_parsed); |
846 | 846 |
847 // Relative URL --------------------------------------------------------------- | 847 // Relative URL --------------------------------------------------------------- |
848 | 848 |
849 // Given an input URL or URL fragment |fragment|, determines if it is a | 849 // Given an input URL or URL fragment |fragment|, determines if it is a |
850 // relative or absolute URL and places the result into |*is_relative|. If it is | 850 // relative or absolute URL and places the result into |*is_relative|. If it is |
851 // relative, the relevant portion of the URL will be placed into | 851 // relative, the relevant portion of the URL will be placed into |
852 // |*relative_component| (there may have been trimmed whitespace, for example). | 852 // |*relative_component| (there may have been trimmed whitespace, for example). |
853 // This value is passed to ResolveRelativeURL. If the input is not relative, | 853 // This value is passed to ResolveRelativeURL. If the input is not relative, |
854 // this value is UNDEFINED (it may be changed by the function). | 854 // this value is UNDEFINED (it may be changed by the function). |
855 // | 855 // |
856 // Returns true on success (we successfully determined the URL is relative or | 856 // Returns true on success (we successfully determined the URL is relative or |
857 // not). Failure means that the combination of URLs doesn't make any sense. | 857 // not). Failure means that the combination of URLs doesn't make any sense. |
858 // | 858 // |
859 // The base URL should always be canonical, therefore is ASCII. | 859 // The base URL should always be canonical, therefore is ASCII. |
860 GURL_API bool IsRelativeURL(const char* base, | 860 bool IsRelativeURL(const char* base, |
861 const url_parse::Parsed& base_parsed, | 861 const url_parse::Parsed& base_parsed, |
862 const char* fragment, | 862 const char* fragment, |
863 int fragment_len, | 863 int fragment_len, |
864 bool is_base_hierarchical, | 864 bool is_base_hierarchical, |
865 bool* is_relative, | 865 bool* is_relative, |
866 url_parse::Component* relative_component); | 866 url_parse::Component* relative_component); |
867 GURL_API bool IsRelativeURL(const char* base, | 867 bool IsRelativeURL(const char* base, |
868 const url_parse::Parsed& base_parsed, | 868 const url_parse::Parsed& base_parsed, |
869 const char16* fragment, | 869 const char16* fragment, |
870 int fragment_len, | 870 int fragment_len, |
871 bool is_base_hierarchical, | 871 bool is_base_hierarchical, |
872 bool* is_relative, | 872 bool* is_relative, |
873 url_parse::Component* relative_component); | 873 url_parse::Component* relative_component); |
874 | 874 |
875 // Given a canonical parsed source URL, a URL fragment known to be relative, | 875 // Given a canonical parsed source URL, a URL fragment known to be relative, |
876 // and the identified relevant portion of the relative URL (computed by | 876 // and the identified relevant portion of the relative URL (computed by |
877 // IsRelativeURL), this produces a new parsed canonical URL in |output| and | 877 // IsRelativeURL), this produces a new parsed canonical URL in |output| and |
878 // |out_parsed|. | 878 // |out_parsed|. |
879 // | 879 // |
880 // It also requires a flag indicating whether the base URL is a file: URL | 880 // It also requires a flag indicating whether the base URL is a file: URL |
881 // which triggers additional logic. | 881 // which triggers additional logic. |
882 // | 882 // |
883 // The base URL should be canonical and have a host (may be empty for file | 883 // The base URL should be canonical and have a host (may be empty for file |
884 // URLs) and a path. If it doesn't have these, we can't resolve relative | 884 // URLs) and a path. If it doesn't have these, we can't resolve relative |
885 // URLs off of it and will return the base as the output with an error flag. | 885 // URLs off of it and will return the base as the output with an error flag. |
886 // Becausee it is canonical is should also be ASCII. | 886 // Becausee it is canonical is should also be ASCII. |
887 // | 887 // |
888 // The query charset converter follows the same rules as CanonicalizeQuery. | 888 // The query charset converter follows the same rules as CanonicalizeQuery. |
889 // | 889 // |
890 // Returns true on success. On failure, the output will be "something | 890 // Returns true on success. On failure, the output will be "something |
891 // reasonable" that will be consistent and valid, just probably not what | 891 // reasonable" that will be consistent and valid, just probably not what |
892 // was intended by the web page author or caller. | 892 // was intended by the web page author or caller. |
893 GURL_API bool ResolveRelativeURL(const char* base_url, | 893 bool ResolveRelativeURL(const char* base_url, |
894 const url_parse::Parsed& base_parsed, | 894 const url_parse::Parsed& base_parsed, |
895 bool base_is_file, | 895 bool base_is_file, |
896 const char* relative_url, | 896 const char* relative_url, |
897 const url_parse::Component& relative_component, | 897 const url_parse::Component& relative_component, |
898 CharsetConverter* query_converter, | 898 CharsetConverter* query_converter, |
899 CanonOutput* output, | 899 CanonOutput* output, |
900 url_parse::Parsed* out_parsed); | 900 url_parse::Parsed* out_parsed); |
901 GURL_API bool ResolveRelativeURL(const char* base_url, | 901 bool ResolveRelativeURL(const char* base_url, |
902 const url_parse::Parsed& base_parsed, | 902 const url_parse::Parsed& base_parsed, |
903 bool base_is_file, | 903 bool base_is_file, |
904 const char16* relative_url, | 904 const char16* relative_url, |
905 const url_parse::Component& relative_component, | 905 const url_parse::Component& relative_component, |
906 CharsetConverter* query_converter, | 906 CharsetConverter* query_converter, |
907 CanonOutput* output, | 907 CanonOutput* output, |
908 url_parse::Parsed* out_parsed); | 908 url_parse::Parsed* out_parsed); |
909 | 909 |
910 } // namespace url_canon | 910 } // namespace url_canon |
911 | 911 |
912 #endif // GOOGLEURL_SRC_URL_CANON_H__ | 912 #endif // URL_URL_CANON_H_ |
OLD | NEW |