| OLD | NEW |
| 1 // Copyright 2007, Google Inc. | 1 // Copyright 2007, Google Inc. |
| 2 // All rights reserved. | 2 // All rights reserved. |
| 3 // | 3 // |
| 4 // Redistribution and use in source and binary forms, with or without | 4 // Redistribution and use in source and binary forms, with or without |
| 5 // modification, are permitted provided that the following conditions are | 5 // modification, are permitted provided that the following conditions are |
| 6 // met: | 6 // met: |
| 7 // | 7 // |
| 8 // * Redistributions of source code must retain the above copyright | 8 // * Redistributions of source code must retain the above copyright |
| 9 // notice, this list of conditions and the following disclaimer. | 9 // notice, this list of conditions and the following disclaimer. |
| 10 // * Redistributions in binary form must reproduce the above | 10 // * Redistributions in binary form must reproduce the above |
| (...skipping 15 matching lines...) Expand all Loading... |
| 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 #ifndef GOOGLEURL_SRC_URL_CANON_H__ | 29 #ifndef GOOGLEURL_SRC_URL_CANON_H__ |
| 30 #define GOOGLEURL_SRC_URL_CANON_H__ | 30 #define GOOGLEURL_SRC_URL_CANON_H__ |
| 31 | 31 |
| 32 #include <memory.h> | 32 #include <memory.h> |
| 33 #include <stdlib.h> | 33 #include <stdlib.h> |
| 34 | 34 |
| 35 #include "base/string16.h" | 35 #include "base/string16.h" |
| 36 #include "googleurl/src/url_common.h" |
| 36 #include "googleurl/src/url_parse.h" | 37 #include "googleurl/src/url_parse.h" |
| 37 | 38 |
| 38 namespace url_canon { | 39 namespace url_canon { |
| 39 | 40 |
| 40 // Canonicalizer output ------------------------------------------------------- | 41 // Canonicalizer output ------------------------------------------------------- |
| 41 | 42 |
| 42 // Base class for the canonicalizer output, this maintains a buffer and | 43 // Base class for the canonicalizer output, this maintains a buffer and |
| 43 // supports simple resizing and append operations on it. | 44 // supports simple resizing and append operations on it. |
| 44 // | 45 // |
| 45 // It is VERY IMPORTANT that no virtual function calls be made on the common | 46 // It is VERY IMPORTANT that no virtual function calls be made on the common |
| (...skipping 195 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 241 // This should be called before parsing if whitespace removal is desired (which | 242 // This should be called before parsing if whitespace removal is desired (which |
| 242 // it normally is when you are canonicalizing). | 243 // it normally is when you are canonicalizing). |
| 243 // | 244 // |
| 244 // If no whitespace is removed, this function will not use the buffer and will | 245 // If no whitespace is removed, this function will not use the buffer and will |
| 245 // return a pointer to the input, to avoid the extra copy. If modification is | 246 // return a pointer to the input, to avoid the extra copy. If modification is |
| 246 // required, the given |buffer| will be used and the returned pointer will | 247 // required, the given |buffer| will be used and the returned pointer will |
| 247 // point to the beginning of the buffer. | 248 // point to the beginning of the buffer. |
| 248 // | 249 // |
| 249 // Therefore, callers should not use the buffer, since it may actuall be empty, | 250 // Therefore, callers should not use the buffer, since it may actuall be empty, |
| 250 // use the computed pointer and |*output_len| instead. | 251 // use the computed pointer and |*output_len| instead. |
| 251 const char* RemoveURLWhitespace(const char* input, int input_len, | 252 GURL_API const char* RemoveURLWhitespace(const char* input, int input_len, |
| 252 CanonOutputT<char>* buffer, | 253 CanonOutputT<char>* buffer, |
| 253 int* output_len); | 254 int* output_len); |
| 254 const char16* RemoveURLWhitespace(const char16* input, int input_len, | 255 GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len, |
| 255 CanonOutputT<char16>* buffer, | 256 CanonOutputT<char16>* buffer, |
| 256 int* output_len); | 257 int* output_len); |
| 257 | 258 |
| 258 // IDN ------------------------------------------------------------------------ | 259 // IDN ------------------------------------------------------------------------ |
| 259 | 260 |
| 260 // Converts the Unicode input representing a hostname to ASCII using IDN rules. | 261 // Converts the Unicode input representing a hostname to ASCII using IDN rules. |
| 261 // The output must fall in the ASCII range, but will be encoded in UTF-16. | 262 // The output must fall in the ASCII range, but will be encoded in UTF-16. |
| 262 // | 263 // |
| 263 // On success, the output will be filled with the ASCII host name and it will | 264 // On success, the output will be filled with the ASCII host name and it will |
| 264 // return true. Unlike most other canonicalization functions, this assumes that | 265 // return true. Unlike most other canonicalization functions, this assumes that |
| 265 // the output is empty. The beginning of the host will be at offset 0, and | 266 // the output is empty. The beginning of the host will be at offset 0, and |
| 266 // the length of the output will be set to the length of the new host name. | 267 // the length of the output will be set to the length of the new host name. |
| 267 // | 268 // |
| 268 // On error, returns false. The output in this case is undefined. | 269 // On error, returns false. The output in this case is undefined. |
| 269 bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); | 270 GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); |
| 270 | 271 |
| 271 // Piece-by-piece canonicalizers ---------------------------------------------- | 272 // Piece-by-piece canonicalizers ---------------------------------------------- |
| 272 // | 273 // |
| 273 // These individual canonicalizers append the canonicalized versions of the | 274 // These individual canonicalizers append the canonicalized versions of the |
| 274 // corresponding URL component to the given std::string. The spec and the | 275 // corresponding URL component to the given std::string. The spec and the |
| 275 // previously-identified range of that component are the input. The range of | 276 // previously-identified range of that component are the input. The range of |
| 276 // the canonicalized component will be written to the output component. | 277 // the canonicalized component will be written to the output component. |
| 277 // | 278 // |
| 278 // These functions all append to the output so they can be chained. Make sure | 279 // These functions all append to the output so they can be chained. Make sure |
| 279 // the output is empty when you start. | 280 // the output is empty when you start. |
| 280 // | 281 // |
| 281 // These functions returns boolean values indicating success. On failure, they | 282 // These functions returns boolean values indicating success. On failure, they |
| 282 // will attempt to write something reasonable to the output so that, if | 283 // will attempt to write something reasonable to the output so that, if |
| 283 // displayed to the user, they will recognise it as something that's messed up. | 284 // displayed to the user, they will recognise it as something that's messed up. |
| 284 // Nothing more should ever be done with these invalid URLs, however. | 285 // Nothing more should ever be done with these invalid URLs, however. |
| 285 | 286 |
| 286 // Scheme: Appends the scheme and colon to the URL. The output component will | 287 // Scheme: Appends the scheme and colon to the URL. The output component will |
| 287 // indicate the range of characters up to but not including the colon. | 288 // indicate the range of characters up to but not including the colon. |
| 288 // | 289 // |
| 289 // Canonical URLs always have a scheme. If the scheme is not present in the | 290 // Canonical URLs always have a scheme. If the scheme is not present in the |
| 290 // input, this will just write the colon to indicate an empty scheme. Does not | 291 // input, this will just write the colon to indicate an empty scheme. Does not |
| 291 // append slashes which will be needed before any authority components for most | 292 // append slashes which will be needed before any authority components for most |
| 292 // URLs. | 293 // URLs. |
| 293 // | 294 // |
| 294 // The 8-bit version requires UTF-8 encoding. | 295 // The 8-bit version requires UTF-8 encoding. |
| 295 bool CanonicalizeScheme(const char* spec, | 296 GURL_API bool CanonicalizeScheme(const char* spec, |
| 296 const url_parse::Component& scheme, | 297 const url_parse::Component& scheme, |
| 297 CanonOutput* output, | 298 CanonOutput* output, |
| 298 url_parse::Component* out_scheme); | 299 url_parse::Component* out_scheme); |
| 299 bool CanonicalizeScheme(const char16* spec, | 300 GURL_API bool CanonicalizeScheme(const char16* spec, |
| 300 const url_parse::Component& scheme, | 301 const url_parse::Component& scheme, |
| 301 CanonOutput* output, | 302 CanonOutput* output, |
| 302 url_parse::Component* out_scheme); | 303 url_parse::Component* out_scheme); |
| 303 | 304 |
| 304 // User info: username/password. If present, this will add the delimiters so | 305 // User info: username/password. If present, this will add the delimiters so |
| 305 // the output will be "<username>:<password>@" or "<username>@". Empty | 306 // the output will be "<username>:<password>@" or "<username>@". Empty |
| 306 // username/password pairs, or empty passwords, will get converted to | 307 // username/password pairs, or empty passwords, will get converted to |
| 307 // nonexistant in the canonical version. | 308 // nonexistant in the canonical version. |
| 308 // | 309 // |
| 309 // The components for the username and password refer to ranges in the | 310 // The components for the username and password refer to ranges in the |
| 310 // respective source strings. Usually, these will be the same string, which | 311 // respective source strings. Usually, these will be the same string, which |
| 311 // is legal as long as the two components don't overlap. | 312 // is legal as long as the two components don't overlap. |
| 312 // | 313 // |
| 313 // The 8-bit version requires UTF-8 encoding. | 314 // The 8-bit version requires UTF-8 encoding. |
| 314 bool CanonicalizeUserInfo(const char* username_source, | 315 GURL_API bool CanonicalizeUserInfo(const char* username_source, |
| 315 const url_parse::Component& username, | 316 const url_parse::Component& username, |
| 316 const char* password_source, | 317 const char* password_source, |
| 317 const url_parse::Component& password, | 318 const url_parse::Component& password, |
| 318 CanonOutput* output, | 319 CanonOutput* output, |
| 319 url_parse::Component* out_username, | 320 url_parse::Component* out_username, |
| 320 url_parse::Component* out_password); | 321 url_parse::Component* out_password); |
| 321 bool CanonicalizeUserInfo(const char16* username_source, | 322 GURL_API bool CanonicalizeUserInfo(const char16* username_source, |
| 322 const url_parse::Component& username, | 323 const url_parse::Component& username, |
| 323 const char16* password_source, | 324 const char16* password_source, |
| 324 const url_parse::Component& password, | 325 const url_parse::Component& password, |
| 325 CanonOutput* output, | 326 CanonOutput* output, |
| 326 url_parse::Component* out_username, | 327 url_parse::Component* out_username, |
| 327 url_parse::Component* out_password); | 328 url_parse::Component* out_password); |
| 328 | 329 |
| 329 | 330 |
| 330 // This structure holds detailed state exported from the IP/Host canonicalizers. | 331 // This structure holds detailed state exported from the IP/Host canonicalizers. |
| 331 // Additional fields may be added as callers require them. | 332 // Additional fields may be added as callers require them. |
| 332 struct CanonHostInfo { | 333 struct CanonHostInfo { |
| 333 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} | 334 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} |
| 334 | 335 |
| 335 // Convenience function to test if family is an IP address. | 336 // Convenience function to test if family is an IP address. |
| 336 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } | 337 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } |
| 337 | 338 |
| (...skipping 21 matching lines...) Expand all Loading... |
| 359 // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. | 360 // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. |
| 360 // CanonicalizeHostVerbose() always sets it. | 361 // CanonicalizeHostVerbose() always sets it. |
| 361 url_parse::Component out_host; | 362 url_parse::Component out_host; |
| 362 }; | 363 }; |
| 363 | 364 |
| 364 | 365 |
| 365 // Host. | 366 // Host. |
| 366 // | 367 // |
| 367 // The 8-bit version requires UTF-8 encoding. Use this version when you only | 368 // The 8-bit version requires UTF-8 encoding. Use this version when you only |
| 368 // need to know whether canonicalization succeeded. | 369 // need to know whether canonicalization succeeded. |
| 369 bool CanonicalizeHost(const char* spec, | 370 GURL_API bool CanonicalizeHost(const char* spec, |
| 370 const url_parse::Component& host, | 371 const url_parse::Component& host, |
| 371 CanonOutput* output, | 372 CanonOutput* output, |
| 372 url_parse::Component* out_host); | 373 url_parse::Component* out_host); |
| 373 bool CanonicalizeHost(const char16* spec, | 374 GURL_API bool CanonicalizeHost(const char16* spec, |
| 374 const url_parse::Component& host, | 375 const url_parse::Component& host, |
| 375 CanonOutput* output, | 376 CanonOutput* output, |
| 376 url_parse::Component* out_host); | 377 url_parse::Component* out_host); |
| 377 | 378 |
| 378 // Extended version of CanonicalizeHost, which returns additional information. | 379 // Extended version of CanonicalizeHost, which returns additional information. |
| 379 // Use this when you need to know whether the hostname was an IP address. | 380 // Use this when you need to know whether the hostname was an IP address. |
| 380 // A successful return is indicated by host_info->family != BROKEN. See the | 381 // A successful return is indicated by host_info->family != BROKEN. See the |
| 381 // definition of CanonHostInfo above for details. | 382 // definition of CanonHostInfo above for details. |
| 382 void CanonicalizeHostVerbose(const char* spec, | 383 GURL_API void CanonicalizeHostVerbose(const char* spec, |
| 383 const url_parse::Component& host, | 384 const url_parse::Component& host, |
| 384 CanonOutput* output, | 385 CanonOutput* output, |
| 385 CanonHostInfo* host_info); | 386 CanonHostInfo* host_info); |
| 386 void CanonicalizeHostVerbose(const char16* spec, | 387 GURL_API void CanonicalizeHostVerbose(const char16* spec, |
| 387 const url_parse::Component& host, | 388 const url_parse::Component& host, |
| 388 CanonOutput* output, | 389 CanonOutput* output, |
| 389 CanonHostInfo* host_info); | 390 CanonHostInfo* host_info); |
| 390 | 391 |
| 391 | 392 |
| 392 // IP addresses. | 393 // IP addresses. |
| 393 // | 394 // |
| 394 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is | 395 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is |
| 395 // an IP address, it will canonicalize it as such, appending it to |output|. | 396 // an IP address, it will canonicalize it as such, appending it to |output|. |
| 396 // Additional status information is returned via the |*host_info| parameter. | 397 // Additional status information is returned via the |*host_info| parameter. |
| 397 // See the definition of CanonHostInfo above for details. | 398 // See the definition of CanonHostInfo above for details. |
| 398 // | 399 // |
| 399 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that | 400 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that |
| 400 // the input is unescaped and name-prepped, etc. It should not normally be | 401 // the input is unescaped and name-prepped, etc. It should not normally be |
| 401 // necessary or wise to call this directly. | 402 // necessary or wise to call this directly. |
| 402 void CanonicalizeIPAddress(const char* spec, | 403 GURL_API void CanonicalizeIPAddress(const char* spec, |
| 403 const url_parse::Component& host, | 404 const url_parse::Component& host, |
| 404 CanonOutput* output, | 405 CanonOutput* output, |
| 405 CanonHostInfo* host_info); | 406 CanonHostInfo* host_info); |
| 406 void CanonicalizeIPAddress(const char16* spec, | 407 GURL_API void CanonicalizeIPAddress(const char16* spec, |
| 407 const url_parse::Component& host, | 408 const url_parse::Component& host, |
| 408 CanonOutput* output, | 409 CanonOutput* output, |
| 409 CanonHostInfo* host_info); | 410 CanonHostInfo* host_info); |
| 410 | 411 |
| 411 // Port: this function will add the colon for the port if a port is present. | 412 // Port: this function will add the colon for the port if a port is present. |
| 412 // The caller can pass url_parse::PORT_UNSPECIFIED as the | 413 // The caller can pass url_parse::PORT_UNSPECIFIED as the |
| 413 // default_port_for_scheme argument if there is no default port. | 414 // default_port_for_scheme argument if there is no default port. |
| 414 // | 415 // |
| 415 // The 8-bit version requires UTF-8 encoding. | 416 // The 8-bit version requires UTF-8 encoding. |
| 416 bool CanonicalizePort(const char* spec, | 417 GURL_API bool CanonicalizePort(const char* spec, |
| 417 const url_parse::Component& port, | 418 const url_parse::Component& port, |
| 418 int default_port_for_scheme, | 419 int default_port_for_scheme, |
| 419 CanonOutput* output, | 420 CanonOutput* output, |
| 420 url_parse::Component* out_port); | 421 url_parse::Component* out_port); |
| 421 bool CanonicalizePort(const char16* spec, | 422 GURL_API bool CanonicalizePort(const char16* spec, |
| 422 const url_parse::Component& port, | 423 const url_parse::Component& port, |
| 423 int default_port_for_scheme, | 424 int default_port_for_scheme, |
| 424 CanonOutput* output, | 425 CanonOutput* output, |
| 425 url_parse::Component* out_port); | 426 url_parse::Component* out_port); |
| 426 | 427 |
| 427 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED | 428 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED |
| 428 // if the scheme is unknown. | 429 // if the scheme is unknown. |
| 429 int DefaultPortForScheme(const char* scheme, int scheme_len); | 430 GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len); |
| 430 | 431 |
| 431 // Path. If the input does not begin in a slash (including if the input is | 432 // Path. If the input does not begin in a slash (including if the input is |
| 432 // empty), we'll prepend a slash to the path to make it canonical. | 433 // empty), we'll prepend a slash to the path to make it canonical. |
| 433 // | 434 // |
| 434 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity | 435 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity |
| 435 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid | 436 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid |
| 436 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't | 437 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't |
| 437 // an issue. Somebody giving us an 8-bit path is responsible for generating | 438 // an issue. Somebody giving us an 8-bit path is responsible for generating |
| 438 // the path that the server expects (we'll escape high-bit characters), so | 439 // the path that the server expects (we'll escape high-bit characters), so |
| 439 // if something is invalid, it's their problem. | 440 // if something is invalid, it's their problem. |
| 440 bool CanonicalizePath(const char* spec, | 441 GURL_API bool CanonicalizePath(const char* spec, |
| 441 const url_parse::Component& path, | 442 const url_parse::Component& path, |
| 442 CanonOutput* output, | 443 CanonOutput* output, |
| 443 url_parse::Component* out_path); | 444 url_parse::Component* out_path); |
| 444 bool CanonicalizePath(const char16* spec, | 445 GURL_API bool CanonicalizePath(const char16* spec, |
| 445 const url_parse::Component& path, | 446 const url_parse::Component& path, |
| 446 CanonOutput* output, | 447 CanonOutput* output, |
| 447 url_parse::Component* out_path); | 448 url_parse::Component* out_path); |
| 448 | 449 |
| 449 // Canonicalizes the input as a file path. This is like CanonicalizePath except | 450 // Canonicalizes the input as a file path. This is like CanonicalizePath except |
| 450 // that it also handles Windows drive specs. For example, the path can begin | 451 // that it also handles Windows drive specs. For example, the path can begin |
| 451 // with "c|\" and it will get properly canonicalized to "C:/". | 452 // with "c|\" and it will get properly canonicalized to "C:/". |
| 452 // The string will be appended to |*output| and |*out_path| will be updated. | 453 // The string will be appended to |*output| and |*out_path| will be updated. |
| 453 // | 454 // |
| 454 // The 8-bit version requires UTF-8 encoding. | 455 // The 8-bit version requires UTF-8 encoding. |
| 455 bool FileCanonicalizePath(const char* spec, | 456 GURL_API bool FileCanonicalizePath(const char* spec, |
| 456 const url_parse::Component& path, | 457 const url_parse::Component& path, |
| 457 CanonOutput* output, | 458 CanonOutput* output, |
| 458 url_parse::Component* out_path); | 459 url_parse::Component* out_path); |
| 459 bool FileCanonicalizePath(const char16* spec, | 460 GURL_API bool FileCanonicalizePath(const char16* spec, |
| 460 const url_parse::Component& path, | 461 const url_parse::Component& path, |
| 461 CanonOutput* output, | 462 CanonOutput* output, |
| 462 url_parse::Component* out_path); | 463 url_parse::Component* out_path); |
| 463 | 464 |
| 464 // Query: Prepends the ? if needed. | 465 // Query: Prepends the ? if needed. |
| 465 // | 466 // |
| 466 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly | 467 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly |
| 467 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode | 468 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode |
| 468 // "invalid character." This function can not fail, we always just try to do | 469 // "invalid character." This function can not fail, we always just try to do |
| 469 // our best for crazy input here since web pages can set it themselves. | 470 // our best for crazy input here since web pages can set it themselves. |
| 470 // | 471 // |
| 471 // This will convert the given input into the output encoding that the given | 472 // This will convert the given input into the output encoding that the given |
| 472 // character set converter object provides. The converter will only be called | 473 // character set converter object provides. The converter will only be called |
| 473 // if necessary, for ASCII input, no conversions are necessary. | 474 // if necessary, for ASCII input, no conversions are necessary. |
| 474 // | 475 // |
| 475 // The converter can be NULL. In this case, the output encoding will be UTF-8. | 476 // The converter can be NULL. In this case, the output encoding will be UTF-8. |
| 476 void CanonicalizeQuery(const char* spec, | 477 GURL_API void CanonicalizeQuery(const char* spec, |
| 477 const url_parse::Component& query, | 478 const url_parse::Component& query, |
| 478 CharsetConverter* converter, | 479 CharsetConverter* converter, |
| 479 CanonOutput* output, | 480 CanonOutput* output, |
| 480 url_parse::Component* out_query); | 481 url_parse::Component* out_query); |
| 481 void CanonicalizeQuery(const char16* spec, | 482 GURL_API void CanonicalizeQuery(const char16* spec, |
| 482 const url_parse::Component& query, | 483 const url_parse::Component& query, |
| 483 CharsetConverter* converter, | 484 CharsetConverter* converter, |
| 484 CanonOutput* output, | 485 CanonOutput* output, |
| 485 url_parse::Component* out_query); | 486 url_parse::Component* out_query); |
| 486 | 487 |
| 487 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only | 488 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only |
| 488 // canonicalizer that does not produce ASCII output). The output is | 489 // canonicalizer that does not produce ASCII output). The output is |
| 489 // guaranteed to be valid UTF-8. | 490 // guaranteed to be valid UTF-8. |
| 490 // | 491 // |
| 491 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use | 492 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use |
| 492 // the "Unicode replacement character" for the confusing bits and copy the rest. | 493 // the "Unicode replacement character" for the confusing bits and copy the rest. |
| 493 void CanonicalizeRef(const char* spec, | 494 GURL_API void CanonicalizeRef(const char* spec, |
| 494 const url_parse::Component& path, | 495 const url_parse::Component& path, |
| 495 CanonOutput* output, | 496 CanonOutput* output, |
| 496 url_parse::Component* out_path); | 497 url_parse::Component* out_path); |
| 497 void CanonicalizeRef(const char16* spec, | 498 GURL_API void CanonicalizeRef(const char16* spec, |
| 498 const url_parse::Component& path, | 499 const url_parse::Component& path, |
| 499 CanonOutput* output, | 500 CanonOutput* output, |
| 500 url_parse::Component* out_path); | 501 url_parse::Component* out_path); |
| 501 | 502 |
| 502 // Full canonicalizer --------------------------------------------------------- | 503 // Full canonicalizer --------------------------------------------------------- |
| 503 // | 504 // |
| 504 // These functions replace any string contents, rather than append as above. | 505 // These functions replace any string contents, rather than append as above. |
| 505 // See the above piece-by-piece functions for information specific to | 506 // See the above piece-by-piece functions for information specific to |
| 506 // canonicalizing individual components. | 507 // canonicalizing individual components. |
| 507 // | 508 // |
| 508 // The output will be ASCII except the reference fragment, which may be UTF-8. | 509 // The output will be ASCII except the reference fragment, which may be UTF-8. |
| 509 // | 510 // |
| 510 // The 8-bit versions require UTF-8 encoding. | 511 // The 8-bit versions require UTF-8 encoding. |
| 511 | 512 |
| 512 // Use for standard URLs with authorities and paths. | 513 // Use for standard URLs with authorities and paths. |
| 513 bool CanonicalizeStandardURL(const char* spec, | 514 GURL_API bool CanonicalizeStandardURL(const char* spec, |
| 514 int spec_len, | 515 int spec_len, |
| 515 const url_parse::Parsed& parsed, | 516 const url_parse::Parsed& parsed, |
| 516 CharsetConverter* query_converter, | 517 CharsetConverter* query_converter, |
| 517 CanonOutput* output, | 518 CanonOutput* output, |
| 518 url_parse::Parsed* new_parsed); | 519 url_parse::Parsed* new_parsed); |
| 519 bool CanonicalizeStandardURL(const char16* spec, | 520 GURL_API bool CanonicalizeStandardURL(const char16* spec, |
| 520 int spec_len, | 521 int spec_len, |
| 521 const url_parse::Parsed& parsed, | 522 const url_parse::Parsed& parsed, |
| 522 CharsetConverter* query_converter, | 523 CharsetConverter* query_converter, |
| 523 CanonOutput* output, | 524 CanonOutput* output, |
| 524 url_parse::Parsed* new_parsed); | 525 url_parse::Parsed* new_parsed); |
| 525 | 526 |
| 526 // Use for file URLs. | 527 // Use for file URLs. |
| 527 bool CanonicalizeFileURL(const char* spec, | 528 GURL_API bool CanonicalizeFileURL(const char* spec, |
| 528 int spec_len, | 529 int spec_len, |
| 529 const url_parse::Parsed& parsed, | 530 const url_parse::Parsed& parsed, |
| 530 CharsetConverter* query_converter, | 531 CharsetConverter* query_converter, |
| 531 CanonOutput* output, | 532 CanonOutput* output, |
| 532 url_parse::Parsed* new_parsed); | 533 url_parse::Parsed* new_parsed); |
| 533 bool CanonicalizeFileURL(const char16* spec, | 534 GURL_API bool CanonicalizeFileURL(const char16* spec, |
| 534 int spec_len, | 535 int spec_len, |
| 535 const url_parse::Parsed& parsed, | 536 const url_parse::Parsed& parsed, |
| 536 CharsetConverter* query_converter, | 537 CharsetConverter* query_converter, |
| 537 CanonOutput* output, | 538 CanonOutput* output, |
| 538 url_parse::Parsed* new_parsed); | 539 url_parse::Parsed* new_parsed); |
| 539 | 540 |
| 540 // Use for path URLs such as javascript. This does not modify the path in any | 541 // Use for path URLs such as javascript. This does not modify the path in any |
| 541 // way, for example, by escaping it. | 542 // way, for example, by escaping it. |
| 542 bool CanonicalizePathURL(const char* spec, | 543 GURL_API bool CanonicalizePathURL(const char* spec, |
| 543 int spec_len, | 544 int spec_len, |
| 544 const url_parse::Parsed& parsed, | 545 const url_parse::Parsed& parsed, |
| 545 CanonOutput* output, | 546 CanonOutput* output, |
| 546 url_parse::Parsed* new_parsed); | 547 url_parse::Parsed* new_parsed); |
| 547 bool CanonicalizePathURL(const char16* spec, | 548 GURL_API bool CanonicalizePathURL(const char16* spec, |
| 548 int spec_len, | 549 int spec_len, |
| 549 const url_parse::Parsed& parsed, | 550 const url_parse::Parsed& parsed, |
| 550 CanonOutput* output, | 551 CanonOutput* output, |
| 551 url_parse::Parsed* new_parsed); | 552 url_parse::Parsed* new_parsed); |
| 552 | 553 |
| 553 // Use for mailto URLs. This "canonicalizes" the url into a path and query | 554 // Use for mailto URLs. This "canonicalizes" the url into a path and query |
| 554 // component. It does not attempt to merge "to" fields. It uses UTF-8 for | 555 // component. It does not attempt to merge "to" fields. It uses UTF-8 for |
| 555 // the query encoding if there is a query. This is because a mailto URL is | 556 // the query encoding if there is a query. This is because a mailto URL is |
| 556 // really intended for an external mail program, and the encoding of a page, | 557 // really intended for an external mail program, and the encoding of a page, |
| 557 // etc. which would influence a query encoding normally are irrelevant. | 558 // etc. which would influence a query encoding normally are irrelevant. |
| 558 bool CanonicalizeMailtoURL(const char* spec, | 559 GURL_API bool CanonicalizeMailtoURL(const char* spec, |
| 559 int spec_len, | 560 int spec_len, |
| 560 const url_parse::Parsed& parsed, | 561 const url_parse::Parsed& parsed, |
| 561 CanonOutput* output, | 562 CanonOutput* output, |
| 562 url_parse::Parsed* new_parsed); | 563 url_parse::Parsed* new_parsed); |
| 563 bool CanonicalizeMailtoURL(const char16* spec, | 564 GURL_API bool CanonicalizeMailtoURL(const char16* spec, |
| 564 int spec_len, | 565 int spec_len, |
| 565 const url_parse::Parsed& parsed, | 566 const url_parse::Parsed& parsed, |
| 566 CanonOutput* output, | 567 CanonOutput* output, |
| 567 url_parse::Parsed* new_parsed); | 568 url_parse::Parsed* new_parsed); |
| 568 | 569 |
| 569 // Part replacer -------------------------------------------------------------- | 570 // Part replacer -------------------------------------------------------------- |
| 570 | 571 |
| 571 // Internal structure used for storing separate strings for each component. | 572 // Internal structure used for storing separate strings for each component. |
| 572 // The basic canonicalization functions use this structure internally so that | 573 // The basic canonicalization functions use this structure internally so that |
| 573 // component remplacement (different strings for different components) can be | 574 // component remplacement (different strings for different components) can be |
| 574 // treated on the same code path as regular canonicalization (the same string | 575 // treated on the same code path as regular canonicalization (the same string |
| 575 // for each component). | 576 // for each component). |
| 576 // | 577 // |
| 577 // A url_parse::Parsed structure usually goes along with this. Those | 578 // A url_parse::Parsed structure usually goes along with this. Those |
| 578 // components identify offsets within these strings, so that they can all be | 579 // components identify offsets within these strings, so that they can all be |
| 579 // in the same string, or spread arbitrarily across different ones. | 580 // in the same string, or spread arbitrarily across different ones. |
| 580 // | 581 // |
| 581 // This structures does not own any data. It is the caller's responsibility to | 582 // This structures does not own any data. It is the caller's responsibility to |
| 582 // ensure that the data the pointers point to stays in scope and is not | 583 // ensure that the data the pointers point to stays in scope and is not |
| 583 // modified. | 584 // modified. |
| 584 template<typename CHAR> | 585 template<typename CHAR> |
| 585 struct URLComponentSource { | 586 struct URLComponentSource { |
| 586 // Constructor normally used by callers wishing to replace components. This | 587 // Constructor normally used by callers wishing to replace components. This |
| 587 // will make them all NULL, which is no replacement. The caller would then | 588 // will make them all NULL, which is no replacement. The caller would then |
| 588 // override the compoents they want to replace. | 589 // override the components they want to replace. |
| 589 URLComponentSource() | 590 URLComponentSource() |
| 590 : scheme(NULL), | 591 : scheme(NULL), |
| 591 username(NULL), | 592 username(NULL), |
| 592 password(NULL), | 593 password(NULL), |
| 593 host(NULL), | 594 host(NULL), |
| 594 port(NULL), | 595 port(NULL), |
| 595 path(NULL), | 596 path(NULL), |
| 596 query(NULL), | 597 query(NULL), |
| 597 ref(NULL) { | 598 ref(NULL) { |
| 598 } | 599 } |
| (...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 742 // Replace component | (replacement string) (replacement component) | 743 // Replace component | (replacement string) (replacement component) |
| 743 // Delete component | (non-NULL) (invalid component: (0,-1)) | 744 // Delete component | (non-NULL) (invalid component: (0,-1)) |
| 744 // | 745 // |
| 745 // We use a pointer to the empty string for the source when the component | 746 // We use a pointer to the empty string for the source when the component |
| 746 // should be deleted. | 747 // should be deleted. |
| 747 URLComponentSource<CHAR> sources_; | 748 URLComponentSource<CHAR> sources_; |
| 748 url_parse::Parsed components_; | 749 url_parse::Parsed components_; |
| 749 }; | 750 }; |
| 750 | 751 |
| 751 // The base must be an 8-bit canonical URL. | 752 // The base must be an 8-bit canonical URL. |
| 752 bool ReplaceStandardURL(const char* base, | 753 GURL_API bool ReplaceStandardURL(const char* base, |
| 753 const url_parse::Parsed& base_parsed, | 754 const url_parse::Parsed& base_parsed, |
| 754 const Replacements<char>& replacements, | 755 const Replacements<char>& replacements, |
| 755 CharsetConverter* query_converter, | 756 CharsetConverter* query_converter, |
| 756 CanonOutput* output, | 757 CanonOutput* output, |
| 757 url_parse::Parsed* new_parsed); | 758 url_parse::Parsed* new_parsed); |
| 758 bool ReplaceStandardURL(const char* base, | 759 GURL_API bool ReplaceStandardURL(const char* base, |
| 759 const url_parse::Parsed& base_parsed, | 760 const url_parse::Parsed& base_parsed, |
| 760 const Replacements<char16>& replacements, | 761 const Replacements<char16>& replacements, |
| 761 CharsetConverter* query_converter, | 762 CharsetConverter* query_converter, |
| 762 CanonOutput* output, | 763 CanonOutput* output, |
| 763 url_parse::Parsed* new_parsed); | 764 url_parse::Parsed* new_parsed); |
| 764 | 765 |
| 765 // Replacing some parts of a file URL is not permitted. Everything except | 766 // Replacing some parts of a file URL is not permitted. Everything except |
| 766 // the host, path, query, and ref will be ignored. | 767 // the host, path, query, and ref will be ignored. |
| 767 bool ReplaceFileURL(const char* base, | 768 GURL_API bool ReplaceFileURL(const char* base, |
| 768 const url_parse::Parsed& base_parsed, | 769 const url_parse::Parsed& base_parsed, |
| 769 const Replacements<char>& replacements, | 770 const Replacements<char>& replacements, |
| 770 CharsetConverter* query_converter, | 771 CharsetConverter* query_converter, |
| 771 CanonOutput* output, | 772 CanonOutput* output, |
| 772 url_parse::Parsed* new_parsed); | 773 url_parse::Parsed* new_parsed); |
| 773 bool ReplaceFileURL(const char* base, | 774 GURL_API bool ReplaceFileURL(const char* base, |
| 774 const url_parse::Parsed& base_parsed, | 775 const url_parse::Parsed& base_parsed, |
| 775 const Replacements<char16>& replacements, | 776 const Replacements<char16>& replacements, |
| 776 CharsetConverter* query_converter, | 777 CharsetConverter* query_converter, |
| 777 CanonOutput* output, | 778 CanonOutput* output, |
| 778 url_parse::Parsed* new_parsed); | 779 url_parse::Parsed* new_parsed); |
| 779 | 780 |
| 780 // Path URLs can only have the scheme and path replaced. All other components | 781 // Path URLs can only have the scheme and path replaced. All other components |
| 781 // will be ignored. | 782 // will be ignored. |
| 782 bool ReplacePathURL(const char* base, | 783 GURL_API bool ReplacePathURL(const char* base, |
| 783 const url_parse::Parsed& base_parsed, | 784 const url_parse::Parsed& base_parsed, |
| 784 const Replacements<char>& replacements, | 785 const Replacements<char>& replacements, |
| 785 CanonOutput* output, | 786 CanonOutput* output, |
| 786 url_parse::Parsed* new_parsed); | 787 url_parse::Parsed* new_parsed); |
| 787 bool ReplacePathURL(const char* base, | 788 GURL_API bool ReplacePathURL(const char* base, |
| 788 const url_parse::Parsed& base_parsed, | 789 const url_parse::Parsed& base_parsed, |
| 789 const Replacements<char16>& replacements, | 790 const Replacements<char16>& replacements, |
| 790 CanonOutput* output, | 791 CanonOutput* output, |
| 791 url_parse::Parsed* new_parsed); | 792 url_parse::Parsed* new_parsed); |
| 792 | 793 |
| 793 // Mailto URLs can only have the scheme, path, and query replaced. | 794 // Mailto URLs can only have the scheme, path, and query replaced. |
| 794 // All other components will be ignored. | 795 // All other components will be ignored. |
| 795 bool ReplaceMailtoURL(const char* base, | 796 GURL_API bool ReplaceMailtoURL(const char* base, |
| 796 const url_parse::Parsed& base_parsed, | 797 const url_parse::Parsed& base_parsed, |
| 797 const Replacements<char>& replacements, | 798 const Replacements<char>& replacements, |
| 798 CanonOutput* output, | 799 CanonOutput* output, |
| 799 url_parse::Parsed* new_parsed); | 800 url_parse::Parsed* new_parsed); |
| 800 bool ReplaceMailtoURL(const char* base, | 801 GURL_API bool ReplaceMailtoURL(const char* base, |
| 801 const url_parse::Parsed& base_parsed, | 802 const url_parse::Parsed& base_parsed, |
| 802 const Replacements<char16>& replacements, | 803 const Replacements<char16>& replacements, |
| 803 CanonOutput* output, | 804 CanonOutput* output, |
| 804 url_parse::Parsed* new_parsed); | 805 url_parse::Parsed* new_parsed); |
| 805 | 806 |
| 806 // Relative URL --------------------------------------------------------------- | 807 // Relative URL --------------------------------------------------------------- |
| 807 | 808 |
| 808 // Given an input URL or URL fragment |fragment|, determines if it is a | 809 // Given an input URL or URL fragment |fragment|, determines if it is a |
| 809 // relative or absolute URL and places the result into |*is_relative|. If it is | 810 // relative or absolute URL and places the result into |*is_relative|. If it is |
| 810 // relative, the relevant portion of the URL will be placed into | 811 // relative, the relevant portion of the URL will be placed into |
| 811 // |*relative_component| (there may have been trimmed whitespace, for example). | 812 // |*relative_component| (there may have been trimmed whitespace, for example). |
| 812 // This value is passed to ResolveRelativeURL. If the input is not relative, | 813 // This value is passed to ResolveRelativeURL. If the input is not relative, |
| 813 // this value is UNDEFINED (it may be changed by the functin). | 814 // this value is UNDEFINED (it may be changed by the functin). |
| 814 // | 815 // |
| 815 // Returns true on success (we successfully determined the URL is relative or | 816 // Returns true on success (we successfully determined the URL is relative or |
| 816 // not). Failure means that the combination of URLs doesn't make any sense. | 817 // not). Failure means that the combination of URLs doesn't make any sense. |
| 817 // | 818 // |
| 818 // The base URL should always be canonical, therefore is ASCII. | 819 // The base URL should always be canonical, therefore is ASCII. |
| 819 bool IsRelativeURL(const char* base, | 820 GURL_API bool IsRelativeURL(const char* base, |
| 820 const url_parse::Parsed& base_parsed, | 821 const url_parse::Parsed& base_parsed, |
| 821 const char* fragment, | 822 const char* fragment, |
| 822 int fragment_len, | 823 int fragment_len, |
| 823 bool is_base_hierarchical, | 824 bool is_base_hierarchical, |
| 824 bool* is_relative, | 825 bool* is_relative, |
| 825 url_parse::Component* relative_component); | 826 url_parse::Component* relative_component); |
| 826 bool IsRelativeURL(const char* base, | 827 GURL_API bool IsRelativeURL(const char* base, |
| 827 const url_parse::Parsed& base_parsed, | 828 const url_parse::Parsed& base_parsed, |
| 828 const char16* fragment, | 829 const char16* fragment, |
| 829 int fragment_len, | 830 int fragment_len, |
| 830 bool is_base_hierarchical, | 831 bool is_base_hierarchical, |
| 831 bool* is_relative, | 832 bool* is_relative, |
| 832 url_parse::Component* relative_component); | 833 url_parse::Component* relative_component); |
| 833 | 834 |
| 834 // Given a canonical parsed source URL, a URL fragment known to be relative, | 835 // Given a canonical parsed source URL, a URL fragment known to be relative, |
| 835 // and the identified relevant portion of the relative URL (computed by | 836 // and the identified relevant portion of the relative URL (computed by |
| 836 // IsRelativeURL), this produces a new parsed canonical URL in |output| and | 837 // IsRelativeURL), this produces a new parsed canonical URL in |output| and |
| 837 // |out_parsed|. | 838 // |out_parsed|. |
| 838 // | 839 // |
| 839 // It also requires a flag indicating whether the base URL is a file: URL | 840 // It also requires a flag indicating whether the base URL is a file: URL |
| 840 // which triggers additional logic. | 841 // which triggers additional logic. |
| 841 // | 842 // |
| 842 // The base URL should be canonical and have a host (may be empty for file | 843 // The base URL should be canonical and have a host (may be empty for file |
| 843 // URLs) and a path. If it doesn't have these, we can't resolve relative | 844 // URLs) and a path. If it doesn't have these, we can't resolve relative |
| 844 // URLs off of it and will return the base as the output with an error flag. | 845 // URLs off of it and will return the base as the output with an error flag. |
| 845 // Becausee it is canonical is should also be ASCII. | 846 // Becausee it is canonical is should also be ASCII. |
| 846 // | 847 // |
| 847 // The query charset converter follows the same rules as CanonicalizeQuery. | 848 // The query charset converter follows the same rules as CanonicalizeQuery. |
| 848 // | 849 // |
| 849 // Returns true on success. On failure, the output will be "something | 850 // Returns true on success. On failure, the output will be "something |
| 850 // reasonable" that will be consistent and valid, just probably not what | 851 // reasonable" that will be consistent and valid, just probably not what |
| 851 // was intended by the web page author or caller. | 852 // was intended by the web page author or caller. |
| 852 bool ResolveRelativeURL(const char* base_url, | 853 GURL_API bool ResolveRelativeURL(const char* base_url, |
| 853 const url_parse::Parsed& base_parsed, | 854 const url_parse::Parsed& base_parsed, |
| 854 bool base_is_file, | 855 bool base_is_file, |
| 855 const char* relative_url, | 856 const char* relative_url, |
| 856 const url_parse::Component& relative_component, | 857 const url_parse::Component& relative_component, |
| 857 CharsetConverter* query_converter, | 858 CharsetConverter* query_converter, |
| 858 CanonOutput* output, | 859 CanonOutput* output, |
| 859 url_parse::Parsed* out_parsed); | 860 url_parse::Parsed* out_parsed); |
| 860 bool ResolveRelativeURL(const char* base_url, | 861 GURL_API bool ResolveRelativeURL(const char* base_url, |
| 861 const url_parse::Parsed& base_parsed, | 862 const url_parse::Parsed& base_parsed, |
| 862 bool base_is_file, | 863 bool base_is_file, |
| 863 const char16* relative_url, | 864 const char16* relative_url, |
| 864 const url_parse::Component& relative_component, | 865 const url_parse::Component& relative_component, |
| 865 CharsetConverter* query_converter, | 866 CharsetConverter* query_converter, |
| 866 CanonOutput* output, | 867 CanonOutput* output, |
| 867 url_parse::Parsed* out_parsed); | 868 url_parse::Parsed* out_parsed); |
| 868 | 869 |
| 869 } // namespace url_canon | 870 } // namespace url_canon |
| 870 | 871 |
| 871 #endif // GOOGLEURL_SRC_URL_CANON_H__ | 872 #endif // GOOGLEURL_SRC_URL_CANON_H__ |
| OLD | NEW |