| OLD | NEW |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #ifndef URL_URL_CANON_H_ | 5 #ifndef URL_URL_CANON_H_ |
| 6 #define URL_URL_CANON_H_ | 6 #define URL_URL_CANON_H_ |
| 7 | 7 |
| 8 #include <stdlib.h> | 8 #include <stdlib.h> |
| 9 #include <string.h> | 9 #include <string.h> |
| 10 | 10 |
| 11 #include "base/string16.h" | 11 #include "base/string16.h" |
| 12 #include "url/url_export.h" | |
| 13 #include "url/url_parse.h" | 12 #include "url/url_parse.h" |
| 14 | 13 |
| 15 namespace url_canon { | 14 namespace url_canon { |
| 16 | 15 |
| 17 // Canonicalizer output ------------------------------------------------------- | 16 // Canonicalizer output ------------------------------------------------------- |
| 18 | 17 |
| 19 // Base class for the canonicalizer output, this maintains a buffer and | 18 // Base class for the canonicalizer output, this maintains a buffer and |
| 20 // supports simple resizing and append operations on it. | 19 // supports simple resizing and append operations on it. |
| 21 // | 20 // |
| 22 // It is VERY IMPORTANT that no virtual function calls be made on the common | 21 // It is VERY IMPORTANT that no virtual function calls be made on the common |
| (...skipping 157 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 180 class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {}; | 179 class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {}; |
| 181 | 180 |
| 182 // Character set converter ---------------------------------------------------- | 181 // Character set converter ---------------------------------------------------- |
| 183 // | 182 // |
| 184 // Converts query strings into a custom encoding. The embedder can supply an | 183 // Converts query strings into a custom encoding. The embedder can supply an |
| 185 // implementation of this class to interface with their own character set | 184 // implementation of this class to interface with their own character set |
| 186 // conversion libraries. | 185 // conversion libraries. |
| 187 // | 186 // |
| 188 // Embedders will want to see the unit test for the ICU version. | 187 // Embedders will want to see the unit test for the ICU version. |
| 189 | 188 |
| 190 class URL_EXPORT CharsetConverter { | 189 class CharsetConverter { |
| 191 public: | 190 public: |
| 192 CharsetConverter() {} | 191 CharsetConverter() {} |
| 193 virtual ~CharsetConverter() {} | 192 virtual ~CharsetConverter() {} |
| 194 | 193 |
| 195 // Converts the given input string from UTF-16 to whatever output format the | 194 // Converts the given input string from UTF-16 to whatever output format the |
| 196 // converter supports. This is used only for the query encoding conversion, | 195 // converter supports. This is used only for the query encoding conversion, |
| 197 // which does not fail. Instead, the converter should insert "invalid | 196 // which does not fail. Instead, the converter should insert "invalid |
| 198 // character" characters in the output for invalid sequences, and do the | 197 // character" characters in the output for invalid sequences, and do the |
| 199 // best it can. | 198 // best it can. |
| 200 // | 199 // |
| (...skipping 17 matching lines...) Expand all Loading... |
| 218 // This should be called before parsing if whitespace removal is desired (which | 217 // This should be called before parsing if whitespace removal is desired (which |
| 219 // it normally is when you are canonicalizing). | 218 // it normally is when you are canonicalizing). |
| 220 // | 219 // |
| 221 // If no whitespace is removed, this function will not use the buffer and will | 220 // If no whitespace is removed, this function will not use the buffer and will |
| 222 // return a pointer to the input, to avoid the extra copy. If modification is | 221 // return a pointer to the input, to avoid the extra copy. If modification is |
| 223 // required, the given |buffer| will be used and the returned pointer will | 222 // required, the given |buffer| will be used and the returned pointer will |
| 224 // point to the beginning of the buffer. | 223 // point to the beginning of the buffer. |
| 225 // | 224 // |
| 226 // Therefore, callers should not use the buffer, since it may actuall be empty, | 225 // Therefore, callers should not use the buffer, since it may actuall be empty, |
| 227 // use the computed pointer and |*output_len| instead. | 226 // use the computed pointer and |*output_len| instead. |
| 228 URL_EXPORT const char* RemoveURLWhitespace(const char* input, int input_len, | 227 const char* RemoveURLWhitespace(const char* input, int input_len, |
| 229 CanonOutputT<char>* buffer, | 228 CanonOutputT<char>* buffer, |
| 230 int* output_len); | 229 int* output_len); |
| 231 URL_EXPORT const char16* RemoveURLWhitespace(const char16* input, int input_len, | 230 const char16* RemoveURLWhitespace(const char16* input, int input_len, |
| 232 CanonOutputT<char16>* buffer, | 231 CanonOutputT<char16>* buffer, |
| 233 int* output_len); | 232 int* output_len); |
| 234 | 233 |
| 235 // IDN ------------------------------------------------------------------------ | 234 // IDN ------------------------------------------------------------------------ |
| 236 | 235 |
| 237 // Converts the Unicode input representing a hostname to ASCII using IDN rules. | 236 // Converts the Unicode input representing a hostname to ASCII using IDN rules. |
| 238 // The output must fall in the ASCII range, but will be encoded in UTF-16. | 237 // The output must fall in the ASCII range, but will be encoded in UTF-16. |
| 239 // | 238 // |
| 240 // On success, the output will be filled with the ASCII host name and it will | 239 // On success, the output will be filled with the ASCII host name and it will |
| 241 // return true. Unlike most other canonicalization functions, this assumes that | 240 // return true. Unlike most other canonicalization functions, this assumes that |
| 242 // the output is empty. The beginning of the host will be at offset 0, and | 241 // the output is empty. The beginning of the host will be at offset 0, and |
| 243 // the length of the output will be set to the length of the new host name. | 242 // the length of the output will be set to the length of the new host name. |
| 244 // | 243 // |
| 245 // On error, returns false. The output in this case is undefined. | 244 // On error, returns false. The output in this case is undefined. |
| 246 URL_EXPORT bool IDNToASCII(const char16* src, | 245 bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output); |
| 247 int src_len, | |
| 248 CanonOutputW* output); | |
| 249 | 246 |
| 250 // Piece-by-piece canonicalizers ---------------------------------------------- | 247 // Piece-by-piece canonicalizers ---------------------------------------------- |
| 251 // | 248 // |
| 252 // These individual canonicalizers append the canonicalized versions of the | 249 // These individual canonicalizers append the canonicalized versions of the |
| 253 // corresponding URL component to the given std::string. The spec and the | 250 // corresponding URL component to the given std::string. The spec and the |
| 254 // previously-identified range of that component are the input. The range of | 251 // previously-identified range of that component are the input. The range of |
| 255 // the canonicalized component will be written to the output component. | 252 // the canonicalized component will be written to the output component. |
| 256 // | 253 // |
| 257 // These functions all append to the output so they can be chained. Make sure | 254 // These functions all append to the output so they can be chained. Make sure |
| 258 // the output is empty when you start. | 255 // the output is empty when you start. |
| 259 // | 256 // |
| 260 // These functions returns boolean values indicating success. On failure, they | 257 // These functions returns boolean values indicating success. On failure, they |
| 261 // will attempt to write something reasonable to the output so that, if | 258 // will attempt to write something reasonable to the output so that, if |
| 262 // displayed to the user, they will recognise it as something that's messed up. | 259 // displayed to the user, they will recognise it as something that's messed up. |
| 263 // Nothing more should ever be done with these invalid URLs, however. | 260 // Nothing more should ever be done with these invalid URLs, however. |
| 264 | 261 |
| 265 // Scheme: Appends the scheme and colon to the URL. The output component will | 262 // Scheme: Appends the scheme and colon to the URL. The output component will |
| 266 // indicate the range of characters up to but not including the colon. | 263 // indicate the range of characters up to but not including the colon. |
| 267 // | 264 // |
| 268 // Canonical URLs always have a scheme. If the scheme is not present in the | 265 // Canonical URLs always have a scheme. If the scheme is not present in the |
| 269 // input, this will just write the colon to indicate an empty scheme. Does not | 266 // input, this will just write the colon to indicate an empty scheme. Does not |
| 270 // append slashes which will be needed before any authority components for most | 267 // append slashes which will be needed before any authority components for most |
| 271 // URLs. | 268 // URLs. |
| 272 // | 269 // |
| 273 // The 8-bit version requires UTF-8 encoding. | 270 // The 8-bit version requires UTF-8 encoding. |
| 274 URL_EXPORT bool CanonicalizeScheme(const char* spec, | 271 bool CanonicalizeScheme(const char* spec, |
| 275 const url_parse::Component& scheme, | 272 const url_parse::Component& scheme, |
| 276 CanonOutput* output, | 273 CanonOutput* output, |
| 277 url_parse::Component* out_scheme); | 274 url_parse::Component* out_scheme); |
| 278 URL_EXPORT bool CanonicalizeScheme(const char16* spec, | 275 bool CanonicalizeScheme(const char16* spec, |
| 279 const url_parse::Component& scheme, | 276 const url_parse::Component& scheme, |
| 280 CanonOutput* output, | 277 CanonOutput* output, |
| 281 url_parse::Component* out_scheme); | 278 url_parse::Component* out_scheme); |
| 282 | 279 |
| 283 // User info: username/password. If present, this will add the delimiters so | 280 // User info: username/password. If present, this will add the delimiters so |
| 284 // the output will be "<username>:<password>@" or "<username>@". Empty | 281 // the output will be "<username>:<password>@" or "<username>@". Empty |
| 285 // username/password pairs, or empty passwords, will get converted to | 282 // username/password pairs, or empty passwords, will get converted to |
| 286 // nonexistant in the canonical version. | 283 // nonexistant in the canonical version. |
| 287 // | 284 // |
| 288 // The components for the username and password refer to ranges in the | 285 // The components for the username and password refer to ranges in the |
| 289 // respective source strings. Usually, these will be the same string, which | 286 // respective source strings. Usually, these will be the same string, which |
| 290 // is legal as long as the two components don't overlap. | 287 // is legal as long as the two components don't overlap. |
| 291 // | 288 // |
| 292 // The 8-bit version requires UTF-8 encoding. | 289 // The 8-bit version requires UTF-8 encoding. |
| 293 URL_EXPORT bool CanonicalizeUserInfo(const char* username_source, | 290 bool CanonicalizeUserInfo(const char* username_source, |
| 294 const url_parse::Component& username, | 291 const url_parse::Component& username, |
| 295 const char* password_source, | 292 const char* password_source, |
| 296 const url_parse::Component& password, | 293 const url_parse::Component& password, |
| 297 CanonOutput* output, | 294 CanonOutput* output, |
| 298 url_parse::Component* out_username, | 295 url_parse::Component* out_username, |
| 299 url_parse::Component* out_password); | 296 url_parse::Component* out_password); |
| 300 URL_EXPORT bool CanonicalizeUserInfo(const char16* username_source, | 297 bool CanonicalizeUserInfo(const char16* username_source, |
| 301 const url_parse::Component& username, | 298 const url_parse::Component& username, |
| 302 const char16* password_source, | 299 const char16* password_source, |
| 303 const url_parse::Component& password, | 300 const url_parse::Component& password, |
| 304 CanonOutput* output, | 301 CanonOutput* output, |
| 305 url_parse::Component* out_username, | 302 url_parse::Component* out_username, |
| 306 url_parse::Component* out_password); | 303 url_parse::Component* out_password); |
| 307 | 304 |
| 308 | 305 |
| 309 // This structure holds detailed state exported from the IP/Host canonicalizers. | 306 // This structure holds detailed state exported from the IP/Host canonicalizers. |
| 310 // Additional fields may be added as callers require them. | 307 // Additional fields may be added as callers require them. |
| 311 struct CanonHostInfo { | 308 struct CanonHostInfo { |
| 312 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} | 309 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} |
| 313 | 310 |
| 314 // Convenience function to test if family is an IP address. | 311 // Convenience function to test if family is an IP address. |
| 315 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } | 312 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } |
| 316 | 313 |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 349 int AddressLength() const { | 346 int AddressLength() const { |
| 350 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); | 347 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); |
| 351 } | 348 } |
| 352 }; | 349 }; |
| 353 | 350 |
| 354 | 351 |
| 355 // Host. | 352 // Host. |
| 356 // | 353 // |
| 357 // The 8-bit version requires UTF-8 encoding. Use this version when you only | 354 // The 8-bit version requires UTF-8 encoding. Use this version when you only |
| 358 // need to know whether canonicalization succeeded. | 355 // need to know whether canonicalization succeeded. |
| 359 URL_EXPORT bool CanonicalizeHost(const char* spec, | 356 bool CanonicalizeHost(const char* spec, |
| 360 const url_parse::Component& host, | 357 const url_parse::Component& host, |
| 361 CanonOutput* output, | 358 CanonOutput* output, |
| 362 url_parse::Component* out_host); | 359 url_parse::Component* out_host); |
| 363 URL_EXPORT bool CanonicalizeHost(const char16* spec, | 360 bool CanonicalizeHost(const char16* spec, |
| 364 const url_parse::Component& host, | 361 const url_parse::Component& host, |
| 365 CanonOutput* output, | 362 CanonOutput* output, |
| 366 url_parse::Component* out_host); | 363 url_parse::Component* out_host); |
| 367 | 364 |
| 368 // Extended version of CanonicalizeHost, which returns additional information. | 365 // Extended version of CanonicalizeHost, which returns additional information. |
| 369 // Use this when you need to know whether the hostname was an IP address. | 366 // Use this when you need to know whether the hostname was an IP address. |
| 370 // A successful return is indicated by host_info->family != BROKEN. See the | 367 // A successful return is indicated by host_info->family != BROKEN. See the |
| 371 // definition of CanonHostInfo above for details. | 368 // definition of CanonHostInfo above for details. |
| 372 URL_EXPORT void CanonicalizeHostVerbose(const char* spec, | 369 void CanonicalizeHostVerbose(const char* spec, |
| 373 const url_parse::Component& host, | 370 const url_parse::Component& host, |
| 374 CanonOutput* output, | 371 CanonOutput* output, |
| 375 CanonHostInfo* host_info); | 372 CanonHostInfo* host_info); |
| 376 URL_EXPORT void CanonicalizeHostVerbose(const char16* spec, | 373 void CanonicalizeHostVerbose(const char16* spec, |
| 377 const url_parse::Component& host, | 374 const url_parse::Component& host, |
| 378 CanonOutput* output, | 375 CanonOutput* output, |
| 379 CanonHostInfo* host_info); | 376 CanonHostInfo* host_info); |
| 380 | 377 |
| 381 | 378 |
| 382 // IP addresses. | 379 // IP addresses. |
| 383 // | 380 // |
| 384 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is | 381 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is |
| 385 // an IP address, it will canonicalize it as such, appending it to |output|. | 382 // an IP address, it will canonicalize it as such, appending it to |output|. |
| 386 // Additional status information is returned via the |*host_info| parameter. | 383 // Additional status information is returned via the |*host_info| parameter. |
| 387 // See the definition of CanonHostInfo above for details. | 384 // See the definition of CanonHostInfo above for details. |
| 388 // | 385 // |
| 389 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that | 386 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that |
| 390 // the input is unescaped and name-prepped, etc. It should not normally be | 387 // the input is unescaped and name-prepped, etc. It should not normally be |
| 391 // necessary or wise to call this directly. | 388 // necessary or wise to call this directly. |
| 392 URL_EXPORT void CanonicalizeIPAddress(const char* spec, | 389 void CanonicalizeIPAddress(const char* spec, |
| 393 const url_parse::Component& host, | 390 const url_parse::Component& host, |
| 394 CanonOutput* output, | 391 CanonOutput* output, |
| 395 CanonHostInfo* host_info); | 392 CanonHostInfo* host_info); |
| 396 URL_EXPORT void CanonicalizeIPAddress(const char16* spec, | 393 void CanonicalizeIPAddress(const char16* spec, |
| 397 const url_parse::Component& host, | 394 const url_parse::Component& host, |
| 398 CanonOutput* output, | 395 CanonOutput* output, |
| 399 CanonHostInfo* host_info); | 396 CanonHostInfo* host_info); |
| 400 | 397 |
| 401 // Port: this function will add the colon for the port if a port is present. | 398 // Port: this function will add the colon for the port if a port is present. |
| 402 // The caller can pass url_parse::PORT_UNSPECIFIED as the | 399 // The caller can pass url_parse::PORT_UNSPECIFIED as the |
| 403 // default_port_for_scheme argument if there is no default port. | 400 // default_port_for_scheme argument if there is no default port. |
| 404 // | 401 // |
| 405 // The 8-bit version requires UTF-8 encoding. | 402 // The 8-bit version requires UTF-8 encoding. |
| 406 URL_EXPORT bool CanonicalizePort(const char* spec, | 403 bool CanonicalizePort(const char* spec, |
| 407 const url_parse::Component& port, | 404 const url_parse::Component& port, |
| 408 int default_port_for_scheme, | 405 int default_port_for_scheme, |
| 409 CanonOutput* output, | 406 CanonOutput* output, |
| 410 url_parse::Component* out_port); | 407 url_parse::Component* out_port); |
| 411 URL_EXPORT bool CanonicalizePort(const char16* spec, | 408 bool CanonicalizePort(const char16* spec, |
| 412 const url_parse::Component& port, | 409 const url_parse::Component& port, |
| 413 int default_port_for_scheme, | 410 int default_port_for_scheme, |
| 414 CanonOutput* output, | 411 CanonOutput* output, |
| 415 url_parse::Component* out_port); | 412 url_parse::Component* out_port); |
| 416 | 413 |
| 417 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED | 414 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED |
| 418 // if the scheme is unknown. | 415 // if the scheme is unknown. |
| 419 URL_EXPORT int DefaultPortForScheme(const char* scheme, int scheme_len); | 416 int DefaultPortForScheme(const char* scheme, int scheme_len); |
| 420 | 417 |
| 421 // Path. If the input does not begin in a slash (including if the input is | 418 // Path. If the input does not begin in a slash (including if the input is |
| 422 // empty), we'll prepend a slash to the path to make it canonical. | 419 // empty), we'll prepend a slash to the path to make it canonical. |
| 423 // | 420 // |
| 424 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity | 421 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity |
| 425 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid | 422 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid |
| 426 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't | 423 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't |
| 427 // an issue. Somebody giving us an 8-bit path is responsible for generating | 424 // an issue. Somebody giving us an 8-bit path is responsible for generating |
| 428 // the path that the server expects (we'll escape high-bit characters), so | 425 // the path that the server expects (we'll escape high-bit characters), so |
| 429 // if something is invalid, it's their problem. | 426 // if something is invalid, it's their problem. |
| 430 URL_EXPORT bool CanonicalizePath(const char* spec, | 427 bool CanonicalizePath(const char* spec, |
| 431 const url_parse::Component& path, | 428 const url_parse::Component& path, |
| 432 CanonOutput* output, | 429 CanonOutput* output, |
| 433 url_parse::Component* out_path); | 430 url_parse::Component* out_path); |
| 434 URL_EXPORT bool CanonicalizePath(const char16* spec, | 431 bool CanonicalizePath(const char16* spec, |
| 435 const url_parse::Component& path, | 432 const url_parse::Component& path, |
| 436 CanonOutput* output, | 433 CanonOutput* output, |
| 437 url_parse::Component* out_path); | 434 url_parse::Component* out_path); |
| 438 | 435 |
| 439 // Canonicalizes the input as a file path. This is like CanonicalizePath except | 436 // Canonicalizes the input as a file path. This is like CanonicalizePath except |
| 440 // that it also handles Windows drive specs. For example, the path can begin | 437 // that it also handles Windows drive specs. For example, the path can begin |
| 441 // with "c|\" and it will get properly canonicalized to "C:/". | 438 // with "c|\" and it will get properly canonicalized to "C:/". |
| 442 // The string will be appended to |*output| and |*out_path| will be updated. | 439 // The string will be appended to |*output| and |*out_path| will be updated. |
| 443 // | 440 // |
| 444 // The 8-bit version requires UTF-8 encoding. | 441 // The 8-bit version requires UTF-8 encoding. |
| 445 URL_EXPORT bool FileCanonicalizePath(const char* spec, | 442 bool FileCanonicalizePath(const char* spec, |
| 446 const url_parse::Component& path, | 443 const url_parse::Component& path, |
| 447 CanonOutput* output, | 444 CanonOutput* output, |
| 448 url_parse::Component* out_path); | 445 url_parse::Component* out_path); |
| 449 URL_EXPORT bool FileCanonicalizePath(const char16* spec, | 446 bool FileCanonicalizePath(const char16* spec, |
| 450 const url_parse::Component& path, | 447 const url_parse::Component& path, |
| 451 CanonOutput* output, | 448 CanonOutput* output, |
| 452 url_parse::Component* out_path); | 449 url_parse::Component* out_path); |
| 453 | 450 |
| 454 // Query: Prepends the ? if needed. | 451 // Query: Prepends the ? if needed. |
| 455 // | 452 // |
| 456 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly | 453 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly |
| 457 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode | 454 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode |
| 458 // "invalid character." This function can not fail, we always just try to do | 455 // "invalid character." This function can not fail, we always just try to do |
| 459 // our best for crazy input here since web pages can set it themselves. | 456 // our best for crazy input here since web pages can set it themselves. |
| 460 // | 457 // |
| 461 // This will convert the given input into the output encoding that the given | 458 // This will convert the given input into the output encoding that the given |
| 462 // character set converter object provides. The converter will only be called | 459 // character set converter object provides. The converter will only be called |
| 463 // if necessary, for ASCII input, no conversions are necessary. | 460 // if necessary, for ASCII input, no conversions are necessary. |
| 464 // | 461 // |
| 465 // The converter can be NULL. In this case, the output encoding will be UTF-8. | 462 // The converter can be NULL. In this case, the output encoding will be UTF-8. |
| 466 URL_EXPORT void CanonicalizeQuery(const char* spec, | 463 void CanonicalizeQuery(const char* spec, |
| 467 const url_parse::Component& query, | 464 const url_parse::Component& query, |
| 468 CharsetConverter* converter, | 465 CharsetConverter* converter, |
| 469 CanonOutput* output, | 466 CanonOutput* output, |
| 470 url_parse::Component* out_query); | 467 url_parse::Component* out_query); |
| 471 URL_EXPORT void CanonicalizeQuery(const char16* spec, | 468 void CanonicalizeQuery(const char16* spec, |
| 472 const url_parse::Component& query, | 469 const url_parse::Component& query, |
| 473 CharsetConverter* converter, | 470 CharsetConverter* converter, |
| 474 CanonOutput* output, | 471 CanonOutput* output, |
| 475 url_parse::Component* out_query); | 472 url_parse::Component* out_query); |
| 476 | 473 |
| 477 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only | 474 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only |
| 478 // canonicalizer that does not produce ASCII output). The output is | 475 // canonicalizer that does not produce ASCII output). The output is |
| 479 // guaranteed to be valid UTF-8. | 476 // guaranteed to be valid UTF-8. |
| 480 // | 477 // |
| 481 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use | 478 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use |
| 482 // the "Unicode replacement character" for the confusing bits and copy the rest. | 479 // the "Unicode replacement character" for the confusing bits and copy the rest. |
| 483 URL_EXPORT void CanonicalizeRef(const char* spec, | 480 void CanonicalizeRef(const char* spec, |
| 484 const url_parse::Component& path, | 481 const url_parse::Component& path, |
| 485 CanonOutput* output, | 482 CanonOutput* output, |
| 486 url_parse::Component* out_path); | 483 url_parse::Component* out_path); |
| 487 URL_EXPORT void CanonicalizeRef(const char16* spec, | 484 void CanonicalizeRef(const char16* spec, |
| 488 const url_parse::Component& path, | 485 const url_parse::Component& path, |
| 489 CanonOutput* output, | 486 CanonOutput* output, |
| 490 url_parse::Component* out_path); | 487 url_parse::Component* out_path); |
| 491 | 488 |
| 492 // Full canonicalizer --------------------------------------------------------- | 489 // Full canonicalizer --------------------------------------------------------- |
| 493 // | 490 // |
| 494 // These functions replace any string contents, rather than append as above. | 491 // These functions replace any string contents, rather than append as above. |
| 495 // See the above piece-by-piece functions for information specific to | 492 // See the above piece-by-piece functions for information specific to |
| 496 // canonicalizing individual components. | 493 // canonicalizing individual components. |
| 497 // | 494 // |
| 498 // The output will be ASCII except the reference fragment, which may be UTF-8. | 495 // The output will be ASCII except the reference fragment, which may be UTF-8. |
| 499 // | 496 // |
| 500 // The 8-bit versions require UTF-8 encoding. | 497 // The 8-bit versions require UTF-8 encoding. |
| 501 | 498 |
| 502 // Use for standard URLs with authorities and paths. | 499 // Use for standard URLs with authorities and paths. |
| 503 URL_EXPORT bool CanonicalizeStandardURL(const char* spec, | 500 bool CanonicalizeStandardURL(const char* spec, |
| 504 int spec_len, | 501 int spec_len, |
| 505 const url_parse::Parsed& parsed, | 502 const url_parse::Parsed& parsed, |
| 506 CharsetConverter* query_converter, | 503 CharsetConverter* query_converter, |
| 507 CanonOutput* output, | 504 CanonOutput* output, |
| 508 url_parse::Parsed* new_parsed); | 505 url_parse::Parsed* new_parsed); |
| 509 URL_EXPORT bool CanonicalizeStandardURL(const char16* spec, | 506 bool CanonicalizeStandardURL(const char16* spec, |
| 510 int spec_len, | 507 int spec_len, |
| 511 const url_parse::Parsed& parsed, | 508 const url_parse::Parsed& parsed, |
| 512 CharsetConverter* query_converter, | 509 CharsetConverter* query_converter, |
| 513 CanonOutput* output, | 510 CanonOutput* output, |
| 514 url_parse::Parsed* new_parsed); | 511 url_parse::Parsed* new_parsed); |
| 515 | 512 |
| 516 // Use for file URLs. | 513 // Use for file URLs. |
| 517 URL_EXPORT bool CanonicalizeFileURL(const char* spec, | 514 bool CanonicalizeFileURL(const char* spec, |
| 518 int spec_len, | 515 int spec_len, |
| 519 const url_parse::Parsed& parsed, | 516 const url_parse::Parsed& parsed, |
| 520 CharsetConverter* query_converter, | 517 CharsetConverter* query_converter, |
| 521 CanonOutput* output, | 518 CanonOutput* output, |
| 522 url_parse::Parsed* new_parsed); | 519 url_parse::Parsed* new_parsed); |
| 523 URL_EXPORT bool CanonicalizeFileURL(const char16* spec, | 520 bool CanonicalizeFileURL(const char16* spec, |
| 524 int spec_len, | 521 int spec_len, |
| 525 const url_parse::Parsed& parsed, | 522 const url_parse::Parsed& parsed, |
| 526 CharsetConverter* query_converter, | 523 CharsetConverter* query_converter, |
| 527 CanonOutput* output, | 524 CanonOutput* output, |
| 528 url_parse::Parsed* new_parsed); | 525 url_parse::Parsed* new_parsed); |
| 529 | 526 |
| 530 // Use for filesystem URLs. | 527 // Use for filesystem URLs. |
| 531 URL_EXPORT bool CanonicalizeFileSystemURL(const char* spec, | 528 bool CanonicalizeFileSystemURL(const char* spec, |
| 532 int spec_len, | 529 int spec_len, |
| 533 const url_parse::Parsed& parsed, | 530 const url_parse::Parsed& parsed, |
| 534 CharsetConverter* query_converter, | 531 CharsetConverter* query_converter, |
| 535 CanonOutput* output, | 532 CanonOutput* output, |
| 536 url_parse::Parsed* new_parsed); | 533 url_parse::Parsed* new_parsed); |
| 537 URL_EXPORT bool CanonicalizeFileSystemURL(const char16* spec, | 534 bool CanonicalizeFileSystemURL(const char16* spec, |
| 538 int spec_len, | 535 int spec_len, |
| 539 const url_parse::Parsed& parsed, | 536 const url_parse::Parsed& parsed, |
| 540 CharsetConverter* query_converter, | 537 CharsetConverter* query_converter, |
| 541 CanonOutput* output, | 538 CanonOutput* output, |
| 542 url_parse::Parsed* new_parsed); | 539 url_parse::Parsed* new_parsed); |
| 543 | 540 |
| 544 // Use for path URLs such as javascript. This does not modify the path in any | 541 // Use for path URLs such as javascript. This does not modify the path in any |
| 545 // way, for example, by escaping it. | 542 // way, for example, by escaping it. |
| 546 URL_EXPORT bool CanonicalizePathURL(const char* spec, | 543 bool CanonicalizePathURL(const char* spec, |
| 547 int spec_len, | 544 int spec_len, |
| 548 const url_parse::Parsed& parsed, | 545 const url_parse::Parsed& parsed, |
| 549 CanonOutput* output, | 546 CanonOutput* output, |
| 550 url_parse::Parsed* new_parsed); | 547 url_parse::Parsed* new_parsed); |
| 551 URL_EXPORT bool CanonicalizePathURL(const char16* spec, | 548 bool CanonicalizePathURL(const char16* spec, |
| 552 int spec_len, | 549 int spec_len, |
| 553 const url_parse::Parsed& parsed, | 550 const url_parse::Parsed& parsed, |
| 554 CanonOutput* output, | 551 CanonOutput* output, |
| 555 url_parse::Parsed* new_parsed); | 552 url_parse::Parsed* new_parsed); |
| 556 | 553 |
| 557 // Use for mailto URLs. This "canonicalizes" the url into a path and query | 554 // Use for mailto URLs. This "canonicalizes" the url into a path and query |
| 558 // component. It does not attempt to merge "to" fields. It uses UTF-8 for | 555 // component. It does not attempt to merge "to" fields. It uses UTF-8 for |
| 559 // the query encoding if there is a query. This is because a mailto URL is | 556 // the query encoding if there is a query. This is because a mailto URL is |
| 560 // really intended for an external mail program, and the encoding of a page, | 557 // really intended for an external mail program, and the encoding of a page, |
| 561 // etc. which would influence a query encoding normally are irrelevant. | 558 // etc. which would influence a query encoding normally are irrelevant. |
| 562 URL_EXPORT bool CanonicalizeMailtoURL(const char* spec, | 559 bool CanonicalizeMailtoURL(const char* spec, |
| 563 int spec_len, | 560 int spec_len, |
| 564 const url_parse::Parsed& parsed, | 561 const url_parse::Parsed& parsed, |
| 565 CanonOutput* output, | 562 CanonOutput* output, |
| 566 url_parse::Parsed* new_parsed); | 563 url_parse::Parsed* new_parsed); |
| 567 URL_EXPORT bool CanonicalizeMailtoURL(const char16* spec, | 564 bool CanonicalizeMailtoURL(const char16* spec, |
| 568 int spec_len, | 565 int spec_len, |
| 569 const url_parse::Parsed& parsed, | 566 const url_parse::Parsed& parsed, |
| 570 CanonOutput* output, | 567 CanonOutput* output, |
| 571 url_parse::Parsed* new_parsed); | 568 url_parse::Parsed* new_parsed); |
| 572 | 569 |
| 573 // Part replacer -------------------------------------------------------------- | 570 // Part replacer -------------------------------------------------------------- |
| 574 | 571 |
| 575 // Internal structure used for storing separate strings for each component. | 572 // Internal structure used for storing separate strings for each component. |
| 576 // The basic canonicalization functions use this structure internally so that | 573 // The basic canonicalization functions use this structure internally so that |
| 577 // component replacement (different strings for different components) can be | 574 // component replacement (different strings for different components) can be |
| 578 // treated on the same code path as regular canonicalization (the same string | 575 // treated on the same code path as regular canonicalization (the same string |
| 579 // for each component). | 576 // for each component). |
| 580 // | 577 // |
| 581 // A url_parse::Parsed structure usually goes along with this. Those | 578 // A url_parse::Parsed structure usually goes along with this. Those |
| (...skipping 164 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 746 // Replace component | (replacement string) (replacement component) | 743 // Replace component | (replacement string) (replacement component) |
| 747 // Delete component | (non-NULL) (invalid component: (0,-1)) | 744 // Delete component | (non-NULL) (invalid component: (0,-1)) |
| 748 // | 745 // |
| 749 // We use a pointer to the empty string for the source when the component | 746 // We use a pointer to the empty string for the source when the component |
| 750 // should be deleted. | 747 // should be deleted. |
| 751 URLComponentSource<CHAR> sources_; | 748 URLComponentSource<CHAR> sources_; |
| 752 url_parse::Parsed components_; | 749 url_parse::Parsed components_; |
| 753 }; | 750 }; |
| 754 | 751 |
| 755 // The base must be an 8-bit canonical URL. | 752 // The base must be an 8-bit canonical URL. |
| 756 URL_EXPORT bool ReplaceStandardURL(const char* base, | 753 bool ReplaceStandardURL(const char* base, |
| 757 const url_parse::Parsed& base_parsed, | 754 const url_parse::Parsed& base_parsed, |
| 758 const Replacements<char>& replacements, | 755 const Replacements<char>& replacements, |
| 759 CharsetConverter* query_converter, | 756 CharsetConverter* query_converter, |
| 760 CanonOutput* output, | 757 CanonOutput* output, |
| 761 url_parse::Parsed* new_parsed); | 758 url_parse::Parsed* new_parsed); |
| 762 URL_EXPORT bool ReplaceStandardURL(const char* base, | 759 bool ReplaceStandardURL(const char* base, |
| 763 const url_parse::Parsed& base_parsed, | 760 const url_parse::Parsed& base_parsed, |
| 764 const Replacements<char16>& replacements, | 761 const Replacements<char16>& replacements, |
| 765 CharsetConverter* query_converter, | 762 CharsetConverter* query_converter, |
| 766 CanonOutput* output, | 763 CanonOutput* output, |
| 767 url_parse::Parsed* new_parsed); | 764 url_parse::Parsed* new_parsed); |
| 768 | 765 |
| 769 // Filesystem URLs can only have the path, query, or ref replaced. | 766 // Filesystem URLs can only have the path, query, or ref replaced. |
| 770 // All other components will be ignored. | 767 // All other components will be ignored. |
| 771 URL_EXPORT bool ReplaceFileSystemURL(const char* base, | 768 bool ReplaceFileSystemURL(const char* base, |
| 772 const url_parse::Parsed& base_parsed, | 769 const url_parse::Parsed& base_parsed, |
| 773 const Replacements<char>& replacements, | 770 const Replacements<char>& replacements, |
| 774 CharsetConverter* query_converter, | 771 CharsetConverter* query_converter, |
| 775 CanonOutput* output, | 772 CanonOutput* output, |
| 776 url_parse::Parsed* new_parsed); | 773 url_parse::Parsed* new_parsed); |
| 777 URL_EXPORT bool ReplaceFileSystemURL(const char* base, | 774 bool ReplaceFileSystemURL(const char* base, |
| 778 const url_parse::Parsed& base_parsed, | 775 const url_parse::Parsed& base_parsed, |
| 779 const Replacements<char16>& replacements, | 776 const Replacements<char16>& replacements, |
| 780 CharsetConverter* query_converter, | 777 CharsetConverter* query_converter, |
| 781 CanonOutput* output, | 778 CanonOutput* output, |
| 782 url_parse::Parsed* new_parsed); | 779 url_parse::Parsed* new_parsed); |
| 783 | 780 |
| 784 // Replacing some parts of a file URL is not permitted. Everything except | 781 // Replacing some parts of a file URL is not permitted. Everything except |
| 785 // the host, path, query, and ref will be ignored. | 782 // the host, path, query, and ref will be ignored. |
| 786 URL_EXPORT bool ReplaceFileURL(const char* base, | 783 bool ReplaceFileURL(const char* base, |
| 787 const url_parse::Parsed& base_parsed, | 784 const url_parse::Parsed& base_parsed, |
| 788 const Replacements<char>& replacements, | 785 const Replacements<char>& replacements, |
| 789 CharsetConverter* query_converter, | 786 CharsetConverter* query_converter, |
| 790 CanonOutput* output, | 787 CanonOutput* output, |
| 791 url_parse::Parsed* new_parsed); | 788 url_parse::Parsed* new_parsed); |
| 792 URL_EXPORT bool ReplaceFileURL(const char* base, | 789 bool ReplaceFileURL(const char* base, |
| 793 const url_parse::Parsed& base_parsed, | 790 const url_parse::Parsed& base_parsed, |
| 794 const Replacements<char16>& replacements, | 791 const Replacements<char16>& replacements, |
| 795 CharsetConverter* query_converter, | 792 CharsetConverter* query_converter, |
| 796 CanonOutput* output, | 793 CanonOutput* output, |
| 797 url_parse::Parsed* new_parsed); | 794 url_parse::Parsed* new_parsed); |
| 798 | 795 |
| 799 // Path URLs can only have the scheme and path replaced. All other components | 796 // Path URLs can only have the scheme and path replaced. All other components |
| 800 // will be ignored. | 797 // will be ignored. |
| 801 URL_EXPORT bool ReplacePathURL(const char* base, | 798 bool ReplacePathURL(const char* base, |
| 802 const url_parse::Parsed& base_parsed, | 799 const url_parse::Parsed& base_parsed, |
| 803 const Replacements<char>& replacements, | 800 const Replacements<char>& replacements, |
| 804 CanonOutput* output, | 801 CanonOutput* output, |
| 805 url_parse::Parsed* new_parsed); | 802 url_parse::Parsed* new_parsed); |
| 806 URL_EXPORT bool ReplacePathURL(const char* base, | 803 bool ReplacePathURL(const char* base, |
| 807 const url_parse::Parsed& base_parsed, | 804 const url_parse::Parsed& base_parsed, |
| 808 const Replacements<char16>& replacements, | 805 const Replacements<char16>& replacements, |
| 809 CanonOutput* output, | 806 CanonOutput* output, |
| 810 url_parse::Parsed* new_parsed); | 807 url_parse::Parsed* new_parsed); |
| 811 | 808 |
| 812 // Mailto URLs can only have the scheme, path, and query replaced. | 809 // Mailto URLs can only have the scheme, path, and query replaced. |
| 813 // All other components will be ignored. | 810 // All other components will be ignored. |
| 814 URL_EXPORT bool ReplaceMailtoURL(const char* base, | 811 bool ReplaceMailtoURL(const char* base, |
| 815 const url_parse::Parsed& base_parsed, | 812 const url_parse::Parsed& base_parsed, |
| 816 const Replacements<char>& replacements, | 813 const Replacements<char>& replacements, |
| 817 CanonOutput* output, | 814 CanonOutput* output, |
| 818 url_parse::Parsed* new_parsed); | 815 url_parse::Parsed* new_parsed); |
| 819 URL_EXPORT bool ReplaceMailtoURL(const char* base, | 816 bool ReplaceMailtoURL(const char* base, |
| 820 const url_parse::Parsed& base_parsed, | 817 const url_parse::Parsed& base_parsed, |
| 821 const Replacements<char16>& replacements, | 818 const Replacements<char16>& replacements, |
| 822 CanonOutput* output, | 819 CanonOutput* output, |
| 823 url_parse::Parsed* new_parsed); | 820 url_parse::Parsed* new_parsed); |
| 824 | 821 |
| 825 // Relative URL --------------------------------------------------------------- | 822 // Relative URL --------------------------------------------------------------- |
| 826 | 823 |
| 827 // Given an input URL or URL fragment |fragment|, determines if it is a | 824 // Given an input URL or URL fragment |fragment|, determines if it is a |
| 828 // relative or absolute URL and places the result into |*is_relative|. If it is | 825 // relative or absolute URL and places the result into |*is_relative|. If it is |
| 829 // relative, the relevant portion of the URL will be placed into | 826 // relative, the relevant portion of the URL will be placed into |
| 830 // |*relative_component| (there may have been trimmed whitespace, for example). | 827 // |*relative_component| (there may have been trimmed whitespace, for example). |
| 831 // This value is passed to ResolveRelativeURL. If the input is not relative, | 828 // This value is passed to ResolveRelativeURL. If the input is not relative, |
| 832 // this value is UNDEFINED (it may be changed by the function). | 829 // this value is UNDEFINED (it may be changed by the function). |
| 833 // | 830 // |
| 834 // Returns true on success (we successfully determined the URL is relative or | 831 // Returns true on success (we successfully determined the URL is relative or |
| 835 // not). Failure means that the combination of URLs doesn't make any sense. | 832 // not). Failure means that the combination of URLs doesn't make any sense. |
| 836 // | 833 // |
| 837 // The base URL should always be canonical, therefore is ASCII. | 834 // The base URL should always be canonical, therefore is ASCII. |
| 838 URL_EXPORT bool IsRelativeURL(const char* base, | 835 bool IsRelativeURL(const char* base, |
| 839 const url_parse::Parsed& base_parsed, | 836 const url_parse::Parsed& base_parsed, |
| 840 const char* fragment, | 837 const char* fragment, |
| 841 int fragment_len, | 838 int fragment_len, |
| 842 bool is_base_hierarchical, | 839 bool is_base_hierarchical, |
| 843 bool* is_relative, | 840 bool* is_relative, |
| 844 url_parse::Component* relative_component); | 841 url_parse::Component* relative_component); |
| 845 URL_EXPORT bool IsRelativeURL(const char* base, | 842 bool IsRelativeURL(const char* base, |
| 846 const url_parse::Parsed& base_parsed, | 843 const url_parse::Parsed& base_parsed, |
| 847 const char16* fragment, | 844 const char16* fragment, |
| 848 int fragment_len, | 845 int fragment_len, |
| 849 bool is_base_hierarchical, | 846 bool is_base_hierarchical, |
| 850 bool* is_relative, | 847 bool* is_relative, |
| 851 url_parse::Component* relative_component); | 848 url_parse::Component* relative_component); |
| 852 | 849 |
| 853 // Given a canonical parsed source URL, a URL fragment known to be relative, | 850 // Given a canonical parsed source URL, a URL fragment known to be relative, |
| 854 // and the identified relevant portion of the relative URL (computed by | 851 // and the identified relevant portion of the relative URL (computed by |
| 855 // IsRelativeURL), this produces a new parsed canonical URL in |output| and | 852 // IsRelativeURL), this produces a new parsed canonical URL in |output| and |
| 856 // |out_parsed|. | 853 // |out_parsed|. |
| 857 // | 854 // |
| 858 // It also requires a flag indicating whether the base URL is a file: URL | 855 // It also requires a flag indicating whether the base URL is a file: URL |
| 859 // which triggers additional logic. | 856 // which triggers additional logic. |
| 860 // | 857 // |
| 861 // The base URL should be canonical and have a host (may be empty for file | 858 // The base URL should be canonical and have a host (may be empty for file |
| 862 // URLs) and a path. If it doesn't have these, we can't resolve relative | 859 // URLs) and a path. If it doesn't have these, we can't resolve relative |
| 863 // URLs off of it and will return the base as the output with an error flag. | 860 // URLs off of it and will return the base as the output with an error flag. |
| 864 // Becausee it is canonical is should also be ASCII. | 861 // Becausee it is canonical is should also be ASCII. |
| 865 // | 862 // |
| 866 // The query charset converter follows the same rules as CanonicalizeQuery. | 863 // The query charset converter follows the same rules as CanonicalizeQuery. |
| 867 // | 864 // |
| 868 // Returns true on success. On failure, the output will be "something | 865 // Returns true on success. On failure, the output will be "something |
| 869 // reasonable" that will be consistent and valid, just probably not what | 866 // reasonable" that will be consistent and valid, just probably not what |
| 870 // was intended by the web page author or caller. | 867 // was intended by the web page author or caller. |
| 871 URL_EXPORT bool ResolveRelativeURL( | 868 bool ResolveRelativeURL(const char* base_url, |
| 872 const char* base_url, | 869 const url_parse::Parsed& base_parsed, |
| 873 const url_parse::Parsed& base_parsed, | 870 bool base_is_file, |
| 874 bool base_is_file, | 871 const char* relative_url, |
| 875 const char* relative_url, | 872 const url_parse::Component& relative_component, |
| 876 const url_parse::Component& relative_component, | 873 CharsetConverter* query_converter, |
| 877 CharsetConverter* query_converter, | 874 CanonOutput* output, |
| 878 CanonOutput* output, | 875 url_parse::Parsed* out_parsed); |
| 879 url_parse::Parsed* out_parsed); | 876 bool ResolveRelativeURL(const char* base_url, |
| 880 URL_EXPORT bool ResolveRelativeURL( | 877 const url_parse::Parsed& base_parsed, |
| 881 const char* base_url, | 878 bool base_is_file, |
| 882 const url_parse::Parsed& base_parsed, | 879 const char16* relative_url, |
| 883 bool base_is_file, | 880 const url_parse::Component& relative_component, |
| 884 const char16* relative_url, | 881 CharsetConverter* query_converter, |
| 885 const url_parse::Component& relative_component, | 882 CanonOutput* output, |
| 886 CharsetConverter* query_converter, | 883 url_parse::Parsed* out_parsed); |
| 887 CanonOutput* output, | |
| 888 url_parse::Parsed* out_parsed); | |
| 889 | 884 |
| 890 } // namespace url_canon | 885 } // namespace url_canon |
| 891 | 886 |
| 892 #endif // URL_URL_CANON_H_ | 887 #endif // URL_URL_CANON_H_ |
| OLD | NEW |