OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #ifndef URL_URL_CANON_H_ | 5 #ifndef URL_URL_CANON_H_ |
6 #define URL_URL_CANON_H_ | 6 #define URL_URL_CANON_H_ |
7 | 7 |
8 #include <stdlib.h> | 8 #include <stdlib.h> |
9 #include <string.h> | 9 #include <string.h> |
10 | 10 |
11 #include "base/strings/string16.h" | 11 #include "base/strings/string16.h" |
| 12 #include "url/third_party/mozilla/url_parse.h" |
12 #include "url/url_export.h" | 13 #include "url/url_export.h" |
13 #include "url/url_parse.h" | |
14 | 14 |
15 namespace url { | 15 namespace url { |
16 | 16 |
17 // Canonicalizer output ------------------------------------------------------- | 17 // Canonicalizer output ------------------------------------------------------- |
18 | 18 |
19 // Base class for the canonicalizer output, this maintains a buffer and | 19 // Base class for the canonicalizer output, this maintains a buffer and |
20 // supports simple resizing and append operations on it. | 20 // supports simple resizing and append operations on it. |
21 // | 21 // |
22 // It is VERY IMPORTANT that no virtual function calls be made on the common | 22 // It is VERY IMPORTANT that no virtual function calls be made on the common |
23 // code path. We only have two virtual function calls, the destructor and a | 23 // code path. We only have two virtual function calls, the destructor and a |
(...skipping 254 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
278 CanonOutput* output, | 278 CanonOutput* output, |
279 Component* out_scheme); | 279 Component* out_scheme); |
280 URL_EXPORT bool CanonicalizeScheme(const base::char16* spec, | 280 URL_EXPORT bool CanonicalizeScheme(const base::char16* spec, |
281 const Component& scheme, | 281 const Component& scheme, |
282 CanonOutput* output, | 282 CanonOutput* output, |
283 Component* out_scheme); | 283 Component* out_scheme); |
284 | 284 |
285 // User info: username/password. If present, this will add the delimiters so | 285 // User info: username/password. If present, this will add the delimiters so |
286 // the output will be "<username>:<password>@" or "<username>@". Empty | 286 // the output will be "<username>:<password>@" or "<username>@". Empty |
287 // username/password pairs, or empty passwords, will get converted to | 287 // username/password pairs, or empty passwords, will get converted to |
288 // nonexistant in the canonical version. | 288 // nonexistent in the canonical version. |
289 // | 289 // |
290 // The components for the username and password refer to ranges in the | 290 // The components for the username and password refer to ranges in the |
291 // respective source strings. Usually, these will be the same string, which | 291 // respective source strings. Usually, these will be the same string, which |
292 // is legal as long as the two components don't overlap. | 292 // is legal as long as the two components don't overlap. |
293 // | 293 // |
294 // The 8-bit version requires UTF-8 encoding. | 294 // The 8-bit version requires UTF-8 encoding. |
295 URL_EXPORT bool CanonicalizeUserInfo(const char* username_source, | 295 URL_EXPORT bool CanonicalizeUserInfo(const char* username_source, |
296 const Component& username, | 296 const Component& username, |
297 const char* password_source, | 297 const char* password_source, |
298 const Component& password, | 298 const Component& password, |
(...skipping 11 matching lines...) Expand all Loading... |
310 // This structure holds detailed state exported from the IP/Host canonicalizers. | 310 // This structure holds detailed state exported from the IP/Host canonicalizers. |
311 // Additional fields may be added as callers require them. | 311 // Additional fields may be added as callers require them. |
312 struct CanonHostInfo { | 312 struct CanonHostInfo { |
313 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} | 313 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {} |
314 | 314 |
315 // Convenience function to test if family is an IP address. | 315 // Convenience function to test if family is an IP address. |
316 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } | 316 bool IsIPAddress() const { return family == IPV4 || family == IPV6; } |
317 | 317 |
318 // This field summarizes how the input was classified by the canonicalizer. | 318 // This field summarizes how the input was classified by the canonicalizer. |
319 enum Family { | 319 enum Family { |
320 NEUTRAL, // - Doesn't resemble an IP address. As far as the IP | 320 NEUTRAL, // - Doesn't resemble an IP address. As far as the IP |
321 // canonicalizer is concerned, it should be treated as a | 321 // canonicalizer is concerned, it should be treated as a |
322 // hostname. | 322 // hostname. |
323 BROKEN, // - Almost an IP, but was not canonicalized. This could be an | 323 BROKEN, // - Almost an IP, but was not canonicalized. This could be an |
324 // IPv4 address where truncation occurred, or something | 324 // IPv4 address where truncation occurred, or something |
325 // containing the special characters :[] which did not parse | 325 // containing the special characters :[] which did not parse |
326 // as an IPv6 address. Never attempt to connect to this | 326 // as an IPv6 address. Never attempt to connect to this |
327 // address, because it might actually succeed! | 327 // address, because it might actually succeed! |
328 IPV4, // - Successfully canonicalized as an IPv4 address. | 328 IPV4, // - Successfully canonicalized as an IPv4 address. |
329 IPV6, // - Successfully canonicalized as an IPv6 address. | 329 IPV6, // - Successfully canonicalized as an IPv6 address. |
330 }; | 330 }; |
331 Family family; | 331 Family family; |
332 | 332 |
333 // If |family| is IPV4, then this is the number of nonempty dot-separated | 333 // If |family| is IPV4, then this is the number of nonempty dot-separated |
334 // components in the input text, from 1 to 4. If |family| is not IPV4, | 334 // components in the input text, from 1 to 4. If |family| is not IPV4, |
335 // this value is undefined. | 335 // this value is undefined. |
336 int num_ipv4_components; | 336 int num_ipv4_components; |
337 | 337 |
338 // Location of host within the canonicalized output. | 338 // Location of host within the canonicalized output. |
339 // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. | 339 // CanonicalizeIPAddress() only sets this field if |family| is IPV4 or IPV6. |
340 // CanonicalizeHostVerbose() always sets it. | 340 // CanonicalizeHostVerbose() always sets it. |
341 Component out_host; | 341 Component out_host; |
342 | 342 |
343 // |address| contains the parsed IP Address (if any) in its first | 343 // |address| contains the parsed IP Address (if any) in its first |
344 // AddressLength() bytes, in network order. If IsIPAddress() is false | 344 // AddressLength() bytes, in network order. If IsIPAddress() is false |
345 // AddressLength() will return zero and the content of |address| is undefined. | 345 // AddressLength() will return zero and the content of |address| is undefined. |
346 unsigned char address[16]; | 346 unsigned char address[16]; |
347 | 347 |
348 // Convenience function to calculate the length of an IP address corresponding | 348 // Convenience function to calculate the length of an IP address corresponding |
349 // to the current IP version in |family|, if any. For use with |address|. | 349 // to the current IP version in |family|, if any. For use with |address|. |
350 int AddressLength() const { | 350 int AddressLength() const { |
351 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); | 351 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0); |
352 } | 352 } |
353 }; | 353 }; |
354 | 354 |
355 | 355 |
356 // Host. | 356 // Host. |
357 // | 357 // |
358 // The 8-bit version requires UTF-8 encoding. Use this version when you only | 358 // The 8-bit version requires UTF-8 encoding. Use this version when you only |
359 // need to know whether canonicalization succeeded. | 359 // need to know whether canonicalization succeeded. |
360 URL_EXPORT bool CanonicalizeHost(const char* spec, | 360 URL_EXPORT bool CanonicalizeHost(const char* spec, |
361 const Component& host, | 361 const Component& host, |
362 CanonOutput* output, | 362 CanonOutput* output, |
363 Component* out_host); | 363 Component* out_host); |
364 URL_EXPORT bool CanonicalizeHost(const base::char16* spec, | 364 URL_EXPORT bool CanonicalizeHost(const base::char16* spec, |
365 const Component& host, | 365 const Component& host, |
366 CanonOutput* output, | 366 CanonOutput* output, |
367 Component* out_host); | 367 Component* out_host); |
368 | 368 |
369 // Extended version of CanonicalizeHost, which returns additional information. | 369 // Extended version of CanonicalizeHost, which returns additional information. |
370 // Use this when you need to know whether the hostname was an IP address. | 370 // Use this when you need to know whether the hostname was an IP address. |
371 // A successful return is indicated by host_info->family != BROKEN. See the | 371 // A successful return is indicated by host_info->family != BROKEN. See the |
372 // definition of CanonHostInfo above for details. | 372 // definition of CanonHostInfo above for details. |
373 URL_EXPORT void CanonicalizeHostVerbose(const char* spec, | 373 URL_EXPORT void CanonicalizeHostVerbose(const char* spec, |
374 const Component& host, | 374 const Component& host, |
375 CanonOutput* output, | 375 CanonOutput* output, |
376 CanonHostInfo* host_info); | 376 CanonHostInfo* host_info); |
377 URL_EXPORT void CanonicalizeHostVerbose(const base::char16* spec, | 377 URL_EXPORT void CanonicalizeHostVerbose(const base::char16* spec, |
378 const Component& host, | 378 const Component& host, |
379 CanonOutput* output, | 379 CanonOutput* output, |
380 CanonHostInfo* host_info); | 380 CanonHostInfo* host_info); |
381 | 381 |
(...skipping 165 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
547 int spec_len, | 547 int spec_len, |
548 const Parsed& parsed, | 548 const Parsed& parsed, |
549 CanonOutput* output, | 549 CanonOutput* output, |
550 Parsed* new_parsed); | 550 Parsed* new_parsed); |
551 URL_EXPORT bool CanonicalizePathURL(const base::char16* spec, | 551 URL_EXPORT bool CanonicalizePathURL(const base::char16* spec, |
552 int spec_len, | 552 int spec_len, |
553 const Parsed& parsed, | 553 const Parsed& parsed, |
554 CanonOutput* output, | 554 CanonOutput* output, |
555 Parsed* new_parsed); | 555 Parsed* new_parsed); |
556 | 556 |
557 // Use for mailto URLs. This "canonicalizes" the url into a path and query | 557 // Use for mailto URLs. This "canonicalizes" the URL into a path and query |
558 // component. It does not attempt to merge "to" fields. It uses UTF-8 for | 558 // component. It does not attempt to merge "to" fields. It uses UTF-8 for |
559 // the query encoding if there is a query. This is because a mailto URL is | 559 // the query encoding if there is a query. This is because a mailto URL is |
560 // really intended for an external mail program, and the encoding of a page, | 560 // really intended for an external mail program, and the encoding of a page, |
561 // etc. which would influence a query encoding normally are irrelevant. | 561 // etc. which would influence a query encoding normally are irrelevant. |
562 URL_EXPORT bool CanonicalizeMailtoURL(const char* spec, | 562 URL_EXPORT bool CanonicalizeMailtoURL(const char* spec, |
563 int spec_len, | 563 int spec_len, |
564 const Parsed& parsed, | 564 const Parsed& parsed, |
565 CanonOutput* output, | 565 CanonOutput* output, |
566 Parsed* new_parsed); | 566 Parsed* new_parsed); |
567 URL_EXPORT bool CanonicalizeMailtoURL(const base::char16* spec, | 567 URL_EXPORT bool CanonicalizeMailtoURL(const base::char16* spec, |
568 int spec_len, | 568 int spec_len, |
569 const Parsed& parsed, | 569 const Parsed& parsed, |
570 CanonOutput* output, | 570 CanonOutput* output, |
571 Parsed* new_parsed); | 571 Parsed* new_parsed); |
572 | 572 |
573 // Part replacer -------------------------------------------------------------- | 573 // Part replacer -------------------------------------------------------------- |
574 | 574 |
575 // Internal structure used for storing separate strings for each component. | 575 // Internal structure used for storing separate strings for each component. |
576 // The basic canonicalization functions use this structure internally so that | 576 // The basic canonicalization functions use this structure internally so that |
577 // component replacement (different strings for different components) can be | 577 // component replacement (different strings for different components) can be |
578 // treated on the same code path as regular canonicalization (the same string | 578 // treated on the same code path as regular canonicalization (the same string |
579 // for each component). | 579 // for each component). |
580 // | 580 // |
581 // A Parsed structure usually goes along with this. Those | 581 // A Parsed structure usually goes along with this. Those components identify |
582 // components identify offsets within these strings, so that they can all be | 582 // offsets within these strings, so that they can all be in the same string, |
583 // in the same string, or spread arbitrarily across different ones. | 583 // or spread arbitrarily across different ones. |
584 // | 584 // |
585 // This structures does not own any data. It is the caller's responsibility to | 585 // This structures does not own any data. It is the caller's responsibility to |
586 // ensure that the data the pointers point to stays in scope and is not | 586 // ensure that the data the pointers point to stays in scope and is not |
587 // modified. | 587 // modified. |
588 template<typename CHAR> | 588 template<typename CHAR> |
589 struct URLComponentSource { | 589 struct URLComponentSource { |
590 // Constructor normally used by callers wishing to replace components. This | 590 // Constructor normally used by callers wishing to replace components. This |
591 // will make them all NULL, which is no replacement. The caller would then | 591 // will make them all NULL, which is no replacement. The caller would then |
592 // override the components they want to replace. | 592 // override the components they want to replace. |
593 URLComponentSource() | 593 URLComponentSource() |
(...skipping 124 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
718 void SetRef(const CHAR* s, const Component& comp) { | 718 void SetRef(const CHAR* s, const Component& comp) { |
719 sources_.ref = s; | 719 sources_.ref = s; |
720 components_.ref = comp; | 720 components_.ref = comp; |
721 } | 721 } |
722 void ClearRef() { | 722 void ClearRef() { |
723 sources_.ref = Placeholder(); | 723 sources_.ref = Placeholder(); |
724 components_.ref = Component(); | 724 components_.ref = Component(); |
725 } | 725 } |
726 bool IsRefOverridden() const { return sources_.ref != NULL; } | 726 bool IsRefOverridden() const { return sources_.ref != NULL; } |
727 | 727 |
728 // Getters for the itnernal data. See the variables below for how the | 728 // Getters for the internal data. See the variables below for how the |
729 // information is encoded. | 729 // information is encoded. |
730 const URLComponentSource<CHAR>& sources() const { return sources_; } | 730 const URLComponentSource<CHAR>& sources() const { return sources_; } |
731 const Parsed& components() const { return components_; } | 731 const Parsed& components() const { return components_; } |
732 | 732 |
733 private: | 733 private: |
734 // Returns a pointer to a static empty string that is used as a placeholder | 734 // Returns a pointer to a static empty string that is used as a placeholder |
735 // to indicate a component should be deleted (see below). | 735 // to indicate a component should be deleted (see below). |
736 const CHAR* Placeholder() { | 736 const CHAR* Placeholder() { |
737 static const CHAR empty_cstr = 0; | 737 static const CHAR empty_cstr = 0; |
738 return &empty_cstr; | 738 return &empty_cstr; |
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
856 // and the identified relevant portion of the relative URL (computed by | 856 // and the identified relevant portion of the relative URL (computed by |
857 // IsRelativeURL), this produces a new parsed canonical URL in |output| and | 857 // IsRelativeURL), this produces a new parsed canonical URL in |output| and |
858 // |out_parsed|. | 858 // |out_parsed|. |
859 // | 859 // |
860 // It also requires a flag indicating whether the base URL is a file: URL | 860 // It also requires a flag indicating whether the base URL is a file: URL |
861 // which triggers additional logic. | 861 // which triggers additional logic. |
862 // | 862 // |
863 // The base URL should be canonical and have a host (may be empty for file | 863 // The base URL should be canonical and have a host (may be empty for file |
864 // URLs) and a path. If it doesn't have these, we can't resolve relative | 864 // URLs) and a path. If it doesn't have these, we can't resolve relative |
865 // URLs off of it and will return the base as the output with an error flag. | 865 // URLs off of it and will return the base as the output with an error flag. |
866 // Becausee it is canonical is should also be ASCII. | 866 // Because it is canonical is should also be ASCII. |
867 // | 867 // |
868 // The query charset converter follows the same rules as CanonicalizeQuery. | 868 // The query charset converter follows the same rules as CanonicalizeQuery. |
869 // | 869 // |
870 // Returns true on success. On failure, the output will be "something | 870 // Returns true on success. On failure, the output will be "something |
871 // reasonable" that will be consistent and valid, just probably not what | 871 // reasonable" that will be consistent and valid, just probably not what |
872 // was intended by the web page author or caller. | 872 // was intended by the web page author or caller. |
873 URL_EXPORT bool ResolveRelativeURL(const char* base_url, | 873 URL_EXPORT bool ResolveRelativeURL(const char* base_url, |
874 const Parsed& base_parsed, | 874 const Parsed& base_parsed, |
875 bool base_is_file, | 875 bool base_is_file, |
876 const char* relative_url, | 876 const char* relative_url, |
877 const Component& relative_component, | 877 const Component& relative_component, |
878 CharsetConverter* query_converter, | 878 CharsetConverter* query_converter, |
879 CanonOutput* output, | 879 CanonOutput* output, |
880 Parsed* out_parsed); | 880 Parsed* out_parsed); |
881 URL_EXPORT bool ResolveRelativeURL(const char* base_url, | 881 URL_EXPORT bool ResolveRelativeURL(const char* base_url, |
882 const Parsed& base_parsed, | 882 const Parsed& base_parsed, |
883 bool base_is_file, | 883 bool base_is_file, |
884 const base::char16* relative_url, | 884 const base::char16* relative_url, |
885 const Component& relative_component, | 885 const Component& relative_component, |
886 CharsetConverter* query_converter, | 886 CharsetConverter* query_converter, |
887 CanonOutput* output, | 887 CanonOutput* output, |
888 Parsed* out_parsed); | 888 Parsed* out_parsed); |
889 | 889 |
890 } // namespace url | 890 } // namespace url |
891 | 891 |
892 #endif // URL_URL_CANON_H_ | 892 #endif // URL_URL_CANON_H_ |
OLD | NEW |