OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // url_formatter contains routines for formatting URLs in a way that can be |
| 6 // safely and securely displayed to users. For example, it is responsible |
| 7 // for determining when to convert an IDN A-Label (e.g. "xn--[something]") |
| 8 // into the IDN U-Label. |
| 9 // |
| 10 // Note that this formatting is only intended for display purposes; it would |
| 11 // be insecure and insufficient to make comparisons solely on formatted URLs |
| 12 // (that is, it should not be used for normalizing URLs for comparison for |
| 13 // security decisions). |
| 14 |
| 15 #ifndef COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_ |
| 16 #define COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_ |
| 17 |
| 18 #include <stdint.h> |
| 19 |
| 20 #include <string> |
| 21 #include <vector> |
| 22 |
| 23 #include "base/strings/string16.h" |
| 24 #include "base/strings/utf_offset_string_conversions.h" |
| 25 #include "net/base/escape.h" |
| 26 |
| 27 class GURL; |
| 28 |
| 29 namespace url { |
| 30 struct Parsed; |
| 31 } // url |
| 32 |
| 33 namespace url_formatter { |
| 34 |
| 35 // Used by FormatUrl to specify handling of certain parts of the url. |
| 36 typedef uint32_t FormatUrlType; |
| 37 typedef uint32_t FormatUrlTypes; |
| 38 |
| 39 // Nothing is ommitted. |
| 40 extern const FormatUrlType kFormatUrlOmitNothing; |
| 41 |
| 42 // If set, any username and password are removed. |
| 43 extern const FormatUrlType kFormatUrlOmitUsernamePassword; |
| 44 |
| 45 // If the scheme is 'http://', it's removed. |
| 46 extern const FormatUrlType kFormatUrlOmitHTTP; |
| 47 |
| 48 // Omits the path if it is just a slash and there is no query or ref. This is |
| 49 // meaningful for non-file "standard" URLs. |
| 50 extern const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname; |
| 51 |
| 52 // Convenience for omitting all unecessary types. |
| 53 extern const FormatUrlType kFormatUrlOmitAll; |
| 54 |
| 55 // Creates a string representation of |url|. The IDN host name may be in Unicode |
| 56 // if |languages| accepts the Unicode representation. |format_type| is a bitmask |
| 57 // of FormatUrlTypes, see it for details. |unescape_rules| defines how to clean |
| 58 // the URL for human readability. You will generally want |UnescapeRule::SPACES| |
| 59 // for display to the user if you can handle spaces, or |UnescapeRule::NORMAL| |
| 60 // if not. If the path part and the query part seem to be encoded in %-encoded |
| 61 // UTF-8, decodes %-encoding and UTF-8. |
| 62 // |
| 63 // The last three parameters may be NULL. |
| 64 // |
| 65 // |new_parsed| will be set to the parsing parameters of the resultant URL. |
| 66 // |
| 67 // |prefix_end| will be the length before the hostname of the resultant URL. |
| 68 // |
| 69 // |offset[s]_for_adjustment| specifies one or more offsets into the original |
| 70 // URL, representing insertion or selection points between characters: if the |
| 71 // input is "http://foo.com/", offset 0 is before the entire URL, offset 7 is |
| 72 // between the scheme and the host, and offset 15 is after the end of the URL. |
| 73 // Valid input offsets range from 0 to the length of the input URL string. On |
| 74 // exit, each offset will have been modified to reflect any changes made to the |
| 75 // output string. For example, if |url| is "http://a:b@c.com/", |
| 76 // |omit_username_password| is true, and an offset is 12 (pointing between 'c' |
| 77 // and '.'), then on return the output string will be "http://c.com/" and the |
| 78 // offset will be 8. If an offset cannot be successfully adjusted (e.g. because |
| 79 // it points into the middle of a component that was entirely removed or into |
| 80 // the middle of an encoding sequence), it will be set to base::string16::npos. |
| 81 // For consistency, if an input offset points between the scheme and the |
| 82 // username/password, and both are removed, on output this offset will be 0 |
| 83 // rather than npos; this means that offsets at the starts and ends of removed |
| 84 // components are always transformed the same way regardless of what other |
| 85 // components are adjacent. |
| 86 base::string16 FormatUrl(const GURL& url, |
| 87 const std::string& languages, |
| 88 FormatUrlTypes format_types, |
| 89 net::UnescapeRule::Type unescape_rules, |
| 90 url::Parsed* new_parsed, |
| 91 size_t* prefix_end, |
| 92 size_t* offset_for_adjustment); |
| 93 |
| 94 base::string16 FormatUrlWithOffsets( |
| 95 const GURL& url, |
| 96 const std::string& languages, |
| 97 FormatUrlTypes format_types, |
| 98 net::UnescapeRule::Type unescape_rules, |
| 99 url::Parsed* new_parsed, |
| 100 size_t* prefix_end, |
| 101 std::vector<size_t>* offsets_for_adjustment); |
| 102 |
| 103 // This function is like those above except it takes |adjustments| rather |
| 104 // than |offset[s]_for_adjustment|. |adjustments| will be set to reflect all |
| 105 // the transformations that happened to |url| to convert it into the returned |
| 106 // value. |
| 107 base::string16 FormatUrlWithAdjustments( |
| 108 const GURL& url, |
| 109 const std::string& languages, |
| 110 FormatUrlTypes format_types, |
| 111 net::UnescapeRule::Type unescape_rules, |
| 112 url::Parsed* new_parsed, |
| 113 size_t* prefix_end, |
| 114 base::OffsetAdjuster::Adjustments* adjustments); |
| 115 |
| 116 // This is a convenience function for FormatUrl() with |
| 117 // format_types = kFormatUrlOmitAll and unescape = SPACES. This is the typical |
| 118 // set of flags for "URLs to display to the user". You should be cautious about |
| 119 // using this for URLs which will be parsed or sent to other applications. |
| 120 inline base::string16 FormatUrl(const GURL& url, const std::string& languages) { |
| 121 return FormatUrl(url, languages, kFormatUrlOmitAll, net::UnescapeRule::SPACES, |
| 122 nullptr, nullptr, nullptr); |
| 123 } |
| 124 |
| 125 // Returns whether FormatUrl() would strip a trailing slash from |url|, given a |
| 126 // format flag including kFormatUrlOmitTrailingSlashOnBareHostname. |
| 127 bool CanStripTrailingSlash(const GURL& url); |
| 128 |
| 129 // Formats the host in |url| and appends it to |output|. The host formatter |
| 130 // takes the same accept languages component as ElideURL(). |
| 131 void AppendFormattedHost(const GURL& url, |
| 132 const std::string& languages, |
| 133 base::string16* output); |
| 134 |
| 135 // Converts the given host name to unicode characters. This can be called for |
| 136 // any host name, if the input is not IDN or is invalid in some way, we'll just |
| 137 // return the ASCII source so it is still usable. |
| 138 // |
| 139 // The input should be the canonicalized ASCII host name from GURL. This |
| 140 // function does NOT accept UTF-8! |
| 141 // |
| 142 // |languages| is a comma separated list of ISO 639 language codes. It |
| 143 // is used to determine whether a hostname is 'comprehensible' to a user |
| 144 // who understands languages listed. |host| will be converted to a |
| 145 // human-readable form (Unicode) ONLY when each component of |host| is |
| 146 // regarded as 'comprehensible'. Scipt-mixing is not allowed except that |
| 147 // Latin letters in the ASCII range can be mixed with a limited set of |
| 148 // script-language pairs (currently Han, Kana and Hangul for zh,ja and ko). |
| 149 // When |languages| is empty, even that mixing is not allowed. |
| 150 base::string16 IDNToUnicode(const std::string& host, |
| 151 const std::string& languages); |
| 152 |
| 153 } // url_formatter |
| 154 |
| 155 #endif // COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_ |
OLD | NEW |