OLD | NEW |
| (Empty) |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 // url_formatter contains routines for formatting URLs in a way that can be | |
6 // safely and securely displayed to users. For example, it is responsible | |
7 // for determining when to convert an IDN A-Label (e.g. "xn--[something]") | |
8 // into the IDN U-Label. | |
9 // | |
10 // Note that this formatting is only intended for display purposes; it would | |
11 // be insecure and insufficient to make comparisons solely on formatted URLs | |
12 // (that is, it should not be used for normalizing URLs for comparison for | |
13 // security decisions). | |
14 | |
15 #ifndef COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_ | |
16 #define COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_ | |
17 | |
18 #include <stdint.h> | |
19 | |
20 #include <string> | |
21 #include <vector> | |
22 | |
23 #include "base/strings/string16.h" | |
24 #include "base/strings/utf_offset_string_conversions.h" | |
25 #include "net/base/escape.h" | |
26 | |
27 class GURL; | |
28 | |
29 namespace url { | |
30 struct Parsed; | |
31 } // url | |
32 | |
33 namespace url_formatter { | |
34 | |
35 // Used by FormatUrl to specify handling of certain parts of the url. | |
36 typedef uint32_t FormatUrlType; | |
37 typedef uint32_t FormatUrlTypes; | |
38 | |
39 // Nothing is ommitted. | |
40 extern const FormatUrlType kFormatUrlOmitNothing; | |
41 | |
42 // If set, any username and password are removed. | |
43 extern const FormatUrlType kFormatUrlOmitUsernamePassword; | |
44 | |
45 // If the scheme is 'http://', it's removed. | |
46 extern const FormatUrlType kFormatUrlOmitHTTP; | |
47 | |
48 // Omits the path if it is just a slash and there is no query or ref. This is | |
49 // meaningful for non-file "standard" URLs. | |
50 extern const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname; | |
51 | |
52 // Convenience for omitting all unecessary types. | |
53 extern const FormatUrlType kFormatUrlOmitAll; | |
54 | |
55 // Creates a string representation of |url|. The IDN host name may be in Unicode | |
56 // if |languages| accepts the Unicode representation. |format_type| is a bitmask | |
57 // of FormatUrlTypes, see it for details. |unescape_rules| defines how to clean | |
58 // the URL for human readability. You will generally want |UnescapeRule::SPACES| | |
59 // for display to the user if you can handle spaces, or |UnescapeRule::NORMAL| | |
60 // if not. If the path part and the query part seem to be encoded in %-encoded | |
61 // UTF-8, decodes %-encoding and UTF-8. | |
62 // | |
63 // The last three parameters may be NULL. | |
64 // | |
65 // |new_parsed| will be set to the parsing parameters of the resultant URL. | |
66 // | |
67 // |prefix_end| will be the length before the hostname of the resultant URL. | |
68 // | |
69 // |offset[s]_for_adjustment| specifies one or more offsets into the original | |
70 // URL, representing insertion or selection points between characters: if the | |
71 // input is "http://foo.com/", offset 0 is before the entire URL, offset 7 is | |
72 // between the scheme and the host, and offset 15 is after the end of the URL. | |
73 // Valid input offsets range from 0 to the length of the input URL string. On | |
74 // exit, each offset will have been modified to reflect any changes made to the | |
75 // output string. For example, if |url| is "http://a:b@c.com/", | |
76 // |omit_username_password| is true, and an offset is 12 (pointing between 'c' | |
77 // and '.'), then on return the output string will be "http://c.com/" and the | |
78 // offset will be 8. If an offset cannot be successfully adjusted (e.g. because | |
79 // it points into the middle of a component that was entirely removed or into | |
80 // the middle of an encoding sequence), it will be set to base::string16::npos. | |
81 // For consistency, if an input offset points between the scheme and the | |
82 // username/password, and both are removed, on output this offset will be 0 | |
83 // rather than npos; this means that offsets at the starts and ends of removed | |
84 // components are always transformed the same way regardless of what other | |
85 // components are adjacent. | |
86 base::string16 FormatUrl(const GURL& url, | |
87 const std::string& languages, | |
88 FormatUrlTypes format_types, | |
89 net::UnescapeRule::Type unescape_rules, | |
90 url::Parsed* new_parsed, | |
91 size_t* prefix_end, | |
92 size_t* offset_for_adjustment); | |
93 | |
94 base::string16 FormatUrlWithOffsets( | |
95 const GURL& url, | |
96 const std::string& languages, | |
97 FormatUrlTypes format_types, | |
98 net::UnescapeRule::Type unescape_rules, | |
99 url::Parsed* new_parsed, | |
100 size_t* prefix_end, | |
101 std::vector<size_t>* offsets_for_adjustment); | |
102 | |
103 // This function is like those above except it takes |adjustments| rather | |
104 // than |offset[s]_for_adjustment|. |adjustments| will be set to reflect all | |
105 // the transformations that happened to |url| to convert it into the returned | |
106 // value. | |
107 base::string16 FormatUrlWithAdjustments( | |
108 const GURL& url, | |
109 const std::string& languages, | |
110 FormatUrlTypes format_types, | |
111 net::UnescapeRule::Type unescape_rules, | |
112 url::Parsed* new_parsed, | |
113 size_t* prefix_end, | |
114 base::OffsetAdjuster::Adjustments* adjustments); | |
115 | |
116 // This is a convenience function for FormatUrl() with | |
117 // format_types = kFormatUrlOmitAll and unescape = SPACES. This is the typical | |
118 // set of flags for "URLs to display to the user". You should be cautious about | |
119 // using this for URLs which will be parsed or sent to other applications. | |
120 inline base::string16 FormatUrl(const GURL& url, const std::string& languages) { | |
121 return FormatUrl(url, languages, kFormatUrlOmitAll, net::UnescapeRule::SPACES, | |
122 nullptr, nullptr, nullptr); | |
123 } | |
124 | |
125 // Returns whether FormatUrl() would strip a trailing slash from |url|, given a | |
126 // format flag including kFormatUrlOmitTrailingSlashOnBareHostname. | |
127 bool CanStripTrailingSlash(const GURL& url); | |
128 | |
129 // Formats the host in |url| and appends it to |output|. The host formatter | |
130 // takes the same accept languages component as ElideURL(). | |
131 void AppendFormattedHost(const GURL& url, | |
132 const std::string& languages, | |
133 base::string16* output); | |
134 | |
135 // Converts the given host name to unicode characters. This can be called for | |
136 // any host name, if the input is not IDN or is invalid in some way, we'll just | |
137 // return the ASCII source so it is still usable. | |
138 // | |
139 // The input should be the canonicalized ASCII host name from GURL. This | |
140 // function does NOT accept UTF-8! | |
141 // | |
142 // |languages| is a comma separated list of ISO 639 language codes. It | |
143 // is used to determine whether a hostname is 'comprehensible' to a user | |
144 // who understands languages listed. |host| will be converted to a | |
145 // human-readable form (Unicode) ONLY when each component of |host| is | |
146 // regarded as 'comprehensible'. Scipt-mixing is not allowed except that | |
147 // Latin letters in the ASCII range can be mixed with a limited set of | |
148 // script-language pairs (currently Han, Kana and Hangul for zh,ja and ko). | |
149 // When |languages| is empty, even that mixing is not allowed. | |
150 base::string16 IDNToUnicode(const std::string& host, | |
151 const std::string& languages); | |
152 | |
153 } // url_formatter | |
154 | |
155 #endif // COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_ | |
OLD | NEW |