OLD | NEW |
| (Empty) |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include <algorithm> | |
6 #include <windows.h> | |
7 | |
8 #include "chrome/browser/url_fixer_upper.h" | |
9 | |
10 #include "base/file_util.h" | |
11 #include "base/logging.h" | |
12 #include "base/string_util.h" | |
13 #include "chrome/common/gfx/text_elider.h" | |
14 #include "googleurl/src/gurl.h" | |
15 #include "googleurl/src/url_canon.h" | |
16 #include "googleurl/src/url_file.h" | |
17 #include "googleurl/src/url_parse.h" | |
18 #include "googleurl/src/url_util.h" | |
19 #include "net/base/escape.h" | |
20 #include "net/base/net_util.h" | |
21 #include "net/base/registry_controlled_domain.h" | |
22 | |
23 using namespace std; | |
24 | |
25 // does some basic fixes for input that we want to test for file-ness | |
26 static void PrepareStringForFileOps(const wstring& text, wstring* output) { | |
27 TrimWhitespace(text, TRIM_ALL, output); | |
28 replace(output->begin(), output->end(), '/', '\\'); | |
29 } | |
30 | |
31 // Tries to create a full path from |text|. If the result is valid and the | |
32 // file exists, returns true and sets |full_path| to the result. Otherwise, | |
33 // returns false and leaves |full_path| unchanged. | |
34 static bool ValidPathForFile(const wstring& text, wstring* full_path) { | |
35 wchar_t file_path[MAX_PATH]; | |
36 if (!_wfullpath(file_path, text.c_str(), MAX_PATH)) | |
37 return false; | |
38 | |
39 if (!file_util::PathExists(file_path)) | |
40 return false; | |
41 | |
42 full_path->assign(file_path); | |
43 return true; | |
44 } | |
45 | |
46 // Tries to create a file: URL from |text| if it looks like a filename, even if | |
47 // it doesn't resolve as a valid path or to an existing file. Returns true | |
48 // with a (possibly invalid) file: URL in |fixed_up_url| for input beginning | |
49 // with a drive specifier or "\\". Returns false in other cases (including | |
50 // file: URLs: these don't look like filenames), leaving fixed_up_url | |
51 // unchanged. | |
52 static wstring FixupPath(const wstring& text) { | |
53 DCHECK(text.length() >= 2); | |
54 | |
55 wstring filename; | |
56 PrepareStringForFileOps(text, &filename); | |
57 | |
58 if (filename[1] == '|') | |
59 filename[1] = ':'; | |
60 | |
61 // Here, we know the input looks like a file. | |
62 GURL file_url = net::FilePathToFileURL(filename); | |
63 if (file_url.is_valid()) | |
64 return gfx::ElideUrl(file_url, ChromeFont(), 0, std::wstring()); | |
65 | |
66 // Invalid file URL, just return the input. | |
67 return text; | |
68 } | |
69 | |
70 // Checks |domain| to see if a valid TLD is already present. If not, appends | |
71 // |desired_tld| to the domain, and prepends "www." unless it's already present. | |
72 // Then modifies |fixed_up_url| to reflect the changes. | |
73 static void AddDesiredTLD(const wstring& desired_tld, | |
74 wstring* domain) { | |
75 if (desired_tld.empty() || domain->empty()) | |
76 return; | |
77 | |
78 // Check the TLD. If the return value is positive, we already have a TLD, so | |
79 // abort; if the return value is wstring::npos, there's no valid host (e.g. if | |
80 // the user pasted in garbage for which HistoryURLProvider is trying to | |
81 // suggest an exact match), so adding a TLD makes no sense. The only useful | |
82 // case is where the return value is 0 (there's a valid host with no known | |
83 // TLD). We disallow unknown registries here so users can input "mail.yahoo" | |
84 // and hit ctrl-enter to get "www.mail.yahoo.com". | |
85 const size_t registry_length = | |
86 net::RegistryControlledDomainService::GetRegistryLength(*domain, false); | |
87 if (registry_length != 0) | |
88 return; | |
89 | |
90 // Add the suffix at the end of the domain. | |
91 const size_t domain_length(domain->length()); | |
92 DCHECK(domain_length > 0); | |
93 DCHECK(desired_tld[0] != '.'); | |
94 if ((*domain)[domain_length - 1] != '.') | |
95 domain->push_back('.'); | |
96 domain->append(desired_tld); | |
97 | |
98 // Now, if the domain begins with "www.", stop. | |
99 const wstring prefix(L"www."); | |
100 if (domain->compare(0, prefix.length(), prefix) != 0) { | |
101 // Otherwise, add www. to the beginning of the URL. | |
102 domain->insert(0, prefix); | |
103 } | |
104 } | |
105 | |
106 static inline void FixupUsername(const wstring& text, | |
107 const url_parse::Component& part, | |
108 wstring* url) { | |
109 if (!part.is_valid()) | |
110 return; | |
111 | |
112 // We don't fix up the username at the moment. | |
113 url->append(text, part.begin, part.len); | |
114 // Do not append the trailing '@' because we might need to include the user's | |
115 // password. FixupURL itself will append the '@' for us. | |
116 } | |
117 | |
118 static inline void FixupPassword(const wstring& text, | |
119 const url_parse::Component& part, | |
120 wstring* url) { | |
121 if (!part.is_valid()) | |
122 return; | |
123 | |
124 // We don't fix up the password at the moment. | |
125 url->append(L":"); | |
126 url->append(text, part.begin, part.len); | |
127 } | |
128 | |
129 static void FixupHost(const wstring& text, | |
130 const url_parse::Component& part, | |
131 bool has_scheme, | |
132 const wstring& desired_tld, | |
133 wstring* url) { | |
134 if (!part.is_valid()) | |
135 return; | |
136 | |
137 // Make domain valid. | |
138 // Strip all leading dots and all but one trailing dot, unless the user only | |
139 // typed dots, in which case their input is totally invalid and we should just | |
140 // leave it unchanged. | |
141 wstring domain(text, part.begin, part.len); | |
142 const size_t first_nondot(domain.find_first_not_of('.')); | |
143 if (first_nondot != wstring::npos) { | |
144 domain.erase(0, first_nondot); | |
145 size_t last_nondot(domain.find_last_not_of('.')); | |
146 DCHECK(last_nondot != wstring::npos); | |
147 last_nondot += 2; // Point at second period in ending string | |
148 if (last_nondot < domain.length()) | |
149 domain.erase(last_nondot); | |
150 } | |
151 | |
152 // Add any user-specified TLD, if applicable. | |
153 AddDesiredTLD(desired_tld, &domain); | |
154 | |
155 url->append(domain); | |
156 } | |
157 | |
158 // Looks for a port number, including initial colon, at port_start. If | |
159 // something invalid (which cannot be fixed up) is found, like ":foo" or | |
160 // ":7:7", returns false. Otherwise, removes any extra colons | |
161 // ("::1337" -> ":1337", ":/" -> "/") and returns true. | |
162 static void FixupPort(const wstring& text, | |
163 const url_parse::Component& part, | |
164 wstring* url) { | |
165 if (!part.is_valid()) | |
166 return; | |
167 | |
168 // Look for non-digit in port and strip if found. | |
169 wstring port(text, part.begin, part.len); | |
170 for (wstring::iterator i = port.begin(); i != port.end(); ) { | |
171 if (IsAsciiDigit(*i)) | |
172 ++i; | |
173 else | |
174 i = port.erase(i); | |
175 } | |
176 | |
177 if (port.empty()) | |
178 return; // Nothing to append. | |
179 | |
180 url->append(L":"); | |
181 url->append(port); | |
182 } | |
183 | |
184 static inline void FixupPath(const wstring& text, | |
185 const url_parse::Component& part, | |
186 wstring* url) { | |
187 if (!part.is_valid() || part.len == 0) { | |
188 // We should always have a path. | |
189 url->append(L"/"); | |
190 return; | |
191 } | |
192 | |
193 // Append the path as is. | |
194 url->append(text, part.begin, part.len); | |
195 } | |
196 | |
197 static inline void FixupQuery(const wstring& text, | |
198 const url_parse::Component& part, | |
199 wstring* url) { | |
200 if (!part.is_valid()) | |
201 return; | |
202 | |
203 // We don't fix up the query at the moment. | |
204 url->append(L"?"); | |
205 url->append(text, part.begin, part.len); | |
206 } | |
207 | |
208 static inline void FixupRef(const wstring& text, | |
209 const url_parse::Component& part, | |
210 wstring* url) { | |
211 if (!part.is_valid()) | |
212 return; | |
213 | |
214 // We don't fix up the ref at the moment. | |
215 url->append(L"#"); | |
216 url->append(text, part.begin, part.len); | |
217 } | |
218 | |
219 static void OffsetComponent(int offset, url_parse::Component* part) { | |
220 DCHECK(part); | |
221 | |
222 if (part->is_valid()) { | |
223 // Offset the location of this component. | |
224 part->begin += offset; | |
225 | |
226 // This part might not have existed in the original text. | |
227 if (part->begin < 0) | |
228 part->reset(); | |
229 } | |
230 } | |
231 | |
232 static bool HasPort(const std::wstring& original_text, | |
233 const url_parse::Component& scheme_component, | |
234 const std::wstring& scheme) { | |
235 // Find the range between the ":" and the "/". | |
236 size_t port_start = scheme_component.end() + 1; | |
237 size_t port_end = port_start; | |
238 while ((port_end < original_text.length()) && | |
239 !url_parse::IsAuthorityTerminator(original_text[port_end])) | |
240 ++port_end; | |
241 if (port_end == port_start) | |
242 return false; | |
243 | |
244 // Scan the range to see if it is entirely digits. | |
245 for (size_t i = port_start; i < port_end; ++i) { | |
246 if (!IsAsciiDigit(original_text[i])) | |
247 return false; | |
248 } | |
249 | |
250 return true; | |
251 } | |
252 | |
253 wstring URLFixerUpper::SegmentURL(const wstring& text, | |
254 url_parse::Parsed* parts) { | |
255 // Initialize the result. | |
256 *parts = url_parse::Parsed(); | |
257 | |
258 wstring trimmed; | |
259 TrimWhitespace(text, TRIM_ALL, &trimmed); | |
260 if (trimmed.empty()) | |
261 return wstring(); // Nothing to segment. | |
262 | |
263 int trimmed_length = static_cast<int>(trimmed.length()); | |
264 if (url_parse::DoesBeginWindowsDriveSpec(trimmed.data(), 0, trimmed_length) | |
265 || url_parse::DoesBeginUNCPath(trimmed.data(), 0, trimmed_length, false)) | |
266 return L"file"; | |
267 | |
268 // Otherwise, we need to look at things carefully. | |
269 wstring scheme; | |
270 if (url_parse::ExtractScheme(text.data(), | |
271 static_cast<int>(text.length()), | |
272 &parts->scheme)) { | |
273 // We were able to extract a scheme. Remember what we have, but we may | |
274 // decide to change our minds later. | |
275 scheme.assign(text.substr(parts->scheme.begin, parts->scheme.len)); | |
276 | |
277 if (parts->scheme.is_valid() && | |
278 // Valid schemes are ASCII-only. | |
279 (!IsStringASCII(scheme) || | |
280 // We need to fix up the segmentation for "www.example.com:/". For this | |
281 // case, we guess that schemes with a "." are not actually schemes. | |
282 (scheme.find(L".") != wstring::npos) || | |
283 // We need to fix up the segmentation for "www:123/". For this case, we | |
284 // will add an HTTP scheme later and make the URL parser happy. | |
285 // TODO(pkasting): Maybe we should try to use GURL's parser for this? | |
286 HasPort(text, parts->scheme, scheme))) | |
287 parts->scheme.reset(); | |
288 } | |
289 | |
290 // When we couldn't find a scheme in the input, we need to pick one. Normally | |
291 // we choose http, but if the URL starts with "ftp.", we match other browsers | |
292 // and choose ftp. | |
293 if (!parts->scheme.is_valid()) | |
294 scheme.assign(StartsWith(text, L"ftp.", false) ? L"ftp" : L"http"); | |
295 | |
296 // Cannonicalize the scheme. | |
297 StringToLowerASCII(&scheme); | |
298 | |
299 // Not segmenting file schemes or nonstandard schemes. | |
300 if ((scheme == L"file") || | |
301 !url_util::IsStandard(scheme.c_str(), static_cast<int>(scheme.length()), | |
302 url_parse::Component(0, static_cast<int>(scheme.length())))) | |
303 return scheme; | |
304 | |
305 if (parts->scheme.is_valid()) { | |
306 // Have the GURL parser do the heavy lifting for us. | |
307 url_parse::ParseStandardURL(text.data(), static_cast<int>(text.length()), | |
308 parts); | |
309 return scheme; | |
310 } | |
311 | |
312 // We need to add a scheme in order for ParseStandardURL to be happy. | |
313 // Find the first non-whitespace character. | |
314 wstring::const_iterator first_nonwhite = text.begin(); | |
315 while ((first_nonwhite != text.end()) && IsWhitespace(*first_nonwhite)) | |
316 ++first_nonwhite; | |
317 | |
318 // Construct the text to parse by inserting the scheme. | |
319 wstring inserted_text(scheme); | |
320 inserted_text.append(L"://"); | |
321 wstring text_to_parse(text.begin(), first_nonwhite); | |
322 text_to_parse.append(inserted_text); | |
323 text_to_parse.append(first_nonwhite, text.end()); | |
324 | |
325 // Have the GURL parser do the heavy lifting for us. | |
326 url_parse::ParseStandardURL(text_to_parse.data(), | |
327 static_cast<int>(text_to_parse.length()), | |
328 parts); | |
329 | |
330 // Offset the results of the parse to match the original text. | |
331 const int offset = -static_cast<int>(inserted_text.length()); | |
332 OffsetComponent(offset, &parts->scheme); | |
333 OffsetComponent(offset, &parts->username); | |
334 OffsetComponent(offset, &parts->password); | |
335 OffsetComponent(offset, &parts->host); | |
336 OffsetComponent(offset, &parts->port); | |
337 OffsetComponent(offset, &parts->path); | |
338 OffsetComponent(offset, &parts->query); | |
339 OffsetComponent(offset, &parts->ref); | |
340 | |
341 return scheme; | |
342 } | |
343 | |
344 std::wstring URLFixerUpper::FixupURL(const wstring& text, | |
345 const wstring& desired_tld) { | |
346 wstring trimmed; | |
347 TrimWhitespace(text, TRIM_ALL, &trimmed); | |
348 if (trimmed.empty()) | |
349 return wstring(); // Nothing here. | |
350 | |
351 // Segment the URL. | |
352 url_parse::Parsed parts; | |
353 wstring scheme(SegmentURL(trimmed, &parts)); | |
354 | |
355 // We handle the file scheme separately. | |
356 if (scheme == L"file") | |
357 return (parts.scheme.is_valid() ? text : FixupPath(text)); | |
358 | |
359 // For some schemes whose layouts we understand, we rebuild it. | |
360 if (url_util::IsStandard(scheme.c_str(), static_cast<int>(scheme.length()), | |
361 url_parse::Component(0, static_cast<int>(scheme.length())))) { | |
362 wstring url(scheme); | |
363 url.append(L"://"); | |
364 | |
365 // We need to check whether the |username| is valid because it is our | |
366 // responsibility to append the '@' to delineate the user information from | |
367 // the host portion of the URL. | |
368 if (parts.username.is_valid()) { | |
369 FixupUsername(trimmed, parts.username, &url); | |
370 FixupPassword(trimmed, parts.password, &url); | |
371 url.append(L"@"); | |
372 } | |
373 | |
374 FixupHost(trimmed, parts.host, parts.scheme.is_valid(), desired_tld, &url); | |
375 FixupPort(trimmed, parts.port, &url); | |
376 FixupPath(trimmed, parts.path, &url); | |
377 FixupQuery(trimmed, parts.query, &url); | |
378 FixupRef(trimmed, parts.ref, &url); | |
379 | |
380 return url; | |
381 } | |
382 | |
383 // In the worst-case, we insert a scheme if the URL lacks one. | |
384 if (!parts.scheme.is_valid()) { | |
385 wstring fixed_scheme(scheme); | |
386 fixed_scheme.append(L"://"); | |
387 trimmed.insert(0, fixed_scheme); | |
388 } | |
389 | |
390 return trimmed; | |
391 } | |
392 | |
393 // The rules are different here than for regular fixup, since we need to handle | |
394 // input like "hello.html" and know to look in the current directory. Regular | |
395 // fixup will look for cues that it is actually a file path before trying to | |
396 // figure out what file it is. If our logic doesn't work, we will fall back on | |
397 // regular fixup. | |
398 wstring URLFixerUpper::FixupRelativeFile(const wstring& base_dir, | |
399 const wstring& text) { | |
400 wchar_t old_cur_directory[MAX_PATH]; | |
401 if (!base_dir.empty()) { | |
402 // save the old current directory before we move to the new one | |
403 // TODO: in the future, we may want to handle paths longer than MAX_PATH | |
404 GetCurrentDirectory(MAX_PATH, old_cur_directory); | |
405 SetCurrentDirectory(base_dir.c_str()); | |
406 } | |
407 | |
408 // allow funny input with extra whitespace and the wrong kind of slashes | |
409 wstring trimmed; | |
410 PrepareStringForFileOps(text, &trimmed); | |
411 | |
412 bool is_file = true; | |
413 wstring full_path; | |
414 if (!ValidPathForFile(trimmed, &full_path)) { | |
415 // Not a path as entered, try unescaping it in case the user has | |
416 // escaped things. We need to go through 8-bit since the escaped values | |
417 // only represent 8-bit values. | |
418 std::wstring unescaped = UTF8ToWide(UnescapeURLComponent( | |
419 WideToUTF8(trimmed), | |
420 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS)); | |
421 if (!ValidPathForFile(unescaped, &full_path)) | |
422 is_file = false; | |
423 } | |
424 | |
425 // Put back the current directory if we saved it. | |
426 if (!base_dir.empty()) | |
427 SetCurrentDirectory(old_cur_directory); | |
428 | |
429 if (is_file) { | |
430 GURL file_url = net::FilePathToFileURL(full_path); | |
431 if (file_url.is_valid()) | |
432 return gfx::ElideUrl(file_url, ChromeFont(), 0, std::wstring()); | |
433 // Invalid files fall through to regular processing. | |
434 } | |
435 | |
436 // Fall back on regular fixup for this input. | |
437 return FixupURL(text, L""); | |
438 } | |
439 | |
OLD | NEW |