chrome/browser/autocomplete/autocomplete.cc - Issue 292003: Parse input with explicit schemes better. Before, if the user typed "http://...

Side by Side Diff: chrome/browser/autocomplete/autocomplete.cc

Issue 292003: Parse input with explicit schemes better. Before, if the user typed "http://... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "chrome/browser/autocomplete/autocomplete.h"	5 #include "chrome/browser/autocomplete/autocomplete.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8	8

9 #include "app/l10n_util.h"	9 #include "app/l10n_util.h"

10 #include "base/basictypes.h"	10 #include "base/basictypes.h"

(...skipping 96 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
107 // to have a scheme.	107 // to have a scheme.

108 const std::wstring parsed_scheme(URLFixerUpper::SegmentURL(text, parts));	108 const std::wstring parsed_scheme(URLFixerUpper::SegmentURL(text, parts));

109 if (scheme)	109 if (scheme)

110 *scheme = parsed_scheme;	110 *scheme = parsed_scheme;

111	111

112 if (parsed_scheme == L"file") {	112 if (parsed_scheme == L"file") {

113 // A user might or might not type a scheme when entering a file URL.	113 // A user might or might not type a scheme when entering a file URL.

114 return URL;	114 return URL;

115 }	115 }

116	116

117 // If the user typed a scheme, determine our available actions based on that.	117 // If the user typed a scheme, and it's HTTP or HTTPS, we know how to parse it

118 if (parts->scheme.is_valid()) {	118 // well enough that we can fall through to the heuristics below. If it's

	119 // something else, we can just determine our action based on what we do with

	120 // any input of this scheme. In theory we could do better with some schemes

	121 // (e.g. "ftp" or "view-source") but I'll wait to spend the effort on that

	122 // until I run into some cases that really need it.

	123 if (parts->scheme.is_nonempty() &&

	124 (parsed_scheme != L"http") && (parsed_scheme != L"https")) {

119 // See if we know how to handle the URL internally.	125 // See if we know how to handle the URL internally.

120 if (URLRequest::IsHandledProtocol(WideToASCII(parsed_scheme)))	126 if (URLRequest::IsHandledProtocol(WideToASCII(parsed_scheme)))

121 return URL;	127 return URL;

122	128

123 // There are also some schemes that we convert to other things before they	129 // There are also some schemes that we convert to other things before they

124 // reach the renderer or else the renderer handles internally without	130 // reach the renderer or else the renderer handles internally without

125 // reaching the URLRequest logic. We thus won't catch these above, but we	131 // reaching the URLRequest logic. We thus won't catch these above, but we

126 // should still claim to handle them.	132 // should still claim to handle them.

127 if (LowerCaseEqualsASCII(parsed_scheme, chrome::kViewSourceScheme) \|\|	133 if (LowerCaseEqualsASCII(parsed_scheme, chrome::kViewSourceScheme) \|\|

128 LowerCaseEqualsASCII(parsed_scheme, chrome::kJavaScriptScheme) \|\|	134 LowerCaseEqualsASCII(parsed_scheme, chrome::kJavaScriptScheme) \|\|

(...skipping 17 matching lines...) Expand all Loading...
146 default:	152 default:

147 // We don't know about this scheme. It's likely to be a search operator	153 // We don't know about this scheme. It's likely to be a search operator

148 // like "site:" or "link:". We classify it as UNKNOWN so the user has	154 // like "site:" or "link:". We classify it as UNKNOWN so the user has

149 // the option of treating it as a URL if we're wrong.	155 // the option of treating it as a URL if we're wrong.

150 // Note that SegmentURL() is smart so we aren't tricked by "c:\foo" or	156 // Note that SegmentURL() is smart so we aren't tricked by "c:\foo" or

151 // "www.example.com:81" in this case.	157 // "www.example.com:81" in this case.

152 return UNKNOWN;	158 return UNKNOWN;

153 }	159 }

154 }	160 }

155	161

156 // The user didn't type a scheme. Assume that this is either an HTTP URL or	162 // Either the user didn't type a scheme, in which case we need to distinguish

157 // not a URL at all; try to determine which.	163 // between an HTTP URL and a query, or the scheme is HTTP or HTTPS, in which

	164 // case we should reject invalid formulations.

158	165

159 // It's not clear that we can reach here with an empty "host" (maybe on some	166 // If we have an empty host it can't be a URL.

160 // kinds of garbage input?), but if we did, it couldn't be a URL.

161 if (!parts->host.is_nonempty())	167 if (!parts->host.is_nonempty())

162 return QUERY;	168 return QUERY;

163 // (We use the registry length later below but ask for it here so we can check	169

164 // the host's validity at this point.)	170 // Likewise, the RCDS can reject certain obviously-invalid hosts. (We also

	171 // use the registry length later below.)

165 const std::wstring host(text.substr(parts->host.begin, parts->host.len));	172 const std::wstring host(text.substr(parts->host.begin, parts->host.len));

166 const size_t registry_length =	173 const size_t registry_length =

167 net::RegistryControlledDomainService::GetRegistryLength(host, false);	174 net::RegistryControlledDomainService::GetRegistryLength(host, false);

168 if (registry_length == std::wstring::npos)	175 if (registry_length == std::wstring::npos)

169 return QUERY; // Could be a broken IP address, etc.	176 return QUERY; // Could be a broken IP address, etc.

170	177

171 // See if the hostname is valid per RFC 1738. While IE and GURL allow	178 // See if the hostname is valid per RFC 1738. While IE and GURL allow

172 // hostnames to contain many other characters (perhaps for weird intranet	179 // hostnames to contain many other characters (perhaps for weird intranet

173 // machines), it's extremely unlikely that a user would be trying to type	180 // machines), it's extremely unlikely that a user would be trying to type

174 // those in for anything other than a search query.	181 // those in for anything other than a search query.

175 url_canon::CanonHostInfo host_info;	182 url_canon::CanonHostInfo host_info;

176 const std::string canonicalized_host(net::CanonicalizeHost(host, &host_info));	183 const std::string canonicalized_host(net::CanonicalizeHost(host, &host_info));

177 if ((host_info.family == url_canon::CanonHostInfo::NEUTRAL) &&	184 if ((host_info.family == url_canon::CanonHostInfo::NEUTRAL) &&

178 !net::IsCanonicalizedHostRFC1738Compliant(canonicalized_host))	185 !net::IsCanonicalizedHostRFC1738Compliant(canonicalized_host))

179 return QUERY;	186 return QUERY;

180	187

181 // Presence of a port means this is likely a URL, if the port is really a port	188 // Presence of a port means this is likely a URL, if the port is really a port

182 // number. If it's just garbage after a colon, this is a query.	189 // number. If it's just garbage after a colon, this is a query.

183 if (parts->port.is_nonempty()) {	190 if (parts->port.is_nonempty()) {

184 int port;	191 int port;

185 return (StringToInt(WideToUTF16(	192 return (StringToInt(WideToUTF16(

186 text.substr(parts->port.begin, parts->port.len)), &port) &&	193 text.substr(parts->port.begin, parts->port.len)), &port) &&

187 (port >= 0) && (port <= 65535)) ? URL : QUERY;	194 (port >= 0) && (port <= 65535)) ? URL : QUERY;

188 }	195 }

189	196

190 // Presence of a password means this is likely a URL. We don't treat	197 // Presence of a username could either indicate a URL or an email address

191 // usernames (without passwords) as indicating a URL, because this could be an	198 // ("user@mail.com"). E-mail addresses are likely queries so we only open

192 // email address like "user@mail.com" which is more likely a search than an	199 // this as a URL if the user explicitly typed a scheme.

193 // HTTP auth login attempt.	200 if (parts->username.is_nonempty() && parts->scheme.is_nonempty())

	201 return URL;

	202

	203 // Presence of a password means this is likely a URL. Note that unless the

	204 // user has typed an explicit "http://" or similar, we'll probably think that

	205 // the username is some unknown scheme, and bail out in the scheme-handling

	206 // code above.

194 if (parts->password.is_nonempty())	207 if (parts->password.is_nonempty())

195 return URL;	208 return URL;

196	209

197 // See if the host is an IP address.	210 // See if the host is an IP address.

198 if (host_info.family == url_canon::CanonHostInfo::IPV4) {	211 if (host_info.family == url_canon::CanonHostInfo::IPV4) {

199 // If the user originally typed a host that looks like an IP address (a	212 // If the user originally typed a host that looks like an IP address (a

200 // dotted quad), they probably want to open it. If the original input was	213 // dotted quad), they probably want to open it. If the original input was

201 // something else (like a single number), they probably wanted to search for	214 // something else (like a single number), they probably wanted to search for

202 // it. This is true even if the URL appears to have a path: "1.2/45" is	215 // it, unless they explicitly typed a scheme. This is true even if the URL

203 // more likely a search (for the answer to a math problem) than a URL.	216 // appears to have a path: "1.2/45" is more likely a search (for the answer

204 if (host_info.num_ipv4_components == 4)	217 // to a math problem) than a URL.

	218 if ((host_info.num_ipv4_components == 4) \|\| parts->scheme.is_nonempty())

205 return URL;	219 return URL;

206 return desired_tld.empty() ? UNKNOWN : REQUESTED_URL;	220 return desired_tld.empty() ? UNKNOWN : REQUESTED_URL;

207 }	221 }

208 if (host_info.family == url_canon::CanonHostInfo::IPV6)	222 if (host_info.family == url_canon::CanonHostInfo::IPV6)

209 return URL;	223 return URL;

210	224

211 // The host doesn't look like a number, so see if the user's given us a path.	225 // The host doesn't look like a number, so see if the user's given us a path.

212 if (parts->path.is_nonempty()) {	226 if (parts->path.is_nonempty()) {

213 // Most inputs with paths are URLs, even ones without known registries (e.g.	227 // Most inputs with paths are URLs, even ones without known registries (e.g.

214 // intranet URLs). However, if there's no known registry, and the path has	228 // intranet URLs). However, if the user didn't type a scheme, there's no

215 // a space, this is more likely a query with a slash in the first term (e.g.	229 // known registry, and the path has a space, this is more likely a query

216 // "ps/2 games") than a URL. We can still open URLs with spaces in the path	230 // with a slash in the first term (e.g. "ps/2 games") than a URL. We can

217 // by escaping the space, and we will still inline autocomplete them if	231 // still open URLs with spaces in the path by escaping the space, and we

218 // users have typed them in the past, but we default to searching since	232 // will still inline autocomplete them if users have typed them in the past,

219 // that's the common case.	233 // but we default to searching since that's the common case.

220 return ((registry_length == 0) &&	234 return (!parts->scheme.is_nonempty() && (registry_length == 0) &&

221 (text.substr(parts->path.begin, parts->path.len).find(' ') !=	235 (text.substr(parts->path.begin, parts->path.len).find(' ') !=

222 std::wstring::npos)) ? UNKNOWN : URL;	236 std::wstring::npos)) ? UNKNOWN : URL;

223 }	237 }

224	238

225 // If we reach here with a username, our input looks like "user@host"; this is	239 // If we reach here with a username, our input looks like "user@host"; this is

226 // the case mentioned above, where we think this is more likely an email	240 // the case mentioned above, where we think this is more likely an email

227 // address than an HTTP auth attempt, so search for it.	241 // address than an HTTP auth attempt, so search for it.

228 if (parts->username.is_nonempty())	242 if (parts->username.is_nonempty())

229 return UNKNOWN;	243 return UNKNOWN;

230	244

231 // We have a bare host string. See if it has a known TLD. If so, it's	245 // We have a bare host string. See if it has a known TLD or the user typed a

232 // probably a URL.	246 // scheme. If so, it's probably a URL.

233 if (registry_length != 0)	247 if (parts->scheme.is_nonempty() \|\| (registry_length != 0))

234 return URL;	248 return URL;

235	249

236 // No TLD that we know about. This could be:	250 // No TLD that we know about. This could be:

237 // * A string that the user wishes to add a desired_tld to to get a URL. If	251 // * A string that the user wishes to add a desired_tld to to get a URL. If

238 // we reach this point, we know there's no known TLD on the string, so the	252 // we reach this point, we know there's no known TLD on the string, so the

239 // fixup code will be willing to add one; thus this is a URL.	253 // fixup code will be willing to add one; thus this is a URL.

240 // * A single word "foo"; possibly an intranet site, but more likely a search.	254 // * A single word "foo"; possibly an intranet site, but more likely a search.

241 // This is ideally an UNKNOWN, and we can let the Alternate Nav URL code	255 // This is ideally an UNKNOWN, and we can let the Alternate Nav URL code

242 // catch our mistakes.	256 // catch our mistakes.

243 // * A URL with a valid TLD we don't know about yet. If e.g. a registrar adds	257 // * A URL with a valid TLD we don't know about yet. If e.g. a registrar adds

(...skipping 683 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
927 void AutocompleteController::CheckIfDone() {	941 void AutocompleteController::CheckIfDone() {

928 for (ACProviders::const_iterator i(providers_.begin()); i != providers_.end();	942 for (ACProviders::const_iterator i(providers_.begin()); i != providers_.end();

929 ++i) {	943 ++i) {

930 if (!(*i)->done()) {	944 if (!(*i)->done()) {

931 done_ = false;	945 done_ = false;

932 return;	946 return;

933 }	947 }

934 }	948 }

935 done_ = true;	949 done_ = true;

936 }	950 }

OLD	NEW

« no previous file with comments | « no previous file | chrome/browser/autocomplete/autocomplete_unittest.cc » ('j') | no next file with comments »