src/url_util.cc - Issue 564011: Remove the rule that "://" means a standard URL. This fixes a number of bugs...

Side by Side Diff: src/url_util.cc

Issue 564011: Remove the rule that "://" means a standard URL. This fixes a number of bugs... (Closed) Base URL: http://google-url.googlecode.com/svn/trunk/

Patch Set: '' Created 10 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2007, Google Inc.	1 // Copyright 2007, Google Inc.

2 // All rights reserved.	2 // All rights reserved.

3 //	3 //

4 // Redistribution and use in source and binary forms, with or without	4 // Redistribution and use in source and binary forms, with or without

5 // modification, are permitted provided that the following conditions are	5 // modification, are permitted provided that the following conditions are

6 // met:	6 // met:

7 //	7 //

8 // * Redistributions of source code must retain the above copyright	8 // * Redistributions of source code must retain the above copyright

9 // notice, this list of conditions and the following disclaimer.	9 // notice, this list of conditions and the following disclaimer.

10 // * Redistributions in binary form must reproduce the above	10 // * Redistributions in binary form must reproduce the above

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
51 for (Iter it = a_begin; it != a_end; ++it, ++b) {	51 for (Iter it = a_begin; it != a_end; ++it, ++b) {

52 if (!b \|\| ToLowerASCII(it) != *b)	52 if (!b \|\| ToLowerASCII(it) != *b)

53 return false;	53 return false;

54 }	54 }

55 return *b == 0;	55 return *b == 0;

56 }	56 }

57	57

58 const char kFileScheme[] = "file"; // Used in a number of places.	58 const char kFileScheme[] = "file"; // Used in a number of places.

59 const char kMailtoScheme[] = "mailto";	59 const char kMailtoScheme[] = "mailto";

60	60

61 const int kNumStandardURLSchemes = 5;	61 const int kNumStandardURLSchemes = 7;

62 const char* kStandardURLSchemes[kNumStandardURLSchemes] = {	62 const char* kStandardURLSchemes[kNumStandardURLSchemes] = {

63 "http",	63 "http",

64 "https",	64 "https",

65 kFileScheme, // Yes, file urls can have a hostname!	65 kFileScheme, // Yes, file urls can have a hostname!

66 "ftp",	66 "ftp",

67 "gopher",	67 "gopher",

	68 "ws", // WebSocket.

	69 "wss", // WebSocket secure.

68 };	70 };

69	71

70 // List of the currently installed standard schemes. This list is lazily	72 // List of the currently installed standard schemes. This list is lazily

71 // initialized by InitStandardSchemes and is leaked on shutdown to prevent	73 // initialized by InitStandardSchemes and is leaked on shutdown to prevent

72 // any destructors from being called that will slow us down or cause problems.	74 // any destructors from being called that will slow us down or cause problems.

73 std::vector<const char> standard_schemes = NULL;	75 std::vector<const char> standard_schemes = NULL;

74	76

75 // Ensures that the standard_schemes list is initialized, does nothing if it	77 // Ensures that the standard_schemes list is initialized, does nothing if it

76 // already has values.	78 // already has values.

77 void InitStandardSchemes() {	79 void InitStandardSchemes() {

(...skipping 11 matching lines...) Expand all Loading...
89 const url_parse::Component& component,	91 const url_parse::Component& component,

90 const char* compare_to) {	92 const char* compare_to) {

91 if (!component.is_nonempty())	93 if (!component.is_nonempty())

92 return compare_to[0] == 0; // When component is empty, match empty scheme.	94 return compare_to[0] == 0; // When component is empty, match empty scheme.

93 return LowerCaseEqualsASCII(&spec[component.begin],	95 return LowerCaseEqualsASCII(&spec[component.begin],

94 &spec[component.end()],	96 &spec[component.end()],

95 compare_to);	97 compare_to);

96 }	98 }

97	99

98 // Returns true if the given scheme identified by \|scheme\| within \|spec\| is one	100 // Returns true if the given scheme identified by \|scheme\| within \|spec\| is one

99 // of the registered "standard" schemes. Note that this does not check for	101 // of the registered "standard" schemes.

100 // "://", use IsStandard for that.

101 template<typename CHAR>	102 template<typename CHAR>

102 bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {	103 bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {

103 if (!scheme.is_nonempty())	104 if (!scheme.is_nonempty())

104 return false; // Empty or invalid schemes are non-standard.	105 return false; // Empty or invalid schemes are non-standard.

105	106

106 InitStandardSchemes();	107 InitStandardSchemes();

107 for (size_t i = 0; i < standard_schemes->size(); i++) {	108 for (size_t i = 0; i < standard_schemes->size(); i++) {

108 if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],	109 if (LowerCaseEqualsASCII(&spec[scheme.begin], &spec[scheme.end()],

109 standard_schemes->at(i)))	110 standard_schemes->at(i)))

110 return true;	111 return true;

111 }	112 }

112 return false;	113 return false;

113 }	114 }

114	115

115 // Returns true if the stuff following the scheme in the given spec indicates

116 // a "standard" URL. The presence of "://" after the scheme indicates that

117 // there is a hostname, etc. which we call a standard URL.

118 template<typename CHAR>

119 bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len,

120 const url_parse::Component& scheme) {

121 int after_scheme = scheme.end();

122 if (spec_len < after_scheme + 3)

123 return false;

124 return spec[after_scheme] == ':' &&

125 spec[after_scheme + 1] == '/' &&

126 spec[after_scheme + 2] == '/';

127 }

128

129 template<typename CHAR>

130 bool DoIsStandard(const CHAR* spec, int spec_len,

131 const url_parse::Component& scheme) {

132 return HasStandardSchemeSeparator(spec, spec_len, scheme) \|\|

133 IsStandardScheme(spec, scheme);

134 }

135

136 template<typename CHAR>	116 template<typename CHAR>

137 bool DoFindAndCompareScheme(const CHAR* str,	117 bool DoFindAndCompareScheme(const CHAR* str,

138 int str_len,	118 int str_len,

139 const char* compare,	119 const char* compare,

140 url_parse::Component* found_scheme) {	120 url_parse::Component* found_scheme) {

141 url_parse::Component our_scheme;	121 url_parse::Component our_scheme;

142 if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) {	122 if (!url_parse::ExtractScheme(str, str_len, &our_scheme)) {

143 // No scheme.	123 // No scheme.

144 if (found_scheme)	124 if (found_scheme)

145 *found_scheme = url_parse::Component();	125 *found_scheme = url_parse::Component();

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
177 if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) \|\|	157 if (url_parse::DoesBeginUNCPath(spec, 0, spec_len, false) \|\|

178 url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {	158 url_parse::DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {

179 url_parse::ParseFileURL(spec, spec_len, &parsed_input);	159 url_parse::ParseFileURL(spec, spec_len, &parsed_input);

180 return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,	160 return url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,

181 charset_converter,	161 charset_converter,

182 output, output_parsed);	162 output, output_parsed);

183 }	163 }

184 #endif	164 #endif

185	165

186 url_parse::Component scheme;	166 url_parse::Component scheme;

187 if(!url_parse::ExtractScheme(spec, spec_len, &scheme))	167 if (!url_parse::ExtractScheme(spec, spec_len, &scheme))

188 return false;	168 return false;

189	169

190 // This is the parsed version of the input URL, we have to canonicalize it	170 // This is the parsed version of the input URL, we have to canonicalize it

191 // before storing it in our object.	171 // before storing it in our object.

192 bool success;	172 bool success;

193 if (CompareSchemeComponent(spec, scheme, kFileScheme)) {	173 if (CompareSchemeComponent(spec, scheme, kFileScheme)) {

194 // File URLs are special.	174 // File URLs are special.

195 url_parse::ParseFileURL(spec, spec_len, &parsed_input);	175 url_parse::ParseFileURL(spec, spec_len, &parsed_input);

196 success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,	176 success = url_canon::CanonicalizeFileURL(spec, spec_len, parsed_input,

197 charset_converter,	177 charset_converter,

198 output, output_parsed);	178 output, output_parsed);

199	179

200 } else if (IsStandard(spec, spec_len, scheme)) {	180 } else if (DoIsStandard(spec, scheme)) {

201 // All "normal" URLs.	181 // All "normal" URLs.

202 url_parse::ParseStandardURL(spec, spec_len, &parsed_input);	182 url_parse::ParseStandardURL(spec, spec_len, &parsed_input);

203 success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,	183 success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,

204 charset_converter,	184 charset_converter,

205 output, output_parsed);	185 output, output_parsed);

206	186

207 } else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) {	187 } else if (CompareSchemeComponent(spec, scheme, kMailtoScheme)) {

208 // Mailto are treated like a standard url with only a scheme, path, query	188 // Mailto are treated like a standard url with only a scheme, path, query

209 url_parse::ParseMailtoURL(spec, spec_len, &parsed_input);	189 url_parse::ParseMailtoURL(spec, spec_len, &parsed_input);

210 success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input,	190 success = url_canon::CanonicalizeMailtoURL(spec, spec_len, parsed_input,

(...skipping 21 matching lines...) Expand all Loading...
232 // copying to the new buffer.	212 // copying to the new buffer.

233 url_canon::RawCanonOutputT<CHAR> whitespace_buffer;	213 url_canon::RawCanonOutputT<CHAR> whitespace_buffer;

234 int relative_length;	214 int relative_length;

235 const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,	215 const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,

236 &whitespace_buffer,	216 &whitespace_buffer,

237 &relative_length);	217 &relative_length);

238	218

239 // See if our base URL should be treated as "standard".	219 // See if our base URL should be treated as "standard".

240 bool standard_base_scheme =	220 bool standard_base_scheme =

241 base_parsed.scheme.is_nonempty() &&	221 base_parsed.scheme.is_nonempty() &&

242 IsStandard(base_spec, base_spec_len, base_parsed.scheme);	222 DoIsStandard(base_spec, base_parsed.scheme);

243	223

244 bool is_relative;	224 bool is_relative;

245 url_parse::Component relative_component;	225 url_parse::Component relative_component;

246 if (!url_canon::IsRelativeURL(base_spec, base_parsed,	226 if (!url_canon::IsRelativeURL(base_spec, base_parsed,

247 relative, relative_length,	227 relative, relative_length,

248 standard_base_scheme,	228 standard_base_scheme,

249 &is_relative,	229 &is_relative,

250 &relative_component)) {	230 &relative_component)) {

251 // Error resolving.	231 // Error resolving.

252 return false;	232 return false;

(...skipping 15 matching lines...) Expand all Loading...
268 }	248 }

269	249

270 template<typename CHAR>	250 template<typename CHAR>

271 bool DoReplaceComponents(const char* spec,	251 bool DoReplaceComponents(const char* spec,

272 int spec_len,	252 int spec_len,

273 const url_parse::Parsed& parsed,	253 const url_parse::Parsed& parsed,

274 const url_canon::Replacements<CHAR>& replacements,	254 const url_canon::Replacements<CHAR>& replacements,

275 url_canon::CharsetConverter* charset_converter,	255 url_canon::CharsetConverter* charset_converter,

276 url_canon::CanonOutput* output,	256 url_canon::CanonOutput* output,

277 url_parse::Parsed* out_parsed) {	257 url_parse::Parsed* out_parsed) {

278 // Note that we dispatch to the parser according the the scheme type of	258 // If the scheme is overridden, just do a simple string substitution and

279 // the OUTPUT URL. Normally, this is the same as our scheme, but if the	259 // reparse the whole thing. There are lots of edge cases that we really don't

280 // scheme is being overridden, we need to test that.	260 // want to deal with. Like what happens if I replace "http://e:8080/foo"

	261 // with a file. Does it become "file:///E:/8080/foo" where the port number

	262 // becomes part of the path? Parsing that string as a file URL says "yes"

	263 // but almost no sane rule for dealing with the components individually would

	264 // come up with that.

	265 //

	266 // Why allow these crazy cases at all? Programatically, there is almost no

	267 // case for replacing the scheme. The most common case for hitting this is

	268 // in JS when building up a URL using the location object. In this case, the

	269 // JS code expects the string substitution behavior:

	270 // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3

	271 if (replacements.IsSchemeOverridden()) {

	272 // Canonicalize the new scheme so it is 8-bit and can be concatenated with

	273 // the existing spec.

	274 url_canon::RawCanonOutput<128> scheme_replaced;

	275 url_parse::Component scheme_replaced_parsed;

	276 url_canon::CanonicalizeScheme(

	277 replacements.sources().scheme,

	278 replacements.components().scheme,

	279 &scheme_replaced, &scheme_replaced_parsed);

281	280

282 if (// Either the scheme is not replaced and the old one is a file,	281 // We can assume that the input is canonicalized, which means it always has

283 (!replacements.IsSchemeOverridden() &&	282 // a colon after the scheme (or where the scheme would be).

284 CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) \|\|	283 int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1

285 // ...or it is being replaced and the new one is a file.	284 : 1;

286 (replacements.IsSchemeOverridden() &&	285 if (spec_len - spec_after_colon > 0) {

287 CompareSchemeComponent(replacements.sources().scheme,	286 scheme_replaced.Append(&spec[spec_after_colon],

288 replacements.components().scheme,	287 spec_len - spec_after_colon);

289 kFileScheme))) {	288 }

	289

	290 // We now need to completely re-parse the resulting string since its meaning

	291 // may have changed with the different scheme.

	292 url_canon::RawCanonOutput<128> recanonicalized;

	293 url_parse::Parsed recanonicalized_parsed;

	294 DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),

	295 charset_converter,

	296 &recanonicalized, &recanonicalized_parsed);

	297

	298 // Recurse using the version with the scheme already replaced. This will now

	299 // use the replacement rules for the new scheme.

	300 //

	301 // Warning: this code assumes that ReplaceComponents will re-check all

	302 // components for validity. This is because we can't fail if DoCanonicalize

	303 // failed above since theoretically the thing making it fail could be

	304 // getting replaced here. If ReplaceComponents didn't re-check everything,

	305 // we wouldn't know if something not getting replaced is a problem.

	306 // If the scheme-specific replacers are made more intelligent so they don't

	307 // re-check everything, we should instead recanonicalize the whole thing

	308 // after this call to check validity (this assumes replacing the scheme is

	309 // much much less common than other types of replacements, like clearing the

	310 // ref).

	311 url_canon::Replacements<CHAR> replacements_no_scheme = replacements;

	312 replacements_no_scheme.SetScheme(NULL, url_parse::Component());

	313 return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),

	314 recanonicalized_parsed, replacements_no_scheme,

	315 charset_converter, output, out_parsed);

	316 }

	317

	318 // If we get here, then we know the scheme doesn't need to be replaced, so can

	319 // just key off the scheme in the spec to know how to do the replacements.

	320 if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {

290 return url_canon::ReplaceFileURL(spec, parsed, replacements,	321 return url_canon::ReplaceFileURL(spec, parsed, replacements,

291 charset_converter, output, out_parsed);	322 charset_converter, output, out_parsed);

292 }	323 }

293	324 if (DoIsStandard(spec, parsed.scheme)) {

294 if (// Either the scheme is not replaced and the old one is standard,

295 (!replacements.IsSchemeOverridden() &&

296 IsStandard(spec, spec_len, parsed.scheme)) \|\|

297 // ...or it is being replaced and the new one is standard.

298 (replacements.IsSchemeOverridden() &&

299 IsStandardScheme(replacements.sources().scheme,

300 replacements.components().scheme))) {

301 // Standard URL with all parts.

302 return url_canon::ReplaceStandardURL(spec, parsed, replacements,	325 return url_canon::ReplaceStandardURL(spec, parsed, replacements,

303 charset_converter, output, out_parsed);	326 charset_converter, output, out_parsed);

304 }	327 }

305	328 if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {

306 if (// Either the scheme is not replaced and the old one is mailto,

307 (!replacements.IsSchemeOverridden() &&

308 CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) \|\|

309 // ...or it is being replaced and the new one is a mailto.

310 (replacements.IsSchemeOverridden() &&

311 CompareSchemeComponent(replacements.sources().scheme,

312 replacements.components().scheme,

313 kMailtoScheme))) {

314 return url_canon::ReplaceMailtoURL(spec, parsed, replacements,	329 return url_canon::ReplaceMailtoURL(spec, parsed, replacements,

315 output, out_parsed);	330 output, out_parsed);

316 }	331 }

317	332

	333 // Default is a path URL.

318 return url_canon::ReplacePathURL(spec, parsed, replacements,	334 return url_canon::ReplacePathURL(spec, parsed, replacements,

319 output, out_parsed);	335 output, out_parsed);

320 }	336 }

321	337

322 } // namespace	338 } // namespace

323	339

324 void AddStandardScheme(const char* new_scheme) {	340 void AddStandardScheme(const char* new_scheme) {

325 size_t scheme_len = strlen(new_scheme);	341 size_t scheme_len = strlen(new_scheme);

326 if (scheme_len == 0)	342 if (scheme_len == 0)

327 return;	343 return;

328	344

329 // Dulicate the scheme into a new buffer and add it to the list of standard	345 // Dulicate the scheme into a new buffer and add it to the list of standard

330 // schemes. This pointer will be leaked on shutdown.	346 // schemes. This pointer will be leaked on shutdown.

331 char* dup_scheme = new char[scheme_len + 1];	347 char* dup_scheme = new char[scheme_len + 1];

332 memcpy(dup_scheme, new_scheme, scheme_len + 1);	348 memcpy(dup_scheme, new_scheme, scheme_len + 1);

333	349

334 InitStandardSchemes();	350 InitStandardSchemes();

335 standard_schemes->push_back(dup_scheme);	351 standard_schemes->push_back(dup_scheme);

336 }	352 }

337	353

338 bool IsStandard(const char* spec, int spec_len,	354 bool IsStandard(const char* spec, const url_parse::Component& scheme) {

339 const url_parse::Component& scheme) {	355 return DoIsStandard(spec, scheme);

340 return DoIsStandard(spec, spec_len, scheme);

341 }	356 }

342	357

343 bool IsStandard(const char16* spec, int spec_len,	358 bool IsStandard(const char16* spec, const url_parse::Component& scheme) {

344 const url_parse::Component& scheme) {	359 return DoIsStandard(spec, scheme);

345 return DoIsStandard(spec, spec_len, scheme);

346 }	360 }

347	361

348 bool FindAndCompareScheme(const char* str,	362 bool FindAndCompareScheme(const char* str,

349 int str_len,	363 int str_len,

350 const char* compare,	364 const char* compare,

351 url_parse::Component* found_scheme) {	365 url_parse::Component* found_scheme) {

352 return DoFindAndCompareScheme(str, str_len, compare, found_scheme);	366 return DoFindAndCompareScheme(str, str_len, compare, found_scheme);

353 }	367 }

354	368

355 bool FindAndCompareScheme(const char16* str,	369 bool FindAndCompareScheme(const char16* str,

(...skipping 88 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
444 return a_begin == a_end && b_begin == b_end;	458 return a_begin == a_end && b_begin == b_end;

445 }	459 }

446	460

447 bool LowerCaseEqualsASCII(const char16* a_begin,	461 bool LowerCaseEqualsASCII(const char16* a_begin,

448 const char16* a_end,	462 const char16* a_end,

449 const char* b) {	463 const char* b) {

450 return DoLowerCaseEqualsASCII(a_begin, a_end, b);	464 return DoLowerCaseEqualsASCII(a_begin, a_end, b);

451 }	465 }

452	466

453 } // namespace url_util	467 } // namespace url_util

OLD	NEW

« no previous file with comments | « src/url_util.h ('k') | src/url_util_unittest.cc » ('j') | no next file with comments »