src/url_util.cc - Issue 564011: Remove the rule that "://" means a standard URL. This fixes a number of bugs...

Unified Diff: src/url_util.cc

Issue 564011: Remove the rule that "://" means a standard URL. This fixes a number of bugs... (Closed) Base URL: http://google-url.googlecode.com/svn/trunk/

Patch Set: '' Created 10 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/url_util.cc

===================================================================

--- src/url_util.cc (revision 122)

+++ src/url_util.cc (working copy)

@@ -58,13 +58,15 @@

const char kFileScheme[] = "file"; // Used in a number of places.

const char kMailtoScheme[] = "mailto";

-const int kNumStandardURLSchemes = 5;

+const int kNumStandardURLSchemes = 7;

const char* kStandardURLSchemes[kNumStandardURLSchemes] = {

"http",

"https",

kFileScheme, // Yes, file urls can have a hostname!

"ftp",

"gopher",

+ "ws", // WebSocket.

+ "wss", // WebSocket secure.

};

// List of the currently installed standard schemes. This list is lazily

@@ -96,10 +98,9 @@

}

// Returns true if the given scheme identified by |scheme| within |spec| is one

-// of the registered "standard" schemes. Note that this does not check for

-// "://", use IsStandard for that.

+// of the registered "standard" schemes.

template<typename CHAR>

-bool IsStandardScheme(const CHAR* spec, const url_parse::Component& scheme) {

+bool DoIsStandard(const CHAR* spec, const url_parse::Component& scheme) {

if (!scheme.is_nonempty())

return false; // Empty or invalid schemes are non-standard.

@@ -112,28 +113,7 @@

return false;

}

-// Returns true if the stuff following the scheme in the given spec indicates

-// a "standard" URL. The presence of "://" after the scheme indicates that

-// there is a hostname, etc. which we call a standard URL.

template<typename CHAR>

-bool HasStandardSchemeSeparator(const CHAR* spec, int spec_len,

- const url_parse::Component& scheme) {

- int after_scheme = scheme.end();

- if (spec_len < after_scheme + 3)

- return false;

- return spec[after_scheme] == ':' &&

- spec[after_scheme + 1] == '/' &&

- spec[after_scheme + 2] == '/';

-template<typename CHAR>

-bool DoIsStandard(const CHAR* spec, int spec_len,

- const url_parse::Component& scheme) {

- return HasStandardSchemeSeparator(spec, spec_len, scheme) ||

- IsStandardScheme(spec, scheme);

-template<typename CHAR>

bool DoFindAndCompareScheme(const CHAR* str,

int str_len,

const char* compare,

@@ -184,7 +164,7 @@

#endif

url_parse::Component scheme;

- if(!url_parse::ExtractScheme(spec, spec_len, &scheme))

+ if (!url_parse::ExtractScheme(spec, spec_len, &scheme))

return false;

// This is the parsed version of the input URL, we have to canonicalize it

@@ -197,7 +177,7 @@

charset_converter,

output, output_parsed);

- } else if (IsStandard(spec, spec_len, scheme)) {

+ } else if (DoIsStandard(spec, scheme)) {

// All "normal" URLs.

url_parse::ParseStandardURL(spec, spec_len, &parsed_input);

success = url_canon::CanonicalizeStandardURL(spec, spec_len, parsed_input,

@@ -239,7 +219,7 @@

// See if our base URL should be treated as "standard".

bool standard_base_scheme =

base_parsed.scheme.is_nonempty() &&

- IsStandard(base_spec, base_spec_len, base_parsed.scheme);

+ DoIsStandard(base_spec, base_parsed.scheme);

bool is_relative;

url_parse::Component relative_component;

@@ -275,46 +255,82 @@

url_canon::CharsetConverter* charset_converter,

url_canon::CanonOutput* output,

url_parse::Parsed* out_parsed) {

- // Note that we dispatch to the parser according the the scheme type of

- // the OUTPUT URL. Normally, this is the same as our scheme, but if the

- // scheme is being overridden, we need to test that.

+ // If the scheme is overridden, just do a simple string substitution and

+ // reparse the whole thing. There are lots of edge cases that we really don't

+ // want to deal with. Like what happens if I replace "http://e:8080/foo"

+ // with a file. Does it become "file:///E:/8080/foo" where the port number

+ // becomes part of the path? Parsing that string as a file URL says "yes"

+ // but almost no sane rule for dealing with the components individually would

+ // come up with that.

+ //

+ // Why allow these crazy cases at all? Programatically, there is almost no

+ // case for replacing the scheme. The most common case for hitting this is

+ // in JS when building up a URL using the location object. In this case, the

+ // JS code expects the string substitution behavior:

+ // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3

+ if (replacements.IsSchemeOverridden()) {

+ // Canonicalize the new scheme so it is 8-bit and can be concatenated with

+ // the existing spec.

+ url_canon::RawCanonOutput<128> scheme_replaced;

+ url_parse::Component scheme_replaced_parsed;

+ url_canon::CanonicalizeScheme(

+ replacements.sources().scheme,

+ replacements.components().scheme,

+ &scheme_replaced, &scheme_replaced_parsed);

- if (// Either the scheme is not replaced and the old one is a file,

- (!replacements.IsSchemeOverridden() &&

- CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) ||

- // ...or it is being replaced and the new one is a file.

- (replacements.IsSchemeOverridden() &&

- CompareSchemeComponent(replacements.sources().scheme,

- replacements.components().scheme,

- kFileScheme))) {

+ // We can assume that the input is canonicalized, which means it always has

+ // a colon after the scheme (or where the scheme would be).

+ int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1

+ : 1;

+ if (spec_len - spec_after_colon > 0) {

+ scheme_replaced.Append(&spec[spec_after_colon],

+ spec_len - spec_after_colon);

+ }

+ // We now need to completely re-parse the resulting string since its meaning

+ // may have changed with the different scheme.

+ url_canon::RawCanonOutput<128> recanonicalized;

+ url_parse::Parsed recanonicalized_parsed;

+ DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(),

+ charset_converter,

+ &recanonicalized, &recanonicalized_parsed);

+ // Recurse using the version with the scheme already replaced. This will now

+ // use the replacement rules for the new scheme.

+ //

+ // Warning: this code assumes that ReplaceComponents will re-check all

+ // components for validity. This is because we can't fail if DoCanonicalize

+ // failed above since theoretically the thing making it fail could be

+ // getting replaced here. If ReplaceComponents didn't re-check everything,

+ // we wouldn't know if something *not* getting replaced is a problem.

+ // If the scheme-specific replacers are made more intelligent so they don't

+ // re-check everything, we should instead recanonicalize the whole thing

+ // after this call to check validity (this assumes replacing the scheme is

+ // much much less common than other types of replacements, like clearing the

+ // ref).

+ url_canon::Replacements<CHAR> replacements_no_scheme = replacements;

+ replacements_no_scheme.SetScheme(NULL, url_parse::Component());

+ return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),

+ recanonicalized_parsed, replacements_no_scheme,

+ charset_converter, output, out_parsed);

+ }

+ // If we get here, then we know the scheme doesn't need to be replaced, so can

+ // just key off the scheme in the spec to know how to do the replacements.

+ if (CompareSchemeComponent(spec, parsed.scheme, kFileScheme)) {

return url_canon::ReplaceFileURL(spec, parsed, replacements,

charset_converter, output, out_parsed);

}

- if (// Either the scheme is not replaced and the old one is standard,

- (!replacements.IsSchemeOverridden() &&

- IsStandard(spec, spec_len, parsed.scheme)) ||

- // ...or it is being replaced and the new one is standard.

- (replacements.IsSchemeOverridden() &&

- IsStandardScheme(replacements.sources().scheme,

- replacements.components().scheme))) {

- // Standard URL with all parts.

+ if (DoIsStandard(spec, parsed.scheme)) {

return url_canon::ReplaceStandardURL(spec, parsed, replacements,

charset_converter, output, out_parsed);

}

- if (// Either the scheme is not replaced and the old one is mailto,

- (!replacements.IsSchemeOverridden() &&

- CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) ||

- // ...or it is being replaced and the new one is a mailto.

- (replacements.IsSchemeOverridden() &&

- CompareSchemeComponent(replacements.sources().scheme,

- replacements.components().scheme,

- kMailtoScheme))) {

+ if (CompareSchemeComponent(spec, parsed.scheme, kMailtoScheme)) {

return url_canon::ReplaceMailtoURL(spec, parsed, replacements,

output, out_parsed);

}

+ // Default is a path URL.

return url_canon::ReplacePathURL(spec, parsed, replacements,

output, out_parsed);

}

@@ -335,14 +351,12 @@

standard_schemes->push_back(dup_scheme);

}

-bool IsStandard(const char* spec, int spec_len,

- const url_parse::Component& scheme) {

- return DoIsStandard(spec, spec_len, scheme);

+bool IsStandard(const char* spec, const url_parse::Component& scheme) {

+ return DoIsStandard(spec, scheme);

}

-bool IsStandard(const char16* spec, int spec_len,

- const url_parse::Component& scheme) {

- return DoIsStandard(spec, spec_len, scheme);

+bool IsStandard(const char16* spec, const url_parse::Component& scheme) {

+ return DoIsStandard(spec, scheme);

}

bool FindAndCompareScheme(const char* str,

« no previous file with comments | « src/url_util.h ('k') | src/url_util_unittest.cc » ('j') | no next file with comments »