net/base/net_util.cc - Issue 372017: Fix various problems with inline autocomplete and URLs that change length dur...

Unified Diff: net/base/net_util.cc

Issue 372017: Fix various problems with inline autocomplete and URLs that change length dur... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: net/base/net_util.cc

===================================================================

--- net/base/net_util.cc (revision 31214)

+++ net/base/net_util.cc (working copy)

@@ -650,60 +650,51 @@

}

// Converts one component of a host (between dots) to IDN if safe. The result

-// will be APPENDED to the given output string and will be the same as the

-// input if it is not IDN or the IDN is unsafe to display.

-void IDNToUnicodeOneComponent(const char16* comp,

- int comp_len,

+// will be APPENDED to the given output string and will be the same as the input

+// if it is not IDN or the IDN is unsafe to display. Returns whether any

+// conversion was performed.

+bool IDNToUnicodeOneComponent(const char16* comp,

+ size_t comp_len,

const std::wstring& languages,

string16* out) {

- DCHECK(comp_len >= 0);

+ DCHECK(out);

if (comp_len == 0)

- return;

+ return false;

- // Expand the output string to make room for a possibly longer string

- // (we'll expand if it's still not big enough below).

- int extra_space = 64;

- size_t host_begin_in_output = out->size();

+ // Only transform if the input can be an IDN component.

+ static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'};

+ if ((comp_len > arraysize(kIdnPrefix)) &&

+ !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) {

+ // Repeatedly expand the output string until it's big enough. It looks like

+ // ICU will return the required size of the buffer, but that's not

+ // documented, so we'll just grow by 2x. This should be rare and is not on a

+ // critical path.

+ size_t original_length = out->length();

+ for (int extra_space = 64; ; extra_space *= 2) {

+ UErrorCode status = U_ZERO_ERROR;

+ out->resize(out->length() + extra_space);

+ int output_chars = uidna_IDNToUnicode(comp,

+ static_cast<int32_t>(comp_len), &(*out)[original_length], extra_space,

+ UIDNA_DEFAULT, NULL, &status);

+ if (status == U_ZERO_ERROR) {

+ // Converted successfully.

+ out->resize(original_length + output_chars);

+ if (IsIDNComponentSafe(out->data() + original_length, output_chars,

+ languages))

+ return true;

+ }

- // Just copy the input if it can't be an IDN component.

- if (comp_len < 4 ||

- comp[0] != 'x' || comp[1] != 'n' || comp[2] != '-' || comp[3] != '-') {

- out->resize(host_begin_in_output + comp_len);

- for (int i = 0; i < comp_len; i++)

- (*out)[host_begin_in_output + i] = comp[i];

- return;

- }

- while (true) {

- UErrorCode status = U_ZERO_ERROR;

- out->resize(out->size() + extra_space);

- int output_chars =

- uidna_IDNToUnicode(comp, comp_len, &(*out)[host_begin_in_output],

- extra_space, UIDNA_DEFAULT, NULL, &status);

- if (status == U_ZERO_ERROR) {

- // Converted successfully.

- out->resize(host_begin_in_output + output_chars);

- if (!IsIDNComponentSafe(&out->data()[host_begin_in_output],

- output_chars,

- languages))

- break; // The error handling below will undo the IDN.

- return;

+ if (status != U_BUFFER_OVERFLOW_ERROR)

+ break;

}

- if (status != U_BUFFER_OVERFLOW_ERROR)

- break;

- // Need to loop again with a bigger buffer. It looks like ICU will

- // return the required size of the buffer, but that's not documented,

- // so we'll just grow by 2x. This should be rare and is not on a

- // critical path.

- extra_space *= 2;

+ // Failed, revert back to original string.

+ out->resize(original_length);

}

- // We get here on error, in which case we replace anything that was added

- // with the literal input.

- out->resize(host_begin_in_output + comp_len);

- for (int i = 0; i < comp_len; i++)

- (*out)[host_begin_in_output + i] = comp[i];

+ // We get here with no IDN or on error, in which case we just append the

+ // literal input.

+ out->append(comp, comp_len);

+ return false;

}

// Helper for FormatUrl().

@@ -712,19 +703,23 @@

bool omit_username_password,

UnescapeRule::Type unescape_rules,

url_parse::Parsed* new_parsed,

- size_t* prefix_end) {

+ size_t* prefix_end,

+ size_t* offset_for_adjustment) {

DCHECK(new_parsed);

const wchar_t* const kWideViewSource = L"view-source:";

const size_t kViewSourceLengthPlus1 = 12;

GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1));

+ size_t temp_offset = (*offset_for_adjustment == std::wstring::npos) ?

+ std::wstring::npos : (*offset_for_adjustment - kViewSourceLengthPlus1);

+ size_t* temp_offset_ptr = (*offset_for_adjustment < kViewSourceLengthPlus1) ?

+ NULL : &temp_offset;

std::wstring result = net::FormatUrl(real_url, languages,

- omit_username_password, unescape_rules, new_parsed, prefix_end);

+ omit_username_password, unescape_rules, new_parsed, prefix_end,

+ temp_offset_ptr);

result.insert(0, kWideViewSource);

// Adjust position values.

- if (prefix_end)

- *prefix_end += kViewSourceLengthPlus1;

if (new_parsed->scheme.is_nonempty()) {

// Assume "view-source:real-scheme" as a scheme.

new_parsed->scheme.len += kViewSourceLengthPlus1;

@@ -746,6 +741,12 @@

new_parsed->query.begin += kViewSourceLengthPlus1;

if (new_parsed->ref.is_nonempty())

new_parsed->ref.begin += kViewSourceLengthPlus1;

+ if (prefix_end)

+ *prefix_end += kViewSourceLengthPlus1;

+ if (temp_offset_ptr) {

+ *offset_for_adjustment = (temp_offset == std::wstring::npos) ?

+ std::wstring::npos : (temp_offset + kViewSourceLengthPlus1);

+ }

return result;

}

@@ -769,12 +770,20 @@

// and the resulting range will be filled into |out_component|. |unescape_rules|

-// defines how to clean the URL for human readability.

+// defines how to clean the URL for human readability. |offset_for_adjustment|

+// is an offset into |output| which will be adjusted based on how it maps to the

+// component being converted; if it is less than output->length(), it will be

+// untouched, and if it is greater than output->length() + in_component.len it

+// will be shortened by the difference in lengths between the input and output

+// components. Otherwise it points into the component being converted, and is

+// adjusted to point to the same logical place in |output|.

+// |offset_for_adjustment| may not be NULL.

static void AppendFormattedComponent(const std::string& spec,

const url_parse::Component& in_component,

UnescapeRule::Type unescape_rules,

std::wstring* output,

- url_parse::Component* out_component);

+ url_parse::Component* out_component,

+ size_t* offset_for_adjustment);

GURL FilePathToFileURL(const FilePath& path) {

// Produce a URL like "file:///C:/foo" for a regular file, or

@@ -849,58 +858,56 @@

// We may want to skip this step in the case of file URLs to allow unicode

// UNC hostnames regardless of encodings.

-void IDNToUnicode(const char* host,

- int host_len,

- const std::wstring& languages,

- std::wstring* out) {

+std::wstring IDNToUnicode(const char* host,

+ size_t host_len,

+ const std::wstring& languages,

+ size_t* offset_for_adjustment) {

// Convert the ASCII input to a wide string for ICU.

string16 input16;

input16.reserve(host_len);

- for (int i = 0; i < host_len; i++)

- input16.push_back(host[i]);

+ std::copy(host, host + host_len, std::back_inserter(input16));

string16 out16;

- // The output string is appended to, so convert what's already there if

- // needed.

-#if defined(WCHAR_T_IS_UTF32)

- WideToUTF16(out->data(), out->length(), &out16);

- out->clear(); // for equivalence with the swap below

-#elif defined(WCHAR_T_IS_UTF16)

- out->swap(out16);

-#endif

+ size_t output_offset = offset_for_adjustment ?

+ *offset_for_adjustment : std::wstring::npos;

// Do each component of the host separately, since we enforce script matching

// on a per-component basis.

- size_t cur_begin = 0; // Beginning of the current component (inclusive).

- while (cur_begin < input16.size()) {

- // Find the next dot or the end of the string.

- size_t next_dot = input16.find_first_of('.', cur_begin);

- if (next_dot == std::wstring::npos)

- next_dot = input16.size(); // For getting the last component.

+ for (size_t component_start = 0, component_end;

+ component_start < input16.length();

+ component_start = component_end + 1) {

+ // Find the end of the component.

+ component_end = input16.find('.', component_start);

+ if (component_end == string16::npos)

+ component_end = input16.length(); // For getting the last component.

+ size_t component_length = component_end - component_start;

- if (next_dot > cur_begin) {

+ size_t output_component_start = out16.length();

+ bool converted_idn = false;

+ if (component_end > component_start) {

// Add the substring that we just found.

- IDNToUnicodeOneComponent(&input16[cur_begin],

- static_cast<int>(next_dot - cur_begin),

- languages,

- &out16);

+ converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start,

+ component_length, languages, &out16);

}

+ size_t output_component_length = out16.length() - output_component_start;

- // Need to add the dot we just found (if we found one). This needs to be

- // done before we break out below in case the URL ends in a dot.

- if (next_dot < input16.size())

+ if ((output_offset != std::wstring::npos) &&

+ (*offset_for_adjustment > component_start)) {

+ if ((*offset_for_adjustment < component_end) && converted_idn)

+ output_offset = std::wstring::npos;

+ else

+ output_offset += output_component_length - component_length;

+ }

+ // Need to add the dot we just found (if we found one).

+ if (component_end < input16.length())

out16.push_back('.');

- else

- break; // No more components left.

- cur_begin = next_dot + 1;

}

-#if defined(WCHAR_T_IS_UTF32)

- UTF16ToWide(out16.data(), out16.length(), out);

-#elif defined(WCHAR_T_IS_UTF16)

- out->swap(out16);

-#endif

+ if (offset_for_adjustment)

+ *offset_for_adjustment = output_offset;

+ return UTF16ToWideAndAdjustOffset(out16, offset_for_adjustment);

}

std::string CanonicalizeHost(const std::string& host,

@@ -1262,31 +1269,48 @@

std::wstring* username,

std::wstring* password) {

UnescapeRule::Type flags = UnescapeRule::SPACES;

- *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags);

- *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags);

+ *username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags, NULL);

+ *password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags, NULL);

}

void AppendFormattedHost(const GURL& url,

const std::wstring& languages,

std::wstring* output,

- url_parse::Parsed* new_parsed) {

+ url_parse::Parsed* new_parsed,

+ size_t* offset_for_adjustment) {

+ DCHECK(output);

const url_parse::Component& host =

url.parsed_for_possibly_invalid_spec().host;

if (host.is_nonempty()) {

// Handle possible IDN in the host name.

+ int new_host_begin = static_cast<int>(output->length());

if (new_parsed)

- new_parsed->host.begin = static_cast<int>(output->length());

+ new_parsed->host.begin = new_host_begin;

+ size_t offset_past_current_output =

+ (!offset_for_adjustment ||

+ (*offset_for_adjustment == std::wstring::npos) ||

+ (*offset_for_adjustment < output->length())) ?

+ std::wstring::npos : (*offset_for_adjustment - output->length());

+ size_t* offset_into_host =

+ (offset_past_current_output >= static_cast<size_t>(host.len)) ?

+ NULL : &offset_past_current_output;

const std::string& spec = url.possibly_invalid_spec();

DCHECK(host.begin >= 0 &&

((spec.length() == 0 && host.begin == 0) ||

host.begin < static_cast<int>(spec.length())));

- net::IDNToUnicode(&spec[host.begin], host.len, languages, output);

+ output->append(net::IDNToUnicode(&spec[host.begin],

+ static_cast<size_t>(host.len), languages, offset_into_host));

- if (new_parsed) {

- new_parsed->host.len =

- static_cast<int>(output->length()) - new_parsed->host.begin;

+ int new_host_len = static_cast<int>(output->length()) - new_host_begin;

+ if (new_parsed)

+ new_parsed->host.len = new_host_len;

+ if (offset_into_host) {

+ *offset_for_adjustment = (*offset_into_host == std::wstring::npos) ?

+ std::wstring::npos : (new_host_begin + *offset_into_host);

+ } else if (offset_past_current_output != std::wstring::npos) {

+ *offset_for_adjustment += new_host_len - host.len;

}

} else if (new_parsed) {

new_parsed->host.reset();

@@ -1298,19 +1322,36 @@

const url_parse::Component& in_component,

UnescapeRule::Type unescape_rules,

std::wstring* output,

- url_parse::Component* out_component) {

+ url_parse::Component* out_component,

+ size_t* offset_for_adjustment) {

+ DCHECK(output);

+ DCHECK(offset_for_adjustment);

if (in_component.is_nonempty()) {

out_component->begin = static_cast<int>(output->length());

+ size_t offset_past_current_output =

+ ((*offset_for_adjustment == std::wstring::npos) ||

+ (*offset_for_adjustment < output->length())) ?

+ std::wstring::npos : (*offset_for_adjustment - output->length());

+ size_t* offset_into_component =

+ (offset_past_current_output >= static_cast<size_t>(in_component.len)) ?

+ NULL : &offset_past_current_output;

if (unescape_rules == UnescapeRule::NONE) {

- output->append(UTF8ToWide(spec.substr(

- in_component.begin, in_component.len)));

+ output->append(UTF8ToWideAndAdjustOffset(

+ spec.substr(in_component.begin, in_component.len),

+ offset_into_component));

} else {

output->append(UnescapeAndDecodeUTF8URLComponent(

- spec.substr(in_component.begin, in_component.len),

- unescape_rules));

+ spec.substr(in_component.begin, in_component.len), unescape_rules,

+ offset_into_component));

}

out_component->len =

static_cast<int>(output->length()) - out_component->begin;

+ if (offset_into_component) {

+ *offset_for_adjustment = (*offset_into_component == std::wstring::npos) ?

+ std::wstring::npos : (out_component->begin + *offset_into_component);

+ } else if (offset_past_current_output != std::wstring::npos) {

+ *offset_for_adjustment += out_component->len - in_component.len;

+ }

} else {

out_component->reset();

}

@@ -1321,10 +1362,14 @@

bool omit_username_password,

UnescapeRule::Type unescape_rules,

url_parse::Parsed* new_parsed,

- size_t* prefix_end) {

+ size_t* prefix_end,

+ size_t* offset_for_adjustment) {

url_parse::Parsed parsed_temp;

if (!new_parsed)

new_parsed = &parsed_temp;

+ size_t offset_temp = std::wstring::npos;

+ if (!offset_for_adjustment)

+ offset_for_adjustment = &offset_temp;

std::wstring url_string;

@@ -1332,6 +1377,7 @@

if (url.is_empty()) {

if (prefix_end)

*prefix_end = 0;

+ *offset_for_adjustment = std::wstring::npos;

return url_string;

}

@@ -1343,19 +1389,22 @@

if (url.SchemeIs(kViewSource) &&

!StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {

return FormatViewSourceUrl(url, languages, omit_username_password,

- unescape_rules, new_parsed, prefix_end);

+ unescape_rules, new_parsed, prefix_end, offset_for_adjustment);

}

// We handle both valid and invalid URLs (this will give us the spec

// regardless of validity).

const std::string& spec = url.possibly_invalid_spec();

const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec();

+ if (*offset_for_adjustment >= spec.length())

+ *offset_for_adjustment = std::wstring::npos;

// Copy everything before the username (the scheme and the separators.)

// These are ASCII.

- int pre_end = parsed.CountCharactersBefore(url_parse::Parsed::USERNAME, true);

- for (int i = 0; i < pre_end; ++i)

- url_string.push_back(spec[i]);

+ std::copy(spec.begin(),

+ spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME,

+ true),

+ std::back_inserter(url_string));

new_parsed->scheme = parsed.scheme;

if (omit_username_password) {

@@ -1364,16 +1413,41 @@

// e.g. "http://google.com:search@evil.ru/"

new_parsed->username.reset();

new_parsed->password.reset();

+ if ((*offset_for_adjustment != std::wstring::npos) &&

+ (parsed.username.is_nonempty() || parsed.password.is_nonempty())) {

+ if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {

+ // The seeming off-by-one and off-by-two in these first two lines are to

+ // account for the ':' after the username and '@' after the password.

+ if (*offset_for_adjustment >

+ static_cast<size_t>(parsed.password.end())) {

+ *offset_for_adjustment -=

+ (parsed.username.len + parsed.password.len + 2);

+ } else if (*offset_for_adjustment >

+ static_cast<size_t>(parsed.username.begin)) {

+ *offset_for_adjustment = std::wstring::npos;

+ }

+ } else {

+ const url_parse::Component* nonempty_component =

+ parsed.username.is_nonempty() ? &parsed.username : &parsed.password;

+ // The seeming off-by-one in these first two lines is to account for the

+ // '@' after the username/password.

+ if (*offset_for_adjustment >

+ static_cast<size_t>(nonempty_component->end())) {

+ *offset_for_adjustment -= (nonempty_component->len + 1);

+ } else if (*offset_for_adjustment >

+ static_cast<size_t>(nonempty_component->begin)) {

+ *offset_for_adjustment = std::wstring::npos;

+ }

} else {

- AppendFormattedComponent(

- spec, parsed.username, unescape_rules,

- &url_string, &new_parsed->username);

+ AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string,

+ &new_parsed->username, offset_for_adjustment);

if (parsed.password.is_valid()) {

url_string.push_back(':');

}

- AppendFormattedComponent(

- spec, parsed.password, unescape_rules,

- &url_string, &new_parsed->password);

+ AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string,

+ &new_parsed->password, offset_for_adjustment);

if (parsed.username.is_valid() || parsed.password.is_valid()) {

url_string.push_back('@');

}

@@ -1381,39 +1455,56 @@

if (prefix_end)

*prefix_end = static_cast<size_t>(url_string.length());

- AppendFormattedHost(url, languages, &url_string, new_parsed);

+ AppendFormattedHost(url, languages, &url_string, new_parsed,

+ offset_for_adjustment);

// Port.

if (parsed.port.is_nonempty()) {

url_string.push_back(':');

- int begin = url_string.length();

- for (int i = parsed.port.begin; i < parsed.port.end(); ++i)

- url_string.push_back(spec[i]);

- new_parsed->port.begin = begin;

- new_parsed->port.len = url_string.length() - begin;

+ new_parsed->port.begin = url_string.length();

+ std::copy(spec.begin() + parsed.port.begin,

+ spec.begin() + parsed.port.end(), std::back_inserter(url_string));

+ new_parsed->port.len = url_string.length() - new_parsed->port.begin;

} else {

new_parsed->port.reset();

}

// Path and query both get the same general unescape & convert treatment.

- AppendFormattedComponent(

- spec, parsed.path, unescape_rules, &url_string,

- &new_parsed->path);

+ AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string,

+ &new_parsed->path, offset_for_adjustment);

if (parsed.query.is_valid())

url_string.push_back('?');

- AppendFormattedComponent(

- spec, parsed.query, unescape_rules, &url_string,

- &new_parsed->query);

+ AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string,

+ &new_parsed->query, offset_for_adjustment);

// Reference is stored in valid, unescaped UTF-8, so we can just convert.

if (parsed.ref.is_valid()) {

url_string.push_back('#');

- int begin = url_string.length();

- if (parsed.ref.len > 0)

- url_string.append(UTF8ToWide(std::string(&spec[parsed.ref.begin],

- parsed.ref.len)));

- new_parsed->ref.begin = begin;

- new_parsed->ref.len = url_string.length() - begin;

+ new_parsed->ref.begin = url_string.length();

+ size_t offset_past_current_output =

+ ((*offset_for_adjustment == std::wstring::npos) ||

+ (*offset_for_adjustment < url_string.length())) ?

+ std::wstring::npos : (*offset_for_adjustment - url_string.length());

+ size_t* offset_into_ref =

+ (offset_past_current_output >= static_cast<size_t>(parsed.ref.len)) ?

+ NULL : &offset_past_current_output;

+ if (parsed.ref.len > 0) {

+ url_string.append(UTF8ToWideAndAdjustOffset(spec.substr(parsed.ref.begin,

+ parsed.ref.len),

+ offset_into_ref));

+ }

+ new_parsed->ref.len = url_string.length() - new_parsed->ref.begin;

+ if (offset_into_ref) {

+ *offset_for_adjustment = (*offset_into_ref == std::wstring::npos) ?

+ std::wstring::npos : (new_parsed->ref.begin + *offset_into_ref);

+ } else if (offset_past_current_output != std::wstring::npos) {

+ // We clamped the offset near the beginning of this function to ensure it

+ // was within the input URL. If we reach here, the input was something

+ // invalid and non-parseable such that the offset was past any component

+ // we could figure out. In this case it won't be represented in the

+ // output string, so reset it.

+ *offset_for_adjustment = std::wstring::npos;

+ }

}

return url_string;

« no previous file with comments | « net/base/net_util.h ('k') | net/base/net_util_unittest.cc » ('j') | no next file with comments »