Index: src/url_canon_ip.cc |
=================================================================== |
--- src/url_canon_ip.cc (revision 106) |
+++ src/url_canon_ip.cc (working copy) |
@@ -101,17 +101,21 @@ |
return true; |
} |
-// Converts an IPv4 component to a 32-bit number, returning true on success. |
-// False means that the number is invalid and that the input can not be an |
-// IP address. The number will be truncated to 32 bits. |
+// Converts an IPv4 component to a 32-bit number, while checking for overflow. |
// |
+// Possible return values: |
+// - IPV4 - The number was valid, and did not overflow. |
+// - BROKEN - The input was numeric, but too large for a 32-bit field. |
+// - NEUTRAL - Input was not numeric. |
+// |
// The input is assumed to be ASCII. FindIPv4Components should have stripped |
// out any input that is greater than 7 bits. The components are assumed |
// to be non-empty. |
template<typename CHAR> |
-bool IPv4ComponentToNumber(const CHAR* spec, |
- const url_parse::Component& component, |
- uint32_t* number) { |
+CanonHostInfo::Family IPv4ComponentToNumber( |
+ const CHAR* spec, |
+ const url_parse::Component& component, |
+ uint32_t* number) { |
// Figure out the base |
SharedCharTypes base; |
int base_prefix_len = 0; // Size of the prefix for this base. |
@@ -131,33 +135,46 @@ |
base = CHAR_DEC; |
} |
- // Reject any components that are too long. This is generous, Windows |
- // allows at most 16 characters for the entire host name, and 12 per |
- // component, while Mac and Linux will take up to 10 per component. |
+ // Extend the prefix to consume all leading zeros. |
+ while (base_prefix_len < component.len && |
+ spec[component.begin + base_prefix_len] == '0') |
+ base_prefix_len++; |
+ |
+ // Put the component, minus any base prefix, into a NULL-terminated buffer so |
+ // we can call the standard library. Because leading zeros have already been |
+ // discarded, filling the entire buffer is guaranteed to trigger the 32-bit |
+ // overflow check. |
const int kMaxComponentLen = 16; |
- if (component.len - base_prefix_len > kMaxComponentLen) |
- return false; |
- |
- // Put the component, minus any base prefix, to a NULL-terminated buffer so |
- // we can call the standard library. We know the input is 7-bit, so convert |
- // to narrow (if this is the wide version of the template) by casting. |
- char buf[kMaxComponentLen + 1]; |
+ char buf[kMaxComponentLen + 1]; // digits + '\0' |
int dest_i = 0; |
- for (int i = base_prefix_len; i < component.len; i++, dest_i++) { |
- char input = static_cast<char>(spec[component.begin + i]); |
+ for (int i = component.begin + base_prefix_len; i < component.end(); i++) { |
+ // We know the input is 7-bit, so convert to narrow (if this is the wide |
+ // version of the template) by casting. |
+ char input = static_cast<char>(spec[i]); |
// Validate that this character is OK for the given base. |
if (!IsCharOfType(input, base)) |
- return false; |
- buf[dest_i] = input; |
+ return CanonHostInfo::NEUTRAL; |
+ |
+ // Fill the buffer, if there's space remaining. This check allows us to |
+ // verify that all characters are numeric, even those that don't fit. |
+ if (dest_i < kMaxComponentLen) |
+ buf[dest_i++] = input; |
} |
- buf[dest_i] = 0; |
+ buf[dest_i] = '\0'; |
+ |
// Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal |
- // number can overflow a 64-bit number in <= 16 characters). Then cast to |
- // truncate down to a 32-bit number. This may be further truncated later. |
- *number = static_cast<uint32_t>(_strtoui64(buf, NULL, BaseForType(base))); |
- return true; |
+ // number can overflow a 64-bit number in <= 16 characters). |
+ uint64_t num = _strtoui64(buf, NULL, BaseForType(base)); |
+ |
+ // Check for 32-bit overflow. |
+ if (num > UINT32_MAX) |
+ return CanonHostInfo::BROKEN; |
+ |
+ // No overflow. Success! |
+ *number = static_cast<uint32_t>(num); |
+ return CanonHostInfo::IPV4; |
} |
// Writes the given address (with each character representing one dotted |
@@ -180,16 +197,26 @@ |
out_host->len = output->length() - out_host->begin; |
} |
-// Converts an IPv4 address to a 32-bit number (network byte order), returning |
-// true on success. False means that the input is not a valid IPv4 address. |
+// Converts an IPv4 address to a 32-bit number (network byte order). |
+// |
+// Possible return values: |
+// IPV4 - IPv4 address was successfully parsed. |
+// BROKEN - Input was formatted like an IPv4 address, but overflow occurred |
+// during parsing. |
+// NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address. |
+// It might be an IPv6 address, or a hostname. |
+// |
+// On success, |num_ipv4_components| will be populated with the number of |
+// components in the IPv4 address. |
template<typename CHAR> |
-bool IPv4AddressToNumber(const CHAR* spec, |
- const url_parse::Component& host, |
- unsigned char address[4]) { |
+CanonHostInfo::Family IPv4AddressToNumber(const CHAR* spec, |
+ const url_parse::Component& host, |
+ unsigned char address[4], |
+ int* num_ipv4_components) { |
// The identified components. Not all may exist. |
url_parse::Component components[4]; |
if (!FindIPv4Components(spec, host, components)) |
- return false; |
+ return CanonHostInfo::NEUTRAL; |
// Convert existing components to digits. Values up to |
// |existing_components| will be valid. |
@@ -198,43 +225,67 @@ |
for (int i = 0; i < 4; i++) { |
if (components[i].len <= 0) |
continue; |
- if (!IPv4ComponentToNumber(spec, components[i], |
- &component_values[existing_components])) |
- return false; |
+ CanonHostInfo::Family family = IPv4ComponentToNumber( |
+ spec, components[i], &component_values[existing_components]); |
+ |
+ // Stop if we hit an invalid non-empty component. |
+ if (family != CanonHostInfo::IPV4) |
+ return family; |
+ |
existing_components++; |
} |
// Use that sequence of numbers to fill out the 4-component IP address. |
- // ...first fill all but the last component by truncating to one byte. |
- for (int i = 0; i < existing_components - 1; i++) |
+ // First, process all components but the last, while making sure each fits |
+ // within an 8-bit field. |
+ for (int i = 0; i < existing_components - 1; i++) { |
+ if (component_values[i] > UINT8_MAX) |
+ return CanonHostInfo::BROKEN; |
address[i] = static_cast<unsigned char>(component_values[i]); |
+ } |
- // ...then fill out the rest of the bytes by filling them with the last |
- // component. |
+ // Next, consume the last component to fill in the remaining bytes. |
uint32_t last_value = component_values[existing_components - 1]; |
- if (existing_components == 1) |
- address[0] = (last_value & 0xFF000000) >> 24; |
- if (existing_components <= 2) |
- address[1] = (last_value & 0x00FF0000) >> 16; |
- if (existing_components <= 3) |
- address[2] = (last_value & 0x0000FF00) >> 8; |
- address[3] = last_value & 0xFF; |
+ for (int i = 3; i >= existing_components - 1; i--) { |
+ address[i] = static_cast<unsigned char>(last_value); |
+ last_value >>= 8; |
+ } |
- return true; |
+ // If the last component has residual bits, report overflow. |
+ if (last_value != 0) |
+ return CanonHostInfo::BROKEN; |
+ |
+ // Tell the caller how many components we saw. |
+ *num_ipv4_components = existing_components; |
+ |
+ // Success! |
+ return CanonHostInfo::IPV4; |
} |
+// Return true if we've made a final IPV4/BROKEN decision, false if the result |
+// is NEUTRAL, and we could use a second opinion. |
template<typename CHAR, typename UCHAR> |
bool DoCanonicalizeIPv4Address(const CHAR* spec, |
const url_parse::Component& host, |
CanonOutput* output, |
- url_parse::Component* out_host) { |
+ CanonHostInfo* host_info) { |
unsigned char address[4]; |
- if (!IPv4AddressToNumber<CHAR>(spec, host, address)) |
- return false; |
+ host_info->family = IPv4AddressToNumber<CHAR>( |
+ spec, host, address, &host_info->num_ipv4_components); |
- AppendIPv4Address(address, output, out_host); |
- return true; |
+ switch (host_info->family) { |
+ case CanonHostInfo::IPV4: |
+ // Definitely an IPv4 address. |
+ AppendIPv4Address(address, output, &host_info->out_host); |
+ return true; |
+ case CanonHostInfo::BROKEN: |
+ // Definitely broken. |
+ return true; |
+ default: |
+ // Could be IPv6 or a hostname. |
+ return false; |
+ } |
} |
// Helper class that describes the main components of an IPv6 input string. |
@@ -506,9 +557,12 @@ |
return false; |
// Append the 32-bit number to |address|. |
- if (!IPv4AddressToNumber(spec, |
- ipv6_parsed.ipv4_component, |
- &address[cur_index_in_address])) |
+ int ignored_num_ipv4_components; |
+ if (CanonHostInfo::IPV4 != |
+ IPv4AddressToNumber(spec, |
+ ipv6_parsed.ipv4_component, |
+ &address[cur_index_in_address], |
+ &ignored_num_ipv4_components)) |
return false; |
} |
@@ -549,17 +603,34 @@ |
*contraction_range = max_range; |
} |
+// Return true if we've made a final IPV6/BROKEN decision, false if the result |
+// is NEUTRAL, and we could use a second opinion. |
template<typename CHAR, typename UCHAR> |
bool DoCanonicalizeIPv6Address(const CHAR* spec, |
const url_parse::Component& host, |
CanonOutput* output, |
- url_parse::Component* out_host) { |
+ CanonHostInfo* host_info) { |
// Turn the IP address into a 128 bit number. |
unsigned char address[16]; |
- if (!IPv6AddressToNumber<CHAR, UCHAR>(spec, host, address)) |
+ if (!IPv6AddressToNumber<CHAR, UCHAR>(spec, host, address)) { |
+ // If it's not an IPv6 address, scan for characters that should *only* |
+ // exist in an IPv6 address. |
+ for (int i = host.begin; i < host.end(); i++) { |
+ switch (spec[i]) { |
+ case '[': |
+ case ']': |
+ case ':': |
+ host_info->family = CanonHostInfo::BROKEN; |
+ return true; |
+ } |
+ } |
+ |
+ // No invalid characters. Could still be IPv4 or a hostname. |
+ host_info->family = CanonHostInfo::NEUTRAL; |
return false; |
+ } |
- out_host->begin = output->length(); |
+ host_info->out_host.begin = output->length(); |
output->push_back('['); |
// We will now output the address according to the rules in: |
@@ -595,8 +666,9 @@ |
} |
output->push_back(']'); |
- out_host->len = output->length() - out_host->begin; |
+ host_info->out_host.len = output->length() - host_info->out_host.begin; |
+ host_info->family = CanonHostInfo::IPV6; |
return true; |
} |
@@ -614,26 +686,28 @@ |
return DoFindIPv4Components<char16, char16>(spec, host, components); |
} |
-bool CanonicalizeIPAddress(const char* spec, |
+void CanonicalizeIPAddress(const char* spec, |
const url_parse::Component& host, |
CanonOutput* output, |
- url_parse::Component* out_host) { |
- return |
- DoCanonicalizeIPv4Address<char, unsigned char>( |
- spec, host, output, out_host) || |
- DoCanonicalizeIPv6Address<char, unsigned char>( |
- spec, host, output, out_host); |
+ CanonHostInfo* host_info) { |
+ if (DoCanonicalizeIPv4Address<char, unsigned char>( |
+ spec, host, output, host_info)) |
+ return; |
+ if (DoCanonicalizeIPv6Address<char, unsigned char>( |
+ spec, host, output, host_info)) |
+ return; |
} |
-bool CanonicalizeIPAddress(const char16* spec, |
+void CanonicalizeIPAddress(const char16* spec, |
const url_parse::Component& host, |
CanonOutput* output, |
- url_parse::Component* out_host) { |
- return |
- DoCanonicalizeIPv4Address<char16, char16>( |
- spec, host, output, out_host) || |
- DoCanonicalizeIPv6Address<char16, char16>( |
- spec, host, output, out_host); |
+ CanonHostInfo* host_info) { |
+ if (DoCanonicalizeIPv4Address<char16, char16>( |
+ spec, host, output, host_info)) |
+ return; |
+ if (DoCanonicalizeIPv6Address<char16, char16>( |
+ spec, host, output, host_info)) |
+ return; |
} |
} // namespace url_canon |