Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1099)

Side by Side Diff: src/url_canon_ip.cc

Issue 114050: url_canon: New CanonicalizeHostVerbose() function. (Closed) Base URL: http://google-url.googlecode.com/svn/trunk/
Patch Set: Address brettw's comments Created 11 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/url_canon_host.cc ('k') | src/url_canon_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2007, Google Inc. 1 // Copyright 2007, Google Inc.
2 // All rights reserved. 2 // All rights reserved.
3 // 3 //
4 // Redistribution and use in source and binary forms, with or without 4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are 5 // modification, are permitted provided that the following conditions are
6 // met: 6 // met:
7 // 7 //
8 // * Redistributions of source code must retain the above copyright 8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer. 9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above 10 // * Redistributions in binary form must reproduce the above
(...skipping 83 matching lines...) Expand 10 before | Expand all | Expand 10 after
94 return false; 94 return false;
95 } 95 }
96 } 96 }
97 97
98 // Fill in any unused components. 98 // Fill in any unused components.
99 while (cur_component < 4) 99 while (cur_component < 4)
100 components[cur_component++] = url_parse::Component(); 100 components[cur_component++] = url_parse::Component();
101 return true; 101 return true;
102 } 102 }
103 103
104 // Converts an IPv4 component to a 32-bit number, returning true on success. 104 // Converts an IPv4 component to a 32-bit number, while checking for overflow.
105 // False means that the number is invalid and that the input can not be an 105 //
106 // IP address. The number will be truncated to 32 bits. 106 // Possible return values:
107 // - IPV4 - The number was valid, and did not overflow.
108 // - BROKEN - The input was numeric, but too large for a 32-bit field.
109 // - NEUTRAL - Input was not numeric.
107 // 110 //
108 // The input is assumed to be ASCII. FindIPv4Components should have stripped 111 // The input is assumed to be ASCII. FindIPv4Components should have stripped
109 // out any input that is greater than 7 bits. The components are assumed 112 // out any input that is greater than 7 bits. The components are assumed
110 // to be non-empty. 113 // to be non-empty.
111 template<typename CHAR> 114 template<typename CHAR>
112 bool IPv4ComponentToNumber(const CHAR* spec, 115 CanonHostInfo::Family IPv4ComponentToNumber(
113 const url_parse::Component& component, 116 const CHAR* spec,
114 uint32_t* number) { 117 const url_parse::Component& component,
118 uint32_t* number) {
115 // Figure out the base 119 // Figure out the base
116 SharedCharTypes base; 120 SharedCharTypes base;
117 int base_prefix_len = 0; // Size of the prefix for this base. 121 int base_prefix_len = 0; // Size of the prefix for this base.
118 if (spec[component.begin] == '0') { 122 if (spec[component.begin] == '0') {
119 // Either hex or dec, or a standalone zero. 123 // Either hex or dec, or a standalone zero.
120 if (component.len == 1) { 124 if (component.len == 1) {
121 base = CHAR_DEC; 125 base = CHAR_DEC;
122 } else if (spec[component.begin + 1] == 'X' || 126 } else if (spec[component.begin + 1] == 'X' ||
123 spec[component.begin + 1] == 'x') { 127 spec[component.begin + 1] == 'x') {
124 base = CHAR_HEX; 128 base = CHAR_HEX;
125 base_prefix_len = 2; 129 base_prefix_len = 2;
126 } else { 130 } else {
127 base = CHAR_OCT; 131 base = CHAR_OCT;
128 base_prefix_len = 1; 132 base_prefix_len = 1;
129 } 133 }
130 } else { 134 } else {
131 base = CHAR_DEC; 135 base = CHAR_DEC;
132 } 136 }
133 137
134 // Reject any components that are too long. This is generous, Windows 138 // Extend the prefix to consume all leading zeros.
135 // allows at most 16 characters for the entire host name, and 12 per 139 while (base_prefix_len < component.len &&
136 // component, while Mac and Linux will take up to 10 per component. 140 spec[component.begin + base_prefix_len] == '0')
141 base_prefix_len++;
142
143 // Put the component, minus any base prefix, into a NULL-terminated buffer so
144 // we can call the standard library. Because leading zeros have already been
145 // discarded, filling the entire buffer is guaranteed to trigger the 32-bit
146 // overflow check.
137 const int kMaxComponentLen = 16; 147 const int kMaxComponentLen = 16;
138 if (component.len - base_prefix_len > kMaxComponentLen) 148 char buf[kMaxComponentLen + 1]; // digits + '\0'
139 return false;
140
141 // Put the component, minus any base prefix, to a NULL-terminated buffer so
142 // we can call the standard library. We know the input is 7-bit, so convert
143 // to narrow (if this is the wide version of the template) by casting.
144 char buf[kMaxComponentLen + 1];
145 int dest_i = 0; 149 int dest_i = 0;
146 for (int i = base_prefix_len; i < component.len; i++, dest_i++) { 150 for (int i = component.begin + base_prefix_len; i < component.end(); i++) {
147 char input = static_cast<char>(spec[component.begin + i]); 151 // We know the input is 7-bit, so convert to narrow (if this is the wide
152 // version of the template) by casting.
153 char input = static_cast<char>(spec[i]);
148 154
149 // Validate that this character is OK for the given base. 155 // Validate that this character is OK for the given base.
150 if (!IsCharOfType(input, base)) 156 if (!IsCharOfType(input, base))
151 return false; 157 return CanonHostInfo::NEUTRAL;
152 buf[dest_i] = input; 158
159 // Fill the buffer, if there's space remaining. This check allows us to
160 // verify that all characters are numeric, even those that don't fit.
161 if (dest_i < kMaxComponentLen)
162 buf[dest_i++] = input;
153 } 163 }
154 buf[dest_i] = 0; 164
165 buf[dest_i] = '\0';
155 166
156 // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal 167 // Use the 64-bit strtoi so we get a big number (no hex, decimal, or octal
157 // number can overflow a 64-bit number in <= 16 characters). Then cast to 168 // number can overflow a 64-bit number in <= 16 characters).
158 // truncate down to a 32-bit number. This may be further truncated later. 169 uint64_t num = _strtoui64(buf, NULL, BaseForType(base));
159 *number = static_cast<uint32_t>(_strtoui64(buf, NULL, BaseForType(base))); 170
160 return true; 171 // Check for 32-bit overflow.
172 if (num > UINT32_MAX)
173 return CanonHostInfo::BROKEN;
174
175 // No overflow. Success!
176 *number = static_cast<uint32_t>(num);
177 return CanonHostInfo::IPV4;
161 } 178 }
162 179
163 // Writes the given address (with each character representing one dotted 180 // Writes the given address (with each character representing one dotted
164 // part of an IPv4 address) to the output, and updating |*out_host| to 181 // part of an IPv4 address) to the output, and updating |*out_host| to
165 // identify the added portion. 182 // identify the added portion.
166 void AppendIPv4Address(const unsigned char address[4], 183 void AppendIPv4Address(const unsigned char address[4],
167 CanonOutput* output, 184 CanonOutput* output,
168 url_parse::Component* out_host) { 185 url_parse::Component* out_host) {
169 out_host->begin = output->length(); 186 out_host->begin = output->length();
170 for (int i = 0; i < 4; i++) { 187 for (int i = 0; i < 4; i++) {
171 char str[16]; 188 char str[16];
172 _itoa_s(address[i], str, 10); 189 _itoa_s(address[i], str, 10);
173 190
174 for (int ch = 0; str[ch] != 0; ch++) 191 for (int ch = 0; str[ch] != 0; ch++)
175 output->push_back(str[ch]); 192 output->push_back(str[ch]);
176 193
177 if (i != 3) 194 if (i != 3)
178 output->push_back('.'); 195 output->push_back('.');
179 } 196 }
180 out_host->len = output->length() - out_host->begin; 197 out_host->len = output->length() - out_host->begin;
181 } 198 }
182 199
183 // Converts an IPv4 address to a 32-bit number (network byte order), returning 200 // Converts an IPv4 address to a 32-bit number (network byte order).
184 // true on success. False means that the input is not a valid IPv4 address. 201 //
202 // Possible return values:
203 // IPV4 - IPv4 address was successfully parsed.
204 // BROKEN - Input was formatted like an IPv4 address, but overflow occurred
205 // during parsing.
206 // NEUTRAL - Input couldn't possibly be interpreted as an IPv4 address.
207 // It might be an IPv6 address, or a hostname.
208 //
209 // On success, |num_ipv4_components| will be populated with the number of
210 // components in the IPv4 address.
185 template<typename CHAR> 211 template<typename CHAR>
186 bool IPv4AddressToNumber(const CHAR* spec, 212 CanonHostInfo::Family IPv4AddressToNumber(const CHAR* spec,
187 const url_parse::Component& host, 213 const url_parse::Component& host,
188 unsigned char address[4]) { 214 unsigned char address[4],
215 int* num_ipv4_components) {
189 // The identified components. Not all may exist. 216 // The identified components. Not all may exist.
190 url_parse::Component components[4]; 217 url_parse::Component components[4];
191 if (!FindIPv4Components(spec, host, components)) 218 if (!FindIPv4Components(spec, host, components))
192 return false; 219 return CanonHostInfo::NEUTRAL;
193 220
194 // Convert existing components to digits. Values up to 221 // Convert existing components to digits. Values up to
195 // |existing_components| will be valid. 222 // |existing_components| will be valid.
196 uint32_t component_values[4]; 223 uint32_t component_values[4];
197 int existing_components = 0; 224 int existing_components = 0;
198 for (int i = 0; i < 4; i++) { 225 for (int i = 0; i < 4; i++) {
199 if (components[i].len <= 0) 226 if (components[i].len <= 0)
200 continue; 227 continue;
201 if (!IPv4ComponentToNumber(spec, components[i], 228 CanonHostInfo::Family family = IPv4ComponentToNumber(
202 &component_values[existing_components])) 229 spec, components[i], &component_values[existing_components]);
203 return false; 230
231 // Stop if we hit an invalid non-empty component.
232 if (family != CanonHostInfo::IPV4)
233 return family;
234
204 existing_components++; 235 existing_components++;
205 } 236 }
206 237
207 // Use that sequence of numbers to fill out the 4-component IP address. 238 // Use that sequence of numbers to fill out the 4-component IP address.
208 239
209 // ...first fill all but the last component by truncating to one byte. 240 // First, process all components but the last, while making sure each fits
210 for (int i = 0; i < existing_components - 1; i++) 241 // within an 8-bit field.
242 for (int i = 0; i < existing_components - 1; i++) {
243 if (component_values[i] > UINT8_MAX)
244 return CanonHostInfo::BROKEN;
211 address[i] = static_cast<unsigned char>(component_values[i]); 245 address[i] = static_cast<unsigned char>(component_values[i]);
246 }
212 247
213 // ...then fill out the rest of the bytes by filling them with the last 248 // Next, consume the last component to fill in the remaining bytes.
214 // component.
215 uint32_t last_value = component_values[existing_components - 1]; 249 uint32_t last_value = component_values[existing_components - 1];
216 if (existing_components == 1) 250 for (int i = 3; i >= existing_components - 1; i--) {
217 address[0] = (last_value & 0xFF000000) >> 24; 251 address[i] = static_cast<unsigned char>(last_value);
218 if (existing_components <= 2) 252 last_value >>= 8;
219 address[1] = (last_value & 0x00FF0000) >> 16; 253 }
220 if (existing_components <= 3)
221 address[2] = (last_value & 0x0000FF00) >> 8;
222 address[3] = last_value & 0xFF;
223 254
224 return true; 255 // If the last component has residual bits, report overflow.
256 if (last_value != 0)
257 return CanonHostInfo::BROKEN;
258
259 // Tell the caller how many components we saw.
260 *num_ipv4_components = existing_components;
261
262 // Success!
263 return CanonHostInfo::IPV4;
225 } 264 }
226 265
266 // Return true if we've made a final IPV4/BROKEN decision, false if the result
267 // is NEUTRAL, and we could use a second opinion.
227 template<typename CHAR, typename UCHAR> 268 template<typename CHAR, typename UCHAR>
228 bool DoCanonicalizeIPv4Address(const CHAR* spec, 269 bool DoCanonicalizeIPv4Address(const CHAR* spec,
229 const url_parse::Component& host, 270 const url_parse::Component& host,
230 CanonOutput* output, 271 CanonOutput* output,
231 url_parse::Component* out_host) { 272 CanonHostInfo* host_info) {
232 unsigned char address[4]; 273 unsigned char address[4];
233 if (!IPv4AddressToNumber<CHAR>(spec, host, address)) 274 host_info->family = IPv4AddressToNumber<CHAR>(
234 return false; 275 spec, host, address, &host_info->num_ipv4_components);
235 276
236 AppendIPv4Address(address, output, out_host); 277 switch (host_info->family) {
237 return true; 278 case CanonHostInfo::IPV4:
279 // Definitely an IPv4 address.
280 AppendIPv4Address(address, output, &host_info->out_host);
281 return true;
282 case CanonHostInfo::BROKEN:
283 // Definitely broken.
284 return true;
285 default:
286 // Could be IPv6 or a hostname.
287 return false;
288 }
238 } 289 }
239 290
240 // Helper class that describes the main components of an IPv6 input string. 291 // Helper class that describes the main components of an IPv6 input string.
241 // See the following examples to understand how it breaks up an input string: 292 // See the following examples to understand how it breaks up an input string:
242 // 293 //
243 // [Example 1]: input = "[::aa:bb]" 294 // [Example 1]: input = "[::aa:bb]"
244 // ==> num_hex_components = 2 295 // ==> num_hex_components = 2
245 // ==> hex_components[0] = Component(3,2) "aa" 296 // ==> hex_components[0] = Component(3,2) "aa"
246 // ==> hex_components[1] = Component(6,2) "bb" 297 // ==> hex_components[1] = Component(6,2) "bb"
247 // ==> index_of_contraction = 0 298 // ==> index_of_contraction = 0
(...skipping 251 matching lines...) Expand 10 before | Expand all | Expand 10 after
499 // "mapped" ==> 0:0:0:0:0:0000:<IPv4-literal> 550 // "mapped" ==> 0:0:0:0:0:0000:<IPv4-literal>
500 for (int j = 0; j < 10; ++j) { 551 for (int j = 0; j < 10; ++j) {
501 if (address[j] != 0) 552 if (address[j] != 0)
502 return false; 553 return false;
503 } 554 }
504 if (!((address[10] == 0 && address[11] == 0) || 555 if (!((address[10] == 0 && address[11] == 0) ||
505 (address[10] == 0xFF && address[11] == 0xFF))) 556 (address[10] == 0xFF && address[11] == 0xFF)))
506 return false; 557 return false;
507 558
508 // Append the 32-bit number to |address|. 559 // Append the 32-bit number to |address|.
509 if (!IPv4AddressToNumber(spec, 560 int ignored_num_ipv4_components;
510 ipv6_parsed.ipv4_component, 561 if (CanonHostInfo::IPV4 !=
511 &address[cur_index_in_address])) 562 IPv4AddressToNumber(spec,
563 ipv6_parsed.ipv4_component,
564 &address[cur_index_in_address],
565 &ignored_num_ipv4_components))
512 return false; 566 return false;
513 } 567 }
514 568
515 return true; 569 return true;
516 } 570 }
517 571
518 // Searches for the longest sequence of zeros in |address|, and writes the 572 // Searches for the longest sequence of zeros in |address|, and writes the
519 // range into |contraction_range|. The run of zeros must be at least 16 bits, 573 // range into |contraction_range|. The run of zeros must be at least 16 bits,
520 // and if there is a tie the first is chosen. 574 // and if there is a tie the first is chosen.
521 void ChooseIPv6ContractionRange(const unsigned char address[16], 575 void ChooseIPv6ContractionRange(const unsigned char address[16],
(...skipping 20 matching lines...) Expand all
542 // it is a candidate for the contraction. 596 // it is a candidate for the contraction.
543 if (cur_range.len > 2 && cur_range.len > max_range.len) { 597 if (cur_range.len > 2 && cur_range.len > max_range.len) {
544 max_range = cur_range; 598 max_range = cur_range;
545 } 599 }
546 cur_range.reset(); 600 cur_range.reset();
547 } 601 }
548 } 602 }
549 *contraction_range = max_range; 603 *contraction_range = max_range;
550 } 604 }
551 605
606 // Return true if we've made a final IPV6/BROKEN decision, false if the result
607 // is NEUTRAL, and we could use a second opinion.
552 template<typename CHAR, typename UCHAR> 608 template<typename CHAR, typename UCHAR>
553 bool DoCanonicalizeIPv6Address(const CHAR* spec, 609 bool DoCanonicalizeIPv6Address(const CHAR* spec,
554 const url_parse::Component& host, 610 const url_parse::Component& host,
555 CanonOutput* output, 611 CanonOutput* output,
556 url_parse::Component* out_host) { 612 CanonHostInfo* host_info) {
557 // Turn the IP address into a 128 bit number. 613 // Turn the IP address into a 128 bit number.
558 unsigned char address[16]; 614 unsigned char address[16];
559 if (!IPv6AddressToNumber<CHAR, UCHAR>(spec, host, address)) 615 if (!IPv6AddressToNumber<CHAR, UCHAR>(spec, host, address)) {
616 // If it's not an IPv6 address, scan for characters that should *only*
617 // exist in an IPv6 address.
618 for (int i = host.begin; i < host.end(); i++) {
619 switch (spec[i]) {
620 case '[':
621 case ']':
622 case ':':
623 host_info->family = CanonHostInfo::BROKEN;
624 return true;
625 }
626 }
627
628 // No invalid characters. Could still be IPv4 or a hostname.
629 host_info->family = CanonHostInfo::NEUTRAL;
560 return false; 630 return false;
631 }
561 632
562 out_host->begin = output->length(); 633 host_info->out_host.begin = output->length();
563 output->push_back('['); 634 output->push_back('[');
564 635
565 // We will now output the address according to the rules in: 636 // We will now output the address according to the rules in:
566 // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#secti on-4 637 // http://tools.ietf.org/html/draft-kawamura-ipv6-text-representation-01#secti on-4
567 638
568 // Start by finding where to place the "::" contraction (if any). 639 // Start by finding where to place the "::" contraction (if any).
569 url_parse::Component contraction_range; 640 url_parse::Component contraction_range;
570 ChooseIPv6ContractionRange(address, &contraction_range); 641 ChooseIPv6ContractionRange(address, &contraction_range);
571 642
572 for (int i = 0; i < 16;) { 643 for (int i = 0; i < 16;) {
(...skipping 15 matching lines...) Expand all
588 for (int ch = 0; str[ch] != 0; ++ch) 659 for (int ch = 0; str[ch] != 0; ++ch)
589 output->push_back(str[ch]); 660 output->push_back(str[ch]);
590 661
591 // Put a colon after each number, except the last. 662 // Put a colon after each number, except the last.
592 if (i < 16) 663 if (i < 16)
593 output->push_back(':'); 664 output->push_back(':');
594 } 665 }
595 } 666 }
596 667
597 output->push_back(']'); 668 output->push_back(']');
598 out_host->len = output->length() - out_host->begin; 669 host_info->out_host.len = output->length() - host_info->out_host.begin;
599 670
671 host_info->family = CanonHostInfo::IPV6;
600 return true; 672 return true;
601 } 673 }
602 674
603 } // namespace 675 } // namespace
604 676
605 bool FindIPv4Components(const char* spec, 677 bool FindIPv4Components(const char* spec,
606 const url_parse::Component& host, 678 const url_parse::Component& host,
607 url_parse::Component components[4]) { 679 url_parse::Component components[4]) {
608 return DoFindIPv4Components<char, unsigned char>(spec, host, components); 680 return DoFindIPv4Components<char, unsigned char>(spec, host, components);
609 } 681 }
610 682
611 bool FindIPv4Components(const char16* spec, 683 bool FindIPv4Components(const char16* spec,
612 const url_parse::Component& host, 684 const url_parse::Component& host,
613 url_parse::Component components[4]) { 685 url_parse::Component components[4]) {
614 return DoFindIPv4Components<char16, char16>(spec, host, components); 686 return DoFindIPv4Components<char16, char16>(spec, host, components);
615 } 687 }
616 688
617 bool CanonicalizeIPAddress(const char* spec, 689 void CanonicalizeIPAddress(const char* spec,
618 const url_parse::Component& host, 690 const url_parse::Component& host,
619 CanonOutput* output, 691 CanonOutput* output,
620 url_parse::Component* out_host) { 692 CanonHostInfo* host_info) {
621 return 693 if (DoCanonicalizeIPv4Address<char, unsigned char>(
622 DoCanonicalizeIPv4Address<char, unsigned char>( 694 spec, host, output, host_info))
623 spec, host, output, out_host) || 695 return;
624 DoCanonicalizeIPv6Address<char, unsigned char>( 696 if (DoCanonicalizeIPv6Address<char, unsigned char>(
625 spec, host, output, out_host); 697 spec, host, output, host_info))
698 return;
626 } 699 }
627 700
628 bool CanonicalizeIPAddress(const char16* spec, 701 void CanonicalizeIPAddress(const char16* spec,
629 const url_parse::Component& host, 702 const url_parse::Component& host,
630 CanonOutput* output, 703 CanonOutput* output,
631 url_parse::Component* out_host) { 704 CanonHostInfo* host_info) {
632 return 705 if (DoCanonicalizeIPv4Address<char16, char16>(
633 DoCanonicalizeIPv4Address<char16, char16>( 706 spec, host, output, host_info))
634 spec, host, output, out_host) || 707 return;
635 DoCanonicalizeIPv6Address<char16, char16>( 708 if (DoCanonicalizeIPv6Address<char16, char16>(
636 spec, host, output, out_host); 709 spec, host, output, host_info))
710 return;
637 } 711 }
638 712
639 } // namespace url_canon 713 } // namespace url_canon
OLDNEW
« no previous file with comments | « src/url_canon_host.cc ('k') | src/url_canon_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698