Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(119)

Side by Side Diff: src/url_canon_host.cc

Issue 114050: url_canon: New CanonicalizeHostVerbose() function. (Closed) Base URL: http://google-url.googlecode.com/svn/trunk/
Patch Set: Address brettw's comments Created 11 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « src/url_canon.h ('k') | src/url_canon_ip.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2007, Google Inc. 1 // Copyright 2007, Google Inc.
2 // All rights reserved. 2 // All rights reserved.
3 // 3 //
4 // Redistribution and use in source and binary forms, with or without 4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are 5 // modification, are permitted provided that the following conditions are
6 // met: 6 // met:
7 // 7 //
8 // * Redistributions of source code must retain the above copyright 8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer. 9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above 10 // * Redistributions in binary form must reproduce the above
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
75 // based on how many times you run the canonicalizer. We prefer to always report 75 // based on how many times you run the canonicalizer. We prefer to always report
76 // the same vailidity, so reject this. 76 // the same vailidity, so reject this.
77 const unsigned char kEsc = 0xff; 77 const unsigned char kEsc = 0xff;
78 const unsigned char kHostCharLookup[0x80] = { 78 const unsigned char kHostCharLookup[0x80] = {
79 // 00-1f: all are invalid 79 // 00-1f: all are invalid
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 // ' ' ! " # $ % & ' ( ) * + , - . / 82 // ' ' ! " # $ % & ' ( ) * + , - . /
83 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0, 83 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
84 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 84 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
85 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 ,kEsc,kEsc,kEsc, 0 , 85 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
86 // @ A B C D E F G H I J K L M N O 86 // @ A B C D E F G H I J K L M N O
87 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o', 87 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',
88 // P Q R S T U V W X Y Z [ \ ] ^ _ 88 // P Q R S T U V W X Y Z [ \ ] ^ _
89 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , ' _', 89 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , ' _',
90 // ` a b c d e f g h i j k l m n o 90 // ` a b c d e f g h i j k l m n o
91 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o', 91 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',
92 // p q r s t u v w x y z { | } ~ 92 // p q r s t u v w x y z { | } ~
93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 }; 93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
94 94
95 const int kTempHostBufferLen = 1024; 95 const int kTempHostBufferLen = 1024;
(...skipping 10 matching lines...) Expand all
106 *has_non_ascii = false; 106 *has_non_ascii = false;
107 *has_escaped = false; 107 *has_escaped = false;
108 for (int i = host.begin; i < end; i++) { 108 for (int i = host.begin; i < end; i++) {
109 if (static_cast<UCHAR>(spec[i]) >= 0x80) 109 if (static_cast<UCHAR>(spec[i]) >= 0x80)
110 *has_non_ascii = true; 110 *has_non_ascii = true;
111 else if (spec[i] == '%') 111 else if (spec[i] == '%')
112 *has_escaped = true; 112 *has_escaped = true;
113 } 113 }
114 } 114 }
115 115
116 // Considers the current contents of the output and sees if it looks like an
117 // IP address. This is called because we canonicalize to the output assuming
118 // that it's not an IP address, and now need to fix it if we produced one.
119 //
120 // The generated hostname is identified by |host|. The output will be fixed
121 // with a canonical IP address if the host looks like one. Otherwise, there
122 // will be no change.
123 void InterpretIPAddress(const url_parse::Component& host,
124 CanonOutput* output) {
125 // Canonicalize the IP address in the output to this temporary buffer.
126 // IP addresses are small, so this should not cause an allocation.
127 RawCanonOutput<64> canon_ip;
128 url_parse::Component out_host; // Unused.
129 if (CanonicalizeIPAddress(output->data(), host, &canon_ip, &out_host)) {
130 // Looks like an IP address, overwrite the existing host with the newly
131 // canonicalized IP address.
132 output->set_length(host.begin);
133 output->Append(canon_ip.data(), canon_ip.length());
134 }
135 }
136
137 // Canonicalizes a host name that is entirely 8-bit characters (even though 116 // Canonicalizes a host name that is entirely 8-bit characters (even though
138 // the type holding them may be 16 bits. Escaped characters will be unescaped. 117 // the type holding them may be 16 bits. Escaped characters will be unescaped.
139 // Non-7-bit characters (for example, UTF-8) will be passed unchanged. 118 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
140 // 119 //
141 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in 120 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
142 // the output. 121 // the output.
143 // 122 //
144 // This function is used in two situations: 123 // This function is used in two situations:
145 // 124 //
146 // * When the caller knows there is no non-ASCII or percent escaped 125 // * When the caller knows there is no non-ASCII or percent escaped
147 // characters. This is what DoHost does. The result will be a completely 126 // characters. This is what DoHost does. The result will be a completely
148 // canonicalized host since we know nothing weird can happen (escaped 127 // canonicalized host since we know nothing weird can happen (escaped
149 // characters could be unescaped to non-7-bit, so they have to be treated 128 // characters could be unescaped to non-7-bit, so they have to be treated
150 // with suspicion at this point). It does not use the |has_non_ascii| flag. 129 // with suspicion at this point). It does not use the |has_non_ascii| flag.
151 // 130 //
152 // * When the caller has an 8-bit string that may need unescaping. 131 // * When the caller has an 8-bit string that may need unescaping.
153 // DoComplexHost calls us this situation to do unescaping and validation. 132 // DoComplexHost calls us this situation to do unescaping and validation.
154 // After this, it may do other IDN operations depending on the value of the 133 // After this, it may do other IDN operations depending on the value of the
155 // |*has_non_ascii| flag. 134 // |*has_non_ascii| flag.
156 // 135 //
157 // The return value indicates if the output is a potentially valid host name. 136 // The return value indicates if the output is a potentially valid host name.
158 template<typename CHAR> 137 template<typename CHAR>
159 bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output, 138 bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output,
160 bool* has_non_ascii) { 139 bool* has_non_ascii) {
161 *has_non_ascii = false; 140 *has_non_ascii = false;
162 141
163 // First check if the host name is an IP address.
164 url_parse::Component out_ip; // Unused: we compute the size ourselves later.
165 if (CanonicalizeIPAddress(host, url_parse::Component(0, host_len),
166 output, &out_ip))
167 return true;
168
169 bool success = true; 142 bool success = true;
170 for (int i = 0; i < host_len; i++) { 143 for (int i = 0; i < host_len; i++) {
171 unsigned char source = static_cast<unsigned char>(host[i]); 144 unsigned char source = static_cast<unsigned char>(host[i]);
172 if (source == '%') { 145 if (source == '%') {
173 // Handle unescaping. This will replace |source| with the unescaped char. 146 // Handle unescaping. This will replace |source| with the unescaped char.
174 if (!DecodeEscaped(host, &i, host_len, &source)) { 147 if (!DecodeEscaped(host, &i, host_len, &source)) {
175 // Invalid escaped character. There is nothing that can make this 148 // Invalid escaped character. There is nothing that can make this
176 // host valid. We append an escaped percent so the URL looks reasonable 149 // host valid. We append an escaped percent so the URL looks reasonable
177 // and mark as failed. 150 // and mark as failed.
178 AppendEscapedChar('%', output); 151 AppendEscapedChar('%', output);
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
248 // unescaped input requires IDN. 221 // unescaped input requires IDN.
249 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { 222 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
250 // Error with some escape sequence. We'll call the current output 223 // Error with some escape sequence. We'll call the current output
251 // complete. DoSimpleHost will have written some "reasonable" output. 224 // complete. DoSimpleHost will have written some "reasonable" output.
252 return false; 225 return false;
253 } 226 }
254 227
255 // Unescaping may have left us with ASCII input, in which case the 228 // Unescaping may have left us with ASCII input, in which case the
256 // unescaped version we wrote to output is complete. 229 // unescaped version we wrote to output is complete.
257 if (!has_non_ascii) { 230 if (!has_non_ascii) {
258 // Need to be sure to check for IP addresses in the newly unescaped
259 // output. This will fix the output if necessary.
260 InterpretIPAddress(url_parse::MakeRange(begin_length, output->length()),
261 output);
262 return true; 231 return true;
263 } 232 }
264 233
265 // Save the pointer into the data was just converted (it may be appended to 234 // Save the pointer into the data was just converted (it may be appended to
266 // other data in the output buffer). 235 // other data in the output buffer).
267 utf8_source = &output->data()[begin_length]; 236 utf8_source = &output->data()[begin_length];
268 utf8_source_len = output->length() - begin_length; 237 utf8_source_len = output->length() - begin_length;
269 } else { 238 } else {
270 // We don't need to unescape, use input for IDNization later. (We know the 239 // We don't need to unescape, use input for IDNization later. (We know the
271 // input has non-ASCII, or the simple version would have been called 240 // input has non-ASCII, or the simple version would have been called
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
321 } 290 }
322 291
323 // No unescaping necessary, we can safely pass the input to ICU. This 292 // No unescaping necessary, we can safely pass the input to ICU. This
324 // function will only get called if we either have escaped or non-ascii 293 // function will only get called if we either have escaped or non-ascii
325 // input, so it's safe to just use ICU now. Even if the input is ASCII, 294 // input, so it's safe to just use ICU now. Even if the input is ASCII,
326 // this function will do the right thing (just slower than we could). 295 // this function will do the right thing (just slower than we could).
327 return DoIDNHost(host, host_len, output); 296 return DoIDNHost(host, host_len, output);
328 } 297 }
329 298
330 template<typename CHAR, typename UCHAR> 299 template<typename CHAR, typename UCHAR>
331 bool DoHost(const CHAR* spec, 300 void DoHost(const CHAR* spec,
332 const url_parse::Component& host, 301 const url_parse::Component& host,
333 CanonOutput* output, 302 CanonOutput* output,
334 url_parse::Component* out_host) { 303 CanonHostInfo* host_info) {
335 bool success = true;
336 if (host.len <= 0) { 304 if (host.len <= 0) {
337 // Empty hosts don't need anything. 305 // Empty hosts don't need anything.
338 *out_host = url_parse::Component(); 306 host_info->family = CanonHostInfo::NEUTRAL;
339 return true; 307 host_info->out_host = url_parse::Component();
308 return;
340 } 309 }
341 310
342 bool has_non_ascii, has_escaped; 311 bool has_non_ascii, has_escaped;
343 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); 312 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
344 313
345 out_host->begin = output->length(); 314 // Keep track of output's initial length, so we can rewind later.
315 const int output_begin = output->length();
346 316
317 bool success;
347 if (!has_non_ascii && !has_escaped) { 318 if (!has_non_ascii && !has_escaped) {
348 success &= DoSimpleHost(&spec[host.begin], host.len, 319 success = DoSimpleHost(&spec[host.begin], host.len,
349 output, &has_non_ascii); 320 output, &has_non_ascii);
350 DCHECK(!has_non_ascii); 321 DCHECK(!has_non_ascii);
351 } else { 322 } else {
352 success &= DoComplexHost(&spec[host.begin], host.len, 323 success = DoComplexHost(&spec[host.begin], host.len,
353 has_non_ascii, has_escaped, output); 324 has_non_ascii, has_escaped, output);
354 // We could have had escaped numerals that should now be canonicalized as
355 // an IP address. This should be exceedingly rare, it's probably mostly
356 // used by scammers.
357 } 325 }
358 326
359 out_host->len = output->length() - out_host->begin; 327 if (!success) {
360 return success; 328 // Canonicalization failed. Set BROKEN to notify the caller.
329 host_info->family = CanonHostInfo::BROKEN;
330 } else {
331 // After all the other canonicalization, check if we ended up with an IP
332 // address. IP addresses are small, so writing into this temporary buffer
333 // should not cause an allocation.
334 RawCanonOutput<64> canon_ip;
335 CanonicalizeIPAddress(output->data(),
336 url_parse::MakeRange(output_begin, output->length()),
337 &canon_ip, host_info);
338
339 // If we got an IPv4/IPv6 address, copy the canonical form back to the
340 // real buffer. Otherwise, it's a hostname or broken IP, in which case
341 // we just leave it in place.
342 if (host_info->IsIPAddress()) {
343 output->set_length(output_begin);
344 output->Append(canon_ip.data(), canon_ip.length());
345 }
346 }
347
348 host_info->out_host = url_parse::MakeRange(output_begin, output->length());
361 } 349 }
362 350
363 } // namespace 351 } // namespace
364 352
365 bool CanonicalizeHost(const char* spec, 353 bool CanonicalizeHost(const char* spec,
366 const url_parse::Component& host, 354 const url_parse::Component& host,
367 CanonOutput* output, 355 CanonOutput* output,
368 url_parse::Component* out_host) { 356 url_parse::Component* out_host) {
369 return DoHost<char, unsigned char>(spec, host, output, out_host); 357 CanonHostInfo host_info;
358 DoHost<char, unsigned char>(spec, host, output, &host_info);
359 *out_host = host_info.out_host;
360 return (host_info.family != CanonHostInfo::BROKEN);
370 } 361 }
371 362
372 bool CanonicalizeHost(const char16* spec, 363 bool CanonicalizeHost(const char16* spec,
373 const url_parse::Component& host, 364 const url_parse::Component& host,
374 CanonOutput* output, 365 CanonOutput* output,
375 url_parse::Component* out_host) { 366 url_parse::Component* out_host) {
376 return DoHost<char16, char16>(spec, host, output, out_host); 367 CanonHostInfo host_info;
368 DoHost<char16, char16>(spec, host, output, &host_info);
369 *out_host = host_info.out_host;
370 return (host_info.family != CanonHostInfo::BROKEN);
371 }
372
373 void CanonicalizeHostVerbose(const char* spec,
374 const url_parse::Component& host,
375 CanonOutput* output,
376 CanonHostInfo *host_info) {
377 DoHost<char, unsigned char>(spec, host, output, host_info);
378 }
379
380 void CanonicalizeHostVerbose(const char16* spec,
381 const url_parse::Component& host,
382 CanonOutput* output,
383 CanonHostInfo *host_info) {
384 DoHost<char16, char16>(spec, host, output, host_info);
377 } 385 }
378 386
379 } // namespace url_canon 387 } // namespace url_canon
OLDNEW
« no previous file with comments | « src/url_canon.h ('k') | src/url_canon_ip.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698