OLD | NEW |
1 // Copyright 2007, Google Inc. | 1 // Copyright 2007, Google Inc. |
2 // All rights reserved. | 2 // All rights reserved. |
3 // | 3 // |
4 // Redistribution and use in source and binary forms, with or without | 4 // Redistribution and use in source and binary forms, with or without |
5 // modification, are permitted provided that the following conditions are | 5 // modification, are permitted provided that the following conditions are |
6 // met: | 6 // met: |
7 // | 7 // |
8 // * Redistributions of source code must retain the above copyright | 8 // * Redistributions of source code must retain the above copyright |
9 // notice, this list of conditions and the following disclaimer. | 9 // notice, this list of conditions and the following disclaimer. |
10 // * Redistributions in binary form must reproduce the above | 10 // * Redistributions in binary form must reproduce the above |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
75 // based on how many times you run the canonicalizer. We prefer to always report | 75 // based on how many times you run the canonicalizer. We prefer to always report |
76 // the same vailidity, so reject this. | 76 // the same vailidity, so reject this. |
77 const unsigned char kEsc = 0xff; | 77 const unsigned char kEsc = 0xff; |
78 const unsigned char kHostCharLookup[0x80] = { | 78 const unsigned char kHostCharLookup[0x80] = { |
79 // 00-1f: all are invalid | 79 // 00-1f: all are invalid |
80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, | 80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, |
81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, | 81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, |
82 // ' ' ! " # $ % & ' ( ) * + , - .
/ | 82 // ' ' ! " # $ % & ' ( ) * + , - .
/ |
83 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',
0, | 83 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',
0, |
84 // 0 1 2 3 4 5 6 7 8 9 : ; < = >
? | 84 // 0 1 2 3 4 5 6 7 8 9 : ; < = >
? |
85 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 ,kEsc,kEsc,kEsc,
0 , | 85 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc,
0 , |
86 // @ A B C D E F G H I J K L M N
O | 86 // @ A B C D E F G H I J K L M N
O |
87 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', | 87 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', |
88 // P Q R S T U V W X Y Z [ \ ] ^
_ | 88 // P Q R S T U V W X Y Z [ \ ] ^
_ |
89 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '
_', | 89 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '
_', |
90 // ` a b c d e f g h i j k l m n
o | 90 // ` a b c d e f g h i j k l m n
o |
91 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', | 91 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', |
92 // p q r s t u v w x y z { | } ~ | 92 // p q r s t u v w x y z { | } ~ |
93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 ,
0 }; | 93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 ,
0 }; |
94 | 94 |
95 const int kTempHostBufferLen = 1024; | 95 const int kTempHostBufferLen = 1024; |
(...skipping 10 matching lines...) Expand all Loading... |
106 *has_non_ascii = false; | 106 *has_non_ascii = false; |
107 *has_escaped = false; | 107 *has_escaped = false; |
108 for (int i = host.begin; i < end; i++) { | 108 for (int i = host.begin; i < end; i++) { |
109 if (static_cast<UCHAR>(spec[i]) >= 0x80) | 109 if (static_cast<UCHAR>(spec[i]) >= 0x80) |
110 *has_non_ascii = true; | 110 *has_non_ascii = true; |
111 else if (spec[i] == '%') | 111 else if (spec[i] == '%') |
112 *has_escaped = true; | 112 *has_escaped = true; |
113 } | 113 } |
114 } | 114 } |
115 | 115 |
116 // Considers the current contents of the output and sees if it looks like an | |
117 // IP address. This is called because we canonicalize to the output assuming | |
118 // that it's not an IP address, and now need to fix it if we produced one. | |
119 // | |
120 // The generated hostname is identified by |host|. The output will be fixed | |
121 // with a canonical IP address if the host looks like one. Otherwise, there | |
122 // will be no change. | |
123 void InterpretIPAddress(const url_parse::Component& host, | |
124 CanonOutput* output) { | |
125 // Canonicalize the IP address in the output to this temporary buffer. | |
126 // IP addresses are small, so this should not cause an allocation. | |
127 RawCanonOutput<64> canon_ip; | |
128 url_parse::Component out_host; // Unused. | |
129 if (CanonicalizeIPAddress(output->data(), host, &canon_ip, &out_host)) { | |
130 // Looks like an IP address, overwrite the existing host with the newly | |
131 // canonicalized IP address. | |
132 output->set_length(host.begin); | |
133 output->Append(canon_ip.data(), canon_ip.length()); | |
134 } | |
135 } | |
136 | |
137 // Canonicalizes a host name that is entirely 8-bit characters (even though | 116 // Canonicalizes a host name that is entirely 8-bit characters (even though |
138 // the type holding them may be 16 bits. Escaped characters will be unescaped. | 117 // the type holding them may be 16 bits. Escaped characters will be unescaped. |
139 // Non-7-bit characters (for example, UTF-8) will be passed unchanged. | 118 // Non-7-bit characters (for example, UTF-8) will be passed unchanged. |
140 // | 119 // |
141 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in | 120 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in |
142 // the output. | 121 // the output. |
143 // | 122 // |
144 // This function is used in two situations: | 123 // This function is used in two situations: |
145 // | 124 // |
146 // * When the caller knows there is no non-ASCII or percent escaped | 125 // * When the caller knows there is no non-ASCII or percent escaped |
147 // characters. This is what DoHost does. The result will be a completely | 126 // characters. This is what DoHost does. The result will be a completely |
148 // canonicalized host since we know nothing weird can happen (escaped | 127 // canonicalized host since we know nothing weird can happen (escaped |
149 // characters could be unescaped to non-7-bit, so they have to be treated | 128 // characters could be unescaped to non-7-bit, so they have to be treated |
150 // with suspicion at this point). It does not use the |has_non_ascii| flag. | 129 // with suspicion at this point). It does not use the |has_non_ascii| flag. |
151 // | 130 // |
152 // * When the caller has an 8-bit string that may need unescaping. | 131 // * When the caller has an 8-bit string that may need unescaping. |
153 // DoComplexHost calls us this situation to do unescaping and validation. | 132 // DoComplexHost calls us this situation to do unescaping and validation. |
154 // After this, it may do other IDN operations depending on the value of the | 133 // After this, it may do other IDN operations depending on the value of the |
155 // |*has_non_ascii| flag. | 134 // |*has_non_ascii| flag. |
156 // | 135 // |
157 // The return value indicates if the output is a potentially valid host name. | 136 // The return value indicates if the output is a potentially valid host name. |
158 template<typename CHAR> | 137 template<typename CHAR> |
159 bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output, | 138 bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output, |
160 bool* has_non_ascii) { | 139 bool* has_non_ascii) { |
161 *has_non_ascii = false; | 140 *has_non_ascii = false; |
162 | 141 |
163 // First check if the host name is an IP address. | |
164 url_parse::Component out_ip; // Unused: we compute the size ourselves later. | |
165 if (CanonicalizeIPAddress(host, url_parse::Component(0, host_len), | |
166 output, &out_ip)) | |
167 return true; | |
168 | |
169 bool success = true; | 142 bool success = true; |
170 for (int i = 0; i < host_len; i++) { | 143 for (int i = 0; i < host_len; i++) { |
171 unsigned char source = static_cast<unsigned char>(host[i]); | 144 unsigned char source = static_cast<unsigned char>(host[i]); |
172 if (source == '%') { | 145 if (source == '%') { |
173 // Handle unescaping. This will replace |source| with the unescaped char. | 146 // Handle unescaping. This will replace |source| with the unescaped char. |
174 if (!DecodeEscaped(host, &i, host_len, &source)) { | 147 if (!DecodeEscaped(host, &i, host_len, &source)) { |
175 // Invalid escaped character. There is nothing that can make this | 148 // Invalid escaped character. There is nothing that can make this |
176 // host valid. We append an escaped percent so the URL looks reasonable | 149 // host valid. We append an escaped percent so the URL looks reasonable |
177 // and mark as failed. | 150 // and mark as failed. |
178 AppendEscapedChar('%', output); | 151 AppendEscapedChar('%', output); |
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
248 // unescaped input requires IDN. | 221 // unescaped input requires IDN. |
249 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { | 222 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) { |
250 // Error with some escape sequence. We'll call the current output | 223 // Error with some escape sequence. We'll call the current output |
251 // complete. DoSimpleHost will have written some "reasonable" output. | 224 // complete. DoSimpleHost will have written some "reasonable" output. |
252 return false; | 225 return false; |
253 } | 226 } |
254 | 227 |
255 // Unescaping may have left us with ASCII input, in which case the | 228 // Unescaping may have left us with ASCII input, in which case the |
256 // unescaped version we wrote to output is complete. | 229 // unescaped version we wrote to output is complete. |
257 if (!has_non_ascii) { | 230 if (!has_non_ascii) { |
258 // Need to be sure to check for IP addresses in the newly unescaped | |
259 // output. This will fix the output if necessary. | |
260 InterpretIPAddress(url_parse::MakeRange(begin_length, output->length()), | |
261 output); | |
262 return true; | 231 return true; |
263 } | 232 } |
264 | 233 |
265 // Save the pointer into the data was just converted (it may be appended to | 234 // Save the pointer into the data was just converted (it may be appended to |
266 // other data in the output buffer). | 235 // other data in the output buffer). |
267 utf8_source = &output->data()[begin_length]; | 236 utf8_source = &output->data()[begin_length]; |
268 utf8_source_len = output->length() - begin_length; | 237 utf8_source_len = output->length() - begin_length; |
269 } else { | 238 } else { |
270 // We don't need to unescape, use input for IDNization later. (We know the | 239 // We don't need to unescape, use input for IDNization later. (We know the |
271 // input has non-ASCII, or the simple version would have been called | 240 // input has non-ASCII, or the simple version would have been called |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
321 } | 290 } |
322 | 291 |
323 // No unescaping necessary, we can safely pass the input to ICU. This | 292 // No unescaping necessary, we can safely pass the input to ICU. This |
324 // function will only get called if we either have escaped or non-ascii | 293 // function will only get called if we either have escaped or non-ascii |
325 // input, so it's safe to just use ICU now. Even if the input is ASCII, | 294 // input, so it's safe to just use ICU now. Even if the input is ASCII, |
326 // this function will do the right thing (just slower than we could). | 295 // this function will do the right thing (just slower than we could). |
327 return DoIDNHost(host, host_len, output); | 296 return DoIDNHost(host, host_len, output); |
328 } | 297 } |
329 | 298 |
330 template<typename CHAR, typename UCHAR> | 299 template<typename CHAR, typename UCHAR> |
331 bool DoHost(const CHAR* spec, | 300 void DoHost(const CHAR* spec, |
332 const url_parse::Component& host, | 301 const url_parse::Component& host, |
333 CanonOutput* output, | 302 CanonOutput* output, |
334 url_parse::Component* out_host) { | 303 CanonHostInfo* host_info) { |
335 bool success = true; | |
336 if (host.len <= 0) { | 304 if (host.len <= 0) { |
337 // Empty hosts don't need anything. | 305 // Empty hosts don't need anything. |
338 *out_host = url_parse::Component(); | 306 host_info->family = CanonHostInfo::NEUTRAL; |
339 return true; | 307 host_info->out_host = url_parse::Component(); |
| 308 return; |
340 } | 309 } |
341 | 310 |
342 bool has_non_ascii, has_escaped; | 311 bool has_non_ascii, has_escaped; |
343 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); | 312 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped); |
344 | 313 |
345 out_host->begin = output->length(); | 314 // Keep track of output's initial length, so we can rewind later. |
| 315 const int output_begin = output->length(); |
346 | 316 |
| 317 bool success; |
347 if (!has_non_ascii && !has_escaped) { | 318 if (!has_non_ascii && !has_escaped) { |
348 success &= DoSimpleHost(&spec[host.begin], host.len, | 319 success = DoSimpleHost(&spec[host.begin], host.len, |
349 output, &has_non_ascii); | 320 output, &has_non_ascii); |
350 DCHECK(!has_non_ascii); | 321 DCHECK(!has_non_ascii); |
351 } else { | 322 } else { |
352 success &= DoComplexHost(&spec[host.begin], host.len, | 323 success = DoComplexHost(&spec[host.begin], host.len, |
353 has_non_ascii, has_escaped, output); | 324 has_non_ascii, has_escaped, output); |
354 // We could have had escaped numerals that should now be canonicalized as | |
355 // an IP address. This should be exceedingly rare, it's probably mostly | |
356 // used by scammers. | |
357 } | 325 } |
358 | 326 |
359 out_host->len = output->length() - out_host->begin; | 327 if (!success) { |
360 return success; | 328 // Canonicalization failed. Set BROKEN to notify the caller. |
| 329 host_info->family = CanonHostInfo::BROKEN; |
| 330 } else { |
| 331 // After all the other canonicalization, check if we ended up with an IP |
| 332 // address. IP addresses are small, so writing into this temporary buffer |
| 333 // should not cause an allocation. |
| 334 RawCanonOutput<64> canon_ip; |
| 335 CanonicalizeIPAddress(output->data(), |
| 336 url_parse::MakeRange(output_begin, output->length()), |
| 337 &canon_ip, host_info); |
| 338 |
| 339 // If we got an IPv4/IPv6 address, copy the canonical form back to the |
| 340 // real buffer. Otherwise, it's a hostname or broken IP, in which case |
| 341 // we just leave it in place. |
| 342 if (host_info->IsIPAddress()) { |
| 343 output->set_length(output_begin); |
| 344 output->Append(canon_ip.data(), canon_ip.length()); |
| 345 } |
| 346 } |
| 347 |
| 348 host_info->out_host = url_parse::MakeRange(output_begin, output->length()); |
361 } | 349 } |
362 | 350 |
363 } // namespace | 351 } // namespace |
364 | 352 |
365 bool CanonicalizeHost(const char* spec, | 353 bool CanonicalizeHost(const char* spec, |
366 const url_parse::Component& host, | 354 const url_parse::Component& host, |
367 CanonOutput* output, | 355 CanonOutput* output, |
368 url_parse::Component* out_host) { | 356 url_parse::Component* out_host) { |
369 return DoHost<char, unsigned char>(spec, host, output, out_host); | 357 CanonHostInfo host_info; |
| 358 DoHost<char, unsigned char>(spec, host, output, &host_info); |
| 359 *out_host = host_info.out_host; |
| 360 return (host_info.family != CanonHostInfo::BROKEN); |
370 } | 361 } |
371 | 362 |
372 bool CanonicalizeHost(const char16* spec, | 363 bool CanonicalizeHost(const char16* spec, |
373 const url_parse::Component& host, | 364 const url_parse::Component& host, |
374 CanonOutput* output, | 365 CanonOutput* output, |
375 url_parse::Component* out_host) { | 366 url_parse::Component* out_host) { |
376 return DoHost<char16, char16>(spec, host, output, out_host); | 367 CanonHostInfo host_info; |
| 368 DoHost<char16, char16>(spec, host, output, &host_info); |
| 369 *out_host = host_info.out_host; |
| 370 return (host_info.family != CanonHostInfo::BROKEN); |
| 371 } |
| 372 |
| 373 void CanonicalizeHostVerbose(const char* spec, |
| 374 const url_parse::Component& host, |
| 375 CanonOutput* output, |
| 376 CanonHostInfo *host_info) { |
| 377 DoHost<char, unsigned char>(spec, host, output, host_info); |
| 378 } |
| 379 |
| 380 void CanonicalizeHostVerbose(const char16* spec, |
| 381 const url_parse::Component& host, |
| 382 CanonOutput* output, |
| 383 CanonHostInfo *host_info) { |
| 384 DoHost<char16, char16>(spec, host, output, host_info); |
377 } | 385 } |
378 | 386 |
379 } // namespace url_canon | 387 } // namespace url_canon |
OLD | NEW |