OLD | NEW |
(Empty) | |
| 1 // Copyright 2007, Google Inc. |
| 2 // All rights reserved. |
| 3 // |
| 4 // Redistribution and use in source and binary forms, with or without |
| 5 // modification, are permitted provided that the following conditions are |
| 6 // met: |
| 7 // |
| 8 // * Redistributions of source code must retain the above copyright |
| 9 // notice, this list of conditions and the following disclaimer. |
| 10 // * Redistributions in binary form must reproduce the above |
| 11 // copyright notice, this list of conditions and the following disclaimer |
| 12 // in the documentation and/or other materials provided with the |
| 13 // distribution. |
| 14 // * Neither the name of Google Inc. nor the names of its |
| 15 // contributors may be used to endorse or promote products derived from |
| 16 // this software without specific prior written permission. |
| 17 // |
| 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 |
| 30 // Canonicalizers for random bits that aren't big enough for their own files. |
| 31 |
| 32 #include <string.h> |
| 33 |
| 34 #include "googleurl/src/url_canon.h" |
| 35 #include "googleurl/src/url_canon_internal.h" |
| 36 |
| 37 namespace url_canon { |
| 38 |
| 39 namespace { |
| 40 |
| 41 // Returns true if the given character should be removed from the middle of a |
| 42 // URL. |
| 43 inline bool IsRemovableURLWhitespace(int ch) { |
| 44 return ch == '\r' || ch == '\n' || ch == '\t'; |
| 45 } |
| 46 |
| 47 // Backend for RemoveURLWhitespace (see declaration in url_canon.h). |
| 48 // It sucks that we have to do this, since this takes about 13% of the total URL |
| 49 // canonicalization time. |
| 50 template<typename CHAR> |
| 51 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len, |
| 52 CanonOutputT<CHAR>* buffer, |
| 53 int* output_len) { |
| 54 // Fast verification that there's nothing that needs removal. This is the 99% |
| 55 // case, so we want it to be fast and don't care about impacting the speed |
| 56 // when we do find whitespace. |
| 57 int found_whitespace = false; |
| 58 for (int i = 0; i < input_len; i++) { |
| 59 if (!IsRemovableURLWhitespace(input[i])) |
| 60 continue; |
| 61 found_whitespace = true; |
| 62 break; |
| 63 } |
| 64 |
| 65 if (!found_whitespace) { |
| 66 // Didn't find any whitespace, we don't need to do anything. We can just |
| 67 // return the input as the output. |
| 68 *output_len = input_len; |
| 69 return input; |
| 70 } |
| 71 |
| 72 // Remove the whitespace into the new buffer and return it. |
| 73 for (int i = 0; i < input_len; i++) { |
| 74 if (!IsRemovableURLWhitespace(input[i])) |
| 75 buffer->push_back(input[i]); |
| 76 } |
| 77 *output_len = buffer->length(); |
| 78 return buffer->data(); |
| 79 } |
| 80 |
| 81 // Contains the canonical version of each possible input letter in the scheme |
| 82 // (basically, lower-cased). The corresponding entry will be 0 if the letter |
| 83 // is not allowed in a scheme. |
| 84 const char kSchemeCanonical[0x80] = { |
| 85 // 00-1f: all are invalid |
| 86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, |
| 87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, |
| 88 // ' ' ! " # $ % & ' ( ) * + , - .
/ |
| 89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.',
0, |
| 90 // 0 1 2 3 4 5 6 7 8 9 : ; < = >
? |
| 91 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 ,
0 , |
| 92 // @ A B C D E F G H I J K L M N
O |
| 93 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', |
| 94 // P Q R S T U V W X Y Z [ \ ] ^
_ |
| 95 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 ,
0, |
| 96 // ` a b c d e f g h i j k l m n
o |
| 97 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', |
| 98 // p q r s t u v w x y z { | } ~ |
| 99 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 ,
0 }; |
| 100 |
| 101 // This could be a table lookup as well by setting the high bit for each |
| 102 // valid character, but it's only called once per URL, and it makes the lookup |
| 103 // table easier to read not having extra stuff in it. |
| 104 inline bool IsSchemeFirstChar(unsigned char c) { |
| 105 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); |
| 106 } |
| 107 |
| 108 template<typename CHAR, typename UCHAR> |
| 109 bool DoScheme(const CHAR* spec, |
| 110 const url_parse::Component& scheme, |
| 111 CanonOutput* output, |
| 112 url_parse::Component* out_scheme) { |
| 113 if (scheme.len <= 0) { |
| 114 // Scheme is unspecified or empty, convert to empty by appending a colon. |
| 115 *out_scheme = url_parse::Component(output->length(), 0); |
| 116 output->push_back(':'); |
| 117 return true; |
| 118 } |
| 119 |
| 120 // The output scheme starts from the current position. |
| 121 out_scheme->begin = output->length(); |
| 122 |
| 123 // Danger: it's important that this code does not strip any characters: it |
| 124 // only emits the canonical version (be it valid or escaped) of each of |
| 125 // the input characters. Stripping would put it out of sync with |
| 126 // url_util::FindAndCompareScheme, which could cause some security checks on |
| 127 // schemes to be incorrect. |
| 128 bool success = true; |
| 129 int end = scheme.end(); |
| 130 for (int i = scheme.begin; i < end; i++) { |
| 131 UCHAR ch = static_cast<UCHAR>(spec[i]); |
| 132 char replacement = 0; |
| 133 if (ch < 0x80) { |
| 134 if (i == scheme.begin) { |
| 135 // Need to do a special check for the first letter of the scheme. |
| 136 if (IsSchemeFirstChar(static_cast<unsigned char>(ch))) |
| 137 replacement = kSchemeCanonical[ch]; |
| 138 } else { |
| 139 replacement = kSchemeCanonical[ch]; |
| 140 } |
| 141 } |
| 142 |
| 143 if (replacement) { |
| 144 output->push_back(replacement); |
| 145 } else if (ch == '%') { |
| 146 // Canonicalizing the scheme multiple times should lead to the same |
| 147 // result. Since invalid characters will be escaped, we need to preserve |
| 148 // the percent to avoid multiple escaping. The scheme will be invalid. |
| 149 success = false; |
| 150 output->push_back('%'); |
| 151 } else { |
| 152 // Invalid character, store it but mark this scheme as invalid. |
| 153 success = false; |
| 154 |
| 155 // This will escape the output and also handle encoding issues. |
| 156 // Ignore the return value since we already failed. |
| 157 AppendUTF8EscapedChar(spec, &i, end, output); |
| 158 } |
| 159 } |
| 160 |
| 161 // The output scheme ends with the the current position, before appending |
| 162 // the colon. |
| 163 out_scheme->len = output->length() - out_scheme->begin; |
| 164 output->push_back(':'); |
| 165 return success; |
| 166 } |
| 167 |
| 168 // The username and password components reference ranges in the corresponding |
| 169 // *_spec strings. Typically, these specs will be the same (we're |
| 170 // canonicalizing a single source string), but may be different when |
| 171 // replacing components. |
| 172 template<typename CHAR, typename UCHAR> |
| 173 bool DoUserInfo(const CHAR* username_spec, |
| 174 const url_parse::Component& username, |
| 175 const CHAR* password_spec, |
| 176 const url_parse::Component& password, |
| 177 CanonOutput* output, |
| 178 url_parse::Component* out_username, |
| 179 url_parse::Component* out_password) { |
| 180 if (username.len <= 0 && password.len <= 0) { |
| 181 // Common case: no user info. We strip empty username/passwords. |
| 182 *out_username = url_parse::Component(); |
| 183 *out_password = url_parse::Component(); |
| 184 return true; |
| 185 } |
| 186 |
| 187 // Write the username. |
| 188 out_username->begin = output->length(); |
| 189 if (username.len > 0) { |
| 190 // This will escape characters not valid for the username. |
| 191 AppendStringOfType(&username_spec[username.begin], username.len, |
| 192 CHAR_USERINFO, output); |
| 193 } |
| 194 out_username->len = output->length() - out_username->begin; |
| 195 |
| 196 // When there is a password, we need the separator. Note that we strip |
| 197 // empty but specified passwords. |
| 198 if (password.len > 0) { |
| 199 output->push_back(':'); |
| 200 out_password->begin = output->length(); |
| 201 AppendStringOfType(&password_spec[password.begin], password.len, |
| 202 CHAR_USERINFO, output); |
| 203 out_password->len = output->length() - out_password->begin; |
| 204 } else { |
| 205 *out_password = url_parse::Component(); |
| 206 } |
| 207 |
| 208 output->push_back('@'); |
| 209 return true; |
| 210 } |
| 211 |
| 212 // Helper functions for converting port integers to strings. |
| 213 inline void WritePortInt(char* output, int output_len, int port) { |
| 214 _itoa_s(port, output, output_len, 10); |
| 215 } |
| 216 |
| 217 // This function will prepend the colon if there will be a port. |
| 218 template<typename CHAR, typename UCHAR> |
| 219 bool DoPort(const CHAR* spec, |
| 220 const url_parse::Component& port, |
| 221 int default_port_for_scheme, |
| 222 CanonOutput* output, |
| 223 url_parse::Component* out_port) { |
| 224 int port_num = url_parse::ParsePort(spec, port); |
| 225 if (port_num == url_parse::PORT_UNSPECIFIED || |
| 226 port_num == default_port_for_scheme) { |
| 227 *out_port = url_parse::Component(); |
| 228 return true; // Leave port empty. |
| 229 } |
| 230 |
| 231 if (port_num == url_parse::PORT_INVALID) { |
| 232 // Invalid port: We'll copy the text from the input so the user can see |
| 233 // what the error was, and mark the URL as invalid by returning false. |
| 234 output->push_back(':'); |
| 235 out_port->begin = output->length(); |
| 236 AppendInvalidNarrowString(spec, port.begin, port.end(), output); |
| 237 out_port->len = output->length() - out_port->begin; |
| 238 return false; |
| 239 } |
| 240 |
| 241 // Convert port number back to an integer. Max port value is 5 digits, and |
| 242 // the Parsed::ExtractPort will have made sure the integer is in range. |
| 243 const int buf_size = 6; |
| 244 char buf[buf_size]; |
| 245 WritePortInt(buf, buf_size, port_num); |
| 246 |
| 247 // Append the port number to the output, preceeded by a colon. |
| 248 output->push_back(':'); |
| 249 out_port->begin = output->length(); |
| 250 for (int i = 0; i < buf_size && buf[i]; i++) |
| 251 output->push_back(buf[i]); |
| 252 |
| 253 out_port->len = output->length() - out_port->begin; |
| 254 return true; |
| 255 } |
| 256 |
| 257 template<typename CHAR, typename UCHAR> |
| 258 void DoCanonicalizeRef(const CHAR* spec, |
| 259 const url_parse::Component& ref, |
| 260 CanonOutput* output, |
| 261 url_parse::Component* out_ref) { |
| 262 if (ref.len < 0) { |
| 263 // Common case of no ref. |
| 264 *out_ref = url_parse::Component(); |
| 265 return; |
| 266 } |
| 267 |
| 268 // Append the ref separator. Note that we need to do this even when the ref |
| 269 // is empty but present. |
| 270 output->push_back('#'); |
| 271 out_ref->begin = output->length(); |
| 272 |
| 273 // Now iterate through all the characters, converting to UTF-8 and validating. |
| 274 int end = ref.end(); |
| 275 for (int i = ref.begin; i < end; i++) { |
| 276 if (spec[i] == 0) { |
| 277 // IE just strips NULLs, so we do too. |
| 278 continue; |
| 279 } else if (static_cast<UCHAR>(spec[i]) < 0x20) { |
| 280 // Unline IE seems to, we escape control characters. This will probably |
| 281 // make the reference fragment unusable on a web page, but people |
| 282 // shouldn't be using control characters in their anchor names. |
| 283 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output); |
| 284 } else if (static_cast<UCHAR>(spec[i]) < 0x80) { |
| 285 // Normal ASCII characters are just appended. |
| 286 output->push_back(static_cast<char>(spec[i])); |
| 287 } else { |
| 288 // Non-ASCII characters are appended unescaped, but only when they are |
| 289 // valid. Invalid Unicode characters are replaced with the "invalid |
| 290 // character" as IE seems to (ReadUTFChar puts the unicode replacement |
| 291 // character in the output on failure for us). |
| 292 unsigned code_point; |
| 293 ReadUTFChar(spec, &i, end, &code_point); |
| 294 AppendUTF8Value(code_point, output); |
| 295 } |
| 296 } |
| 297 |
| 298 out_ref->len = output->length() - out_ref->begin; |
| 299 } |
| 300 |
| 301 } // namespace |
| 302 |
| 303 const char* RemoveURLWhitespace(const char* input, int input_len, |
| 304 CanonOutputT<char>* buffer, |
| 305 int* output_len) { |
| 306 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); |
| 307 } |
| 308 |
| 309 const char16* RemoveURLWhitespace(const char16* input, int input_len, |
| 310 CanonOutputT<char16>* buffer, |
| 311 int* output_len) { |
| 312 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); |
| 313 } |
| 314 |
| 315 char CanonicalSchemeChar(char16 ch) { |
| 316 if (ch >= 0x80) |
| 317 return 0; // Non-ASCII is not supported by schemes. |
| 318 return kSchemeCanonical[ch]; |
| 319 } |
| 320 |
| 321 bool CanonicalizeScheme(const char* spec, |
| 322 const url_parse::Component& scheme, |
| 323 CanonOutput* output, |
| 324 url_parse::Component* out_scheme) { |
| 325 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme); |
| 326 } |
| 327 |
| 328 bool CanonicalizeScheme(const char16* spec, |
| 329 const url_parse::Component& scheme, |
| 330 CanonOutput* output, |
| 331 url_parse::Component* out_scheme) { |
| 332 return DoScheme<char16, char16>(spec, scheme, output, out_scheme); |
| 333 } |
| 334 |
| 335 bool CanonicalizeUserInfo(const char* username_source, |
| 336 const url_parse::Component& username, |
| 337 const char* password_source, |
| 338 const url_parse::Component& password, |
| 339 CanonOutput* output, |
| 340 url_parse::Component* out_username, |
| 341 url_parse::Component* out_password) { |
| 342 return DoUserInfo<char, unsigned char>( |
| 343 username_source, username, password_source, password, |
| 344 output, out_username, out_password); |
| 345 } |
| 346 |
| 347 bool CanonicalizeUserInfo(const char16* username_source, |
| 348 const url_parse::Component& username, |
| 349 const char16* password_source, |
| 350 const url_parse::Component& password, |
| 351 CanonOutput* output, |
| 352 url_parse::Component* out_username, |
| 353 url_parse::Component* out_password) { |
| 354 return DoUserInfo<char16, char16>( |
| 355 username_source, username, password_source, password, |
| 356 output, out_username, out_password); |
| 357 } |
| 358 |
| 359 bool CanonicalizePort(const char* spec, |
| 360 const url_parse::Component& port, |
| 361 int default_port_for_scheme, |
| 362 CanonOutput* output, |
| 363 url_parse::Component* out_port) { |
| 364 return DoPort<char, unsigned char>(spec, port, |
| 365 default_port_for_scheme, |
| 366 output, out_port); |
| 367 } |
| 368 |
| 369 bool CanonicalizePort(const char16* spec, |
| 370 const url_parse::Component& port, |
| 371 int default_port_for_scheme, |
| 372 CanonOutput* output, |
| 373 url_parse::Component* out_port) { |
| 374 return DoPort<char16, char16>(spec, port, default_port_for_scheme, |
| 375 output, out_port); |
| 376 } |
| 377 |
| 378 void CanonicalizeRef(const char* spec, |
| 379 const url_parse::Component& ref, |
| 380 CanonOutput* output, |
| 381 url_parse::Component* out_ref) { |
| 382 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref); |
| 383 } |
| 384 |
| 385 void CanonicalizeRef(const char16* spec, |
| 386 const url_parse::Component& ref, |
| 387 CanonOutput* output, |
| 388 url_parse::Component* out_ref) { |
| 389 DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref); |
| 390 } |
| 391 |
| 392 } // namespace url_canon |
OLD | NEW |