| OLD | NEW |
| 1 // Copyright 2007, Google Inc. | 1 // Copyright 2007, Google Inc. |
| 2 // All rights reserved. | 2 // All rights reserved. |
| 3 // | 3 // |
| 4 // Redistribution and use in source and binary forms, with or without | 4 // Redistribution and use in source and binary forms, with or without |
| 5 // modification, are permitted provided that the following conditions are | 5 // modification, are permitted provided that the following conditions are |
| 6 // met: | 6 // met: |
| 7 // | 7 // |
| 8 // * Redistributions of source code must retain the above copyright | 8 // * Redistributions of source code must retain the above copyright |
| 9 // notice, this list of conditions and the following disclaimer. | 9 // notice, this list of conditions and the following disclaimer. |
| 10 // * Redistributions in binary form must reproduce the above | 10 // * Redistributions in binary form must reproduce the above |
| (...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 127 // canonicalized host since we know nothing weird can happen (escaped | 127 // canonicalized host since we know nothing weird can happen (escaped |
| 128 // characters could be unescaped to non-7-bit, so they have to be treated | 128 // characters could be unescaped to non-7-bit, so they have to be treated |
| 129 // with suspicion at this point). It does not use the |has_non_ascii| flag. | 129 // with suspicion at this point). It does not use the |has_non_ascii| flag. |
| 130 // | 130 // |
| 131 // * When the caller has an 8-bit string that may need unescaping. | 131 // * When the caller has an 8-bit string that may need unescaping. |
| 132 // DoComplexHost calls us this situation to do unescaping and validation. | 132 // DoComplexHost calls us this situation to do unescaping and validation. |
| 133 // After this, it may do other IDN operations depending on the value of the | 133 // After this, it may do other IDN operations depending on the value of the |
| 134 // |*has_non_ascii| flag. | 134 // |*has_non_ascii| flag. |
| 135 // | 135 // |
| 136 // The return value indicates if the output is a potentially valid host name. | 136 // The return value indicates if the output is a potentially valid host name. |
| 137 template<typename CHAR> | 137 template<typename INCHAR, typename OUTCHAR> |
| 138 bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output, | 138 bool DoSimpleHost(const INCHAR* host, |
| 139 int host_len, |
| 140 CanonOutputT<OUTCHAR>* output, |
| 139 bool* has_non_ascii) { | 141 bool* has_non_ascii) { |
| 140 *has_non_ascii = false; | 142 *has_non_ascii = false; |
| 141 | 143 |
| 142 bool success = true; | 144 bool success = true; |
| 143 for (int i = 0; i < host_len; i++) { | 145 for (int i = 0; i < host_len; ++i) { |
| 144 unsigned char source = static_cast<unsigned char>(host[i]); | 146 unsigned int source = host[i]; |
| 145 if (source == '%') { | 147 if (source == '%') { |
| 146 // Handle unescaping. This will replace |source| with the unescaped char. | 148 // Unescape first, if possible. |
| 147 if (!DecodeEscaped(host, &i, host_len, &source)) { | 149 // Source will be used only if decode operation was successful. |
| 150 if (!DecodeEscaped(host, &i, host_len, |
| 151 reinterpret_cast<unsigned char*>(&source))) { |
| 148 // Invalid escaped character. There is nothing that can make this | 152 // Invalid escaped character. There is nothing that can make this |
| 149 // host valid. We append an escaped percent so the URL looks reasonable | 153 // host valid. We append an escaped percent so the URL looks reasonable |
| 150 // and mark as failed. | 154 // and mark as failed. |
| 151 AppendEscapedChar('%', output); | 155 AppendEscapedChar('%', output); |
| 152 success = false; | 156 success = false; |
| 153 continue; | 157 continue; |
| 154 } | 158 } |
| 155 } | 159 } |
| 156 | 160 |
| 157 if (source >= 0x80) { | 161 if (source <= 0x80) { |
| 158 // Handle non-ASCII. | |
| 159 *has_non_ascii = true; | |
| 160 output->push_back(source); | |
| 161 } else { | |
| 162 // We have ASCII input, we can use our lookup table. | 162 // We have ASCII input, we can use our lookup table. |
| 163 unsigned char replacement = kHostCharLookup[source]; | 163 unsigned char replacement = kHostCharLookup[source]; |
| 164 if (!replacement) { | 164 if (!replacement) { |
| 165 // Invalid character, add it as percent-escaped and mark as failed. | 165 // Invalid character, add it as percent-escaped and mark as failed. |
| 166 AppendEscapedChar(source, output); | 166 AppendEscapedChar(source, output); |
| 167 success = false; | 167 success = false; |
| 168 } else if (replacement == kEsc) { | 168 } else if (replacement == kEsc) { |
| 169 // This character is valid but should be escaped. | 169 // This character is valid but should be escaped. |
| 170 AppendEscapedChar(source, output); | 170 AppendEscapedChar(source, output); |
| 171 } else { | 171 } else { |
| 172 // Common case, the given character is valid in a hostname, the lookup | 172 // Common case, the given character is valid in a hostname, the lookup |
| 173 // table tells us the canonical representation of that character (lower | 173 // table tells us the canonical representation of that character (lower |
| 174 // cased). | 174 // cased). |
| 175 output->push_back(replacement); | 175 output->push_back(replacement); |
| 176 } | 176 } |
| 177 } else { |
| 178 // It's a non-ascii char. Just push it to the output. |
| 179 // In case where we have char16 input, and char output it's safe to |
| 180 // cast char16->char only if input string was converted to ASCII. |
| 181 output->push_back(static_cast<OUTCHAR>(source)); |
| 182 *has_non_ascii = true; |
| 177 } | 183 } |
| 178 } | 184 } |
| 185 |
| 179 return success; | 186 return success; |
| 180 } | 187 } |
| 181 | 188 |
| 182 // Canonicalizes a host that requires IDN conversion. Returns true on success. | 189 // Canonicalizes a host that requires IDN conversion. Returns true on success |
| 183 bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) { | 190 bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) { |
| 191 // We need to escape URL before doing IDN conversion, since punicode strings |
| 192 // cannot be escaped after they are created. |
| 193 RawCanonOutputW<kTempHostBufferLen> url_escaped_host; |
| 194 bool has_non_ascii; |
| 195 DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii); |
| 196 |
| 184 StackBufferW wide_output; | 197 StackBufferW wide_output; |
| 185 if (!IDNToASCII(src, src_len, &wide_output)) { | 198 if (!IDNToASCII(url_escaped_host.data(), |
| 199 url_escaped_host.length(), |
| 200 &wide_output)) { |
| 186 // Some error, give up. This will write some reasonable looking | 201 // Some error, give up. This will write some reasonable looking |
| 187 // representation of the string to the output. | 202 // representation of the string to the output. |
| 188 AppendInvalidNarrowString(src, 0, src_len, output); | 203 AppendInvalidNarrowString(src, 0, src_len, output); |
| 189 return false; | 204 return false; |
| 190 } | 205 } |
| 191 | 206 |
| 192 // Now we check the ASCII output like a normal host. It will also handle | 207 // Now we check the ASCII output like a normal host. It will also handle |
| 193 // unescaping. Although we unescaped everything before this function call, if | 208 // unescaping. Although we unescaped everything before this function call, if |
| 194 // somebody does %00 as fullwidth, ICU will convert this to ASCII. | 209 // somebody does %00 as fullwidth, ICU will convert this to ASCII. |
| 195 bool has_non_ascii; | 210 bool success = DoSimpleHost(wide_output.data(), |
| 196 bool success = DoSimpleHost<char16>(wide_output.data(), | 211 wide_output.length(), |
| 197 wide_output.length(), | 212 output, &has_non_ascii); |
| 198 output, &has_non_ascii); | |
| 199 DCHECK(!has_non_ascii); | 213 DCHECK(!has_non_ascii); |
| 200 return success; | 214 return success; |
| 201 } | 215 } |
| 202 | 216 |
| 203 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to | 217 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to |
| 204 // UTF-16. The has_escaped flag should be set if the input string requires | 218 // UTF-16. The has_escaped flag should be set if the input string requires |
| 205 // unescaping. | 219 // unescaping. |
| 206 bool DoComplexHost(const char* host, int host_len, | 220 bool DoComplexHost(const char* host, int host_len, |
| 207 bool has_non_ascii, bool has_escaped, CanonOutput* output) { | 221 bool has_non_ascii, bool has_escaped, CanonOutput* output) { |
| 208 // Save the current position in the output. We may write stuff and rewind it | 222 // Save the current position in the output. We may write stuff and rewind it |
| (...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 378 } | 392 } |
| 379 | 393 |
| 380 void CanonicalizeHostVerbose(const char16* spec, | 394 void CanonicalizeHostVerbose(const char16* spec, |
| 381 const url_parse::Component& host, | 395 const url_parse::Component& host, |
| 382 CanonOutput* output, | 396 CanonOutput* output, |
| 383 CanonHostInfo *host_info) { | 397 CanonHostInfo *host_info) { |
| 384 DoHost<char16, char16>(spec, host, output, host_info); | 398 DoHost<char16, char16>(spec, host, output, host_info); |
| 385 } | 399 } |
| 386 | 400 |
| 387 } // namespace url_canon | 401 } // namespace url_canon |
| OLD | NEW |