| OLD | NEW |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "base/logging.h" | 5 #include "base/logging.h" |
| 6 #include "url/url_canon.h" | 6 #include "url/url_canon.h" |
| 7 #include "url/url_canon_internal.h" | 7 #include "url/url_canon_internal.h" |
| 8 | 8 |
| 9 namespace url { | 9 namespace url { |
| 10 | 10 |
| 11 namespace { | 11 namespace { |
| 12 | 12 |
| 13 // For reference, here's what IE supports: | 13 // For reference, here's what IE6 supported: |
| 14 // Key: 0 (disallowed: failure if present in the input) | 14 // Key: 0 (disallowed: failure if present in the input) |
| 15 // + (allowed either escaped or unescaped, and unmodified) | 15 // + (allowed either escaped or unescaped, and unmodified) |
| 16 // U (allowed escaped or unescaped but always unescaped if present in | 16 // U (allowed escaped or unescaped but always unescaped if present in |
| 17 // escaped form) | 17 // escaped form) |
| 18 // E (allowed escaped or unescaped but always escaped if present in | 18 // E (allowed escaped or unescaped but always escaped if present in |
| 19 // unescaped form) | 19 // unescaped form) |
| 20 // % (only allowed escaped in the input, will be unmodified). | 20 // % (only allowed escaped in the input, will be unmodified). |
| 21 // I left blank alpha numeric characters. | 21 // I left blank alpha numeric characters. |
| 22 // | 22 // |
| 23 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f | 23 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f |
| 24 // ----------------------------------------------- | 24 // ----------------------------------------------- |
| 25 // 0 0 E E E E E E E E E E E E E E E | 25 // 0 0 E E E E E E E E E E E E E E E |
| 26 // 1 E E E E E E E E E E E E E E E E | 26 // 1 E E E E E E E E E E E E E E E E |
| 27 // 2 E + E E + E + + + + + + + U U 0 | 27 // 2 E + E E + E + + + + + + + U U 0 |
| 28 // 3 % % E + E 0 <-- Those are : ; < = >
? | 28 // 3 % % E + E 0 <-- Those are : ; < = >
? |
| 29 // 4 % | 29 // 4 % |
| 30 // 5 U 0 U U U <-- Those are [ \ ] ^ _ | 30 // 5 U 0 U U U <-- Those are [ \ ] ^ _ |
| 31 // 6 E <-- That's ` | 31 // 6 E <-- That's ` |
| 32 // 7 E E E U E <-- Those are { | } ~ (UN
PRINTABLE) | 32 // 7 E E E U E <-- Those are { | } ~ (UN
PRINTABLE) |
| 33 // | 33 // |
| 34 // NOTE: I didn't actually test all the control characters. Some may be | 34 // NOTE: I didn't actually test all the control characters. Some may be |
| 35 // disallowed in the input, but they are all accepted escaped except for 0. | 35 // disallowed in the input, but they are all accepted escaped except for 0. |
| 36 // I also didn't test if characters affecting HTML parsing are allowed | 36 // I also didn't test if characters affecting HTML parsing are allowed |
| 37 // unescaped, e.g. (") or (#), which would indicate the beginning of the path. | 37 // unescaped, e.g. (") or (#), which would indicate the beginning of the path. |
| 38 // Surprisingly, space is accepted in the input and always escaped. | 38 // Surprisingly, space is accepted in the input and always escaped. |
| 39 | 39 |
| 40 // This table lists the canonical version of all characters we allow in the | 40 // This table lists the canonical version of all characters we allow in the |
| 41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar | 41 // input, with 0 indicating it is disallowed. |
| 42 // value to indicate that this character should be escaped. We are a little more | |
| 43 // restrictive than IE, but less restrictive than Firefox. | |
| 44 // | |
| 45 // Note that we disallow the % character. We will allow it when part of an | |
| 46 // escape sequence, of course, but this disallows "%25". Even though IE allows | |
| 47 // it, allowing it would put us in a funny state. If there was an invalid | |
| 48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail. | |
| 49 // Allowing percents means we'll succeed a second time, so validity would change | |
| 50 // based on how many times you run the canonicalizer. We prefer to always report | |
| 51 // the same vailidity, so reject this. | |
| 52 const unsigned char kEsc = 0xff; | |
| 53 const unsigned char kHostCharLookup[0x80] = { | 42 const unsigned char kHostCharLookup[0x80] = { |
| 54 // 00-1f: all are invalid | 43 // 00-1f: all are invalid |
| 55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, | 44 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, |
| 56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, | 45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, |
| 57 // ' ' ! " # $ % & ' ( ) * + , - .
/ | 46 // ' ' ! " # $ % & ' ( ) * + , - .
/ |
| 58 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',
0, | 47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '-', '.',
0, |
| 59 // 0 1 2 3 4 5 6 7 8 9 : ; < = >
? | 48 // 0 1 2 3 4 5 6 7 8 9 : ; < = >
? |
| 60 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc,
0 , | 49 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0, 0, 0, 0,
0 , |
| 61 // @ A B C D E F G H I J K L M N
O | 50 // @ A B C D E F G H I J K L M N
O |
| 62 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', | 51 0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', |
| 63 // P Q R S T U V W X Y Z [ \ ] ^
_ | 52 // P Q R S T U V W X Y Z [ \ ] ^
_ |
| 64 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '
_', | 53 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '
_', |
| 65 // ` a b c d e f g h i j k l m n
o | 54 // ` a b c d e f g h i j k l m n
o |
| 66 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', | 55 0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', '
o', |
| 67 // p q r s t u v w x y z { | } ~ | 56 // p q r s t u v w x y z { | } ~ |
| 68 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 ,
0 }; | 57 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0, 0, 0,
0 }; |
| 69 | 58 |
| 70 const int kTempHostBufferLen = 1024; | 59 const int kTempHostBufferLen = 1024; |
| 71 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer; | 60 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer; |
| 72 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW; | 61 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW; |
| 73 | 62 |
| 74 // Scans a host name and fills in the output flags according to what we find. | 63 // Scans a host name and fills in the output flags according to what we find. |
| 75 // |has_non_ascii| will be true if there are any non-7-bit characters, and | 64 // |has_non_ascii| will be true if there are any non-7-bit characters, and |
| 76 // |has_escaped| will be true if there is a percent sign. | 65 // |has_escaped| will be true if there is a percent sign. |
| 77 template<typename CHAR, typename UCHAR> | 66 template<typename CHAR, typename UCHAR> |
| 78 void ScanHostname(const CHAR* spec, | 67 void ScanHostname(const CHAR* spec, |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 135 } | 124 } |
| 136 } | 125 } |
| 137 | 126 |
| 138 if (source < 0x80) { | 127 if (source < 0x80) { |
| 139 // We have ASCII input, we can use our lookup table. | 128 // We have ASCII input, we can use our lookup table. |
| 140 unsigned char replacement = kHostCharLookup[source]; | 129 unsigned char replacement = kHostCharLookup[source]; |
| 141 if (!replacement) { | 130 if (!replacement) { |
| 142 // Invalid character, add it as percent-escaped and mark as failed. | 131 // Invalid character, add it as percent-escaped and mark as failed. |
| 143 AppendEscapedChar(source, output); | 132 AppendEscapedChar(source, output); |
| 144 success = false; | 133 success = false; |
| 145 } else if (replacement == kEsc) { | |
| 146 // This character is valid but should be escaped. | |
| 147 AppendEscapedChar(source, output); | |
| 148 } else { | 134 } else { |
| 149 // Common case, the given character is valid in a hostname, the lookup | 135 // Common case, the given character is valid in a hostname, the lookup |
| 150 // table tells us the canonical representation of that character (lower | 136 // table tells us the canonical representation of that character (lower |
| 151 // cased). | 137 // cased). |
| 152 output->push_back(replacement); | 138 output->push_back(replacement); |
| 153 } | 139 } |
| 154 } else { | 140 } else { |
| 155 // It's a non-ascii char. Just push it to the output. | 141 // It's a non-ascii char. Just push it to the output. |
| 156 // In case where we have char16 input, and char output it's safe to | 142 // In case where we have char16 input, and char output it's safe to |
| 157 // cast char16->char only if input string was converted to ASCII. | 143 // cast char16->char only if input string was converted to ASCII. |
| (...skipping 249 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 407 return DoHostSubstring<char, unsigned char>(spec, host, output); | 393 return DoHostSubstring<char, unsigned char>(spec, host, output); |
| 408 } | 394 } |
| 409 | 395 |
| 410 bool CanonicalizeHostSubstring(const base::char16* spec, | 396 bool CanonicalizeHostSubstring(const base::char16* spec, |
| 411 const Component& host, | 397 const Component& host, |
| 412 CanonOutput* output) { | 398 CanonOutput* output) { |
| 413 return DoHostSubstring<base::char16, base::char16>(spec, host, output); | 399 return DoHostSubstring<base::char16, base::char16>(spec, host, output); |
| 414 } | 400 } |
| 415 | 401 |
| 416 } // namespace url | 402 } // namespace url |
| OLD | NEW |