url/url_canon_host.cc - Issue 2397873002: Reject some previuosly-escaped chars in hostnames.

Side by Side Diff: url/url_canon_host.cc

Issue 2397873002: Reject some previuosly-escaped chars in hostnames.

Patch Set: Some tests fixed Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2013 The Chromium Authors. All rights reserved.	1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "base/logging.h"	5 #include "base/logging.h"

6 #include "url/url_canon.h"	6 #include "url/url_canon.h"

7 #include "url/url_canon_internal.h"	7 #include "url/url_canon_internal.h"

8	8

9 namespace url {	9 namespace url {

10	10

11 namespace {	11 namespace {

12	12

13 // For reference, here's what IE supports:	13 // For reference, here's what IE6 supported:

14 // Key: 0 (disallowed: failure if present in the input)	14 // Key: 0 (disallowed: failure if present in the input)

15 // + (allowed either escaped or unescaped, and unmodified)	15 // + (allowed either escaped or unescaped, and unmodified)

16 // U (allowed escaped or unescaped but always unescaped if present in	16 // U (allowed escaped or unescaped but always unescaped if present in

17 // escaped form)	17 // escaped form)

18 // E (allowed escaped or unescaped but always escaped if present in	18 // E (allowed escaped or unescaped but always escaped if present in

19 // unescaped form)	19 // unescaped form)

20 // % (only allowed escaped in the input, will be unmodified).	20 // % (only allowed escaped in the input, will be unmodified).

21 // I left blank alpha numeric characters.	21 // I left blank alpha numeric characters.

22 //	22 //

23 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f	23 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f

24 // -----------------------------------------------	24 // -----------------------------------------------

25 // 0 0 E E E E E E E E E E E E E E E	25 // 0 0 E E E E E E E E E E E E E E E

26 // 1 E E E E E E E E E E E E E E E E	26 // 1 E E E E E E E E E E E E E E E E

27 // 2 E + E E + E + + + + + + + U U 0	27 // 2 E + E E + E + + + + + + + U U 0

28 // 3 % % E + E 0 <-- Those are : ; < = > ?	28 // 3 % % E + E 0 <-- Those are : ; < = > ?

29 // 4 %	29 // 4 %

30 // 5 U 0 U U U <-- Those are [ \ ] ^ _	30 // 5 U 0 U U U <-- Those are [ \ ] ^ _

31 // 6 E <-- That's `	31 // 6 E <-- That's `

32 // 7 E E E U E <-- Those are { \| } ~ (UN PRINTABLE)	32 // 7 E E E U E <-- Those are { \| } ~ (UN PRINTABLE)

33 //	33 //

34 // NOTE: I didn't actually test all the control characters. Some may be	34 // NOTE: I didn't actually test all the control characters. Some may be

35 // disallowed in the input, but they are all accepted escaped except for 0.	35 // disallowed in the input, but they are all accepted escaped except for 0.

36 // I also didn't test if characters affecting HTML parsing are allowed	36 // I also didn't test if characters affecting HTML parsing are allowed

37 // unescaped, e.g. (") or (#), which would indicate the beginning of the path.	37 // unescaped, e.g. (") or (#), which would indicate the beginning of the path.

38 // Surprisingly, space is accepted in the input and always escaped.	38 // Surprisingly, space is accepted in the input and always escaped.

39	39

40 // This table lists the canonical version of all characters we allow in the	40 // This table lists the canonical version of all characters we allow in the

41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar	41 // input, with 0 indicating it is disallowed.

42 // value to indicate that this character should be escaped. We are a little more

43 // restrictive than IE, but less restrictive than Firefox.

44 //

45 // Note that we disallow the % character. We will allow it when part of an

46 // escape sequence, of course, but this disallows "%25". Even though IE allows

47 // it, allowing it would put us in a funny state. If there was an invalid

48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.

49 // Allowing percents means we'll succeed a second time, so validity would change

50 // based on how many times you run the canonicalizer. We prefer to always report

51 // the same vailidity, so reject this.

52 const unsigned char kEsc = 0xff;

53 const unsigned char kHostCharLookup[0x80] = {	42 const unsigned char kHostCharLookup[0x80] = {

54 // 00-1f: all are invalid	43 // 00-1f: all are invalid

55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	44 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

57 // ' ' ! " # $ % & ' ( ) * + , - . /	46 // ' ' ! " # $ % & ' ( ) * + , - . /

58 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,	47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '-', '.', 0,

59 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?	48 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?

60 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,	49 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0, 0, 0, 0, 0 ,

61 // @ A B C D E F G H I J K L M N O	50 // @ A B C D E F G H I J K L M N O

62 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',	51 0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',

63 // P Q R S T U V W X Y Z [ \ ] ^ _	52 // P Q R S T U V W X Y Z [ \ ] ^ _

64 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , ' _',	53 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , ' _',

65 // ` a b c d e f g h i j k l m n o	54 // ` a b c d e f g h i j k l m n o

66 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',	55 0, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',

67 // p q r s t u v w x y z { \| } ~	56 // p q r s t u v w x y z { \| } ~

68 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };	57 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0, 0, 0, 0 };

69	58

70 const int kTempHostBufferLen = 1024;	59 const int kTempHostBufferLen = 1024;

71 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;	60 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;

72 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;	61 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;

73	62

74 // Scans a host name and fills in the output flags according to what we find.	63 // Scans a host name and fills in the output flags according to what we find.

75 // \|has_non_ascii\| will be true if there are any non-7-bit characters, and	64 // \|has_non_ascii\| will be true if there are any non-7-bit characters, and

76 // \|has_escaped\| will be true if there is a percent sign.	65 // \|has_escaped\| will be true if there is a percent sign.

77 template<typename CHAR, typename UCHAR>	66 template<typename CHAR, typename UCHAR>

78 void ScanHostname(const CHAR* spec,	67 void ScanHostname(const CHAR* spec,

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
135 }	124 }

136 }	125 }

137	126

138 if (source < 0x80) {	127 if (source < 0x80) {

139 // We have ASCII input, we can use our lookup table.	128 // We have ASCII input, we can use our lookup table.

140 unsigned char replacement = kHostCharLookup[source];	129 unsigned char replacement = kHostCharLookup[source];

141 if (!replacement) {	130 if (!replacement) {

142 // Invalid character, add it as percent-escaped and mark as failed.	131 // Invalid character, add it as percent-escaped and mark as failed.

143 AppendEscapedChar(source, output);	132 AppendEscapedChar(source, output);

144 success = false;	133 success = false;

145 } else if (replacement == kEsc) {

146 // This character is valid but should be escaped.

147 AppendEscapedChar(source, output);

148 } else {	134 } else {

149 // Common case, the given character is valid in a hostname, the lookup	135 // Common case, the given character is valid in a hostname, the lookup

150 // table tells us the canonical representation of that character (lower	136 // table tells us the canonical representation of that character (lower

151 // cased).	137 // cased).

152 output->push_back(replacement);	138 output->push_back(replacement);

153 }	139 }

154 } else {	140 } else {

155 // It's a non-ascii char. Just push it to the output.	141 // It's a non-ascii char. Just push it to the output.

156 // In case where we have char16 input, and char output it's safe to	142 // In case where we have char16 input, and char output it's safe to

157 // cast char16->char only if input string was converted to ASCII.	143 // cast char16->char only if input string was converted to ASCII.

(...skipping 249 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
407 return DoHostSubstring<char, unsigned char>(spec, host, output);	393 return DoHostSubstring<char, unsigned char>(spec, host, output);

408 }	394 }

409	395

410 bool CanonicalizeHostSubstring(const base::char16* spec,	396 bool CanonicalizeHostSubstring(const base::char16* spec,

411 const Component& host,	397 const Component& host,

412 CanonOutput* output) {	398 CanonOutput* output) {

413 return DoHostSubstring<base::char16, base::char16>(spec, host, output);	399 return DoHostSubstring<base::char16, base::char16>(spec, host, output);

414 }	400 }

415	401

416 } // namespace url	402 } // namespace url

OLD	NEW

« no previous file with comments | « net/proxy/proxy_config_service_linux_unittest.cc ('k') | url/url_canon_unittest.cc » ('j') | no next file with comments »