src/url_canon_host.cc - Issue 114050: url_canon: New CanonicalizeHostVerbose() function.

Side by Side Diff: src/url_canon_host.cc

Issue 114050: url_canon: New CanonicalizeHostVerbose() function. (Closed) Base URL: http://google-url.googlecode.com/svn/trunk/

Patch Set: Address brettw's comments Created 11 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2007, Google Inc.	1 // Copyright 2007, Google Inc.

2 // All rights reserved.	2 // All rights reserved.

3 //	3 //

4 // Redistribution and use in source and binary forms, with or without	4 // Redistribution and use in source and binary forms, with or without

5 // modification, are permitted provided that the following conditions are	5 // modification, are permitted provided that the following conditions are

6 // met:	6 // met:

7 //	7 //

8 // * Redistributions of source code must retain the above copyright	8 // * Redistributions of source code must retain the above copyright

9 // notice, this list of conditions and the following disclaimer.	9 // notice, this list of conditions and the following disclaimer.

10 // * Redistributions in binary form must reproduce the above	10 // * Redistributions in binary form must reproduce the above

(...skipping 64 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
75 // based on how many times you run the canonicalizer. We prefer to always report	75 // based on how many times you run the canonicalizer. We prefer to always report

76 // the same vailidity, so reject this.	76 // the same vailidity, so reject this.

77 const unsigned char kEsc = 0xff;	77 const unsigned char kEsc = 0xff;

78 const unsigned char kHostCharLookup[0x80] = {	78 const unsigned char kHostCharLookup[0x80] = {

79 // 00-1f: all are invalid	79 // 00-1f: all are invalid

80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

82 // ' ' ! " # $ % & ' ( ) * + , - . /	82 // ' ' ! " # $ % & ' ( ) * + , - . /

83 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,	83 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,

84 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?	84 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?

85 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 ,kEsc,kEsc,kEsc, 0 ,	85 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,

86 // @ A B C D E F G H I J K L M N O	86 // @ A B C D E F G H I J K L M N O

87 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',	87 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',

88 // P Q R S T U V W X Y Z [ \ ] ^ _	88 // P Q R S T U V W X Y Z [ \ ] ^ _

89 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , ' _',	89 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , ' _',

90 // ` a b c d e f g h i j k l m n o	90 // ` a b c d e f g h i j k l m n o

91 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',	91 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',

92 // p q r s t u v w x y z { \| } ~	92 // p q r s t u v w x y z { \| } ~

93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };	93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };

94	94

95 const int kTempHostBufferLen = 1024;	95 const int kTempHostBufferLen = 1024;

(...skipping 10 matching lines...) Expand all Loading...
106 *has_non_ascii = false;	106 *has_non_ascii = false;

107 *has_escaped = false;	107 *has_escaped = false;

108 for (int i = host.begin; i < end; i++) {	108 for (int i = host.begin; i < end; i++) {

109 if (static_cast<UCHAR>(spec[i]) >= 0x80)	109 if (static_cast<UCHAR>(spec[i]) >= 0x80)

110 *has_non_ascii = true;	110 *has_non_ascii = true;

111 else if (spec[i] == '%')	111 else if (spec[i] == '%')

112 *has_escaped = true;	112 *has_escaped = true;

113 }	113 }

114 }	114 }

115	115

116 // Considers the current contents of the output and sees if it looks like an

117 // IP address. This is called because we canonicalize to the output assuming

118 // that it's not an IP address, and now need to fix it if we produced one.

119 //

120 // The generated hostname is identified by \|host\|. The output will be fixed

121 // with a canonical IP address if the host looks like one. Otherwise, there

122 // will be no change.

123 void InterpretIPAddress(const url_parse::Component& host,

124 CanonOutput* output) {

125 // Canonicalize the IP address in the output to this temporary buffer.

126 // IP addresses are small, so this should not cause an allocation.

127 RawCanonOutput<64> canon_ip;

128 url_parse::Component out_host; // Unused.

129 if (CanonicalizeIPAddress(output->data(), host, &canon_ip, &out_host)) {

130 // Looks like an IP address, overwrite the existing host with the newly

131 // canonicalized IP address.

132 output->set_length(host.begin);

133 output->Append(canon_ip.data(), canon_ip.length());

134 }

135 }

136

137 // Canonicalizes a host name that is entirely 8-bit characters (even though	116 // Canonicalizes a host name that is entirely 8-bit characters (even though

138 // the type holding them may be 16 bits. Escaped characters will be unescaped.	117 // the type holding them may be 16 bits. Escaped characters will be unescaped.

139 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.	118 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.

140 //	119 //

141 // The \|*has_non_ascii\| flag will be true if there are non-7-bit characters in	120 // The \|*has_non_ascii\| flag will be true if there are non-7-bit characters in

142 // the output.	121 // the output.

143 //	122 //

144 // This function is used in two situations:	123 // This function is used in two situations:

145 //	124 //

146 // * When the caller knows there is no non-ASCII or percent escaped	125 // * When the caller knows there is no non-ASCII or percent escaped

147 // characters. This is what DoHost does. The result will be a completely	126 // characters. This is what DoHost does. The result will be a completely

148 // canonicalized host since we know nothing weird can happen (escaped	127 // canonicalized host since we know nothing weird can happen (escaped

149 // characters could be unescaped to non-7-bit, so they have to be treated	128 // characters could be unescaped to non-7-bit, so they have to be treated

150 // with suspicion at this point). It does not use the \|has_non_ascii\| flag.	129 // with suspicion at this point). It does not use the \|has_non_ascii\| flag.

151 //	130 //

152 // * When the caller has an 8-bit string that may need unescaping.	131 // * When the caller has an 8-bit string that may need unescaping.

153 // DoComplexHost calls us this situation to do unescaping and validation.	132 // DoComplexHost calls us this situation to do unescaping and validation.

154 // After this, it may do other IDN operations depending on the value of the	133 // After this, it may do other IDN operations depending on the value of the

155 // \|*has_non_ascii\| flag.	134 // \|*has_non_ascii\| flag.

156 //	135 //

157 // The return value indicates if the output is a potentially valid host name.	136 // The return value indicates if the output is a potentially valid host name.

158 template<typename CHAR>	137 template<typename CHAR>

159 bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output,	138 bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output,

160 bool* has_non_ascii) {	139 bool* has_non_ascii) {

161 *has_non_ascii = false;	140 *has_non_ascii = false;

162	141

163 // First check if the host name is an IP address.

164 url_parse::Component out_ip; // Unused: we compute the size ourselves later.

165 if (CanonicalizeIPAddress(host, url_parse::Component(0, host_len),

166 output, &out_ip))

167 return true;

168

169 bool success = true;	142 bool success = true;

170 for (int i = 0; i < host_len; i++) {	143 for (int i = 0; i < host_len; i++) {

171 unsigned char source = static_cast<unsigned char>(host[i]);	144 unsigned char source = static_cast<unsigned char>(host[i]);

172 if (source == '%') {	145 if (source == '%') {

173 // Handle unescaping. This will replace \|source\| with the unescaped char.	146 // Handle unescaping. This will replace \|source\| with the unescaped char.

174 if (!DecodeEscaped(host, &i, host_len, &source)) {	147 if (!DecodeEscaped(host, &i, host_len, &source)) {

175 // Invalid escaped character. There is nothing that can make this	148 // Invalid escaped character. There is nothing that can make this

176 // host valid. We append an escaped percent so the URL looks reasonable	149 // host valid. We append an escaped percent so the URL looks reasonable

177 // and mark as failed.	150 // and mark as failed.

178 AppendEscapedChar('%', output);	151 AppendEscapedChar('%', output);

(...skipping 69 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
248 // unescaped input requires IDN.	221 // unescaped input requires IDN.

249 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {	222 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {

250 // Error with some escape sequence. We'll call the current output	223 // Error with some escape sequence. We'll call the current output

251 // complete. DoSimpleHost will have written some "reasonable" output.	224 // complete. DoSimpleHost will have written some "reasonable" output.

252 return false;	225 return false;

253 }	226 }

254	227

255 // Unescaping may have left us with ASCII input, in which case the	228 // Unescaping may have left us with ASCII input, in which case the

256 // unescaped version we wrote to output is complete.	229 // unescaped version we wrote to output is complete.

257 if (!has_non_ascii) {	230 if (!has_non_ascii) {

258 // Need to be sure to check for IP addresses in the newly unescaped

259 // output. This will fix the output if necessary.

260 InterpretIPAddress(url_parse::MakeRange(begin_length, output->length()),

261 output);

262 return true;	231 return true;

263 }	232 }

264	233

265 // Save the pointer into the data was just converted (it may be appended to	234 // Save the pointer into the data was just converted (it may be appended to

266 // other data in the output buffer).	235 // other data in the output buffer).

267 utf8_source = &output->data()[begin_length];	236 utf8_source = &output->data()[begin_length];

268 utf8_source_len = output->length() - begin_length;	237 utf8_source_len = output->length() - begin_length;

269 } else {	238 } else {

270 // We don't need to unescape, use input for IDNization later. (We know the	239 // We don't need to unescape, use input for IDNization later. (We know the

271 // input has non-ASCII, or the simple version would have been called	240 // input has non-ASCII, or the simple version would have been called

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
321 }	290 }

322	291

323 // No unescaping necessary, we can safely pass the input to ICU. This	292 // No unescaping necessary, we can safely pass the input to ICU. This

324 // function will only get called if we either have escaped or non-ascii	293 // function will only get called if we either have escaped or non-ascii

325 // input, so it's safe to just use ICU now. Even if the input is ASCII,	294 // input, so it's safe to just use ICU now. Even if the input is ASCII,

326 // this function will do the right thing (just slower than we could).	295 // this function will do the right thing (just slower than we could).

327 return DoIDNHost(host, host_len, output);	296 return DoIDNHost(host, host_len, output);

328 }	297 }

329	298

330 template<typename CHAR, typename UCHAR>	299 template<typename CHAR, typename UCHAR>

331 bool DoHost(const CHAR* spec,	300 void DoHost(const CHAR* spec,

332 const url_parse::Component& host,	301 const url_parse::Component& host,

333 CanonOutput* output,	302 CanonOutput* output,

334 url_parse::Component* out_host) {	303 CanonHostInfo* host_info) {

335 bool success = true;

336 if (host.len <= 0) {	304 if (host.len <= 0) {

337 // Empty hosts don't need anything.	305 // Empty hosts don't need anything.

338 *out_host = url_parse::Component();	306 host_info->family = CanonHostInfo::NEUTRAL;

339 return true;	307 host_info->out_host = url_parse::Component();

	308 return;

340 }	309 }

341	310

342 bool has_non_ascii, has_escaped;	311 bool has_non_ascii, has_escaped;

343 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);	312 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);

344	313

345 out_host->begin = output->length();	314 // Keep track of output's initial length, so we can rewind later.

	315 const int output_begin = output->length();

346	316

	317 bool success;

347 if (!has_non_ascii && !has_escaped) {	318 if (!has_non_ascii && !has_escaped) {

348 success &= DoSimpleHost(&spec[host.begin], host.len,	319 success = DoSimpleHost(&spec[host.begin], host.len,

349 output, &has_non_ascii);	320 output, &has_non_ascii);

350 DCHECK(!has_non_ascii);	321 DCHECK(!has_non_ascii);

351 } else {	322 } else {

352 success &= DoComplexHost(&spec[host.begin], host.len,	323 success = DoComplexHost(&spec[host.begin], host.len,

353 has_non_ascii, has_escaped, output);	324 has_non_ascii, has_escaped, output);

354 // We could have had escaped numerals that should now be canonicalized as

355 // an IP address. This should be exceedingly rare, it's probably mostly

356 // used by scammers.

357 }	325 }

358	326

359 out_host->len = output->length() - out_host->begin;	327 if (!success) {

360 return success;	328 // Canonicalization failed. Set BROKEN to notify the caller.

	329 host_info->family = CanonHostInfo::BROKEN;

	330 } else {

	331 // After all the other canonicalization, check if we ended up with an IP

	332 // address. IP addresses are small, so writing into this temporary buffer

	333 // should not cause an allocation.

	334 RawCanonOutput<64> canon_ip;

	335 CanonicalizeIPAddress(output->data(),

	336 url_parse::MakeRange(output_begin, output->length()),

	337 &canon_ip, host_info);

	338

	339 // If we got an IPv4/IPv6 address, copy the canonical form back to the

	340 // real buffer. Otherwise, it's a hostname or broken IP, in which case

	341 // we just leave it in place.

	342 if (host_info->IsIPAddress()) {

	343 output->set_length(output_begin);

	344 output->Append(canon_ip.data(), canon_ip.length());

	345 }

	346 }

	347

	348 host_info->out_host = url_parse::MakeRange(output_begin, output->length());

361 }	349 }

362	350

363 } // namespace	351 } // namespace

364	352

365 bool CanonicalizeHost(const char* spec,	353 bool CanonicalizeHost(const char* spec,

366 const url_parse::Component& host,	354 const url_parse::Component& host,

367 CanonOutput* output,	355 CanonOutput* output,

368 url_parse::Component* out_host) {	356 url_parse::Component* out_host) {

369 return DoHost<char, unsigned char>(spec, host, output, out_host);	357 CanonHostInfo host_info;

	358 DoHost<char, unsigned char>(spec, host, output, &host_info);

	359 *out_host = host_info.out_host;

	360 return (host_info.family != CanonHostInfo::BROKEN);

370 }	361 }

371	362

372 bool CanonicalizeHost(const char16* spec,	363 bool CanonicalizeHost(const char16* spec,

373 const url_parse::Component& host,	364 const url_parse::Component& host,

374 CanonOutput* output,	365 CanonOutput* output,

375 url_parse::Component* out_host) {	366 url_parse::Component* out_host) {

376 return DoHost<char16, char16>(spec, host, output, out_host);	367 CanonHostInfo host_info;

	368 DoHost<char16, char16>(spec, host, output, &host_info);

	369 *out_host = host_info.out_host;

	370 return (host_info.family != CanonHostInfo::BROKEN);

	371 }

	372

	373 void CanonicalizeHostVerbose(const char* spec,

	374 const url_parse::Component& host,

	375 CanonOutput* output,

	376 CanonHostInfo *host_info) {

	377 DoHost<char, unsigned char>(spec, host, output, host_info);

	378 }

	379

	380 void CanonicalizeHostVerbose(const char16* spec,

	381 const url_parse::Component& host,

	382 CanonOutput* output,

	383 CanonHostInfo *host_info) {

	384 DoHost<char16, char16>(spec, host, output, host_info);

377 }	385 }

378	386

379 } // namespace url_canon	387 } // namespace url_canon

OLD	NEW

« no previous file with comments | « src/url_canon.h ('k') | src/url_canon_ip.cc » ('j') | no next file with comments »