url/url_canon_host.cc - Issue 13821004: Move googleurl into the Chrome repo.

Side by Side Diff: url/url_canon_host.cc

Issue 13821004: Move googleurl into the Chrome repo. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright 2007, Google Inc.

	2 // All rights reserved.

	3 //

	4 // Redistribution and use in source and binary forms, with or without

	5 // modification, are permitted provided that the following conditions are

	6 // met:

	7 //

	8 // * Redistributions of source code must retain the above copyright

	9 // notice, this list of conditions and the following disclaimer.

	10 // * Redistributions in binary form must reproduce the above

	11 // copyright notice, this list of conditions and the following disclaimer

	12 // in the documentation and/or other materials provided with the

	13 // distribution.

	14 // * Neither the name of Google Inc. nor the names of its

	15 // contributors may be used to endorse or promote products derived from

	16 // this software without specific prior written permission.

	17 //

	18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	29

	30 #include "base/logging.h"

	31 #include "googleurl/src/url_canon.h"

	32 #include "googleurl/src/url_canon_internal.h"

	33

	34 namespace url_canon {

	35

	36 namespace {

	37

	38 // For reference, here's what IE supports:

	39 // Key: 0 (disallowed: failure if present in the input)

	40 // + (allowed either escaped or unescaped, and unmodified)

	41 // U (allowed escaped or unescaped but always unescaped if present in

	42 // escaped form)

	43 // E (allowed escaped or unescaped but always escaped if present in

	44 // unescaped form)

	45 // % (only allowed escaped in the input, will be unmodified).

	46 // I left blank alpha numeric characters.

	47 //

	48 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f

	49 // -----------------------------------------------

	50 // 0 0 E E E E E E E E E E E E E E E

	51 // 1 E E E E E E E E E E E E E E E E

	52 // 2 E + E E + E + + + + + + + U U 0

	53 // 3 % % E + E 0 <-- Those are : ; < = > ?

	54 // 4 %

	55 // 5 U 0 U U U <-- Those are [ \ ] ^ _

	56 // 6 E <-- That's `

	57 // 7 E E E U E <-- Those are { \| } ~ (UN PRINTABLE)

	58 //

	59 // NOTE: I didn't actually test all the control characters. Some may be

	60 // disallowed in the input, but they are all accepted escaped except for 0.

	61 // I also didn't test if characters affecting HTML parsing are allowed

	62 // unescaped, eg. (") or (#), which would indicate the beginning of the path.

	63 // Surprisingly, space is accepted in the input and always escaped.

	64

	65 // This table lists the canonical version of all characters we allow in the

	66 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar

	67 // value to indicate that this character should be escaped. We are a little more

	68 // restrictive than IE, but less restrictive than Firefox.

	69 //

	70 // Note that we disallow the % character. We will allow it when part of an

	71 // escape sequence, of course, but this disallows "%25". Even though IE allows

	72 // it, allowing it would put us in a funny state. If there was an invalid

	73 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.

	74 // Allowing percents means we'll succeed a second time, so validity would change

	75 // based on how many times you run the canonicalizer. We prefer to always report

	76 // the same vailidity, so reject this.

	77 const unsigned char kEsc = 0xff;

	78 const unsigned char kHostCharLookup[0x80] = {

	79 // 00-1f: all are invalid

	80 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	82 // ' ' ! " # $ % & ' ( ) * + , - . /

	83 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,

	84 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?

	85 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,

	86 // @ A B C D E F G H I J K L M N O

	87 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',

	88 // P Q R S T U V W X Y Z [ \ ] ^ _

	89 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , ' _',

	90 // ` a b c d e f g h i j k l m n o

	91 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',

	92 // p q r s t u v w x y z { \| } ~

	93 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };

	94

	95 const int kTempHostBufferLen = 1024;

	96 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;

	97 typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;

	98

	99 // Scans a host name and fills in the output flags according to what we find.

	100 // \|has_non_ascii\| will be true if there are any non-7-bit characters, and

	101 // \|has_escaped\| will be true if there is a percent sign.

	102 template<typename CHAR, typename UCHAR>

	103 void ScanHostname(const CHAR* spec, const url_parse::Component& host,

	104 bool* has_non_ascii, bool* has_escaped) {

	105 int end = host.end();

	106 *has_non_ascii = false;

	107 *has_escaped = false;

	108 for (int i = host.begin; i < end; i++) {

	109 if (static_cast<UCHAR>(spec[i]) >= 0x80)

	110 *has_non_ascii = true;

	111 else if (spec[i] == '%')

	112 *has_escaped = true;

	113 }

	114 }

	115

	116 // Canonicalizes a host name that is entirely 8-bit characters (even though

	117 // the type holding them may be 16 bits. Escaped characters will be unescaped.

	118 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.

	119 //

	120 // The \|*has_non_ascii\| flag will be true if there are non-7-bit characters in

	121 // the output.

	122 //

	123 // This function is used in two situations:

	124 //

	125 // * When the caller knows there is no non-ASCII or percent escaped

	126 // characters. This is what DoHost does. The result will be a completely

	127 // canonicalized host since we know nothing weird can happen (escaped

	128 // characters could be unescaped to non-7-bit, so they have to be treated

	129 // with suspicion at this point). It does not use the \|has_non_ascii\| flag.

	130 //

	131 // * When the caller has an 8-bit string that may need unescaping.

	132 // DoComplexHost calls us this situation to do unescaping and validation.

	133 // After this, it may do other IDN operations depending on the value of the

	134 // \|*has_non_ascii\| flag.

	135 //

	136 // The return value indicates if the output is a potentially valid host name.

	137 template<typename INCHAR, typename OUTCHAR>

	138 bool DoSimpleHost(const INCHAR* host,

	139 int host_len,

	140 CanonOutputT<OUTCHAR>* output,

	141 bool* has_non_ascii) {

	142 *has_non_ascii = false;

	143

	144 bool success = true;

	145 for (int i = 0; i < host_len; ++i) {

	146 unsigned int source = host[i];

	147 if (source == '%') {

	148 // Unescape first, if possible.

	149 // Source will be used only if decode operation was successful.

	150 if (!DecodeEscaped(host, &i, host_len,

	151 reinterpret_cast<unsigned char*>(&source))) {

	152 // Invalid escaped character. There is nothing that can make this

	153 // host valid. We append an escaped percent so the URL looks reasonable

	154 // and mark as failed.

	155 AppendEscapedChar('%', output);

	156 success = false;

	157 continue;

	158 }

	159 }

	160

	161 if (source < 0x80) {

	162 // We have ASCII input, we can use our lookup table.

	163 unsigned char replacement = kHostCharLookup[source];

	164 if (!replacement) {

	165 // Invalid character, add it as percent-escaped and mark as failed.

	166 AppendEscapedChar(source, output);

	167 success = false;

	168 } else if (replacement == kEsc) {

	169 // This character is valid but should be escaped.

	170 AppendEscapedChar(source, output);

	171 } else {

	172 // Common case, the given character is valid in a hostname, the lookup

	173 // table tells us the canonical representation of that character (lower

	174 // cased).

	175 output->push_back(replacement);

	176 }

	177 } else {

	178 // It's a non-ascii char. Just push it to the output.

	179 // In case where we have char16 input, and char output it's safe to

	180 // cast char16->char only if input string was converted to ASCII.

	181 output->push_back(static_cast<OUTCHAR>(source));

	182 *has_non_ascii = true;

	183 }

	184 }

	185

	186 return success;

	187 }

	188

	189 // Canonicalizes a host that requires IDN conversion. Returns true on success

	190 bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {

	191 // We need to escape URL before doing IDN conversion, since punicode strings

	192 // cannot be escaped after they are created.

	193 RawCanonOutputW<kTempHostBufferLen> url_escaped_host;

	194 bool has_non_ascii;

	195 DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);

	196

	197 StackBufferW wide_output;

	198 if (!IDNToASCII(url_escaped_host.data(),

	199 url_escaped_host.length(),

	200 &wide_output)) {

	201 // Some error, give up. This will write some reasonable looking

	202 // representation of the string to the output.

	203 AppendInvalidNarrowString(src, 0, src_len, output);

	204 return false;

	205 }

	206

	207 // Now we check the ASCII output like a normal host. It will also handle

	208 // unescaping. Although we unescaped everything before this function call, if

	209 // somebody does %00 as fullwidth, ICU will convert this to ASCII.

	210 bool success = DoSimpleHost(wide_output.data(),

	211 wide_output.length(),

	212 output, &has_non_ascii);

	213 DCHECK(!has_non_ascii);

	214 return success;

	215 }

	216

	217 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to

	218 // UTF-16. The has_escaped flag should be set if the input string requires

	219 // unescaping.

	220 bool DoComplexHost(const char* host, int host_len,

	221 bool has_non_ascii, bool has_escaped, CanonOutput* output) {

	222 // Save the current position in the output. We may write stuff and rewind it

	223 // below, so we need to know where to rewind to.

	224 int begin_length = output->length();

	225

	226 // Points to the UTF-8 data we want to convert. This will either be the

	227 // input or the unescaped version written to \|*output\| if necessary.

	228 const char* utf8_source;

	229 int utf8_source_len;

	230 if (has_escaped) {

	231 // Unescape before converting to UTF-16 for IDN. We write this into the

	232 // output because it most likely does not require IDNization, and we can

	233 // save another huge stack buffer. It will be replaced below if it requires

	234 // IDN. This will also update our non-ASCII flag so we know whether the

	235 // unescaped input requires IDN.

	236 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {

	237 // Error with some escape sequence. We'll call the current output

	238 // complete. DoSimpleHost will have written some "reasonable" output.

	239 return false;

	240 }

	241

	242 // Unescaping may have left us with ASCII input, in which case the

	243 // unescaped version we wrote to output is complete.

	244 if (!has_non_ascii) {

	245 return true;

	246 }

	247

	248 // Save the pointer into the data was just converted (it may be appended to

	249 // other data in the output buffer).

	250 utf8_source = &output->data()[begin_length];

	251 utf8_source_len = output->length() - begin_length;

	252 } else {

	253 // We don't need to unescape, use input for IDNization later. (We know the

	254 // input has non-ASCII, or the simple version would have been called

	255 // instead of us.)

	256 utf8_source = host;

	257 utf8_source_len = host_len;

	258 }

	259

	260 // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.

	261 // Above, we may have used the output to write the unescaped values to, so

	262 // we have to rewind it to where we started after we convert it to UTF-16.

	263 StackBufferW utf16;

	264 if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {

	265 // In this error case, the input may or may not be the output.

	266 StackBuffer utf8;

	267 for (int i = 0; i < utf8_source_len; i++)

	268 utf8.push_back(utf8_source[i]);

	269 output->set_length(begin_length);

	270 AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);

	271 return false;

	272 }

	273 output->set_length(begin_length);

	274

	275 // This will call DoSimpleHost which will do normal ASCII canonicalization

	276 // and also check for IP addresses in the outpt.

	277 return DoIDNHost(utf16.data(), utf16.length(), output);

	278 }

	279

	280 // UTF-16 convert host to its ASCII version. The set up is already ready for

	281 // the backend, so we just pass through. The has_escaped flag should be set if

	282 // the input string requires unescaping.

	283 bool DoComplexHost(const char16* host, int host_len,

	284 bool has_non_ascii, bool has_escaped, CanonOutput* output) {

	285 if (has_escaped) {

	286 // Yikes, we have escaped characters with wide input. The escaped

	287 // characters should be interpreted as UTF-8. To solve this problem,

	288 // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.

	289 //

	290 // We don't bother to optimize the conversion in the ASCII case (which

	291 // could just be a copy) and use the UTF-8 path, because it should be

	292 // very rare that host names have escaped characters, and it is relatively

	293 // fast to do the conversion anyway.

	294 StackBuffer utf8;

	295 if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {

	296 AppendInvalidNarrowString(host, 0, host_len, output);

	297 return false;

	298 }

	299

	300 // Once we convert to UTF-8, we can use the 8-bit version of the complex

	301 // host handling code above.

	302 return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,

	303 has_escaped, output);

	304 }

	305

	306 // No unescaping necessary, we can safely pass the input to ICU. This

	307 // function will only get called if we either have escaped or non-ascii

	308 // input, so it's safe to just use ICU now. Even if the input is ASCII,

	309 // this function will do the right thing (just slower than we could).

	310 return DoIDNHost(host, host_len, output);

	311 }

	312

	313 template<typename CHAR, typename UCHAR>

	314 void DoHost(const CHAR* spec,

	315 const url_parse::Component& host,

	316 CanonOutput* output,

	317 CanonHostInfo* host_info) {

	318 if (host.len <= 0) {

	319 // Empty hosts don't need anything.

	320 host_info->family = CanonHostInfo::NEUTRAL;

	321 host_info->out_host = url_parse::Component();

	322 return;

	323 }

	324

	325 bool has_non_ascii, has_escaped;

	326 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);

	327

	328 // Keep track of output's initial length, so we can rewind later.

	329 const int output_begin = output->length();

	330

	331 bool success;

	332 if (!has_non_ascii && !has_escaped) {

	333 success = DoSimpleHost(&spec[host.begin], host.len,

	334 output, &has_non_ascii);

	335 DCHECK(!has_non_ascii);

	336 } else {

	337 success = DoComplexHost(&spec[host.begin], host.len,

	338 has_non_ascii, has_escaped, output);

	339 }

	340

	341 if (!success) {

	342 // Canonicalization failed. Set BROKEN to notify the caller.

	343 host_info->family = CanonHostInfo::BROKEN;

	344 } else {

	345 // After all the other canonicalization, check if we ended up with an IP

	346 // address. IP addresses are small, so writing into this temporary buffer

	347 // should not cause an allocation.

	348 RawCanonOutput<64> canon_ip;

	349 CanonicalizeIPAddress(output->data(),

	350 url_parse::MakeRange(output_begin, output->length()),

	351 &canon_ip, host_info);

	352

	353 // If we got an IPv4/IPv6 address, copy the canonical form back to the

	354 // real buffer. Otherwise, it's a hostname or broken IP, in which case

	355 // we just leave it in place.

	356 if (host_info->IsIPAddress()) {

	357 output->set_length(output_begin);

	358 output->Append(canon_ip.data(), canon_ip.length());

	359 }

	360 }

	361

	362 host_info->out_host = url_parse::MakeRange(output_begin, output->length());

	363 }

	364

	365 } // namespace

	366

	367 bool CanonicalizeHost(const char* spec,

	368 const url_parse::Component& host,

	369 CanonOutput* output,

	370 url_parse::Component* out_host) {

	371 CanonHostInfo host_info;

	372 DoHost<char, unsigned char>(spec, host, output, &host_info);

	373 *out_host = host_info.out_host;

	374 return (host_info.family != CanonHostInfo::BROKEN);

	375 }

	376

	377 bool CanonicalizeHost(const char16* spec,

	378 const url_parse::Component& host,

	379 CanonOutput* output,

	380 url_parse::Component* out_host) {

	381 CanonHostInfo host_info;

	382 DoHost<char16, char16>(spec, host, output, &host_info);

	383 *out_host = host_info.out_host;

	384 return (host_info.family != CanonHostInfo::BROKEN);

	385 }

	386

	387 void CanonicalizeHostVerbose(const char* spec,

	388 const url_parse::Component& host,

	389 CanonOutput* output,

	390 CanonHostInfo *host_info) {

	391 DoHost<char, unsigned char>(spec, host, output, host_info);

	392 }

	393

	394 void CanonicalizeHostVerbose(const char16* spec,

	395 const url_parse::Component& host,

	396 CanonOutput* output,

	397 CanonHostInfo *host_info) {

	398 DoHost<char16, char16>(spec, host, output, host_info);

	399 }

	400

	401 } // namespace url_canon

OLD	NEW

« no previous file with comments | « url/url_canon_fileurl.cc ('k') | url/url_canon_icu.h » ('j') | no next file with comments »