url/url_canon_etc.cc - Issue 13821004: Move googleurl into the Chrome repo.

Side by Side Diff: url/url_canon_etc.cc

Issue 13821004: Move googleurl into the Chrome repo. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright 2007, Google Inc.

	2 // All rights reserved.

	3 //

	4 // Redistribution and use in source and binary forms, with or without

	5 // modification, are permitted provided that the following conditions are

	6 // met:

	7 //

	8 // * Redistributions of source code must retain the above copyright

	9 // notice, this list of conditions and the following disclaimer.

	10 // * Redistributions in binary form must reproduce the above

	11 // copyright notice, this list of conditions and the following disclaimer

	12 // in the documentation and/or other materials provided with the

	13 // distribution.

	14 // * Neither the name of Google Inc. nor the names of its

	15 // contributors may be used to endorse or promote products derived from

	16 // this software without specific prior written permission.

	17 //

	18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	29

	30 // Canonicalizers for random bits that aren't big enough for their own files.

	31

	32 #include <string.h>

	33

	34 #include "googleurl/src/url_canon.h"

	35 #include "googleurl/src/url_canon_internal.h"

	36

	37 namespace url_canon {

	38

	39 namespace {

	40

	41 // Returns true if the given character should be removed from the middle of a

	42 // URL.

	43 inline bool IsRemovableURLWhitespace(int ch) {

	44 return ch == '\r' \|\| ch == '\n' \|\| ch == '\t';

	45 }

	46

	47 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).

	48 // It sucks that we have to do this, since this takes about 13% of the total URL

	49 // canonicalization time.

	50 template<typename CHAR>

	51 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,

	52 CanonOutputT<CHAR>* buffer,

	53 int* output_len) {

	54 // Fast verification that there's nothing that needs removal. This is the 99%

	55 // case, so we want it to be fast and don't care about impacting the speed

	56 // when we do find whitespace.

	57 int found_whitespace = false;

	58 for (int i = 0; i < input_len; i++) {

	59 if (!IsRemovableURLWhitespace(input[i]))

	60 continue;

	61 found_whitespace = true;

	62 break;

	63 }

	64

	65 if (!found_whitespace) {

	66 // Didn't find any whitespace, we don't need to do anything. We can just

	67 // return the input as the output.

	68 *output_len = input_len;

	69 return input;

	70 }

	71

	72 // Remove the whitespace into the new buffer and return it.

	73 for (int i = 0; i < input_len; i++) {

	74 if (!IsRemovableURLWhitespace(input[i]))

	75 buffer->push_back(input[i]);

	76 }

	77 *output_len = buffer->length();

	78 return buffer->data();

	79 }

	80

	81 // Contains the canonical version of each possible input letter in the scheme

	82 // (basically, lower-cased). The corresponding entry will be 0 if the letter

	83 // is not allowed in a scheme.

	84 const char kSchemeCanonical[0x80] = {

	85 // 00-1f: all are invalid

	86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

	88 // ' ' ! " # $ % & ' ( ) * + , - . /

	89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,

	90 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?

	91 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,

	92 // @ A B C D E F G H I J K L M N O

	93 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',

	94 // P Q R S T U V W X Y Z [ \ ] ^ _

	95 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,

	96 // ` a b c d e f g h i j k l m n o

	97 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', ' o',

	98 // p q r s t u v w x y z { \| } ~

	99 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };

	100

	101 // This could be a table lookup as well by setting the high bit for each

	102 // valid character, but it's only called once per URL, and it makes the lookup

	103 // table easier to read not having extra stuff in it.

	104 inline bool IsSchemeFirstChar(unsigned char c) {

	105 return (c >= 'a' && c <= 'z') \|\| (c >= 'A' && c <= 'Z');

	106 }

	107

	108 template<typename CHAR, typename UCHAR>

	109 bool DoScheme(const CHAR* spec,

	110 const url_parse::Component& scheme,

	111 CanonOutput* output,

	112 url_parse::Component* out_scheme) {

	113 if (scheme.len <= 0) {

	114 // Scheme is unspecified or empty, convert to empty by appending a colon.

	115 *out_scheme = url_parse::Component(output->length(), 0);

	116 output->push_back(':');

	117 return true;

	118 }

	119

	120 // The output scheme starts from the current position.

	121 out_scheme->begin = output->length();

	122

	123 // Danger: it's important that this code does not strip any characters: it

	124 // only emits the canonical version (be it valid or escaped) of each of

	125 // the input characters. Stripping would put it out of sync with

	126 // url_util::FindAndCompareScheme, which could cause some security checks on

	127 // schemes to be incorrect.

	128 bool success = true;

	129 int end = scheme.end();

	130 for (int i = scheme.begin; i < end; i++) {

	131 UCHAR ch = static_cast<UCHAR>(spec[i]);

	132 char replacement = 0;

	133 if (ch < 0x80) {

	134 if (i == scheme.begin) {

	135 // Need to do a special check for the first letter of the scheme.

	136 if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))

	137 replacement = kSchemeCanonical[ch];

	138 } else {

	139 replacement = kSchemeCanonical[ch];

	140 }

	141 }

	142

	143 if (replacement) {

	144 output->push_back(replacement);

	145 } else if (ch == '%') {

	146 // Canonicalizing the scheme multiple times should lead to the same

	147 // result. Since invalid characters will be escaped, we need to preserve

	148 // the percent to avoid multiple escaping. The scheme will be invalid.

	149 success = false;

	150 output->push_back('%');

	151 } else {

	152 // Invalid character, store it but mark this scheme as invalid.

	153 success = false;

	154

	155 // This will escape the output and also handle encoding issues.

	156 // Ignore the return value since we already failed.

	157 AppendUTF8EscapedChar(spec, &i, end, output);

	158 }

	159 }

	160

	161 // The output scheme ends with the the current position, before appending

	162 // the colon.

	163 out_scheme->len = output->length() - out_scheme->begin;

	164 output->push_back(':');

	165 return success;

	166 }

	167

	168 // The username and password components reference ranges in the corresponding

	169 // *_spec strings. Typically, these specs will be the same (we're

	170 // canonicalizing a single source string), but may be different when

	171 // replacing components.

	172 template<typename CHAR, typename UCHAR>

	173 bool DoUserInfo(const CHAR* username_spec,

	174 const url_parse::Component& username,

	175 const CHAR* password_spec,

	176 const url_parse::Component& password,

	177 CanonOutput* output,

	178 url_parse::Component* out_username,

	179 url_parse::Component* out_password) {

	180 if (username.len <= 0 && password.len <= 0) {

	181 // Common case: no user info. We strip empty username/passwords.

	182 *out_username = url_parse::Component();

	183 *out_password = url_parse::Component();

	184 return true;

	185 }

	186

	187 // Write the username.

	188 out_username->begin = output->length();

	189 if (username.len > 0) {

	190 // This will escape characters not valid for the username.

	191 AppendStringOfType(&username_spec[username.begin], username.len,

	192 CHAR_USERINFO, output);

	193 }

	194 out_username->len = output->length() - out_username->begin;

	195

	196 // When there is a password, we need the separator. Note that we strip

	197 // empty but specified passwords.

	198 if (password.len > 0) {

	199 output->push_back(':');

	200 out_password->begin = output->length();

	201 AppendStringOfType(&password_spec[password.begin], password.len,

	202 CHAR_USERINFO, output);

	203 out_password->len = output->length() - out_password->begin;

	204 } else {

	205 *out_password = url_parse::Component();

	206 }

	207

	208 output->push_back('@');

	209 return true;

	210 }

	211

	212 // Helper functions for converting port integers to strings.

	213 inline void WritePortInt(char* output, int output_len, int port) {

	214 _itoa_s(port, output, output_len, 10);

	215 }

	216

	217 // This function will prepend the colon if there will be a port.

	218 template<typename CHAR, typename UCHAR>

	219 bool DoPort(const CHAR* spec,

	220 const url_parse::Component& port,

	221 int default_port_for_scheme,

	222 CanonOutput* output,

	223 url_parse::Component* out_port) {

	224 int port_num = url_parse::ParsePort(spec, port);

	225 if (port_num == url_parse::PORT_UNSPECIFIED \|\|

	226 port_num == default_port_for_scheme) {

	227 *out_port = url_parse::Component();

	228 return true; // Leave port empty.

	229 }

	230

	231 if (port_num == url_parse::PORT_INVALID) {

	232 // Invalid port: We'll copy the text from the input so the user can see

	233 // what the error was, and mark the URL as invalid by returning false.

	234 output->push_back(':');

	235 out_port->begin = output->length();

	236 AppendInvalidNarrowString(spec, port.begin, port.end(), output);

	237 out_port->len = output->length() - out_port->begin;

	238 return false;

	239 }

	240

	241 // Convert port number back to an integer. Max port value is 5 digits, and

	242 // the Parsed::ExtractPort will have made sure the integer is in range.

	243 const int buf_size = 6;

	244 char buf[buf_size];

	245 WritePortInt(buf, buf_size, port_num);

	246

	247 // Append the port number to the output, preceeded by a colon.

	248 output->push_back(':');

	249 out_port->begin = output->length();

	250 for (int i = 0; i < buf_size && buf[i]; i++)

	251 output->push_back(buf[i]);

	252

	253 out_port->len = output->length() - out_port->begin;

	254 return true;

	255 }

	256

	257 template<typename CHAR, typename UCHAR>

	258 void DoCanonicalizeRef(const CHAR* spec,

	259 const url_parse::Component& ref,

	260 CanonOutput* output,

	261 url_parse::Component* out_ref) {

	262 if (ref.len < 0) {

	263 // Common case of no ref.

	264 *out_ref = url_parse::Component();

	265 return;

	266 }

	267

	268 // Append the ref separator. Note that we need to do this even when the ref

	269 // is empty but present.

	270 output->push_back('#');

	271 out_ref->begin = output->length();

	272

	273 // Now iterate through all the characters, converting to UTF-8 and validating.

	274 int end = ref.end();

	275 for (int i = ref.begin; i < end; i++) {

	276 if (spec[i] == 0) {

	277 // IE just strips NULLs, so we do too.

	278 continue;

	279 } else if (static_cast<UCHAR>(spec[i]) < 0x20) {

	280 // Unline IE seems to, we escape control characters. This will probably

	281 // make the reference fragment unusable on a web page, but people

	282 // shouldn't be using control characters in their anchor names.

	283 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);

	284 } else if (static_cast<UCHAR>(spec[i]) < 0x80) {

	285 // Normal ASCII characters are just appended.

	286 output->push_back(static_cast<char>(spec[i]));

	287 } else {

	288 // Non-ASCII characters are appended unescaped, but only when they are

	289 // valid. Invalid Unicode characters are replaced with the "invalid

	290 // character" as IE seems to (ReadUTFChar puts the unicode replacement

	291 // character in the output on failure for us).

	292 unsigned code_point;

	293 ReadUTFChar(spec, &i, end, &code_point);

	294 AppendUTF8Value(code_point, output);

	295 }

	296 }

	297

	298 out_ref->len = output->length() - out_ref->begin;

	299 }

	300

	301 } // namespace

	302

	303 const char* RemoveURLWhitespace(const char* input, int input_len,

	304 CanonOutputT<char>* buffer,

	305 int* output_len) {

	306 return DoRemoveURLWhitespace(input, input_len, buffer, output_len);

	307 }

	308

	309 const char16* RemoveURLWhitespace(const char16* input, int input_len,

	310 CanonOutputT<char16>* buffer,

	311 int* output_len) {

	312 return DoRemoveURLWhitespace(input, input_len, buffer, output_len);

	313 }

	314

	315 char CanonicalSchemeChar(char16 ch) {

	316 if (ch >= 0x80)

	317 return 0; // Non-ASCII is not supported by schemes.

	318 return kSchemeCanonical[ch];

	319 }

	320

	321 bool CanonicalizeScheme(const char* spec,

	322 const url_parse::Component& scheme,

	323 CanonOutput* output,

	324 url_parse::Component* out_scheme) {

	325 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);

	326 }

	327

	328 bool CanonicalizeScheme(const char16* spec,

	329 const url_parse::Component& scheme,

	330 CanonOutput* output,

	331 url_parse::Component* out_scheme) {

	332 return DoScheme<char16, char16>(spec, scheme, output, out_scheme);

	333 }

	334

	335 bool CanonicalizeUserInfo(const char* username_source,

	336 const url_parse::Component& username,

	337 const char* password_source,

	338 const url_parse::Component& password,

	339 CanonOutput* output,

	340 url_parse::Component* out_username,

	341 url_parse::Component* out_password) {

	342 return DoUserInfo<char, unsigned char>(

	343 username_source, username, password_source, password,

	344 output, out_username, out_password);

	345 }

	346

	347 bool CanonicalizeUserInfo(const char16* username_source,

	348 const url_parse::Component& username,

	349 const char16* password_source,

	350 const url_parse::Component& password,

	351 CanonOutput* output,

	352 url_parse::Component* out_username,

	353 url_parse::Component* out_password) {

	354 return DoUserInfo<char16, char16>(

	355 username_source, username, password_source, password,

	356 output, out_username, out_password);

	357 }

	358

	359 bool CanonicalizePort(const char* spec,

	360 const url_parse::Component& port,

	361 int default_port_for_scheme,

	362 CanonOutput* output,

	363 url_parse::Component* out_port) {

	364 return DoPort<char, unsigned char>(spec, port,

	365 default_port_for_scheme,

	366 output, out_port);

	367 }

	368

	369 bool CanonicalizePort(const char16* spec,

	370 const url_parse::Component& port,

	371 int default_port_for_scheme,

	372 CanonOutput* output,

	373 url_parse::Component* out_port) {

	374 return DoPort<char16, char16>(spec, port, default_port_for_scheme,

	375 output, out_port);

	376 }

	377

	378 void CanonicalizeRef(const char* spec,

	379 const url_parse::Component& ref,

	380 CanonOutput* output,

	381 url_parse::Component* out_ref) {

	382 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);

	383 }

	384

	385 void CanonicalizeRef(const char16* spec,

	386 const url_parse::Component& ref,

	387 CanonOutput* output,

	388 url_parse::Component* out_ref) {

	389 DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);

	390 }

	391

	392 } // namespace url_canon

OLD	NEW

« no previous file with comments | « url/url_canon.h ('k') | url/url_canon_filesystemurl.cc » ('j') | no next file with comments »