url/url_canon.h - Issue 13821004: Move googleurl into the Chrome repo.

Side by Side Diff: url/url_canon.h

Issue 13821004: Move googleurl into the Chrome repo. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright 2007, Google Inc.

	2 // All rights reserved.

	3 //

	4 // Redistribution and use in source and binary forms, with or without

	5 // modification, are permitted provided that the following conditions are

	6 // met:

	7 //

	8 // * Redistributions of source code must retain the above copyright

	9 // notice, this list of conditions and the following disclaimer.

	10 // * Redistributions in binary form must reproduce the above

	11 // copyright notice, this list of conditions and the following disclaimer

	12 // in the documentation and/or other materials provided with the

	13 // distribution.

	14 // * Neither the name of Google Inc. nor the names of its

	15 // contributors may be used to endorse or promote products derived from

	16 // this software without specific prior written permission.

	17 //

	18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	29 #ifndef GOOGLEURL_SRC_URL_CANON_H__

	30 #define GOOGLEURL_SRC_URL_CANON_H__

	31

	32 #include <string.h>

	33 #include <stdlib.h>

	34

	35 #include "base/string16.h"

	36 #include "googleurl/src/url_common.h"

	37 #include "googleurl/src/url_parse.h"

	38

	39 namespace url_canon {

	40

	41 // Canonicalizer output -------------------------------------------------------

	42

	43 // Base class for the canonicalizer output, this maintains a buffer and

	44 // supports simple resizing and append operations on it.

	45 //

	46 // It is VERY IMPORTANT that no virtual function calls be made on the common

	47 // code path. We only have two virtual function calls, the destructor and a

	48 // resize function that is called when the existing buffer is not big enough.

	49 // The derived class is then in charge of setting up our buffer which we will

	50 // manage.

	51 template<typename T>

	52 class CanonOutputT {

	53 public:

	54 CanonOutputT() : buffer_(NULL), buffer_len_(0), cur_len_(0) {

	55 }

	56 virtual ~CanonOutputT() {

	57 }

	58

	59 // Implemented to resize the buffer. This function should update the buffer

	60 // pointer to point to the new buffer, and any old data up to \|cur_len_\| in

	61 // the buffer must be copied over.

	62 //

	63 // The new size \|sz\| must be larger than buffer_len_.

	64 virtual void Resize(int sz) = 0;

	65

	66 // Accessor for returning a character at a given position. The input offset

	67 // must be in the valid range.

	68 inline char at(int offset) const {

	69 return buffer_[offset];

	70 }

	71

	72 // Sets the character at the given position. The given position MUST be less

	73 // than the length().

	74 inline void set(int offset, int ch) {

	75 buffer_[offset] = ch;

	76 }

	77

	78 // Returns the number of characters currently in the buffer.

	79 inline int length() const {

	80 return cur_len_;

	81 }

	82

	83 // Returns the current capacity of the buffer. The length() is the number of

	84 // characters that have been declared to be written, but the capacity() is

	85 // the number that can be written without reallocation. If the caller must

	86 // write many characters at once, it can make sure there is enough capacity,

	87 // write the data, then use set_size() to declare the new length().

	88 int capacity() const {

	89 return buffer_len_;

	90 }

	91

	92 // Called by the user of this class to get the output. The output will NOT

	93 // be NULL-terminated. Call length() to get the

	94 // length.

	95 const T* data() const {

	96 return buffer_;

	97 }

	98 T* data() {

	99 return buffer_;

	100 }

	101

	102 // Shortens the URL to the new length. Used for "backing up" when processing

	103 // relative paths. This can also be used if an external function writes a lot

	104 // of data to the buffer (when using the "Raw" version below) beyond the end,

	105 // to declare the new length.

	106 //

	107 // This MUST NOT be used to expand the size of the buffer beyond capacity().

	108 void set_length(int new_len) {

	109 cur_len_ = new_len;

	110 }

	111

	112 // This is the most performance critical function, since it is called for

	113 // every character.

	114 void push_back(T ch) {

	115 // In VC2005, putting this common case first speeds up execution

	116 // dramatically because this branch is predicted as taken.

	117 if (cur_len_ < buffer_len_) {

	118 buffer_[cur_len_] = ch;

	119 cur_len_++;

	120 return;

	121 }

	122

	123 // Grow the buffer to hold at least one more item. Hopefully we won't have

	124 // to do this very often.

	125 if (!Grow(1))

	126 return;

	127

	128 // Actually do the insertion.

	129 buffer_[cur_len_] = ch;

	130 cur_len_++;

	131 }

	132

	133 // Appends the given string to the output.

	134 void Append(const T* str, int str_len) {

	135 if (cur_len_ + str_len > buffer_len_) {

	136 if (!Grow(cur_len_ + str_len - buffer_len_))

	137 return;

	138 }

	139 for (int i = 0; i < str_len; i++)

	140 buffer_[cur_len_ + i] = str[i];

	141 cur_len_ += str_len;

	142 }

	143

	144 protected:

	145 // Grows the given buffer so that it can fit at least \|min_additional\|

	146 // characters. Returns true if the buffer could be resized, false on OOM.

	147 bool Grow(int min_additional) {

	148 static const int kMinBufferLen = 16;

	149 int new_len = (buffer_len_ == 0) ? kMinBufferLen : buffer_len_;

	150 do {

	151 if (new_len >= (1 << 30)) // Prevent overflow below.

	152 return false;

	153 new_len *= 2;

	154 } while (new_len < buffer_len_ + min_additional);

	155 Resize(new_len);

	156 return true;

	157 }

	158

	159 T* buffer_;

	160 int buffer_len_;

	161

	162 // Used characters in the buffer.

	163 int cur_len_;

	164 };

	165

	166 // Simple implementation of the CanonOutput using new[]. This class

	167 // also supports a static buffer so if it is allocated on the stack, most

	168 // URLs can be canonicalized with no heap allocations.

	169 template<typename T, int fixed_capacity = 1024>

	170 class RawCanonOutputT : public CanonOutputT<T> {

	171 public:

	172 RawCanonOutputT() : CanonOutputT<T>() {

	173 this->buffer_ = fixed_buffer_;

	174 this->buffer_len_ = fixed_capacity;

	175 }

	176 virtual ~RawCanonOutputT() {

	177 if (this->buffer_ != fixed_buffer_)

	178 delete[] this->buffer_;

	179 }

	180

	181 virtual void Resize(int sz) {

	182 T* new_buf = new T[sz];

	183 memcpy(new_buf, this->buffer_,

	184 sizeof(T) * (this->cur_len_ < sz ? this->cur_len_ : sz));

	185 if (this->buffer_ != fixed_buffer_)

	186 delete[] this->buffer_;

	187 this->buffer_ = new_buf;

	188 this->buffer_len_ = sz;

	189 }

	190

	191 protected:

	192 T fixed_buffer_[fixed_capacity];

	193 };

	194

	195 // Normally, all canonicalization output is in narrow characters. We support

	196 // the templates so it can also be used internally if a wide buffer is

	197 // required.

	198 typedef CanonOutputT<char> CanonOutput;

	199 typedef CanonOutputT<char16> CanonOutputW;

	200

	201 template<int fixed_capacity>

	202 class RawCanonOutput : public RawCanonOutputT<char, fixed_capacity> {};

	203 template<int fixed_capacity>

	204 class RawCanonOutputW : public RawCanonOutputT<char16, fixed_capacity> {};

	205

	206 // Character set converter ----------------------------------------------------

	207 //

	208 // Converts query strings into a custom encoding. The embedder can supply an

	209 // implementation of this class to interface with their own character set

	210 // conversion libraries.

	211 //

	212 // Embedders will want to see the unit test for the ICU version.

	213

	214 class CharsetConverter {

	215 public:

	216 CharsetConverter() {}

	217 virtual ~CharsetConverter() {}

	218

	219 // Converts the given input string from UTF-16 to whatever output format the

	220 // converter supports. This is used only for the query encoding conversion,

	221 // which does not fail. Instead, the converter should insert "invalid

	222 // character" characters in the output for invalid sequences, and do the

	223 // best it can.

	224 //

	225 // If the input contains a character not representable in the output

	226 // character set, the converter should append the HTML entity sequence in

	227 // decimal, (such as "你") with escaping of the ampersand, number

	228 // sign, and semicolon (in the previous example it would be

	229 // "%26%2320320%3B"). This rule is based on what IE does in this situation.

	230 virtual void ConvertFromUTF16(const char16* input,

	231 int input_len,

	232 CanonOutput* output) = 0;

	233 };

	234

	235 // Whitespace -----------------------------------------------------------------

	236

	237 // Searches for whitespace that should be removed from the middle of URLs, and

	238 // removes it. Removed whitespace are tabs and newlines, but NOT spaces. Spaces

	239 // are preserved, which is what most browsers do. A pointer to the output will

	240 // be returned, and the length of that output will be in \|output_len\|.

	241 //

	242 // This should be called before parsing if whitespace removal is desired (which

	243 // it normally is when you are canonicalizing).

	244 //

	245 // If no whitespace is removed, this function will not use the buffer and will

	246 // return a pointer to the input, to avoid the extra copy. If modification is

	247 // required, the given \|buffer\| will be used and the returned pointer will

	248 // point to the beginning of the buffer.

	249 //

	250 // Therefore, callers should not use the buffer, since it may actuall be empty,

	251 // use the computed pointer and \|*output_len\| instead.

	252 GURL_API const char* RemoveURLWhitespace(const char* input, int input_len,

	253 CanonOutputT<char>* buffer,

	254 int* output_len);

	255 GURL_API const char16* RemoveURLWhitespace(const char16* input, int input_len,

	256 CanonOutputT<char16>* buffer,

	257 int* output_len);

	258

	259 // IDN ------------------------------------------------------------------------

	260

	261 // Converts the Unicode input representing a hostname to ASCII using IDN rules.

	262 // The output must fall in the ASCII range, but will be encoded in UTF-16.

	263 //

	264 // On success, the output will be filled with the ASCII host name and it will

	265 // return true. Unlike most other canonicalization functions, this assumes that

	266 // the output is empty. The beginning of the host will be at offset 0, and

	267 // the length of the output will be set to the length of the new host name.

	268 //

	269 // On error, returns false. The output in this case is undefined.

	270 GURL_API bool IDNToASCII(const char16* src, int src_len, CanonOutputW* output);

	271

	272 // Piece-by-piece canonicalizers ----------------------------------------------

	273 //

	274 // These individual canonicalizers append the canonicalized versions of the

	275 // corresponding URL component to the given std::string. The spec and the

	276 // previously-identified range of that component are the input. The range of

	277 // the canonicalized component will be written to the output component.

	278 //

	279 // These functions all append to the output so they can be chained. Make sure

	280 // the output is empty when you start.

	281 //

	282 // These functions returns boolean values indicating success. On failure, they

	283 // will attempt to write something reasonable to the output so that, if

	284 // displayed to the user, they will recognise it as something that's messed up.

	285 // Nothing more should ever be done with these invalid URLs, however.

	286

	287 // Scheme: Appends the scheme and colon to the URL. The output component will

	288 // indicate the range of characters up to but not including the colon.

	289 //

	290 // Canonical URLs always have a scheme. If the scheme is not present in the

	291 // input, this will just write the colon to indicate an empty scheme. Does not

	292 // append slashes which will be needed before any authority components for most

	293 // URLs.

	294 //

	295 // The 8-bit version requires UTF-8 encoding.

	296 GURL_API bool CanonicalizeScheme(const char* spec,

	297 const url_parse::Component& scheme,

	298 CanonOutput* output,

	299 url_parse::Component* out_scheme);

	300 GURL_API bool CanonicalizeScheme(const char16* spec,

	301 const url_parse::Component& scheme,

	302 CanonOutput* output,

	303 url_parse::Component* out_scheme);

	304

	305 // User info: username/password. If present, this will add the delimiters so

	306 // the output will be "<username>:<password>@" or "<username>@". Empty

	307 // username/password pairs, or empty passwords, will get converted to

	308 // nonexistant in the canonical version.

	309 //

	310 // The components for the username and password refer to ranges in the

	311 // respective source strings. Usually, these will be the same string, which

	312 // is legal as long as the two components don't overlap.

	313 //

	314 // The 8-bit version requires UTF-8 encoding.

	315 GURL_API bool CanonicalizeUserInfo(const char* username_source,

	316 const url_parse::Component& username,

	317 const char* password_source,

	318 const url_parse::Component& password,

	319 CanonOutput* output,

	320 url_parse::Component* out_username,

	321 url_parse::Component* out_password);

	322 GURL_API bool CanonicalizeUserInfo(const char16* username_source,

	323 const url_parse::Component& username,

	324 const char16* password_source,

	325 const url_parse::Component& password,

	326 CanonOutput* output,

	327 url_parse::Component* out_username,

	328 url_parse::Component* out_password);

	329

	330

	331 // This structure holds detailed state exported from the IP/Host canonicalizers.

	332 // Additional fields may be added as callers require them.

	333 struct CanonHostInfo {

	334 CanonHostInfo() : family(NEUTRAL), num_ipv4_components(0), out_host() {}

	335

	336 // Convenience function to test if family is an IP address.

	337 bool IsIPAddress() const { return family == IPV4 \|\| family == IPV6; }

	338

	339 // This field summarizes how the input was classified by the canonicalizer.

	340 enum Family {

	341 NEUTRAL, // - Doesn't resemble an IP address. As far as the IP

	342 // canonicalizer is concerned, it should be treated as a

	343 // hostname.

	344 BROKEN, // - Almost an IP, but was not canonicalized. This could be an

	345 // IPv4 address where truncation occurred, or something

	346 // containing the special characters :[] which did not parse

	347 // as an IPv6 address. Never attempt to connect to this

	348 // address, because it might actually succeed!

	349 IPV4, // - Successfully canonicalized as an IPv4 address.

	350 IPV6, // - Successfully canonicalized as an IPv6 address.

	351 };

	352 Family family;

	353

	354 // If \|family\| is IPV4, then this is the number of nonempty dot-separated

	355 // components in the input text, from 1 to 4. If \|family\| is not IPV4,

	356 // this value is undefined.

	357 int num_ipv4_components;

	358

	359 // Location of host within the canonicalized output.

	360 // CanonicalizeIPAddress() only sets this field if \|family\| is IPV4 or IPV6.

	361 // CanonicalizeHostVerbose() always sets it.

	362 url_parse::Component out_host;

	363

	364 // \|address\| contains the parsed IP Address (if any) in its first

	365 // AddressLength() bytes, in network order. If IsIPAddress() is false

	366 // AddressLength() will return zero and the content of \|address\| is undefined.

	367 unsigned char address[16];

	368

	369 // Convenience function to calculate the length of an IP address corresponding

	370 // to the current IP version in \|family\|, if any. For use with \|address\|.

	371 int AddressLength() const {

	372 return family == IPV4 ? 4 : (family == IPV6 ? 16 : 0);

	373 }

	374 };

	375

	376

	377 // Host.

	378 //

	379 // The 8-bit version requires UTF-8 encoding. Use this version when you only

	380 // need to know whether canonicalization succeeded.

	381 GURL_API bool CanonicalizeHost(const char* spec,

	382 const url_parse::Component& host,

	383 CanonOutput* output,

	384 url_parse::Component* out_host);

	385 GURL_API bool CanonicalizeHost(const char16* spec,

	386 const url_parse::Component& host,

	387 CanonOutput* output,

	388 url_parse::Component* out_host);

	389

	390 // Extended version of CanonicalizeHost, which returns additional information.

	391 // Use this when you need to know whether the hostname was an IP address.

	392 // A successful return is indicated by host_info->family != BROKEN. See the

	393 // definition of CanonHostInfo above for details.

	394 GURL_API void CanonicalizeHostVerbose(const char* spec,

	395 const url_parse::Component& host,

	396 CanonOutput* output,

	397 CanonHostInfo* host_info);

	398 GURL_API void CanonicalizeHostVerbose(const char16* spec,

	399 const url_parse::Component& host,

	400 CanonOutput* output,

	401 CanonHostInfo* host_info);

	402

	403

	404 // IP addresses.

	405 //

	406 // Tries to interpret the given host name as an IPv4 or IPv6 address. If it is

	407 // an IP address, it will canonicalize it as such, appending it to \|output\|.

	408 // Additional status information is returned via the \|*host_info\| parameter.

	409 // See the definition of CanonHostInfo above for details.

	410 //

	411 // This is called AUTOMATICALLY from the host canonicalizer, which ensures that

	412 // the input is unescaped and name-prepped, etc. It should not normally be

	413 // necessary or wise to call this directly.

	414 GURL_API void CanonicalizeIPAddress(const char* spec,

	415 const url_parse::Component& host,

	416 CanonOutput* output,

	417 CanonHostInfo* host_info);

	418 GURL_API void CanonicalizeIPAddress(const char16* spec,

	419 const url_parse::Component& host,

	420 CanonOutput* output,

	421 CanonHostInfo* host_info);

	422

	423 // Port: this function will add the colon for the port if a port is present.

	424 // The caller can pass url_parse::PORT_UNSPECIFIED as the

	425 // default_port_for_scheme argument if there is no default port.

	426 //

	427 // The 8-bit version requires UTF-8 encoding.

	428 GURL_API bool CanonicalizePort(const char* spec,

	429 const url_parse::Component& port,

	430 int default_port_for_scheme,

	431 CanonOutput* output,

	432 url_parse::Component* out_port);

	433 GURL_API bool CanonicalizePort(const char16* spec,

	434 const url_parse::Component& port,

	435 int default_port_for_scheme,

	436 CanonOutput* output,

	437 url_parse::Component* out_port);

	438

	439 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED

	440 // if the scheme is unknown.

	441 GURL_API int DefaultPortForScheme(const char* scheme, int scheme_len);

	442

	443 // Path. If the input does not begin in a slash (including if the input is

	444 // empty), we'll prepend a slash to the path to make it canonical.

	445 //

	446 // The 8-bit version assumes UTF-8 encoding, but does not verify the validity

	447 // of the UTF-8 (i.e., you can have invalid UTF-8 sequences, invalid

	448 // characters, etc.). Normally, URLs will come in as UTF-16, so this isn't

	449 // an issue. Somebody giving us an 8-bit path is responsible for generating

	450 // the path that the server expects (we'll escape high-bit characters), so

	451 // if something is invalid, it's their problem.

	452 GURL_API bool CanonicalizePath(const char* spec,

	453 const url_parse::Component& path,

	454 CanonOutput* output,

	455 url_parse::Component* out_path);

	456 GURL_API bool CanonicalizePath(const char16* spec,

	457 const url_parse::Component& path,

	458 CanonOutput* output,

	459 url_parse::Component* out_path);

	460

	461 // Canonicalizes the input as a file path. This is like CanonicalizePath except

	462 // that it also handles Windows drive specs. For example, the path can begin

	463 // with "c\|\" and it will get properly canonicalized to "C:/".

	464 // The string will be appended to \|output\| and \|out_path\| will be updated.

	465 //

	466 // The 8-bit version requires UTF-8 encoding.

	467 GURL_API bool FileCanonicalizePath(const char* spec,

	468 const url_parse::Component& path,

	469 CanonOutput* output,

	470 url_parse::Component* out_path);

	471 GURL_API bool FileCanonicalizePath(const char16* spec,

	472 const url_parse::Component& path,

	473 CanonOutput* output,

	474 url_parse::Component* out_path);

	475

	476 // Query: Prepends the ? if needed.

	477 //

	478 // The 8-bit version requires the input to be UTF-8 encoding. Incorrectly

	479 // encoded characters (in UTF-8 or UTF-16) will be replaced with the Unicode

	480 // "invalid character." This function can not fail, we always just try to do

	481 // our best for crazy input here since web pages can set it themselves.

	482 //

	483 // This will convert the given input into the output encoding that the given

	484 // character set converter object provides. The converter will only be called

	485 // if necessary, for ASCII input, no conversions are necessary.

	486 //

	487 // The converter can be NULL. In this case, the output encoding will be UTF-8.

	488 GURL_API void CanonicalizeQuery(const char* spec,

	489 const url_parse::Component& query,

	490 CharsetConverter* converter,

	491 CanonOutput* output,

	492 url_parse::Component* out_query);

	493 GURL_API void CanonicalizeQuery(const char16* spec,

	494 const url_parse::Component& query,

	495 CharsetConverter* converter,

	496 CanonOutput* output,

	497 url_parse::Component* out_query);

	498

	499 // Ref: Prepends the # if needed. The output will be UTF-8 (this is the only

	500 // canonicalizer that does not produce ASCII output). The output is

	501 // guaranteed to be valid UTF-8.

	502 //

	503 // This function will not fail. If the input is invalid UTF-8/UTF-16, we'll use

	504 // the "Unicode replacement character" for the confusing bits and copy the rest.

	505 GURL_API void CanonicalizeRef(const char* spec,

	506 const url_parse::Component& path,

	507 CanonOutput* output,

	508 url_parse::Component* out_path);

	509 GURL_API void CanonicalizeRef(const char16* spec,

	510 const url_parse::Component& path,

	511 CanonOutput* output,

	512 url_parse::Component* out_path);

	513

	514 // Full canonicalizer ---------------------------------------------------------

	515 //

	516 // These functions replace any string contents, rather than append as above.

	517 // See the above piece-by-piece functions for information specific to

	518 // canonicalizing individual components.

	519 //

	520 // The output will be ASCII except the reference fragment, which may be UTF-8.

	521 //

	522 // The 8-bit versions require UTF-8 encoding.

	523

	524 // Use for standard URLs with authorities and paths.

	525 GURL_API bool CanonicalizeStandardURL(const char* spec,

	526 int spec_len,

	527 const url_parse::Parsed& parsed,

	528 CharsetConverter* query_converter,

	529 CanonOutput* output,

	530 url_parse::Parsed* new_parsed);

	531 GURL_API bool CanonicalizeStandardURL(const char16* spec,

	532 int spec_len,

	533 const url_parse::Parsed& parsed,

	534 CharsetConverter* query_converter,

	535 CanonOutput* output,

	536 url_parse::Parsed* new_parsed);

	537

	538 // Use for file URLs.

	539 GURL_API bool CanonicalizeFileURL(const char* spec,

	540 int spec_len,

	541 const url_parse::Parsed& parsed,

	542 CharsetConverter* query_converter,

	543 CanonOutput* output,

	544 url_parse::Parsed* new_parsed);

	545 GURL_API bool CanonicalizeFileURL(const char16* spec,

	546 int spec_len,

	547 const url_parse::Parsed& parsed,

	548 CharsetConverter* query_converter,

	549 CanonOutput* output,

	550 url_parse::Parsed* new_parsed);

	551

	552 // Use for filesystem URLs.

	553 GURL_API bool CanonicalizeFileSystemURL(const char* spec,

	554 int spec_len,

	555 const url_parse::Parsed& parsed,

	556 CharsetConverter* query_converter,

	557 CanonOutput* output,

	558 url_parse::Parsed* new_parsed);

	559 GURL_API bool CanonicalizeFileSystemURL(const char16* spec,

	560 int spec_len,

	561 const url_parse::Parsed& parsed,

	562 CharsetConverter* query_converter,

	563 CanonOutput* output,

	564 url_parse::Parsed* new_parsed);

	565

	566 // Use for path URLs such as javascript. This does not modify the path in any

	567 // way, for example, by escaping it.

	568 GURL_API bool CanonicalizePathURL(const char* spec,

	569 int spec_len,

	570 const url_parse::Parsed& parsed,

	571 CanonOutput* output,

	572 url_parse::Parsed* new_parsed);

	573 GURL_API bool CanonicalizePathURL(const char16* spec,

	574 int spec_len,

	575 const url_parse::Parsed& parsed,

	576 CanonOutput* output,

	577 url_parse::Parsed* new_parsed);

	578

	579 // Use for mailto URLs. This "canonicalizes" the url into a path and query

	580 // component. It does not attempt to merge "to" fields. It uses UTF-8 for

	581 // the query encoding if there is a query. This is because a mailto URL is

	582 // really intended for an external mail program, and the encoding of a page,

	583 // etc. which would influence a query encoding normally are irrelevant.

	584 GURL_API bool CanonicalizeMailtoURL(const char* spec,

	585 int spec_len,

	586 const url_parse::Parsed& parsed,

	587 CanonOutput* output,

	588 url_parse::Parsed* new_parsed);

	589 GURL_API bool CanonicalizeMailtoURL(const char16* spec,

	590 int spec_len,

	591 const url_parse::Parsed& parsed,

	592 CanonOutput* output,

	593 url_parse::Parsed* new_parsed);

	594

	595 // Part replacer --------------------------------------------------------------

	596

	597 // Internal structure used for storing separate strings for each component.

	598 // The basic canonicalization functions use this structure internally so that

	599 // component replacement (different strings for different components) can be

	600 // treated on the same code path as regular canonicalization (the same string

	601 // for each component).

	602 //

	603 // A url_parse::Parsed structure usually goes along with this. Those

	604 // components identify offsets within these strings, so that they can all be

	605 // in the same string, or spread arbitrarily across different ones.

	606 //

	607 // This structures does not own any data. It is the caller's responsibility to

	608 // ensure that the data the pointers point to stays in scope and is not

	609 // modified.

	610 template<typename CHAR>

	611 struct URLComponentSource {

	612 // Constructor normally used by callers wishing to replace components. This

	613 // will make them all NULL, which is no replacement. The caller would then

	614 // override the components they want to replace.

	615 URLComponentSource()

	616 : scheme(NULL),

	617 username(NULL),

	618 password(NULL),

	619 host(NULL),

	620 port(NULL),

	621 path(NULL),

	622 query(NULL),

	623 ref(NULL) {

	624 }

	625

	626 // Constructor normally used internally to initialize all the components to

	627 // point to the same spec.

	628 explicit URLComponentSource(const CHAR* default_value)

	629 : scheme(default_value),

	630 username(default_value),

	631 password(default_value),

	632 host(default_value),

	633 port(default_value),

	634 path(default_value),

	635 query(default_value),

	636 ref(default_value) {

	637 }

	638

	639 const CHAR* scheme;

	640 const CHAR* username;

	641 const CHAR* password;

	642 const CHAR* host;

	643 const CHAR* port;

	644 const CHAR* path;

	645 const CHAR* query;

	646 const CHAR* ref;

	647 };

	648

	649 // This structure encapsulates information on modifying a URL. Each component

	650 // may either be left unchanged, replaced, or deleted.

	651 //

	652 // By default, each component is unchanged. For those components that should be

	653 // modified, call either Set* or Clear* to modify it.

	654 //

	655 // The string passed to Set* functions DOES NOT GET COPIED AND MUST BE KEPT

	656 // IN SCOPE BY THE CALLER for as long as this object exists!

	657 //

	658 // Prefer the 8-bit replacement version if possible since it is more efficient.

	659 template<typename CHAR>

	660 class Replacements {

	661 public:

	662 Replacements() {

	663 }

	664

	665 // Scheme

	666 void SetScheme(const CHAR* s, const url_parse::Component& comp) {

	667 sources_.scheme = s;

	668 components_.scheme = comp;

	669 }

	670 // Note: we don't have a ClearScheme since this doesn't make any sense.

	671 bool IsSchemeOverridden() const { return sources_.scheme != NULL; }

	672

	673 // Username

	674 void SetUsername(const CHAR* s, const url_parse::Component& comp) {

	675 sources_.username = s;

	676 components_.username = comp;

	677 }

	678 void ClearUsername() {

	679 sources_.username = Placeholder();

	680 components_.username = url_parse::Component();

	681 }

	682 bool IsUsernameOverridden() const { return sources_.username != NULL; }

	683

	684 // Password

	685 void SetPassword(const CHAR* s, const url_parse::Component& comp) {

	686 sources_.password = s;

	687 components_.password = comp;

	688 }

	689 void ClearPassword() {

	690 sources_.password = Placeholder();

	691 components_.password = url_parse::Component();

	692 }

	693 bool IsPasswordOverridden() const { return sources_.password != NULL; }

	694

	695 // Host

	696 void SetHost(const CHAR* s, const url_parse::Component& comp) {

	697 sources_.host = s;

	698 components_.host = comp;

	699 }

	700 void ClearHost() {

	701 sources_.host = Placeholder();

	702 components_.host = url_parse::Component();

	703 }

	704 bool IsHostOverridden() const { return sources_.host != NULL; }

	705

	706 // Port

	707 void SetPort(const CHAR* s, const url_parse::Component& comp) {

	708 sources_.port = s;

	709 components_.port = comp;

	710 }

	711 void ClearPort() {

	712 sources_.port = Placeholder();

	713 components_.port = url_parse::Component();

	714 }

	715 bool IsPortOverridden() const { return sources_.port != NULL; }

	716

	717 // Path

	718 void SetPath(const CHAR* s, const url_parse::Component& comp) {

	719 sources_.path = s;

	720 components_.path = comp;

	721 }

	722 void ClearPath() {

	723 sources_.path = Placeholder();

	724 components_.path = url_parse::Component();

	725 }

	726 bool IsPathOverridden() const { return sources_.path != NULL; }

	727

	728 // Query

	729 void SetQuery(const CHAR* s, const url_parse::Component& comp) {

	730 sources_.query = s;

	731 components_.query = comp;

	732 }

	733 void ClearQuery() {

	734 sources_.query = Placeholder();

	735 components_.query = url_parse::Component();

	736 }

	737 bool IsQueryOverridden() const { return sources_.query != NULL; }

	738

	739 // Ref

	740 void SetRef(const CHAR* s, const url_parse::Component& comp) {

	741 sources_.ref = s;

	742 components_.ref = comp;

	743 }

	744 void ClearRef() {

	745 sources_.ref = Placeholder();

	746 components_.ref = url_parse::Component();

	747 }

	748 bool IsRefOverridden() const { return sources_.ref != NULL; }

	749

	750 // Getters for the itnernal data. See the variables below for how the

	751 // information is encoded.

	752 const URLComponentSource<CHAR>& sources() const { return sources_; }

	753 const url_parse::Parsed& components() const { return components_; }

	754

	755 private:

	756 // Returns a pointer to a static empty string that is used as a placeholder

	757 // to indicate a component should be deleted (see below).

	758 const CHAR* Placeholder() {

	759 static const CHAR empty_string = 0;

	760 return &empty_string;

	761 }

	762

	763 // We support three states:

	764 //

	765 // Action \| Source Component

	766 // -----------------------+--------------------------------------------------

	767 // Don't change component \| NULL (unused)

	768 // Replace component \| (replacement string) (replacement component)

	769 // Delete component \| (non-NULL) (invalid component: (0,-1))

	770 //

	771 // We use a pointer to the empty string for the source when the component

	772 // should be deleted.

	773 URLComponentSource<CHAR> sources_;

	774 url_parse::Parsed components_;

	775 };

	776

	777 // The base must be an 8-bit canonical URL.

	778 GURL_API bool ReplaceStandardURL(const char* base,

	779 const url_parse::Parsed& base_parsed,

	780 const Replacements<char>& replacements,

	781 CharsetConverter* query_converter,

	782 CanonOutput* output,

	783 url_parse::Parsed* new_parsed);

	784 GURL_API bool ReplaceStandardURL(const char* base,

	785 const url_parse::Parsed& base_parsed,

	786 const Replacements<char16>& replacements,

	787 CharsetConverter* query_converter,

	788 CanonOutput* output,

	789 url_parse::Parsed* new_parsed);

	790

	791 // Filesystem URLs can only have the path, query, or ref replaced.

	792 // All other components will be ignored.

	793 GURL_API bool ReplaceFileSystemURL(const char* base,

	794 const url_parse::Parsed& base_parsed,

	795 const Replacements<char>& replacements,

	796 CharsetConverter* query_converter,

	797 CanonOutput* output,

	798 url_parse::Parsed* new_parsed);

	799 GURL_API bool ReplaceFileSystemURL(const char* base,

	800 const url_parse::Parsed& base_parsed,

	801 const Replacements<char16>& replacements,

	802 CharsetConverter* query_converter,

	803 CanonOutput* output,

	804 url_parse::Parsed* new_parsed);

	805

	806 // Replacing some parts of a file URL is not permitted. Everything except

	807 // the host, path, query, and ref will be ignored.

	808 GURL_API bool ReplaceFileURL(const char* base,

	809 const url_parse::Parsed& base_parsed,

	810 const Replacements<char>& replacements,

	811 CharsetConverter* query_converter,

	812 CanonOutput* output,

	813 url_parse::Parsed* new_parsed);

	814 GURL_API bool ReplaceFileURL(const char* base,

	815 const url_parse::Parsed& base_parsed,

	816 const Replacements<char16>& replacements,

	817 CharsetConverter* query_converter,

	818 CanonOutput* output,

	819 url_parse::Parsed* new_parsed);

	820

	821 // Path URLs can only have the scheme and path replaced. All other components

	822 // will be ignored.

	823 GURL_API bool ReplacePathURL(const char* base,

	824 const url_parse::Parsed& base_parsed,

	825 const Replacements<char>& replacements,

	826 CanonOutput* output,

	827 url_parse::Parsed* new_parsed);

	828 GURL_API bool ReplacePathURL(const char* base,

	829 const url_parse::Parsed& base_parsed,

	830 const Replacements<char16>& replacements,

	831 CanonOutput* output,

	832 url_parse::Parsed* new_parsed);

	833

	834 // Mailto URLs can only have the scheme, path, and query replaced.

	835 // All other components will be ignored.

	836 GURL_API bool ReplaceMailtoURL(const char* base,

	837 const url_parse::Parsed& base_parsed,

	838 const Replacements<char>& replacements,

	839 CanonOutput* output,

	840 url_parse::Parsed* new_parsed);

	841 GURL_API bool ReplaceMailtoURL(const char* base,

	842 const url_parse::Parsed& base_parsed,

	843 const Replacements<char16>& replacements,

	844 CanonOutput* output,

	845 url_parse::Parsed* new_parsed);

	846

	847 // Relative URL ---------------------------------------------------------------

	848

	849 // Given an input URL or URL fragment \|fragment\|, determines if it is a

	850 // relative or absolute URL and places the result into \|*is_relative\|. If it is

	851 // relative, the relevant portion of the URL will be placed into

	852 // \|*relative_component\| (there may have been trimmed whitespace, for example).

	853 // This value is passed to ResolveRelativeURL. If the input is not relative,

	854 // this value is UNDEFINED (it may be changed by the function).

	855 //

	856 // Returns true on success (we successfully determined the URL is relative or

	857 // not). Failure means that the combination of URLs doesn't make any sense.

	858 //

	859 // The base URL should always be canonical, therefore is ASCII.

	860 GURL_API bool IsRelativeURL(const char* base,

	861 const url_parse::Parsed& base_parsed,

	862 const char* fragment,

	863 int fragment_len,

	864 bool is_base_hierarchical,

	865 bool* is_relative,

	866 url_parse::Component* relative_component);

	867 GURL_API bool IsRelativeURL(const char* base,

	868 const url_parse::Parsed& base_parsed,

	869 const char16* fragment,

	870 int fragment_len,

	871 bool is_base_hierarchical,

	872 bool* is_relative,

	873 url_parse::Component* relative_component);

	874

	875 // Given a canonical parsed source URL, a URL fragment known to be relative,

	876 // and the identified relevant portion of the relative URL (computed by

	877 // IsRelativeURL), this produces a new parsed canonical URL in \|output\| and

	878 // \|out_parsed\|.

	879 //

	880 // It also requires a flag indicating whether the base URL is a file: URL

	881 // which triggers additional logic.

	882 //

	883 // The base URL should be canonical and have a host (may be empty for file

	884 // URLs) and a path. If it doesn't have these, we can't resolve relative

	885 // URLs off of it and will return the base as the output with an error flag.

	886 // Becausee it is canonical is should also be ASCII.

	887 //

	888 // The query charset converter follows the same rules as CanonicalizeQuery.

	889 //

	890 // Returns true on success. On failure, the output will be "something

	891 // reasonable" that will be consistent and valid, just probably not what

	892 // was intended by the web page author or caller.

	893 GURL_API bool ResolveRelativeURL(const char* base_url,

	894 const url_parse::Parsed& base_parsed,

	895 bool base_is_file,

	896 const char* relative_url,

	897 const url_parse::Component& relative_component,

	898 CharsetConverter* query_converter,

	899 CanonOutput* output,

	900 url_parse::Parsed* out_parsed);

	901 GURL_API bool ResolveRelativeURL(const char* base_url,

	902 const url_parse::Parsed& base_parsed,

	903 bool base_is_file,

	904 const char16* relative_url,

	905 const url_parse::Component& relative_component,

	906 CharsetConverter* query_converter,

	907 CanonOutput* output,

	908 url_parse::Parsed* out_parsed);

	909

	910 } // namespace url_canon

	911

	912 #endif // GOOGLEURL_SRC_URL_CANON_H__

OLD	NEW

« no previous file with comments | « url/gurl_unittest.cc ('k') | url/url_canon_etc.cc » ('j') | no next file with comments »