| Index: url/url_canon_etc.cc
|
| ===================================================================
|
| --- url/url_canon_etc.cc (revision 0)
|
| +++ url/url_canon_etc.cc (revision 0)
|
| @@ -0,0 +1,392 @@
|
| +// Copyright 2007, Google Inc.
|
| +// All rights reserved.
|
| +//
|
| +// Redistribution and use in source and binary forms, with or without
|
| +// modification, are permitted provided that the following conditions are
|
| +// met:
|
| +//
|
| +// * Redistributions of source code must retain the above copyright
|
| +// notice, this list of conditions and the following disclaimer.
|
| +// * Redistributions in binary form must reproduce the above
|
| +// copyright notice, this list of conditions and the following disclaimer
|
| +// in the documentation and/or other materials provided with the
|
| +// distribution.
|
| +// * Neither the name of Google Inc. nor the names of its
|
| +// contributors may be used to endorse or promote products derived from
|
| +// this software without specific prior written permission.
|
| +//
|
| +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| +
|
| +// Canonicalizers for random bits that aren't big enough for their own files.
|
| +
|
| +#include <string.h>
|
| +
|
| +#include "googleurl/src/url_canon.h"
|
| +#include "googleurl/src/url_canon_internal.h"
|
| +
|
| +namespace url_canon {
|
| +
|
| +namespace {
|
| +
|
| +// Returns true if the given character should be removed from the middle of a
|
| +// URL.
|
| +inline bool IsRemovableURLWhitespace(int ch) {
|
| + return ch == '\r' || ch == '\n' || ch == '\t';
|
| +}
|
| +
|
| +// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
|
| +// It sucks that we have to do this, since this takes about 13% of the total URL
|
| +// canonicalization time.
|
| +template<typename CHAR>
|
| +const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
|
| + CanonOutputT<CHAR>* buffer,
|
| + int* output_len) {
|
| + // Fast verification that there's nothing that needs removal. This is the 99%
|
| + // case, so we want it to be fast and don't care about impacting the speed
|
| + // when we do find whitespace.
|
| + int found_whitespace = false;
|
| + for (int i = 0; i < input_len; i++) {
|
| + if (!IsRemovableURLWhitespace(input[i]))
|
| + continue;
|
| + found_whitespace = true;
|
| + break;
|
| + }
|
| +
|
| + if (!found_whitespace) {
|
| + // Didn't find any whitespace, we don't need to do anything. We can just
|
| + // return the input as the output.
|
| + *output_len = input_len;
|
| + return input;
|
| + }
|
| +
|
| + // Remove the whitespace into the new buffer and return it.
|
| + for (int i = 0; i < input_len; i++) {
|
| + if (!IsRemovableURLWhitespace(input[i]))
|
| + buffer->push_back(input[i]);
|
| + }
|
| + *output_len = buffer->length();
|
| + return buffer->data();
|
| +}
|
| +
|
| +// Contains the canonical version of each possible input letter in the scheme
|
| +// (basically, lower-cased). The corresponding entry will be 0 if the letter
|
| +// is not allowed in a scheme.
|
| +const char kSchemeCanonical[0x80] = {
|
| +// 00-1f: all are invalid
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
| +// ' ' ! " # $ % & ' ( ) * + , - . /
|
| + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
|
| +// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
| + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
|
| +// @ A B C D E F G H I J K L M N O
|
| + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
| +// P Q R S T U V W X Y Z [ \ ] ^ _
|
| + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
|
| +// ` a b c d e f g h i j k l m n o
|
| + 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
| +// p q r s t u v w x y z { | } ~
|
| + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
|
| +
|
| +// This could be a table lookup as well by setting the high bit for each
|
| +// valid character, but it's only called once per URL, and it makes the lookup
|
| +// table easier to read not having extra stuff in it.
|
| +inline bool IsSchemeFirstChar(unsigned char c) {
|
| + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
| +}
|
| +
|
| +template<typename CHAR, typename UCHAR>
|
| +bool DoScheme(const CHAR* spec,
|
| + const url_parse::Component& scheme,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_scheme) {
|
| + if (scheme.len <= 0) {
|
| + // Scheme is unspecified or empty, convert to empty by appending a colon.
|
| + *out_scheme = url_parse::Component(output->length(), 0);
|
| + output->push_back(':');
|
| + return true;
|
| + }
|
| +
|
| + // The output scheme starts from the current position.
|
| + out_scheme->begin = output->length();
|
| +
|
| + // Danger: it's important that this code does not strip any characters: it
|
| + // only emits the canonical version (be it valid or escaped) of each of
|
| + // the input characters. Stripping would put it out of sync with
|
| + // url_util::FindAndCompareScheme, which could cause some security checks on
|
| + // schemes to be incorrect.
|
| + bool success = true;
|
| + int end = scheme.end();
|
| + for (int i = scheme.begin; i < end; i++) {
|
| + UCHAR ch = static_cast<UCHAR>(spec[i]);
|
| + char replacement = 0;
|
| + if (ch < 0x80) {
|
| + if (i == scheme.begin) {
|
| + // Need to do a special check for the first letter of the scheme.
|
| + if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
|
| + replacement = kSchemeCanonical[ch];
|
| + } else {
|
| + replacement = kSchemeCanonical[ch];
|
| + }
|
| + }
|
| +
|
| + if (replacement) {
|
| + output->push_back(replacement);
|
| + } else if (ch == '%') {
|
| + // Canonicalizing the scheme multiple times should lead to the same
|
| + // result. Since invalid characters will be escaped, we need to preserve
|
| + // the percent to avoid multiple escaping. The scheme will be invalid.
|
| + success = false;
|
| + output->push_back('%');
|
| + } else {
|
| + // Invalid character, store it but mark this scheme as invalid.
|
| + success = false;
|
| +
|
| + // This will escape the output and also handle encoding issues.
|
| + // Ignore the return value since we already failed.
|
| + AppendUTF8EscapedChar(spec, &i, end, output);
|
| + }
|
| + }
|
| +
|
| + // The output scheme ends with the the current position, before appending
|
| + // the colon.
|
| + out_scheme->len = output->length() - out_scheme->begin;
|
| + output->push_back(':');
|
| + return success;
|
| +}
|
| +
|
| +// The username and password components reference ranges in the corresponding
|
| +// *_spec strings. Typically, these specs will be the same (we're
|
| +// canonicalizing a single source string), but may be different when
|
| +// replacing components.
|
| +template<typename CHAR, typename UCHAR>
|
| +bool DoUserInfo(const CHAR* username_spec,
|
| + const url_parse::Component& username,
|
| + const CHAR* password_spec,
|
| + const url_parse::Component& password,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_username,
|
| + url_parse::Component* out_password) {
|
| + if (username.len <= 0 && password.len <= 0) {
|
| + // Common case: no user info. We strip empty username/passwords.
|
| + *out_username = url_parse::Component();
|
| + *out_password = url_parse::Component();
|
| + return true;
|
| + }
|
| +
|
| + // Write the username.
|
| + out_username->begin = output->length();
|
| + if (username.len > 0) {
|
| + // This will escape characters not valid for the username.
|
| + AppendStringOfType(&username_spec[username.begin], username.len,
|
| + CHAR_USERINFO, output);
|
| + }
|
| + out_username->len = output->length() - out_username->begin;
|
| +
|
| + // When there is a password, we need the separator. Note that we strip
|
| + // empty but specified passwords.
|
| + if (password.len > 0) {
|
| + output->push_back(':');
|
| + out_password->begin = output->length();
|
| + AppendStringOfType(&password_spec[password.begin], password.len,
|
| + CHAR_USERINFO, output);
|
| + out_password->len = output->length() - out_password->begin;
|
| + } else {
|
| + *out_password = url_parse::Component();
|
| + }
|
| +
|
| + output->push_back('@');
|
| + return true;
|
| +}
|
| +
|
| +// Helper functions for converting port integers to strings.
|
| +inline void WritePortInt(char* output, int output_len, int port) {
|
| + _itoa_s(port, output, output_len, 10);
|
| +}
|
| +
|
| +// This function will prepend the colon if there will be a port.
|
| +template<typename CHAR, typename UCHAR>
|
| +bool DoPort(const CHAR* spec,
|
| + const url_parse::Component& port,
|
| + int default_port_for_scheme,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_port) {
|
| + int port_num = url_parse::ParsePort(spec, port);
|
| + if (port_num == url_parse::PORT_UNSPECIFIED ||
|
| + port_num == default_port_for_scheme) {
|
| + *out_port = url_parse::Component();
|
| + return true; // Leave port empty.
|
| + }
|
| +
|
| + if (port_num == url_parse::PORT_INVALID) {
|
| + // Invalid port: We'll copy the text from the input so the user can see
|
| + // what the error was, and mark the URL as invalid by returning false.
|
| + output->push_back(':');
|
| + out_port->begin = output->length();
|
| + AppendInvalidNarrowString(spec, port.begin, port.end(), output);
|
| + out_port->len = output->length() - out_port->begin;
|
| + return false;
|
| + }
|
| +
|
| + // Convert port number back to an integer. Max port value is 5 digits, and
|
| + // the Parsed::ExtractPort will have made sure the integer is in range.
|
| + const int buf_size = 6;
|
| + char buf[buf_size];
|
| + WritePortInt(buf, buf_size, port_num);
|
| +
|
| + // Append the port number to the output, preceeded by a colon.
|
| + output->push_back(':');
|
| + out_port->begin = output->length();
|
| + for (int i = 0; i < buf_size && buf[i]; i++)
|
| + output->push_back(buf[i]);
|
| +
|
| + out_port->len = output->length() - out_port->begin;
|
| + return true;
|
| +}
|
| +
|
| +template<typename CHAR, typename UCHAR>
|
| +void DoCanonicalizeRef(const CHAR* spec,
|
| + const url_parse::Component& ref,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_ref) {
|
| + if (ref.len < 0) {
|
| + // Common case of no ref.
|
| + *out_ref = url_parse::Component();
|
| + return;
|
| + }
|
| +
|
| + // Append the ref separator. Note that we need to do this even when the ref
|
| + // is empty but present.
|
| + output->push_back('#');
|
| + out_ref->begin = output->length();
|
| +
|
| + // Now iterate through all the characters, converting to UTF-8 and validating.
|
| + int end = ref.end();
|
| + for (int i = ref.begin; i < end; i++) {
|
| + if (spec[i] == 0) {
|
| + // IE just strips NULLs, so we do too.
|
| + continue;
|
| + } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
|
| + // Unline IE seems to, we escape control characters. This will probably
|
| + // make the reference fragment unusable on a web page, but people
|
| + // shouldn't be using control characters in their anchor names.
|
| + AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
|
| + } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
|
| + // Normal ASCII characters are just appended.
|
| + output->push_back(static_cast<char>(spec[i]));
|
| + } else {
|
| + // Non-ASCII characters are appended unescaped, but only when they are
|
| + // valid. Invalid Unicode characters are replaced with the "invalid
|
| + // character" as IE seems to (ReadUTFChar puts the unicode replacement
|
| + // character in the output on failure for us).
|
| + unsigned code_point;
|
| + ReadUTFChar(spec, &i, end, &code_point);
|
| + AppendUTF8Value(code_point, output);
|
| + }
|
| + }
|
| +
|
| + out_ref->len = output->length() - out_ref->begin;
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| +const char* RemoveURLWhitespace(const char* input, int input_len,
|
| + CanonOutputT<char>* buffer,
|
| + int* output_len) {
|
| + return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
|
| +}
|
| +
|
| +const char16* RemoveURLWhitespace(const char16* input, int input_len,
|
| + CanonOutputT<char16>* buffer,
|
| + int* output_len) {
|
| + return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
|
| +}
|
| +
|
| +char CanonicalSchemeChar(char16 ch) {
|
| + if (ch >= 0x80)
|
| + return 0; // Non-ASCII is not supported by schemes.
|
| + return kSchemeCanonical[ch];
|
| +}
|
| +
|
| +bool CanonicalizeScheme(const char* spec,
|
| + const url_parse::Component& scheme,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_scheme) {
|
| + return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
|
| +}
|
| +
|
| +bool CanonicalizeScheme(const char16* spec,
|
| + const url_parse::Component& scheme,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_scheme) {
|
| + return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
|
| +}
|
| +
|
| +bool CanonicalizeUserInfo(const char* username_source,
|
| + const url_parse::Component& username,
|
| + const char* password_source,
|
| + const url_parse::Component& password,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_username,
|
| + url_parse::Component* out_password) {
|
| + return DoUserInfo<char, unsigned char>(
|
| + username_source, username, password_source, password,
|
| + output, out_username, out_password);
|
| +}
|
| +
|
| +bool CanonicalizeUserInfo(const char16* username_source,
|
| + const url_parse::Component& username,
|
| + const char16* password_source,
|
| + const url_parse::Component& password,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_username,
|
| + url_parse::Component* out_password) {
|
| + return DoUserInfo<char16, char16>(
|
| + username_source, username, password_source, password,
|
| + output, out_username, out_password);
|
| +}
|
| +
|
| +bool CanonicalizePort(const char* spec,
|
| + const url_parse::Component& port,
|
| + int default_port_for_scheme,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_port) {
|
| + return DoPort<char, unsigned char>(spec, port,
|
| + default_port_for_scheme,
|
| + output, out_port);
|
| +}
|
| +
|
| +bool CanonicalizePort(const char16* spec,
|
| + const url_parse::Component& port,
|
| + int default_port_for_scheme,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_port) {
|
| + return DoPort<char16, char16>(spec, port, default_port_for_scheme,
|
| + output, out_port);
|
| +}
|
| +
|
| +void CanonicalizeRef(const char* spec,
|
| + const url_parse::Component& ref,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_ref) {
|
| + DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
|
| +}
|
| +
|
| +void CanonicalizeRef(const char16* spec,
|
| + const url_parse::Component& ref,
|
| + CanonOutput* output,
|
| + url_parse::Component* out_ref) {
|
| + DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
|
| +}
|
| +
|
| +} // namespace url_canon
|
|
|
| Property changes on: url/url_canon_etc.cc
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|