url/url_canon_query.cc - Issue 13821004: Move googleurl into the Chrome repo.

Side by Side Diff: url/url_canon_query.cc

Issue 13821004: Move googleurl into the Chrome repo. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright 2007, Google Inc.

	2 // All rights reserved.

	3 //

	4 // Redistribution and use in source and binary forms, with or without

	5 // modification, are permitted provided that the following conditions are

	6 // met:

	7 //

	8 // * Redistributions of source code must retain the above copyright

	9 // notice, this list of conditions and the following disclaimer.

	10 // * Redistributions in binary form must reproduce the above

	11 // copyright notice, this list of conditions and the following disclaimer

	12 // in the documentation and/or other materials provided with the

	13 // distribution.

	14 // * Neither the name of Google Inc. nor the names of its

	15 // contributors may be used to endorse or promote products derived from

	16 // this software without specific prior written permission.

	17 //

	18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	29

	30 #include "googleurl/src/url_canon.h"

	31 #include "googleurl/src/url_canon_internal.h"

	32

	33 // Query canonicalization in IE

	34 // ----------------------------

	35 // IE is very permissive for query parameters specified in links on the page

	36 // (in contrast to links that it constructs itself based on form data). It does

	37 // not unescape any character. It does not reject any escape sequence (be they

	38 // invalid like "%2y" or freaky like %00).

	39 //

	40 // IE only escapes spaces and nothing else. Embedded NULLs, tabs (0x09),

	41 // LF (0x0a), and CR (0x0d) are removed (this probably happens at an earlier

	42 // layer since they are removed from all portions of the URL). All other

	43 // characters are passed unmodified. Invalid UTF-16 sequences are preserved as

	44 // well, with each character in the input being converted to UTF-8. It is the

	45 // server's job to make sense of this invalid query.

	46 //

	47 // Invalid multibyte sequences (for example, invalid UTF-8 on a UTF-8 page)

	48 // are converted to the invalid character and sent as unescaped UTF-8 (0xef,

	49 // 0xbf, 0xbd). This may not be canonicalization, the parser may generate these

	50 // strings before the URL handler ever sees them.

	51 //

	52 // Our query canonicalization

	53 // --------------------------

	54 // We escape all non-ASCII characters and control characters, like Firefox.

	55 // This is more conformant to the URL spec, and there do not seem to be many

	56 // problems relating to Firefox's behavior.

	57 //

	58 // Like IE, we will never unescape (although the application may want to try

	59 // unescaping to present the user with a more understandable URL). We will

	60 // replace all invalid sequences (including invalid UTF-16 sequences, which IE

	61 // doesn't) with the "invalid character," and we will escape it.

	62

	63 namespace url_canon {

	64

	65 namespace {

	66

	67 // Returns true if the characters starting at \|begin\| and going until \|end\|

	68 // (non-inclusive) are all representable in 7-bits.

	69 template<typename CHAR, typename UCHAR>

	70 bool IsAllASCII(const CHAR* spec, const url_parse::Component& query) {

	71 int end = query.end();

	72 for (int i = query.begin; i < end; i++) {

	73 if (static_cast<UCHAR>(spec[i]) >= 0x80)

	74 return false;

	75 }

	76 return true;

	77 }

	78

	79 // Appends the given string to the output, escaping characters that do not

	80 // match the given \|type\| in SharedCharTypes. This version will accept 8 or 16

	81 // bit characters, but assumes that they have only 7-bit values. It also assumes

	82 // that all UTF-8 values are correct, so doesn't bother checking

	83 template<typename CHAR>

	84 void AppendRaw8BitQueryString(const CHAR* source, int length,

	85 CanonOutput* output) {

	86 for (int i = 0; i < length; i++) {

	87 if (!IsQueryChar(static_cast<unsigned char>(source[i])))

	88 AppendEscapedChar(static_cast<unsigned char>(source[i]), output);

	89 else // Doesn't need escaping.

	90 output->push_back(static_cast<char>(source[i]));

	91 }

	92 }

	93

	94 // Runs the converter on the given UTF-8 input. Since the converter expects

	95 // UTF-16, we have to convert first. The converter must be non-NULL.

	96 void RunConverter(const char* spec,

	97 const url_parse::Component& query,

	98 CharsetConverter* converter,

	99 CanonOutput* output) {

	100 // This function will replace any misencoded values with the invalid

	101 // character. This is what we want so we don't have to check for error.

	102 RawCanonOutputW<1024> utf16;

	103 ConvertUTF8ToUTF16(&spec[query.begin], query.len, &utf16);

	104 converter->ConvertFromUTF16(utf16.data(), utf16.length(), output);

	105 }

	106

	107 // Runs the converter with the given UTF-16 input. We don't have to do

	108 // anything, but this overriddden function allows us to use the same code

	109 // for both UTF-8 and UTF-16 input.

	110 void RunConverter(const char16* spec,

	111 const url_parse::Component& query,

	112 CharsetConverter* converter,

	113 CanonOutput* output) {

	114 converter->ConvertFromUTF16(&spec[query.begin], query.len, output);

	115 }

	116

	117 template<typename CHAR, typename UCHAR>

	118 void DoConvertToQueryEncoding(const CHAR* spec,

	119 const url_parse::Component& query,

	120 CharsetConverter* converter,

	121 CanonOutput* output) {

	122 if (IsAllASCII<CHAR, UCHAR>(spec, query)) {

	123 // Easy: the input can just appended with no character set conversions.

	124 AppendRaw8BitQueryString(&spec[query.begin], query.len, output);

	125

	126 } else {

	127 // Harder: convert to the proper encoding first.

	128 if (converter) {

	129 // Run the converter to get an 8-bit string, then append it, escaping

	130 // necessary values.

	131 RawCanonOutput<1024> eight_bit;

	132 RunConverter(spec, query, converter, &eight_bit);

	133 AppendRaw8BitQueryString(eight_bit.data(), eight_bit.length(), output);

	134

	135 } else {

	136 // No converter, do our own UTF-8 conversion.

	137 AppendStringOfType(&spec[query.begin], query.len, CHAR_QUERY, output);

	138 }

	139 }

	140 }

	141

	142 template<typename CHAR, typename UCHAR>

	143 void DoCanonicalizeQuery(const CHAR* spec,

	144 const url_parse::Component& query,

	145 CharsetConverter* converter,

	146 CanonOutput* output,

	147 url_parse::Component* out_query) {

	148 if (query.len < 0) {

	149 *out_query = url_parse::Component();

	150 return;

	151 }

	152

	153 output->push_back('?');

	154 out_query->begin = output->length();

	155

	156 DoConvertToQueryEncoding<CHAR, UCHAR>(spec, query, converter, output);

	157

	158 out_query->len = output->length() - out_query->begin;

	159 }

	160

	161 } // namespace

	162

	163 void CanonicalizeQuery(const char* spec,

	164 const url_parse::Component& query,

	165 CharsetConverter* converter,

	166 CanonOutput* output,

	167 url_parse::Component* out_query) {

	168 DoCanonicalizeQuery<char, unsigned char>(spec, query, converter,

	169 output, out_query);

	170 }

	171

	172 void CanonicalizeQuery(const char16* spec,

	173 const url_parse::Component& query,

	174 CharsetConverter* converter,

	175 CanonOutput* output,

	176 url_parse::Component* out_query) {

	177 DoCanonicalizeQuery<char16, char16>(spec, query, converter,

	178 output, out_query);

	179 }

	180

	181 void ConvertUTF16ToQueryEncoding(const char16* input,

	182 const url_parse::Component& query,

	183 CharsetConverter* converter,

	184 CanonOutput* output) {

	185 DoConvertToQueryEncoding<char16, char16>(input, query,

	186 converter, output);

	187 }

	188

	189 } // namespace url_canon

OLD	NEW

« no previous file with comments | « url/url_canon_pathurl.cc ('k') | url/url_canon_relative.cc » ('j') | no next file with comments »