googleurl/src/url_canon_host.cc - Issue 160589: All host names with nonascii characters (cyrillic) + escapable characters (,)...

Side by Side Diff: googleurl/src/url_canon_host.cc

Issue 160589: All host names with nonascii characters (cyrillic) + escapable characters (,)... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2007, Google Inc.	1 // Copyright 2007, Google Inc.

2 // All rights reserved.	2 // All rights reserved.

3 //	3 //

4 // Redistribution and use in source and binary forms, with or without	4 // Redistribution and use in source and binary forms, with or without

5 // modification, are permitted provided that the following conditions are	5 // modification, are permitted provided that the following conditions are

6 // met:	6 // met:

7 //	7 //

8 // * Redistributions of source code must retain the above copyright	8 // * Redistributions of source code must retain the above copyright

9 // notice, this list of conditions and the following disclaimer.	9 // notice, this list of conditions and the following disclaimer.

10 // * Redistributions in binary form must reproduce the above	10 // * Redistributions in binary form must reproduce the above

(...skipping 116 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
127 // canonicalized host since we know nothing weird can happen (escaped	127 // canonicalized host since we know nothing weird can happen (escaped

128 // characters could be unescaped to non-7-bit, so they have to be treated	128 // characters could be unescaped to non-7-bit, so they have to be treated

129 // with suspicion at this point). It does not use the \|has_non_ascii\| flag.	129 // with suspicion at this point). It does not use the \|has_non_ascii\| flag.

130 //	130 //

131 // * When the caller has an 8-bit string that may need unescaping.	131 // * When the caller has an 8-bit string that may need unescaping.

132 // DoComplexHost calls us this situation to do unescaping and validation.	132 // DoComplexHost calls us this situation to do unescaping and validation.

133 // After this, it may do other IDN operations depending on the value of the	133 // After this, it may do other IDN operations depending on the value of the

134 // \|*has_non_ascii\| flag.	134 // \|*has_non_ascii\| flag.

135 //	135 //

136 // The return value indicates if the output is a potentially valid host name.	136 // The return value indicates if the output is a potentially valid host name.

137 template<typename CHAR>	137 template<typename INCHAR, typename OUTCHAR>

138 bool DoSimpleHost(const CHAR* host, int host_len, CanonOutput* output,	138 bool DoSimpleHost(const INCHAR* host,

	139 int host_len,

	140 CanonOutputT<OUTCHAR>* output,

139 bool* has_non_ascii) {	141 bool* has_non_ascii) {

140 *has_non_ascii = false;	142 *has_non_ascii = false;

141	143

142 bool success = true;	144 bool success = true;

143 for (int i = 0; i < host_len; i++) {	145 for (int i = 0; i < host_len; ++i) {

144 unsigned char source = static_cast<unsigned char>(host[i]);	146 unsigned int source = host[i];

145 if (source == '%') {	147 if (source == '%') {

146 // Handle unescaping. This will replace \|source\| with the unescaped char.	148 // Unescape first, if possible.

147 if (!DecodeEscaped(host, &i, host_len, &source)) {	149 // Source will be used only if decode operation was successful.

	150 if (!DecodeEscaped(host, &i, host_len,

	151 reinterpret_cast<unsigned char*>(&source))) {

148 // Invalid escaped character. There is nothing that can make this	152 // Invalid escaped character. There is nothing that can make this

149 // host valid. We append an escaped percent so the URL looks reasonable	153 // host valid. We append an escaped percent so the URL looks reasonable

150 // and mark as failed.	154 // and mark as failed.

151 AppendEscapedChar('%', output);	155 AppendEscapedChar('%', output);

152 success = false;	156 success = false;

153 continue;	157 continue;

154 }	158 }

155 }	159 }

156	160

157 if (source >= 0x80) {	161 if (source <= 0x80) {

158 // Handle non-ASCII.

159 *has_non_ascii = true;

160 output->push_back(source);

161 } else {

162 // We have ASCII input, we can use our lookup table.	162 // We have ASCII input, we can use our lookup table.

163 unsigned char replacement = kHostCharLookup[source];	163 unsigned char replacement = kHostCharLookup[source];

164 if (!replacement) {	164 if (!replacement) {

165 // Invalid character, add it as percent-escaped and mark as failed.	165 // Invalid character, add it as percent-escaped and mark as failed.

166 AppendEscapedChar(source, output);	166 AppendEscapedChar(source, output);

167 success = false;	167 success = false;

168 } else if (replacement == kEsc) {	168 } else if (replacement == kEsc) {

169 // This character is valid but should be escaped.	169 // This character is valid but should be escaped.

170 AppendEscapedChar(source, output);	170 AppendEscapedChar(source, output);

171 } else {	171 } else {

172 // Common case, the given character is valid in a hostname, the lookup	172 // Common case, the given character is valid in a hostname, the lookup

173 // table tells us the canonical representation of that character (lower	173 // table tells us the canonical representation of that character (lower

174 // cased).	174 // cased).

175 output->push_back(replacement);	175 output->push_back(replacement);

176 }	176 }

	177 } else {

	178 // It's a non-ascii char. Just push it to the output.

	179 // In case where we have char16 input, and char output it's safe to

	180 // cast char16->char only if input string was converted to ASCII.

	181 output->push_back(static_cast<OUTCHAR>(source));

	182 *has_non_ascii = true;

177 }	183 }

178 }	184 }

	185

179 return success;	186 return success;

180 }	187 }

181	188

182 // Canonicalizes a host that requires IDN conversion. Returns true on success.	189 // Canonicalizes a host that requires IDN conversion. Returns true on success

183 bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {	190 bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {

	191 // We need to escape URL before doing IDN conversion, since punicode strings

	192 // cannot be escaped after they are created.

	193 RawCanonOutputW<kTempHostBufferLen> url_escaped_host;

	194 bool has_non_ascii;

	195 DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);

	196

184 StackBufferW wide_output;	197 StackBufferW wide_output;

185 if (!IDNToASCII(src, src_len, &wide_output)) {	198 if (!IDNToASCII(url_escaped_host.data(),

	199 url_escaped_host.length(),

	200 &wide_output)) {

186 // Some error, give up. This will write some reasonable looking	201 // Some error, give up. This will write some reasonable looking

187 // representation of the string to the output.	202 // representation of the string to the output.

188 AppendInvalidNarrowString(src, 0, src_len, output);	203 AppendInvalidNarrowString(src, 0, src_len, output);

189 return false;	204 return false;

190 }	205 }

191	206

192 // Now we check the ASCII output like a normal host. It will also handle	207 // Now we check the ASCII output like a normal host. It will also handle

193 // unescaping. Although we unescaped everything before this function call, if	208 // unescaping. Although we unescaped everything before this function call, if

194 // somebody does %00 as fullwidth, ICU will convert this to ASCII.	209 // somebody does %00 as fullwidth, ICU will convert this to ASCII.

195 bool has_non_ascii;	210 bool success = DoSimpleHost(wide_output.data(),

196 bool success = DoSimpleHost<char16>(wide_output.data(),	211 wide_output.length(),

197 wide_output.length(),	212 output, &has_non_ascii);

198 output, &has_non_ascii);

199 DCHECK(!has_non_ascii);	213 DCHECK(!has_non_ascii);

200 return success;	214 return success;

201 }	215 }

202	216

203 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to	217 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to

204 // UTF-16. The has_escaped flag should be set if the input string requires	218 // UTF-16. The has_escaped flag should be set if the input string requires

205 // unescaping.	219 // unescaping.

206 bool DoComplexHost(const char* host, int host_len,	220 bool DoComplexHost(const char* host, int host_len,

207 bool has_non_ascii, bool has_escaped, CanonOutput* output) {	221 bool has_non_ascii, bool has_escaped, CanonOutput* output) {

208 // Save the current position in the output. We may write stuff and rewind it	222 // Save the current position in the output. We may write stuff and rewind it

(...skipping 169 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
378 }	392 }

379	393

380 void CanonicalizeHostVerbose(const char16* spec,	394 void CanonicalizeHostVerbose(const char16* spec,

381 const url_parse::Component& host,	395 const url_parse::Component& host,

382 CanonOutput* output,	396 CanonOutput* output,

383 CanonHostInfo *host_info) {	397 CanonHostInfo *host_info) {

384 DoHost<char16, char16>(spec, host, output, host_info);	398 DoHost<char16, char16>(spec, host, output, host_info);

385 }	399 }

386	400

387 } // namespace url_canon	401 } // namespace url_canon

OLD	NEW

« no previous file with comments | « googleurl/src/gurl_test_main.cc ('k') | googleurl/src/url_canon_internal.h » ('j') | no next file with comments »