OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (C) 2004, 2007, 2008, 2011, 2012 Apple Inc. All rights reserved. | |
3 * Copyright (C) 2012 Research In Motion Limited. All rights reserved. | |
4 * Copyright (C) 2008, 2009, 2011 Google Inc. All rights reserved. | |
5 * | |
6 * Redistribution and use in source and binary forms, with or without | |
7 * modification, are permitted provided that the following conditions | |
8 * are met: | |
9 * 1. Redistributions of source code must retain the above copyright | |
10 * notice, this list of conditions and the following disclaimer. | |
11 * 2. Redistributions in binary form must reproduce the above copyright | |
12 * notice, this list of conditions and the following disclaimer in the | |
13 * documentation and/or other materials provided with the distribution. | |
14 * | |
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR | |
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 */ | |
27 | |
28 #include "config.h" | |
29 #include "weborigin/KURL.h" | |
30 | |
31 #include "weborigin/KnownPorts.h" | |
32 #include "wtf/HashMap.h" | |
33 #include "wtf/StdLibExtras.h" | |
34 #include "wtf/text/CString.h" | |
35 #include "wtf/text/StringHash.h" | |
36 #include "wtf/text/StringUTF8Adaptor.h" | |
37 #include "wtf/text/TextEncoding.h" | |
38 #include <algorithm> | |
39 #include <url/url_util.h> | |
40 #ifndef NDEBUG | |
41 #include <stdio.h> | |
42 #endif | |
43 | |
44 namespace WebCore { | |
45 | |
46 static const int maximumValidPortNumber = 0xFFFE; | |
47 static const int invalidPortNumber = 0xFFFF; | |
48 | |
49 static void assertProtocolIsGood(const char* protocol) | |
50 { | |
51 #ifndef NDEBUG | |
52 const char* p = protocol; | |
53 while (*p) { | |
54 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); | |
55 ++p; | |
56 } | |
57 #endif | |
58 } | |
59 | |
60 // Note: You must ensure that |spec| is a valid canonicalized URL before calling
this function. | |
61 static const char* asURLChar8Subtle(const String& spec) | |
62 { | |
63 ASSERT(spec.is8Bit()); | |
64 // characters8 really return characters in Latin-1, but because we canonical
ize | |
65 // URL strings, we know that everything before the fragment identifier will | |
66 // actually be ASCII, which means this cast is safe as long as you don't loo
k | |
67 // at the fragment component. | |
68 return reinterpret_cast<const char*>(spec.characters8()); | |
69 } | |
70 | |
71 // Returns the characters for the given string, or a pointer to a static empty | |
72 // string if the input string is null. This will always ensure we have a non- | |
73 // null character pointer since ReplaceComponents has special meaning for null. | |
74 static const char* charactersOrEmpty(const StringUTF8Adaptor& string) | |
75 { | |
76 static const char zero = 0; | |
77 return string.data() ? string.data() : &zero; | |
78 } | |
79 | |
80 static bool isSchemeFirstChar(char c) | |
81 { | |
82 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); | |
83 } | |
84 | |
85 static bool isSchemeChar(char c) | |
86 { | |
87 return isSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' || c == '-
' || c == '+'; | |
88 } | |
89 | |
90 static bool isUnicodeEncoding(const WTF::TextEncoding* encoding) | |
91 { | |
92 return encoding->encodingForFormSubmission() == UTF8Encoding(); | |
93 } | |
94 | |
95 namespace { | |
96 | |
97 class KURLCharsetConverter : public url_canon::CharsetConverter { | |
98 public: | |
99 // The encoding parameter may be 0, but in this case the object must not be
called. | |
100 explicit KURLCharsetConverter(const WTF::TextEncoding* encoding) | |
101 : m_encoding(encoding) | |
102 { | |
103 } | |
104 | |
105 virtual void ConvertFromUTF16(const url_parse::UTF16Char* input, int inputLe
ngth, url_canon::CanonOutput* output) | |
106 { | |
107 CString encoded = m_encoding->normalizeAndEncode(String(input, inputLeng
th), WTF::URLEncodedEntitiesForUnencodables); | |
108 output->Append(encoded.data(), static_cast<int>(encoded.length())); | |
109 } | |
110 | |
111 private: | |
112 const WTF::TextEncoding* m_encoding; | |
113 }; | |
114 | |
115 } // namespace | |
116 | |
117 bool isValidProtocol(const String& protocol) | |
118 { | |
119 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) | |
120 if (protocol.isEmpty()) | |
121 return false; | |
122 if (!isSchemeFirstChar(protocol[0])) | |
123 return false; | |
124 unsigned protocolLength = protocol.length(); | |
125 for (unsigned i = 1; i < protocolLength; i++) { | |
126 if (!isSchemeChar(protocol[i])) | |
127 return false; | |
128 } | |
129 return true; | |
130 } | |
131 | |
132 String KURL::strippedForUseAsReferrer() const | |
133 { | |
134 KURL referrer(*this); | |
135 referrer.setUser(String()); | |
136 referrer.setPass(String()); | |
137 referrer.removeFragmentIdentifier(); | |
138 return referrer.string(); | |
139 } | |
140 | |
141 bool KURL::isLocalFile() const | |
142 { | |
143 // Including feed here might be a bad idea since drag and drop uses this che
ck | |
144 // and including feed would allow feeds to potentially let someone's blog | |
145 // read the contents of the clipboard on a drag, even without a drop. | |
146 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function. | |
147 return protocolIs("file"); | |
148 } | |
149 | |
150 bool protocolIsJavaScript(const String& url) | |
151 { | |
152 return protocolIs(url, "javascript"); | |
153 } | |
154 | |
155 const KURL& blankURL() | |
156 { | |
157 DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank")); | |
158 return staticBlankURL; | |
159 } | |
160 | |
161 bool KURL::isBlankURL() const | |
162 { | |
163 return protocolIs("about"); | |
164 } | |
165 | |
166 String KURL::elidedString() const | |
167 { | |
168 if (string().length() <= 1024) | |
169 return string(); | |
170 | |
171 return string().left(511) + "..." + string().right(510); | |
172 } | |
173 | |
174 // Initializes with a string representing an absolute URL. No encoding | |
175 // information is specified. This generally happens when a KURL is converted | |
176 // to a string and then converted back. In this case, the URL is already | |
177 // canonical and in proper escaped form so needs no encoding. We treat it as | |
178 // UTF-8 just in case. | |
179 KURL::KURL(ParsedURLStringTag, const String& url) | |
180 { | |
181 if (!url.isNull()) | |
182 init(KURL(), url, 0); | |
183 else { | |
184 // WebCore expects us to preserve the nullness of strings when this | |
185 // constructor is used. In all other cases, it expects a non-null | |
186 // empty string, which is what init() will create. | |
187 m_isValid = false; | |
188 m_protocolIsInHTTPFamily = false; | |
189 } | |
190 } | |
191 | |
192 KURL KURL::createIsolated(ParsedURLStringTag, const String& url) | |
193 { | |
194 // FIXME: We should be able to skip this extra copy and created an | |
195 // isolated KURL more efficiently. | |
196 return KURL(ParsedURLString, url).copy(); | |
197 } | |
198 | |
199 // Constructs a new URL given a base URL and a possibly relative input URL. | |
200 // This assumes UTF-8 encoding. | |
201 KURL::KURL(const KURL& base, const String& relative) | |
202 { | |
203 init(base, relative, 0); | |
204 } | |
205 | |
206 // Constructs a new URL given a base URL and a possibly relative input URL. | |
207 // Any query portion of the relative URL will be encoded in the given encoding. | |
208 KURL::KURL(const KURL& base, const String& relative, const WTF::TextEncoding& en
coding) | |
209 { | |
210 init(base, relative, &encoding.encodingForFormSubmission()); | |
211 } | |
212 | |
213 KURL::KURL(const AtomicString& canonicalString, const url_parse::Parsed& parsed,
bool isValid) | |
214 : m_isValid(isValid) | |
215 , m_protocolIsInHTTPFamily(false) | |
216 , m_parsed(parsed) | |
217 , m_string(canonicalString) | |
218 { | |
219 initProtocolIsInHTTPFamily(); | |
220 initInnerURL(); | |
221 } | |
222 | |
223 KURL::KURL(WTF::HashTableDeletedValueType) | |
224 : m_isValid(false) | |
225 , m_protocolIsInHTTPFamily(false) | |
226 , m_string(WTF::HashTableDeletedValue) | |
227 { | |
228 } | |
229 | |
230 KURL::KURL(const KURL& other) | |
231 : m_isValid(other.m_isValid) | |
232 , m_protocolIsInHTTPFamily(other.m_protocolIsInHTTPFamily) | |
233 , m_parsed(other.m_parsed) | |
234 , m_string(other.m_string) | |
235 { | |
236 if (other.m_innerURL.get()) | |
237 m_innerURL = adoptPtr(new KURL(other.m_innerURL->copy())); | |
238 } | |
239 | |
240 KURL& KURL::operator=(const KURL& other) | |
241 { | |
242 m_isValid = other.m_isValid; | |
243 m_protocolIsInHTTPFamily = other.m_protocolIsInHTTPFamily; | |
244 m_parsed = other.m_parsed; | |
245 m_string = other.m_string; | |
246 if (other.m_innerURL) | |
247 m_innerURL = adoptPtr(new KURL(other.m_innerURL->copy())); | |
248 else | |
249 m_innerURL.clear(); | |
250 return *this; | |
251 } | |
252 | |
253 KURL KURL::copy() const | |
254 { | |
255 KURL result; | |
256 result.m_isValid = m_isValid; | |
257 result.m_protocolIsInHTTPFamily = m_protocolIsInHTTPFamily; | |
258 result.m_parsed = m_parsed; | |
259 result.m_string = m_string.isolatedCopy(); | |
260 if (result.m_innerURL) | |
261 result.m_innerURL = adoptPtr(new KURL(m_innerURL->copy())); | |
262 return result; | |
263 } | |
264 | |
265 bool KURL::isNull() const | |
266 { | |
267 return m_string.isNull(); | |
268 } | |
269 | |
270 bool KURL::isEmpty() const | |
271 { | |
272 return m_string.isEmpty(); | |
273 } | |
274 | |
275 bool KURL::isValid() const | |
276 { | |
277 return m_isValid; | |
278 } | |
279 | |
280 bool KURL::hasPort() const | |
281 { | |
282 return hostEnd() < pathStart(); | |
283 } | |
284 | |
285 bool KURL::protocolIsInHTTPFamily() const | |
286 { | |
287 return m_protocolIsInHTTPFamily; | |
288 } | |
289 | |
290 bool KURL::hasPath() const | |
291 { | |
292 // Note that http://www.google.com/" has a path, the path is "/". This can | |
293 // return false only for invalid or nonstandard URLs. | |
294 return m_parsed.path.len >= 0; | |
295 } | |
296 | |
297 // We handle "parameters" separated by a semicolon, while KURL.cpp does not, | |
298 // which can lead to different results in some cases. | |
299 String KURL::lastPathComponent() const | |
300 { | |
301 if (!m_isValid) | |
302 return stringForInvalidComponent(); | |
303 ASSERT(!m_string.isNull()); | |
304 | |
305 // When the output ends in a slash, WebCore has different expectations than | |
306 // the GoogleURL library. For "/foo/bar/" the library will return the empty | |
307 // string, but WebCore wants "bar". | |
308 url_parse::Component path = m_parsed.path; | |
309 if (path.len > 0 && m_string[path.end() - 1] == '/') | |
310 path.len--; | |
311 | |
312 url_parse::Component file; | |
313 if (m_string.is8Bit()) | |
314 url_parse::ExtractFileName(asURLChar8Subtle(m_string), path, &file); | |
315 else | |
316 url_parse::ExtractFileName(m_string.characters16(), path, &file); | |
317 | |
318 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns | |
319 // a null string when the path is empty, which we duplicate here. | |
320 if (!file.is_nonempty()) | |
321 return String(); | |
322 return componentString(file); | |
323 } | |
324 | |
325 String KURL::protocol() const | |
326 { | |
327 return componentString(m_parsed.scheme); | |
328 } | |
329 | |
330 String KURL::host() const | |
331 { | |
332 return componentString(m_parsed.host); | |
333 } | |
334 | |
335 // Returns 0 when there is no port. | |
336 // | |
337 // We treat URL's with out-of-range port numbers as invalid URLs, and they will | |
338 // be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but | |
339 // return invalidPortNumber from this port() function, so we mirror that behavio
r here. | |
340 unsigned short KURL::port() const | |
341 { | |
342 if (!m_isValid || m_parsed.port.len <= 0) | |
343 return 0; | |
344 ASSERT(!m_string.isNull()); | |
345 int port = m_string.is8Bit() ? | |
346 url_parse::ParsePort(asURLChar8Subtle(m_string), m_parsed.port) : | |
347 url_parse::ParsePort(m_string.characters16(), m_parsed.port); | |
348 ASSERT(port != url_parse::PORT_UNSPECIFIED); // Checked port.len <= 0 before
. | |
349 | |
350 if (port == url_parse::PORT_INVALID || port > maximumValidPortNumber) // Mim
ic KURL::port() | |
351 port = invalidPortNumber; | |
352 | |
353 return static_cast<unsigned short>(port); | |
354 } | |
355 | |
356 String KURL::pass() const | |
357 { | |
358 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns | |
359 // a null string when the password is empty, which we duplicate here. | |
360 if (!m_parsed.password.is_nonempty()) | |
361 return String(); | |
362 return componentString(m_parsed.password); | |
363 } | |
364 | |
365 String KURL::user() const | |
366 { | |
367 return componentString(m_parsed.username); | |
368 } | |
369 | |
370 String KURL::fragmentIdentifier() const | |
371 { | |
372 // Empty but present refs ("foo.com/bar#") should result in the empty | |
373 // string, which componentString will produce. Nonexistent refs | |
374 // should be the null string. | |
375 if (!m_parsed.ref.is_valid()) | |
376 return String(); | |
377 return componentString(m_parsed.ref); | |
378 } | |
379 | |
380 bool KURL::hasFragmentIdentifier() const | |
381 { | |
382 return m_parsed.ref.len >= 0; | |
383 } | |
384 | |
385 String KURL::baseAsString() const | |
386 { | |
387 // FIXME: There is probably a more efficient way to do this? | |
388 return m_string.left(pathAfterLastSlash()); | |
389 } | |
390 | |
391 String KURL::query() const | |
392 { | |
393 if (m_parsed.query.len >= 0) | |
394 return componentString(m_parsed.query); | |
395 | |
396 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns | |
397 // an empty string when the query is empty rather than a null (not sure | |
398 // which is right). | |
399 // Returns a null if the query is not specified, instead of empty. | |
400 if (m_parsed.query.is_valid()) | |
401 return emptyString(); | |
402 return String(); | |
403 } | |
404 | |
405 String KURL::path() const | |
406 { | |
407 return componentString(m_parsed.path); | |
408 } | |
409 | |
410 bool KURL::setProtocol(const String& protocol) | |
411 { | |
412 // Firefox and IE remove everything after the first ':'. | |
413 int separatorPosition = protocol.find(':'); | |
414 String newProtocol = protocol.substring(0, separatorPosition); | |
415 StringUTF8Adaptor newProtocolUTF8(newProtocol); | |
416 | |
417 // If KURL is given an invalid scheme, it returns failure without modifying | |
418 // the URL at all. This is in contrast to most other setters which modify | |
419 // the URL and set "m_isValid." | |
420 url_canon::RawCanonOutputT<char> canonProtocol; | |
421 url_parse::Component protocolComponent; | |
422 if (!url_canon::CanonicalizeScheme(newProtocolUTF8.data(), url_parse::Compon
ent(0, newProtocolUTF8.length()), &canonProtocol, &protocolComponent) | |
423 || !protocolComponent.is_nonempty()) | |
424 return false; | |
425 | |
426 url_canon::Replacements<char> replacements; | |
427 replacements.SetScheme(charactersOrEmpty(newProtocolUTF8), url_parse::Compon
ent(0, newProtocolUTF8.length())); | |
428 replaceComponents(replacements); | |
429 | |
430 // isValid could be false but we still return true here. This is because | |
431 // WebCore or JS scripts can build up a URL by setting individual | |
432 // components, and a JS exception is based on the return value of this | |
433 // function. We want to throw the exception and stop the script only when | |
434 // its trying to set a bad protocol, and not when it maybe just hasn't | |
435 // finished building up its final scheme. | |
436 return true; | |
437 } | |
438 | |
439 void KURL::setHost(const String& host) | |
440 { | |
441 StringUTF8Adaptor hostUTF8(host); | |
442 url_canon::Replacements<char> replacements; | |
443 replacements.SetHost(charactersOrEmpty(hostUTF8), url_parse::Component(0, ho
stUTF8.length())); | |
444 replaceComponents(replacements); | |
445 } | |
446 | |
447 static String parsePortFromStringPosition(const String& value, unsigned portStar
t) | |
448 { | |
449 // "008080junk" needs to be treated as port "8080" and "000" as "0". | |
450 size_t length = value.length(); | |
451 unsigned portEnd = portStart; | |
452 while (isASCIIDigit(value[portEnd]) && portEnd < length) | |
453 ++portEnd; | |
454 while (value[portStart] == '0' && portStart < portEnd - 1) | |
455 ++portStart; | |
456 | |
457 // Required for backwards compat. | |
458 // https://www.w3.org/Bugs/Public/show_bug.cgi?id=23463 | |
459 if (portStart == portEnd) | |
460 return "0"; | |
461 | |
462 return value.substring(portStart, portEnd - portStart); | |
463 } | |
464 | |
465 void KURL::setHostAndPort(const String& hostAndPort) | |
466 { | |
467 size_t separator = hostAndPort.find(':'); | |
468 if (!separator) | |
469 return; | |
470 | |
471 if (separator == kNotFound) { | |
472 url_canon::Replacements<char> replacements; | |
473 StringUTF8Adaptor hostUTF8(hostAndPort); | |
474 replacements.SetHost(charactersOrEmpty(hostUTF8), url_parse::Component(0
, hostUTF8.length())); | |
475 replaceComponents(replacements); | |
476 return; | |
477 } | |
478 | |
479 String host = hostAndPort.substring(0, separator); | |
480 String port = parsePortFromStringPosition(hostAndPort, separator + 1); | |
481 | |
482 StringUTF8Adaptor hostUTF8(host); | |
483 StringUTF8Adaptor portUTF8(port); | |
484 | |
485 url_canon::Replacements<char> replacements; | |
486 replacements.SetHost(charactersOrEmpty(hostUTF8), url_parse::Component(0, ho
stUTF8.length())); | |
487 replacements.SetPort(charactersOrEmpty(portUTF8), url_parse::Component(0, po
rtUTF8.length())); | |
488 replaceComponents(replacements); | |
489 } | |
490 | |
491 void KURL::removePort() | |
492 { | |
493 if (!hasPort()) | |
494 return; | |
495 url_canon::Replacements<char> replacements; | |
496 replacements.ClearPort(); | |
497 replaceComponents(replacements); | |
498 } | |
499 | |
500 void KURL::setPort(const String& port) | |
501 { | |
502 String parsedPort = parsePortFromStringPosition(port, 0); | |
503 setPort(parsedPort.toUInt()); | |
504 } | |
505 | |
506 void KURL::setPort(unsigned short port) | |
507 { | |
508 if (isDefaultPortForProtocol(port, protocol())) { | |
509 removePort(); | |
510 return; | |
511 } | |
512 | |
513 String portString = String::number(port); | |
514 ASSERT(portString.is8Bit()); | |
515 | |
516 url_canon::Replacements<char> replacements; | |
517 replacements.SetPort(reinterpret_cast<const char*>(portString.characters8())
, url_parse::Component(0, portString.length())); | |
518 replaceComponents(replacements); | |
519 } | |
520 | |
521 void KURL::setUser(const String& user) | |
522 { | |
523 // This function is commonly called to clear the username, which we | |
524 // normally don't have, so we optimize this case. | |
525 if (user.isEmpty() && !m_parsed.username.is_valid()) | |
526 return; | |
527 | |
528 // The canonicalizer will clear any usernames that are empty, so we | |
529 // don't have to explicitly call ClearUsername() here. | |
530 StringUTF8Adaptor userUTF8(user); | |
531 url_canon::Replacements<char> replacements; | |
532 replacements.SetUsername(charactersOrEmpty(userUTF8), url_parse::Component(0
, userUTF8.length())); | |
533 replaceComponents(replacements); | |
534 } | |
535 | |
536 void KURL::setPass(const String& pass) | |
537 { | |
538 // This function is commonly called to clear the password, which we | |
539 // normally don't have, so we optimize this case. | |
540 if (pass.isEmpty() && !m_parsed.password.is_valid()) | |
541 return; | |
542 | |
543 // The canonicalizer will clear any passwords that are empty, so we | |
544 // don't have to explicitly call ClearUsername() here. | |
545 StringUTF8Adaptor passUTF8(pass); | |
546 url_canon::Replacements<char> replacements; | |
547 replacements.SetPassword(charactersOrEmpty(passUTF8), url_parse::Component(0
, passUTF8.length())); | |
548 replaceComponents(replacements); | |
549 } | |
550 | |
551 void KURL::setFragmentIdentifier(const String& fragment) | |
552 { | |
553 // This function is commonly called to clear the ref, which we | |
554 // normally don't have, so we optimize this case. | |
555 if (fragment.isNull() && !m_parsed.ref.is_valid()) | |
556 return; | |
557 | |
558 StringUTF8Adaptor fragmentUTF8(fragment); | |
559 | |
560 url_canon::Replacements<char> replacements; | |
561 if (fragment.isNull()) | |
562 replacements.ClearRef(); | |
563 else | |
564 replacements.SetRef(charactersOrEmpty(fragmentUTF8), url_parse::Componen
t(0, fragmentUTF8.length())); | |
565 replaceComponents(replacements); | |
566 } | |
567 | |
568 void KURL::removeFragmentIdentifier() | |
569 { | |
570 url_canon::Replacements<char> replacements; | |
571 replacements.ClearRef(); | |
572 replaceComponents(replacements); | |
573 } | |
574 | |
575 void KURL::setQuery(const String& query) | |
576 { | |
577 StringUTF8Adaptor queryUTF8(query); | |
578 url_canon::Replacements<char> replacements; | |
579 if (query.isNull()) { | |
580 // KURL.cpp sets to null to clear any query. | |
581 replacements.ClearQuery(); | |
582 } else if (query.length() > 0 && query[0] == '?') { | |
583 // WebCore expects the query string to begin with a question mark, but | |
584 // GoogleURL doesn't. So we trim off the question mark when setting. | |
585 replacements.SetQuery(charactersOrEmpty(queryUTF8), url_parse::Component
(1, queryUTF8.length() - 1)); | |
586 } else { | |
587 // When set with the empty string or something that doesn't begin with | |
588 // a question mark, KURL.cpp will add a question mark for you. The only | |
589 // way this isn't compatible is if you call this function with an empty | |
590 // string. KURL.cpp will leave a '?' with nothing following it in the | |
591 // URL, whereas we'll clear it. | |
592 // FIXME We should eliminate this difference. | |
593 replacements.SetQuery(charactersOrEmpty(queryUTF8), url_parse::Component
(0, queryUTF8.length())); | |
594 } | |
595 replaceComponents(replacements); | |
596 } | |
597 | |
598 void KURL::setPath(const String& path) | |
599 { | |
600 // Empty paths will be canonicalized to "/", so we don't have to worry | |
601 // about calling ClearPath(). | |
602 StringUTF8Adaptor pathUTF8(path); | |
603 url_canon::Replacements<char> replacements; | |
604 replacements.SetPath(charactersOrEmpty(pathUTF8), url_parse::Component(0, pa
thUTF8.length())); | |
605 replaceComponents(replacements); | |
606 } | |
607 | |
608 String decodeURLEscapeSequences(const String& string) | |
609 { | |
610 return decodeURLEscapeSequences(string, UTF8Encoding()); | |
611 } | |
612 | |
613 // In KURL.cpp's implementation, this is called by every component getter. | |
614 // It will unescape every character, including '\0'. This is scary, and may | |
615 // cause security holes. We never call this function for components, and | |
616 // just return the ASCII versions instead. | |
617 // | |
618 // This function is also used to decode javascript: URLs and as a general | |
619 // purpose unescaping function. | |
620 // | |
621 // FIXME These should be merged to the KURL.cpp implementation. | |
622 String decodeURLEscapeSequences(const String& string, const WTF::TextEncoding& e
ncoding) | |
623 { | |
624 // FIXME We can probably use KURL.cpp's version of this function | |
625 // without modification. However, I'm concerned about | |
626 // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old | |
627 // custom code for now. Using their version will also fix the bug that | |
628 // we ignore the encoding. | |
629 // | |
630 // FIXME b/1350291: This does not get called very often. We just convert | |
631 // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of | |
632 // sucks, and we don't use the encoding properly, which will make some | |
633 // obscure anchor navigations fail. | |
634 StringUTF8Adaptor stringUTF8(string); | |
635 url_canon::RawCanonOutputT<url_parse::UTF16Char> unescaped; | |
636 url_util::DecodeURLEscapeSequences(stringUTF8.data(), stringUTF8.length(), &
unescaped); | |
637 return StringImpl::create8BitIfPossible(reinterpret_cast<UChar*>(unescaped.d
ata()), unescaped.length()); | |
638 } | |
639 | |
640 String encodeWithURLEscapeSequences(const String& notEncodedString) | |
641 { | |
642 CString utf8 = UTF8Encoding().normalizeAndEncode(notEncodedString, WTF::URLE
ncodedEntitiesForUnencodables); | |
643 | |
644 url_canon::RawCanonOutputT<char> buffer; | |
645 int inputLength = utf8.length(); | |
646 if (buffer.length() < inputLength * 3) | |
647 buffer.Resize(inputLength * 3); | |
648 | |
649 url_util::EncodeURIComponent(utf8.data(), inputLength, &buffer); | |
650 String escaped(buffer.data(), buffer.length()); | |
651 // Unescape '/'; it's safe and much prettier. | |
652 escaped.replace("%2F", "/"); | |
653 return escaped; | |
654 } | |
655 | |
656 bool KURL::isHierarchical() const | |
657 { | |
658 if (m_string.isNull() || !m_parsed.scheme.is_nonempty()) | |
659 return false; | |
660 return m_string.is8Bit() ? | |
661 url_util::IsStandard(asURLChar8Subtle(m_string), m_parsed.scheme) : | |
662 url_util::IsStandard(m_string.characters16(), m_parsed.scheme); | |
663 } | |
664 | |
665 #ifndef NDEBUG | |
666 void KURL::print() const | |
667 { | |
668 printf("%s\n", m_string.utf8().data()); | |
669 } | |
670 #endif | |
671 | |
672 bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b) | |
673 { | |
674 // Compute the length of each URL without its ref. Note that the reference | |
675 // begin (if it exists) points to the character *after* the '#', so we need | |
676 // to subtract one. | |
677 int aLength = a.m_string.length(); | |
678 if (a.m_parsed.ref.len >= 0) | |
679 aLength = a.m_parsed.ref.begin - 1; | |
680 | |
681 int bLength = b.m_string.length(); | |
682 if (b.m_parsed.ref.len >= 0) | |
683 bLength = b.m_parsed.ref.begin - 1; | |
684 | |
685 if (aLength != bLength) | |
686 return false; | |
687 | |
688 const String& aString = a.m_string; | |
689 const String& bString = b.m_string; | |
690 // FIXME: Abstraction this into a function in WTFString.h. | |
691 for (int i = 0; i < aLength; ++i) { | |
692 if (aString[i] != bString[i]) | |
693 return false; | |
694 } | |
695 return true; | |
696 } | |
697 | |
698 unsigned KURL::hostStart() const | |
699 { | |
700 return m_parsed.CountCharactersBefore(url_parse::Parsed::HOST, false); | |
701 } | |
702 | |
703 unsigned KURL::hostEnd() const | |
704 { | |
705 return m_parsed.CountCharactersBefore(url_parse::Parsed::PORT, true); | |
706 } | |
707 | |
708 unsigned KURL::pathStart() const | |
709 { | |
710 return m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false); | |
711 } | |
712 | |
713 unsigned KURL::pathEnd() const | |
714 { | |
715 return m_parsed.CountCharactersBefore(url_parse::Parsed::QUERY, true); | |
716 } | |
717 | |
718 unsigned KURL::pathAfterLastSlash() const | |
719 { | |
720 if (m_string.isNull()) | |
721 return 0; | |
722 if (!m_isValid || !m_parsed.path.is_valid()) | |
723 return m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false); | |
724 url_parse::Component filename; | |
725 if (m_string.is8Bit()) | |
726 url_parse::ExtractFileName(asURLChar8Subtle(m_string), m_parsed.path, &f
ilename); | |
727 else | |
728 url_parse::ExtractFileName(m_string.characters16(), m_parsed.path, &file
name); | |
729 return filename.begin; | |
730 } | |
731 | |
732 bool protocolIs(const String& url, const char* protocol) | |
733 { | |
734 assertProtocolIsGood(protocol); | |
735 if (url.isNull()) | |
736 return false; | |
737 if (url.is8Bit()) | |
738 return url_util::FindAndCompareScheme(asURLChar8Subtle(url), url.length(
), protocol, 0); | |
739 return url_util::FindAndCompareScheme(url.characters16(), url.length(), prot
ocol, 0); | |
740 } | |
741 | |
742 void KURL::init(const KURL& base, const String& relative, const WTF::TextEncodin
g* queryEncoding) | |
743 { | |
744 if (!relative.isNull() && relative.is8Bit()) { | |
745 StringUTF8Adaptor relativeUTF8(relative); | |
746 init(base, relativeUTF8.data(), relativeUTF8.length(), queryEncoding); | |
747 } else | |
748 init(base, relative.characters16(), relative.length(), queryEncoding); | |
749 initProtocolIsInHTTPFamily(); | |
750 initInnerURL(); | |
751 } | |
752 | |
753 template <typename CHAR> | |
754 void KURL::init(const KURL& base, const CHAR* relative, int relativeLength, cons
t WTF::TextEncoding* queryEncoding) | |
755 { | |
756 // As a performance optimization, we do not use the charset converter | |
757 // if encoding is UTF-8 or other Unicode encodings. Note that this is | |
758 // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be more | |
759 // efficient with no charset converter object because it can do UTF-8 | |
760 // internally with no extra copies. | |
761 | |
762 // We feel free to make the charset converter object every time since it's | |
763 // just a wrapper around a reference. | |
764 KURLCharsetConverter charsetConverterObject(queryEncoding); | |
765 KURLCharsetConverter* charsetConverter = (!queryEncoding || isUnicodeEncodin
g(queryEncoding)) ? 0 : &charsetConverterObject; | |
766 | |
767 StringUTF8Adaptor baseUTF8(base.string()); | |
768 | |
769 url_canon::RawCanonOutputT<char> output; | |
770 m_isValid = url_util::ResolveRelative(baseUTF8.data(), baseUTF8.length(), ba
se.m_parsed, relative, relativeLength, charsetConverter, &output, &m_parsed); | |
771 | |
772 // See FIXME in KURLPrivate in the header. If canonicalization has not | |
773 // changed the string, we can avoid an extra allocation by using assignment. | |
774 m_string = AtomicString::fromUTF8(output.data(), output.length()); | |
775 } | |
776 | |
777 void KURL::initInnerURL() | |
778 { | |
779 if (!m_isValid) { | |
780 m_innerURL.clear(); | |
781 return; | |
782 } | |
783 if (url_parse::Parsed* innerParsed = m_parsed.inner_parsed()) | |
784 m_innerURL = adoptPtr(new KURL(ParsedURLString, m_string.substring(inner
Parsed->scheme.begin, innerParsed->Length() - innerParsed->scheme.begin))); | |
785 else | |
786 m_innerURL.clear(); | |
787 } | |
788 | |
789 template<typename CHAR> | |
790 bool internalProtocolIs(const url_parse::Component& scheme, const CHAR* spec, co
nst char* protocol) | |
791 { | |
792 const CHAR* begin = spec + scheme.begin; | |
793 const CHAR* end = begin + scheme.len; | |
794 | |
795 while (begin != end && *protocol) { | |
796 ASSERT(toASCIILower(*protocol) == *protocol); | |
797 if (toASCIILower(*begin++) != *protocol++) | |
798 return false; | |
799 } | |
800 | |
801 // Both strings are equal (ignoring case) if and only if all of the characte
rs were equal, | |
802 // and the end of both has been reached. | |
803 return begin == end && !*protocol; | |
804 } | |
805 | |
806 template<typename CHAR> | |
807 bool checkIfProtocolIsInHTTPFamily(const url_parse::Component& scheme, const CHA
R* spec) | |
808 { | |
809 if (scheme.len == 4) | |
810 return internalProtocolIs(scheme, spec, "http"); | |
811 if (scheme.len == 5) | |
812 return internalProtocolIs(scheme, spec, "https"); | |
813 return false; | |
814 } | |
815 | |
816 void KURL::initProtocolIsInHTTPFamily() | |
817 { | |
818 if (!m_isValid) { | |
819 m_protocolIsInHTTPFamily = false; | |
820 return; | |
821 } | |
822 | |
823 ASSERT(!m_string.isNull()); | |
824 m_protocolIsInHTTPFamily = m_string.is8Bit() ? | |
825 checkIfProtocolIsInHTTPFamily(m_parsed.scheme, m_string.characters8()) : | |
826 checkIfProtocolIsInHTTPFamily(m_parsed.scheme, m_string.characters16()); | |
827 } | |
828 | |
829 bool KURL::protocolIs(const char* protocol) const | |
830 { | |
831 assertProtocolIsGood(protocol); | |
832 | |
833 // JavaScript URLs are "valid" and should be executed even if KURL decides t
hey are invalid. | |
834 // The free function protocolIsJavaScript() should be used instead. | |
835 // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASS
ERT(strcmp(protocol, "javascript")); | |
836 | |
837 if (m_string.isNull() || m_parsed.scheme.len <= 0) | |
838 return *protocol == '\0'; | |
839 | |
840 return m_string.is8Bit() ? | |
841 internalProtocolIs(m_parsed.scheme, m_string.characters8(), protocol) : | |
842 internalProtocolIs(m_parsed.scheme, m_string.characters16(), protocol); | |
843 } | |
844 | |
845 String KURL::stringForInvalidComponent() const | |
846 { | |
847 if (m_string.isNull()) | |
848 return String(); | |
849 return emptyString(); | |
850 } | |
851 | |
852 String KURL::componentString(const url_parse::Component& component) const | |
853 { | |
854 if (!m_isValid || component.len <= 0) | |
855 return stringForInvalidComponent(); | |
856 // begin and len are in terms of bytes which do not match | |
857 // if string() is UTF-16 and input contains non-ASCII characters. | |
858 // However, the only part in urlString that can contain non-ASCII | |
859 // characters is 'ref' at the end of the string. In that case, | |
860 // begin will always match the actual value and len (in terms of | |
861 // byte) will be longer than what's needed by 'mid'. However, mid | |
862 // truncates len to avoid go past the end of a string so that we can | |
863 // get away without doing anything here. | |
864 return string().substring(component.begin, component.len); | |
865 } | |
866 | |
867 template<typename CHAR> | |
868 void KURL::replaceComponents(const url_canon::Replacements<CHAR>& replacements) | |
869 { | |
870 url_canon::RawCanonOutputT<char> output; | |
871 url_parse::Parsed newParsed; | |
872 | |
873 StringUTF8Adaptor utf8(m_string); | |
874 m_isValid = url_util::ReplaceComponents(utf8.data(), utf8.length(), m_parsed
, replacements, 0, &output, &newParsed); | |
875 | |
876 m_parsed = newParsed; | |
877 m_string = AtomicString::fromUTF8(output.data(), output.length()); | |
878 } | |
879 | |
880 bool KURL::isSafeToSendToAnotherThread() const | |
881 { | |
882 return m_string.isSafeToSendToAnotherThread() | |
883 && (!m_innerURL || m_innerURL->isSafeToSendToAnotherThread()); | |
884 } | |
885 | |
886 } // namespace WebCore | |
OLD | NEW |