Index: webkit/child/site_isolation_policy.cc |
diff --git a/webkit/child/site_isolation_policy.cc b/webkit/child/site_isolation_policy.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..0fd8ab0e5016bbcd7a1d67ee8def7e728e0632d6 |
--- /dev/null |
+++ b/webkit/child/site_isolation_policy.cc |
@@ -0,0 +1,502 @@ |
+// Copyright (c) 2013 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "webkit/child/site_isolation_policy.h" |
+ |
+#include "base/basictypes.h" |
+#include "base/logging.h" |
+#include "base/metrics/histogram.h" |
+#include "base/strings/string_util.h" |
+#include "net/base/registry_controlled_domains/registry_controlled_domain.h" |
+#include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h" |
+#include "third_party/WebKit/public/platform/WebString.h" |
+#include "third_party/WebKit/public/platform/WebURL.h" |
+#include "third_party/WebKit/public/platform/WebURLRequest.h" |
+#include "third_party/WebKit/public/platform/WebURLResponse.h" |
+#include "third_party/WebKit/public/web/WebDocument.h" |
+#include "third_party/WebKit/public/web/WebFrame.h" |
+#include "third_party/WebKit/public/web/WebFrameClient.h" |
+#include "third_party/WebKit/public/web/WebSecurityOrigin.h" |
+ |
+using WebKit::WebURLResponse; |
+using WebKit::WebURLRequest; |
+using WebKit::WebURL; |
+using WebKit::WebString; |
+using WebKit::WebDocument; |
+ |
+namespace webkit_glue { |
+ |
+std::map<unsigned, WebURLRequest::TargetType> |
nasko
2013/08/06 17:29:27
Why are these needed in this file? They are alread
dsjang
2013/08/07 00:19:07
Done.
|
+ SiteIsolationPolicy::id_target_map_; |
+std::map<std::string, ResponseMetaData> |
+ SiteIsolationPolicy::url_responsedata_map_; |
+std::map<unsigned, std::string> SiteIsolationPolicy::id_url_map_; |
+ |
+void SiteIsolationPolicy::WillSendRequest( |
+ unsigned identifier, |
+ WebURLRequest::TargetType target_type) { |
+ // This happens when the original request is redirected. |
+ if (id_target_map_.count(identifier) != 0) { |
+ // This check actually can fail. If it is, which target_type do we |
+ // have to record between the old one and the new one? When |
+ // redirection happens, target_type becomes 2. TODO(dsjang): |
+ // let's disable this code and see what happens on onclickads.com |
+ // for googleads JavaScript code assigned to an image. |
+ if (id_target_map_[identifier] != target_type) { |
+ id_target_map_[identifier] = target_type; |
+ } |
+ } |
+ id_target_map_[identifier] = target_type; |
+} |
+ |
+void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame, |
+ unsigned identifier, |
+ const WebURLResponse& response) { |
+ |
nasko
2013/08/06 17:29:27
nit: no need for empty line here.
dsjang
2013/08/07 00:19:07
Done.
|
+ DCHECK(id_target_map_.count(identifier) == 1); |
+ |
+ UMA_HISTOGRAM_COUNTS("XSDP.ALL", 1); |
+ |
+ GURL response_url = response.url(); |
+ WebURLRequest::TargetType target_type = id_target_map_[identifier]; |
+ id_target_map_.erase(identifier); |
+ |
+ // See if this is for navigation. If it is, let it pass. |
+ if (IsFrameNotCommitted(frame)) { |
+ LOG(INFO) << "SiteIsolationPolicy.FrameNotCommitted"; |
+ return; |
+ } |
+ |
+ GURL frame_origin(frame->document().securityOrigin().toString().utf8()); |
+ |
+ // TODO(dsjang): Find out all non-network scheme here. |
+ // If it's the data: scheme, we can let it pass through. |
+ if (IsSafeScheme(frame_origin)) { |
+ LOG(INFO) << "SiteIsolationPolicy.SafeScheme:" << frame_origin; |
+ return; |
+ } |
+ |
+ if (IsSameSite(frame_origin, response_url)) { |
+ LOG(INFO) << "SiteIsolationPolicy.SameSite:" << frame_origin << "," |
+ << response_url; |
+ return; |
+ } |
+ |
+ ResponseMetaData::CanonicalMimeType canonical_mime_type = |
+ GetCanonicalMimeType(response); |
+ |
+ if (canonical_mime_type == ResponseMetaData::IsOthers) { |
+ LOG(INFO) << "SiteIsolationPolicy.mimetype:" << frame_origin << "," |
+ << response_url << "," << response.mimeType().utf8(); |
+ return; |
+ } |
+ |
+ std::string access_control_origin = response |
+ .httpHeaderField( |
+ WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8(); |
+ |
+ if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) { |
+ LOG(INFO) << "SiteIsolationPolicy.CorsisSafe:"; |
nasko
2013/08/06 17:29:27
nit: CorsIsSafe
dsjang
2013/08/07 00:19:07
Done.
|
+ return; |
+ } |
+ |
+ // Real data collection starts from here. |
+ // |
+ // XSDP.XSD.%MIMECODE is a shortened name for |
+ // XSDP.All.NNav.NSafeScheme.NSMIMEType.NSCORS.%MIMECODE from now |
nasko
2013/08/06 17:29:27
What is the meaning of this string? Putting a comm
dsjang
2013/08/07 00:19:07
Done.
|
+ // on. |
+ |
+ LOG(INFO) << "SiteIsolationPolicy.XSD!!!:" << response_url; |
+ |
+ ResponseMetaData metaData; |
+ metaData.frame_origin = frame_origin.spec(); |
+ metaData.response_url = response_url.spec(); |
+ metaData.identifier = identifier; |
+ metaData.target_type = target_type; |
+ metaData.canonical_mime_type = canonical_mime_type; |
+ metaData.http_status_code = response.httpStatusCode(); |
+ |
+ url_responsedata_map_[metaData.response_url] = metaData; |
+ id_url_map_[identifier] = metaData.response_url; |
+ |
+ return; |
+} |
+ |
+void SiteIsolationPolicy::DidReceiveData(const char* data, |
+ int length, |
+ WebURL& web_response_url) { |
+ GURL response_url(web_response_url); |
+ |
+ std::string response_url_str = response_url.spec(); |
+ if (url_responsedata_map_.count(response_url_str) == 0) |
nasko
2013/08/06 17:29:27
Is this a valid case? When will we have seen a req
dsjang
2013/08/07 00:19:07
url_responsedata_map_ only maintains url for cross
|
+ return; |
+ |
+ // Record the length of the first received network packet to see if |
+ // it's enough for sniffing. |
+ UMA_HISTOGRAM_COUNTS("XSDP.XSD.DataLength", length); |
+ |
+ DCHECK(url_responsedata_map_.count(response_url_str) == 1); |
+ ResponseMetaData metaData = url_responsedata_map_[response_url_str]; |
+ url_responsedata_map_.erase(response_url_str); |
+ |
+ std::string uma_bucket_name("XSDP.XSD."); |
+ uma_bucket_name.append(ResponseMetaData::CanonicalMimeTypeToString( |
+ metaData.canonical_mime_type)); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ |
+ // TODO(dsjang): sometimes the length of payload can be not enough to do |
+ // correct content sniffing. If that happens, put it into a buffer |
+ // so that we can do it later. |
+ bool verified_for_blocking = false; |
+ switch (metaData.canonical_mime_type) { |
+ case ResponseMetaData::IsHTML: |
+ if (SniffForHTML(data, length)) { |
+ uma_bucket_name.append(".Verified"); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ verified_for_blocking = true; |
+ } |
+ break; |
+ case ResponseMetaData::IsXML: |
+ if (SniffForXML(data, length)) { |
+ uma_bucket_name.append(".Verified"); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ verified_for_blocking = true; |
+ } |
+ break; |
+ case ResponseMetaData::IsJSON: |
+ if (SniffForJSON(data, length)) { |
+ uma_bucket_name.append(".Verified"); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ verified_for_blocking = true; |
+ } |
+ break; |
+ case ResponseMetaData::IsPlain: |
+ if (SniffForHTML(data, length)) { |
+ uma_bucket_name.append(".Verified.HTML"); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ verified_for_blocking = true; |
+ } else if (SniffForXML(data, length)) { |
+ uma_bucket_name.append(".Verified.XML"); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ verified_for_blocking = true; |
+ } else if (SniffForJSON(data, length)) { |
+ uma_bucket_name.append(".Verified.JSON"); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ verified_for_blocking = true; |
+ } |
+ break; |
+ case ResponseMetaData::IsOthers: |
+ DCHECK(false); |
+ break; |
+ } |
+ |
+ // We block these. See how many of them have unaffected status code. |
+ if (verified_for_blocking) { |
+ if (UnaffectedStatusCode(metaData.http_status_code)) { |
+ // This is a blocking that does not affect the browser behavior |
+ // by the following reasons : 1) this is not a binary object |
+ // (such as an image) since this is sniffed as a text |
+ // document. 2) then, this blocking only breaks the renderer |
+ // behavior only if it is either JavaScript or CSS. However, the |
+ // renderer doesn't use the contents of JS/CSS with unaffected |
+ // status code(e.g, 404). *) the renderer is expected not to use |
+ // the cross-site document content for purposes other than |
+ // JS/CSS (e.g, XHR). |
+ uma_bucket_name.append(".UnaffectedStatusCode."); |
+ std::stringstream stat_code_strm; |
+ LOG(INFO) << "Blocked:UNAFFECTED STAT CODE:" << metaData.http_status_code; |
+ stat_code_strm << metaData.http_status_code; |
+ uma_bucket_name.append(stat_code_strm.str()); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ } else { |
+ LOG(INFO) << "Blocked:AFFECTED STAT CODE:" << metaData.http_status_code; |
+ // This blocking can be disruptive if it was actually JS, and |
+ // requested for JS. |
+ uma_bucket_name.append(".NUnaffectedStatusCode."); |
+ uma_bucket_name.append( |
+ ResponseMetaData::TargetTypeToString(metaData.target_type)); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ if (SniffForJS(data, length)) { |
+ // This shows if this blocking can be JS. |
+ uma_bucket_name.append(".MaybeJS"); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ } |
+ } |
+ } else { |
+ LOG(INFO) << "Not Blocked:sniffing failed:"; |
+ // Not blocked. How many of them can be JS? This is only useful |
+ // for studying non-blocked documents. |
+ if (SniffForJS(data, length)) { |
+ uma_bucket_name.append(".NVerified.MaybeJS"); |
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1); |
+ } |
+ } |
+} |
+ |
+void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) { |
+ id_target_map_.erase(identifier); |
+ if (id_url_map_.count(identifier) > 0) { |
+ url_responsedata_map_.erase(id_url_map_[identifier]); |
+ id_url_map_.erase(identifier); |
+ } |
+} |
+ |
+void SiteIsolationPolicy::DidFinishResourceLoad( |
+ WebKit::WebURL& web_response_url) { |
+ GURL response_url(web_response_url); |
+ |
+ if (url_responsedata_map_.count(response_url.spec()) > 0) { |
+ ResponseMetaData meta_data = url_responsedata_map_[response_url.spec()]; |
+ url_responsedata_map_.erase(response_url.spec()); |
+ id_target_map_.erase(meta_data.identifier); |
+ id_url_map_.erase(meta_data.identifier); |
+ } |
+} |
+ |
+ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType( |
+ const WebURLResponse& response) { |
+ // RFC 2045 says: "The type, subtype, and parameter names are not |
+ // case sensitive." If you have a MIME type of text/plain that's a |
+ // type of text and a subtype of plain. So, per the spec, these are |
+ // not case sensitive. |
+ std::string mime_type = response.mimeType().utf8(); |
+ StringToLowerASCII(&mime_type); |
nasko
2013/08/06 17:29:27
The mime_type string is UTF8, yet you are using AS
dsjang
2013/08/07 00:19:07
I found that all the crawled mime types are origin
|
+ |
+ const char* const document_mime_types[] = { |
+ "text/html", "text/xml", "application/rss+xml", "application/xml", |
+ "application/json", "text/x-json", "text/json", "text/plain"}; |
+ size_t i = 0; |
+ for (i = 0; i < 8; ++i) { |
+ if (!strcmp(document_mime_types[i], mime_type.data())) { |
nasko
2013/08/06 17:29:27
This is unsafe comparison, you should be bounding
dsjang
2013/08/07 00:19:07
Switched to std::string::operator==().
On 2013/08
|
+ break; |
+ } |
+ } |
+ |
+ if (i == 0) { |
nasko
2013/08/06 17:29:27
Using constants like these seems unclean to me. Is
dsjang
2013/08/07 00:19:07
Done.
|
+ return ResponseMetaData::IsHTML; |
+ } else if (1 <= i && i < 4) { |
+ return ResponseMetaData::IsXML; |
+ } else if (4 <= i && i < 7) { |
+ return ResponseMetaData::IsJSON; |
+ } else if (i == 7) { |
+ return ResponseMetaData::IsPlain; |
+ } else { |
+ return ResponseMetaData::IsOthers; |
+ } |
+} |
+ |
+bool SiteIsolationPolicy::IsSafeScheme(GURL& url) { |
nasko
2013/08/06 17:29:27
What about blob URLs? Are those safe too?
dsjang
2013/08/07 00:19:07
Switched to blacklisting from whitelisting.
On 20
|
+ return url.scheme() == "data"; |
+} |
+ |
+bool SiteIsolationPolicy::IsSameSite(GURL& frame_origin, GURL& response_url) { |
+ if (frame_origin.scheme() != response_url.scheme()) |
+ return false; |
+ |
+ // Extract the effective domains (public suffix plus one) of the |
+ // urls. |
+ std::string frame_domain = |
+ net::registry_controlled_domains::GetDomainAndRegistry( |
+ frame_origin, |
+ net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES); |
nasko
2013/08/06 17:29:27
Is there a reason we are deviating from how SiteIn
dsjang
2013/08/07 00:19:07
I thought allowing private registries here enables
|
+ std::string response_domain = |
+ net::registry_controlled_domains::GetDomainAndRegistry( |
+ response_url, |
+ net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES); |
+ |
+ return frame_domain == response_domain; |
+} |
+ |
+bool SiteIsolationPolicy::IsFrameNotCommitted(WebKit::WebFrame* frame) { |
nasko
2013/08/06 17:29:27
This name is a bit confusing, can you put a descri
dsjang
2013/08/07 00:19:07
Done.
|
+ return frame->provisionalDataSource() != NULL; |
+} |
+ |
+bool SiteIsolationPolicy::IsValidCorsHeaderSet( |
+ GURL& frame_origin, |
+ GURL& website_origin, |
+ std::string access_control_origin) { |
+ |
+ size_t access_control_origin_len = access_control_origin.size(); |
+ |
+ // TODO(dsjang): Is this actually true? The server seems to return |
+ // an empty string or "null". |
+ if (access_control_origin_len == 0) |
+ return false; |
+ |
+ // Strip quotes off from it. This is non-standard practice, but many |
nasko
2013/08/06 17:29:27
Don't we have code that parses the header that we
dsjang
2013/08/07 00:19:07
Done.
|
+ // websites use quote strings surrounding the actual header value. |
+ if (access_control_origin_len > 2) { |
+ char first = access_control_origin[0]; |
+ char last = access_control_origin[access_control_origin_len - 1]; |
+ if ((first == '\"' && last == '\"') || (first == '\'' && last == '\'')) { |
+ access_control_origin = |
+ access_control_origin.substr(1, access_control_origin_len - 2); |
+ } |
+ } |
+ |
+ // TODO(dsjang): * is not allowed for the response from a request |
+ // with cookies. This allows for more than what the renderer will |
+ // eventually be able to receive, so we won't see illegal cross-site |
+ // documents alllowed by this. We have to have t a way to see if |
+ // this response is from a cookie-tagged request or not in the |
+ // future. |
+ if (access_control_origin == "*") |
nasko
2013/08/06 17:29:27
You talk about cookie-tagged requests, but there i
dsjang
2013/08/07 00:19:07
Yes, it has to be incorporated here in the future,
|
+ return true; |
+ |
+ // TODO(dsjang): The CORS spec only treats a fully specified URL, |
+ // not just a domain here. Confirm this ad-hoc rule to be |
+ // correct. If this doesn't start with a scheme(http://, https://), |
nasko
2013/08/06 17:29:27
This sounds scary. Is this correct?
dsjang
2013/08/07 00:19:07
This was actually scarily wrong :-). I found out t
|
+ // it inherits the site's scheme. |
+ if (access_control_origin.find("http://") != 0 && |
+ access_control_origin.find("https://") != 0) { |
+ access_control_origin.insert(0, website_origin.scheme() + "://"); |
+ } |
+ |
+ LOG(ERROR) << access_control_origin; |
+ |
+ // We don't use Webkit's |
+ // frame->securityOrigin().canAccess(WebSecurityOrigin::createFromString(acc |
+ // ess_control_origin)))here since their .canAccess works in terms of origins, |
+ // not sites. For example, when frame is sub.a.com and it is not allowed |
+ // to access a document with sub1.a.com. But under Site Isolation, |
+ // it's allowed. |
+ |
+ // TODO(dsjang): examine createFromString()'s behavior for a URL |
+ // containing * in it. |
+ WebKit::WebSecurityOrigin cors_security_origin = |
+ WebKit::WebSecurityOrigin::createFromString( |
+ WebKit::WebString::fromUTF8(access_control_origin)); |
+ GURL cors_origin(cors_security_origin.toString().utf8()); |
+ |
+ LOG(ERROR) << cors_security_origin.toString().utf8(); |
+ return IsSameSite(frame_origin, cors_origin); |
+} |
+ |
+bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) { |
+ // TODO(dsjang): The content sniffer used by Chrome and Firefox are |
+ // using "<!--" as one of the HTML signatures, but it also appears |
+ // in valid JavaScript, considered as well-formed JS by the browser. |
+ // Since we do not want to block any JS, we exclude it from our HTML |
+ // signatures. This can weaken our document block policy, but we can |
+ // break less websites. |
+ const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec |
+ "<script", // HTML5 spec, Mozilla |
+ "<html", // HTML5 spec, Mozilla |
+ "<head", // HTML5 spec, Mozilla |
+ "<iframe", // Mozilla |
+ "<h1", // Mozilla |
+ "<div", // Mozilla |
+ "<font", // Mozilla |
+ "<table", // Mozilla |
+ "<a", // Mozilla |
+ "<style", // Mozilla |
+ "<title", // Mozilla |
+ "<b", // Mozilla |
+ "<body", // Mozilla |
+ "<br", "<p" // Mozilla |
+ }; |
+ return DoSignatureMatching( |
+ data, length, html_signatures, arraysize(html_signatures)); |
+} |
+ |
+bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) { |
+ const char* xml_signatures[] = {"<?xml" // Mozilla |
+ }; |
+ return DoSignatureMatching( |
+ data, length, xml_signatures, arraysize(xml_signatures)); |
+} |
+ |
+bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) { |
+ // TODO(dsjang): We have to come up with a better way to sniff |
+ // JSON. However, even RE cannot help us that much due to the fact |
+ // that we don't do full parsing. This DFA finds 1) {, 2) "(or'), |
+ // 3) : in the order. This is intentionally not using a regular |
+ // expression library so that we can make the trusted code base as |
+ // small as possible. |
+ int state = 0; |
+ for (size_t i = 0; i < length && state < 3; ++i, ++data) { |
+ char c = *data; |
+ if (c == ' ' || c == '\t' || c == '\r' || c == '\n') |
+ continue; |
+ |
+ switch (state) { |
+ case 0: |
+ if (c == '{') |
+ state = 1; |
nasko
2013/08/06 17:29:27
Please use symbolic names or describe what those n
dsjang
2013/08/07 00:19:07
Done.
|
+ else |
+ state = 4; |
+ break; |
+ case 1: |
+ if (c == '\"' || c == '\'') |
+ state = 2; |
+ else |
+ state = 4; |
+ break; |
+ case 2: |
+ if (c == ':') { |
+ state = 3; |
+ } |
+ break; |
+ default: |
+ break; |
+ } |
+ } |
+ return state == 3; |
+} |
+ |
+bool SiteIsolationPolicy::DoSignatureMatching(const char* data, |
+ size_t length, |
+ const char* signatures[], |
+ size_t arr_size) { |
+ for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) { |
+ const char* signature = signatures[sig_index]; |
+ size_t signature_length = strlen(signature); |
+ size_t i = 0; |
+ // Skip the white characters at the beginning of the document. |
+ for (i = 0; i < length; ++i) { |
+ char c = *data; |
+ if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t')) { |
+ break; |
+ } |
+ ++data; |
+ } |
+ length = length - i; |
+ if (length < signature_length) |
+ continue; |
+ if (base::strncasecmp(signature, data, signature_length) == 0) { |
+ return true; |
+ } |
+ } |
+ return false; |
+} |
+ |
+bool SiteIsolationPolicy::UnaffectedStatusCode(int status_code) { |
+ // Chrome only uses the content of a response with one of these |
+ // status codes for CSS/JavaScript. For images, Chrome just ignores |
+ // status code. |
+ const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302, |
+ 303, 305, 306, 307}; |
+ for (size_t i = 0; i < 12; ++i) { |
+ if (renderable_status_code[i] == status_code) |
+ return false; |
+ } |
+ return true; |
+} |
+ |
+bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) { |
+ // TODO(dsjang): This is a real hacking. The only purpose of this |
+ // function is to try to see if there's any possibility that this |
+ // data can be JavaScript.(superset of JS). This function will be |
+ // removed for the production code. |
+ |
+ // Search for "var " for JS detection. :-) |
+ for (size_t i = 0; i < length - 3; ++i) { |
+ if (strncmp(data, "var ", 4) == 0) { |
+ return true; |
+ } |
+ ++data; |
+ } |
+ return false; |
+} |
+} |
nasko
2013/08/06 17:29:27
Leave an empty line and put a // namespace comment
|