Index: webkit/child/site_isolation_policy.cc |
diff --git a/webkit/child/site_isolation_policy.cc b/webkit/child/site_isolation_policy.cc |
new file mode 100644 |
index 0000000000000000000000000000000000000000..c8e7984885f651b59ad0b630acb39768376081e9 |
--- /dev/null |
+++ b/webkit/child/site_isolation_policy.cc |
@@ -0,0 +1,512 @@ |
+// Copyright (c) 2013 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "webkit/child/site_isolation_policy.h" |
+ |
+#include "base/basictypes.h" |
+#include "base/logging.h" |
+#include "base/metrics/histogram.h" |
+#include "base/strings/string_util.h" |
+#include "net/base/registry_controlled_domains/registry_controlled_domain.h" |
+#include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h" |
+#include "third_party/WebKit/public/platform/WebString.h" |
+#include "third_party/WebKit/public/platform/WebURL.h" |
+#include "third_party/WebKit/public/platform/WebURLRequest.h" |
+#include "third_party/WebKit/public/platform/WebURLResponse.h" |
+#include "third_party/WebKit/public/web/WebDocument.h" |
+#include "third_party/WebKit/public/web/WebFrame.h" |
+#include "third_party/WebKit/public/web/WebFrameClient.h" |
+#include "third_party/WebKit/public/web/WebSecurityOrigin.h" |
+ |
+using base::strncasecmp; |
+using WebKit::WebURLResponse; |
+using WebKit::WebURLRequest; |
+using WebKit::WebURL; |
+using WebKit::WebString; |
+using WebKit::WebDocument; |
Charlie Reis
2013/08/07 21:02:02
These should be alphabetized.
dsjang
2013/08/08 21:21:01
Done.
|
+ |
+namespace webkit_glue { |
+ |
+std::map<unsigned, WebURLRequest::TargetType> |
+ SiteIsolationPolicy::id_target_map_; |
+std::map<std::string, ResponseMetaData> |
+ SiteIsolationPolicy::url_responsedata_map_; |
+std::map<unsigned, std::string> SiteIsolationPolicy::id_url_map_; |
+ |
+void SiteIsolationPolicy::WillSendRequest( |
+ unsigned identifier, |
+ WebURLRequest::TargetType target_type) { |
+ // This happens when the original request is redirected. |
+ if (id_target_map_.count(identifier) != 0) { |
+ // This check actually can fail. If it is, which target_type do we |
+ // have to record between the old one and the new one? When |
+ // redirection happens, target_type becomes 2. TODO(dsjang): |
+ // let's disable this code and see what happens on onclickads.com |
+ // for googleads JavaScript code assigned to an image. To disable |
+ // this, we need a guarntee that target_type is always erased at |
+ // the end of a transaction. |
+ if (id_target_map_[identifier] != target_type) { |
+ id_target_map_[identifier] = target_type; |
Charlie Reis
2013/08/07 21:02:02
I can't understand this comment or code. It looks
dsjang
2013/08/08 21:21:01
Done.
|
+ } |
+ } |
+ id_target_map_[identifier] = target_type; |
+} |
+ |
+void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame, |
+ unsigned identifier, |
+ const WebURLResponse& response) { |
+ DCHECK(id_target_map_.count(identifier) == 1); |
+ |
+ UMA_HISTOGRAM_COUNTS("XSDP.ALL", 1); |
+ |
+ GURL response_url = response.url(); |
+ WebURLRequest::TargetType target_type = id_target_map_[identifier]; |
+ id_target_map_.erase(identifier); |
+ |
+ // See if this is for navigation. If it is, let it pass. |
+ if (IsFrameNotCommitted(frame)) { |
+ LOG(INFO) << "SiteIsolationPolicy.FrameNotCommitted"; |
+ return; |
+ } |
+ |
+ GURL frame_origin(frame->document().securityOrigin().toString().utf8()); |
+ |
+ // TODO(dsjang): Find out all network related schemes here. |
+ if (!IsNetworkScheme(frame_origin)) { |
+ LOG(INFO) << "SiteIsolationPolicy.NotNetworkScheme:" << frame_origin; |
+ return; |
+ } |
+ |
+ if (IsSameSite(frame_origin, response_url)) { |
+ LOG(INFO) << "SiteIsolationPolicy.SameSite:" << frame_origin << "," |
+ << response_url; |
+ return; |
+ } |
+ |
+ ResponseMetaData::CanonicalMimeType canonical_mime_type = |
+ GetCanonicalMimeType(response); |
+ |
+ if (canonical_mime_type == ResponseMetaData::IsOthers) { |
+ LOG(INFO) << "SiteIsolationPolicy.mimetype:" << frame_origin << "," |
+ << response_url << ",[" << response.mimeType().utf8() << "]"; |
+ return; |
+ } |
+ |
+ // There was a possiblity that a CORS request preceded by a |
+ // pre-flight request does not have "Access-Control-Allow-Origin" |
+ // header. But it turns out that every CORS request should have the |
+ // header no matter what CORS request it is. Therefore, if this is a |
+ // CORS request, it has this header. |
+ std::string access_control_origin = response |
+ .httpHeaderField( |
+ WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8(); |
+ |
+ if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) { |
+ LOG(INFO) << "SiteIsolationPolicy.CorsIsSafe:"; |
+ return; |
+ } |
+ |
+ // Real XSD data collection starts from here. |
+ LOG(INFO) << "SiteIsolationPolicy.XSD!!!:" << canonical_mime_type << |
+ ":" << response_url; |
+ |
+ // TODO(dsjang): Apply X-Content-Type option here. |
+ ResponseMetaData resp_data; |
+ resp_data.frame_origin = frame_origin.spec(); |
+ resp_data.response_url = response_url.spec(); |
+ resp_data.identifier = identifier; |
+ resp_data.target_type = target_type; |
+ resp_data.canonical_mime_type = canonical_mime_type; |
+ resp_data.http_status_code = response.httpStatusCode(); |
+ |
+ url_responsedata_map_[resp_data.response_url] = resp_data; |
+ id_url_map_[identifier] = resp_data.response_url; |
+ |
+ return; |
+} |
+ |
+void SiteIsolationPolicy::DidReceiveData(const char* data, |
+ int length, |
+ WebURL& web_response_url) { |
Charlie Reis
2013/08/09 00:39:03
Add a comment to the .h file that there's a risk t
|
+ // We only record XSDs whose content is actually non-zero. |
+ GURL response_url(web_response_url); |
+ |
+ std::string response_url_str = response_url.spec(); |
+ if (url_responsedata_map_.count(response_url_str) == 0) |
+ return; |
+ |
+ DCHECK(url_responsedata_map_.count(response_url_str) == 1); |
+ ResponseMetaData resp_data = url_responsedata_map_[response_url_str]; |
+ url_responsedata_map_.erase(response_url_str); |
+ |
+ // Record the length of the first received network packet to see if |
+ // it's enough for sniffing. |
+ UMA_HISTOGRAM_COUNTS("XSDP.XSD.DataLength", length); |
Charlie Reis
2013/08/07 21:02:02
Why would we need to collect this?
dsjang
2013/08/08 21:21:01
I wanted to see if that's not the case that most o
|
+ |
+ // Record the entire number of responses with a specific mime |
+ // type(text/html, text/xml, etc). |
+ UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.MimeType", |
+ resp_data.canonical_mime_type, |
+ ResponseMetaData::IsOthers + 1); |
+ |
+ // TODO(dsjang): sometimes the length of payload can be not enough to do |
+ // correct content sniffing. If that happens, put it into a buffer |
+ // so that we can do it later. |
+ bool verified_for_blocking = false; |
+ ResponseMetaData::CanonicalMimeType sniffed_type = |
+ ResponseMetaData::IsOthers; |
+ |
+ switch (resp_data.canonical_mime_type) { |
+ // Record the number of responses whose content is sniffed for |
+ // what its mime type claims it to be. For example, we apply a |
+ // HTML sniffer for a document tagged with text/html here, and |
+ // increments the count of "XSDP.XSD.HTML.Verified". |
+ case ResponseMetaData::IsHTML: |
+ if (SniffForHTML(data, length)) { |
+ UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.HTML.Verified", 1); |
+ verified_for_blocking = true; |
+ } |
+ break; |
+ case ResponseMetaData::IsXML: |
+ if (SniffForXML(data, length)) { |
+ UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.XML.Verified", 1); |
+ verified_for_blocking = true; |
+ } |
+ break; |
+ case ResponseMetaData::IsJSON: |
+ if (SniffForJSON(data, length)) { |
+ UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.JSON.Verified", 1); |
+ verified_for_blocking = true; |
+ } |
+ break; |
+ case ResponseMetaData::IsPlain: |
+ if (SniffForHTML(data, length)) { |
+ sniffed_type = ResponseMetaData::IsHTML; |
+ verified_for_blocking = true; |
+ } else if (SniffForXML(data, length)) { |
+ sniffed_type = ResponseMetaData::IsXML; |
+ verified_for_blocking = true; |
+ } else if (SniffForJSON(data, length)) { |
+ sniffed_type = ResponseMetaData::IsJSON; |
+ verified_for_blocking = true; |
+ } |
+ UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.MimeType.Plain.Verified", |
+ sniffed_type, |
+ ResponseMetaData::IsJSON + 1); |
+ break; |
+ case ResponseMetaData::IsOthers: |
+ DCHECK(false); |
+ break; |
+ } |
+ |
+ // We block these. See how many of them have unaffected status code. |
+ if (verified_for_blocking) { |
+ if (IsErrorStatusCode(resp_data.http_status_code)) { |
+ // This is a blocking that does not affect the browser behavior |
+ // by the following reasons : 1) this is not a binary object |
+ // (such as an image) since this is sniffed as a text |
+ // document. 2) then, this blocking only breaks the renderer |
+ // behavior only if it is either JavaScript or CSS. However, the |
+ // renderer doesn't use the contents of JS/CSS with unaffected |
+ // status code(e.g, 404). *) the renderer is expected not to use |
+ // the cross-site document content for purposes other than |
+ // JS/CSS (e.g, XHR). |
+ UMA_HISTOGRAM_COUNTS("XSDP.XSD.Blocked.ErrorStatusCode", 1); |
+ } else { |
+ // This is the case that a blocked response is with a non-error |
+ // status code, so this blocking can be actually disruptive. |
+ UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.Blocked.NormalStatusCode", |
+ resp_data.target_type, WebURLRequest::TargetIsUnspecified + 1); |
+ } |
+ } else { |
+ LOG(INFO) << "Not Blocked:sniffing failed:"; |
+ // Not blocked, but How many of them can be JS? This is only |
+ // useful for studying non-blocked documents. |
+ if (SniffForJS(data, length)) { |
+ UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.NotBlocked.MaybeJS", |
+ resp_data.target_type, |
+ WebURLRequest::TargetIsUnspecified + 1); |
+ } |
+ } |
+} |
+ |
+void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) { |
+ id_target_map_.erase(identifier); |
+ if (id_url_map_.count(identifier) > 0) { |
+ url_responsedata_map_.erase(id_url_map_[identifier]); |
+ id_url_map_.erase(identifier); |
+ } |
+} |
+ |
+void SiteIsolationPolicy::DidFinishResourceLoadForUrl( |
+ const WebKit::WebURL& web_response_url) { |
+ GURL response_url(web_response_url); |
+ |
+ if (url_responsedata_map_.count(response_url.spec()) > 0) { |
+ ResponseMetaData meta_data = url_responsedata_map_[response_url.spec()]; |
+ url_responsedata_map_.erase(response_url.spec()); |
+ id_target_map_.erase(meta_data.identifier); |
+ id_url_map_.erase(meta_data.identifier); |
+ } |
+} |
+ |
+ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType( |
+ const WebURLResponse& response) { |
+ static const char TEXT_HTML[] = "text/html"; |
+ static const char TEXT_XML[] = "text/xml"; |
+ static const char APP_RSS_XML[] = "application/rss+xml"; |
+ static const char APP_XML[] = "application/xml"; |
+ static const char APP_JSON[] = "application/json"; |
+ static const char TEXT_XJSON[] = "text/x-json"; |
+ static const char TEXT_JSON[] = "text/json"; |
+ static const char TEXT_PLAIN[] = "text/json"; |
+ |
+ const std::string mime_type = response.mimeType().utf8(); |
+ |
+ LOG(ERROR) << "mimetype:" << mime_type << "==[" << TEXT_HTML << "]"; |
+ |
+ // These are a thorough list of the mime types crawled over the top |
+ // 50k sites related to HTML, XML, JSON, Plain. |
+ if (LowerCaseEqualsASCII(mime_type, TEXT_HTML)) { |
+ return ResponseMetaData::IsHTML; |
+ } else if (LowerCaseEqualsASCII(mime_type, TEXT_XML) || |
+ LowerCaseEqualsASCII(mime_type, APP_RSS_XML) || |
+ LowerCaseEqualsASCII(mime_type, APP_XML)) { |
+ return ResponseMetaData::IsXML; |
+ } else if (LowerCaseEqualsASCII(mime_type, APP_JSON) || |
+ LowerCaseEqualsASCII(mime_type, TEXT_XJSON) || |
+ LowerCaseEqualsASCII(mime_type, TEXT_JSON)) { |
+ return ResponseMetaData::IsJSON; |
+ } else if (LowerCaseEqualsASCII(mime_type, TEXT_PLAIN)) { |
+ return ResponseMetaData::IsPlain; |
+ } else { |
+ return ResponseMetaData::IsOthers; |
+ } |
+} |
+ |
+bool SiteIsolationPolicy::IsNetworkScheme(GURL& url) { |
+ // We exclude ftp:// from here. FTP doesn't provide a Content-Type |
+ // header which our policy depends on, so we cannot protect any |
+ // document from FTP servers. |
+ return url.SchemeIs("http") || url.SchemeIs("https"); |
+} |
+ |
+bool SiteIsolationPolicy::IsSameSite(GURL& frame_origin, GURL& response_url) { |
+ if (frame_origin.scheme() != response_url.scheme()) |
+ return false; |
+ |
+ // Extract the effective domains (public suffix plus one) of the |
+ // urls. |
+ |
+ // TODO(dsjang): Is there any reason why we don't use |
+ // net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES |
+ // instead of |
+ // net::registry_controlled_domains::EXCLUSE_PRIVATE_REGISTRIES? If |
+ // we allow sites to use their private registries, they can use |
+ // "finer grained" sites than only using public ones. |
+ std::string frame_domain = |
+ net::registry_controlled_domains::GetDomainAndRegistry( |
+ frame_origin, |
+ net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); |
+ std::string response_domain = |
+ net::registry_controlled_domains::GetDomainAndRegistry( |
+ response_url, |
+ net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); |
+ |
+ return frame_domain == response_domain; |
+} |
+ |
+bool SiteIsolationPolicy::IsFrameNotCommitted(WebKit::WebFrame* frame) { |
+ // When a navigation starts, frame->provisionalDataSource() is set |
+ // to a not-null value which stands for the request made for the |
+ // navigation. As soon as the network request is committed to the |
+ // frame, frame->provisionalDataSource() is converted to null, and |
+ // the committed data source is moved to frame->dataSource(). This |
+ // is the most reliable way to detect whether the frame is in |
+ // navigation or not by far. |
+ return frame->provisionalDataSource() != NULL; |
+} |
+ |
+bool SiteIsolationPolicy::IsValidCorsHeaderSet( |
+ GURL& frame_origin, |
+ GURL& website_origin, |
+ std::string access_control_origin) { |
+ |
+ size_t access_control_origin_len = access_control_origin.size(); |
+ |
+ // TODO(dsjang): Is this actually true? The server seems to return |
+ // an empty string or "null". |
+ if (access_control_origin_len == 0) |
+ return false; |
+ |
+ // Many websites are sending back "\"*\"" instead of "*". This is |
+ // non-standard practice, and seems not supported by the |
+ // brwoser. Refer to |
+ // CrossOriginAccessControl::passesAccessControlCheck(). |
+ |
+ // TODO(dsjang): * is not allowed for the response from a request |
+ // with cookies. This allows for more than what the renderer will |
+ // eventually be able to receive, so we won't see illegal cross-site |
+ // documents alllowed by this. We have to have t a way to see if |
+ // this response is from a cookie-tagged request or not in the |
+ // future. |
+ if (access_control_origin == "*") |
+ return true; |
+ |
+ // TODO(dsjang): The CORS spec only treats a fully specified URL, |
+ // except for "*", but many websites are using just a domain for |
+ // access_control_origin, and this is blocked by Webkit's CORS logic |
+ // here : CrossOriginAccessControl::passesAccessControlCheck() |
+ |
+ // We don't use Webkit's existing CORS policy implementation since |
+ // their policy works in terms of origins, not sites. For |
+ // example, when frame is sub.a.com and it is not allowed to access |
+ // a document with sub1.a.com. But under Site Isolation, it's |
+ // allowed. |
+ |
+ // TODO(dsjang): examine createFromString()'s behavior for a URL |
+ // containing * in it. |
+ WebKit::WebSecurityOrigin cors_security_origin = |
+ WebKit::WebSecurityOrigin::createFromString( |
+ WebKit::WebString::fromUTF8(access_control_origin)); |
+ GURL cors_origin(cors_security_origin.toString().utf8()); |
+ |
+ LOG(ERROR) << cors_security_origin.toString().utf8(); |
+ return IsSameSite(frame_origin, cors_origin); |
+} |
+ |
+bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) { |
+ // TODO(dsjang): The content sniffer used by Chrome and Firefox are |
+ // using "<!--" as one of the HTML signatures, but it also appears |
+ // in valid JavaScript, considered as well-formed JS by the browser. |
+ // Since we do not want to block any JS, we exclude it from our HTML |
+ // signatures. This can weaken our document block policy, but we can |
+ // break less websites. |
+ const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec |
+ "<script", // HTML5 spec, Mozilla |
+ "<html", // HTML5 spec, Mozilla |
+ "<head", // HTML5 spec, Mozilla |
+ "<iframe", // Mozilla |
+ "<h1", // Mozilla |
+ "<div", // Mozilla |
+ "<font", // Mozilla |
+ "<table", // Mozilla |
+ "<a", // Mozilla |
+ "<style", // Mozilla |
+ "<title", // Mozilla |
+ "<b", // Mozilla |
+ "<body", // Mozilla |
+ "<br", "<p" // Mozilla |
+ }; |
+ return DoSignatureMatching( |
+ data, length, html_signatures, arraysize(html_signatures)); |
+} |
+ |
+bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) { |
+ const char* xml_signatures[] = {"<?xml" // Mozilla |
+ }; |
+ return DoSignatureMatching( |
+ data, length, xml_signatures, arraysize(xml_signatures)); |
+} |
+ |
+bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) { |
+ // TODO(dsjang): We have to come up with a better way to sniff |
+ // JSON. However, even RE cannot help us that much due to the fact |
+ // that we don't do full parsing. This DFA starts with state 0, and |
+ // finds 1) {, 2) "or', 3) : in the order. This is intentionally not |
+ // using a regular expression library so that we can make the |
+ // trusted code base as small as possible. State 4 is a dead state. |
+ const int INIT_ST = 0; |
+ const int LBRACE_ST = 1; |
+ const int LQUOTE_ST = 2; |
+ const int COLON_ST = 3; |
+ const int DEAD_ST = 4; |
+ |
+ int state = INIT_ST; |
+ for (size_t i = 0; i < length && state < COLON_ST; ++i, ++data) { |
+ const char c = *data; |
+ if (c == ' ' || c == '\t' || c == '\r' || c == '\n') |
+ continue; |
+ |
+ switch (state) { |
+ case INIT_ST: |
+ if (c == '{') |
+ state = LBRACE_ST; |
+ else |
+ state = DEAD_ST; |
+ break; |
+ case LBRACE_ST: |
+ if (c == '\"' || c == '\'') |
+ state = LQUOTE_ST; |
+ else |
+ state = DEAD_ST; |
+ break; |
+ case LQUOTE_ST: |
+ if (c == ':') { |
+ state = COLON_ST; |
+ } |
+ break; |
+ default: |
+ break; |
+ } |
+ } |
+ return state == COLON_ST; |
+} |
+ |
+bool SiteIsolationPolicy::DoSignatureMatching(const char* data, |
+ size_t length, |
+ const char* signatures[], |
+ size_t arr_size) { |
+ for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) { |
+ const char* signature = signatures[sig_index]; |
+ size_t signature_length = strlen(signature); |
+ size_t i = 0; |
+ // Skip the white characters at the beginning of the document. |
+ for (i = 0; i < length; ++i) { |
+ char c = *data; |
+ if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t')) { |
+ break; |
+ } |
+ ++data; |
+ } |
+ length = length - i; |
+ if (length < signature_length) |
+ continue; |
+ if (!base::strncasecmp(signature, data, signature_length)) { |
+ return true; |
+ } |
+ } |
+ return false; |
+} |
+ |
+bool SiteIsolationPolicy::IsErrorStatusCode(int status_code) { |
+ // Chrome only uses the content of a response with one of these |
+ // status codes for CSS/JavaScript. For images, Chrome just ignores |
+ // status code. |
+ const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302, |
+ 303, 305, 306, 307}; |
+ for (size_t i = 0; i < arraysize(renderable_status_code); ++i) { |
+ if (renderable_status_code[i] == status_code) |
+ return false; |
+ } |
+ return true; |
+} |
+ |
+bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) { |
+ // TODO(dsjang): This is a real hacking. The only purpose of this |
+ // function is to try to see if there's any possibility that this |
+ // data can be JavaScript.(superset of JS). This function will be |
+ // removed for the production code. |
+ |
+ // Search for "var " for JS detection. :-) |
+ for (size_t i = 0; i < length - 3; ++i) { |
+ if (strncmp(data, "var ", 4) == 0) { |
+ return true; |
+ } |
+ ++data; |
+ } |
+ return false; |
+} |
+ |
+} // namespace webkit_glue |