Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(275)

Unified Diff: webkit/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr
Patch Set: Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: webkit/child/site_isolation_policy.cc
diff --git a/webkit/child/site_isolation_policy.cc b/webkit/child/site_isolation_policy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fd8ab0e5016bbcd7a1d67ee8def7e728e0632d6
--- /dev/null
+++ b/webkit/child/site_isolation_policy.cc
@@ -0,0 +1,502 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "webkit/child/site_isolation_policy.h"
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "base/metrics/histogram.h"
+#include "base/strings/string_util.h"
+#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
+#include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
+#include "third_party/WebKit/public/platform/WebString.h"
+#include "third_party/WebKit/public/platform/WebURL.h"
+#include "third_party/WebKit/public/platform/WebURLRequest.h"
+#include "third_party/WebKit/public/platform/WebURLResponse.h"
+#include "third_party/WebKit/public/web/WebDocument.h"
+#include "third_party/WebKit/public/web/WebFrame.h"
+#include "third_party/WebKit/public/web/WebFrameClient.h"
+#include "third_party/WebKit/public/web/WebSecurityOrigin.h"
+
+using WebKit::WebURLResponse;
+using WebKit::WebURLRequest;
+using WebKit::WebURL;
+using WebKit::WebString;
+using WebKit::WebDocument;
+
+namespace webkit_glue {
+
+std::map<unsigned, WebURLRequest::TargetType>
nasko 2013/08/06 17:29:27 Why are these needed in this file? They are alread
dsjang 2013/08/07 00:19:07 Done.
+ SiteIsolationPolicy::id_target_map_;
+std::map<std::string, ResponseMetaData>
+ SiteIsolationPolicy::url_responsedata_map_;
+std::map<unsigned, std::string> SiteIsolationPolicy::id_url_map_;
+
+void SiteIsolationPolicy::WillSendRequest(
+ unsigned identifier,
+ WebURLRequest::TargetType target_type) {
+ // This happens when the original request is redirected.
+ if (id_target_map_.count(identifier) != 0) {
+ // This check actually can fail. If it is, which target_type do we
+ // have to record between the old one and the new one? When
+ // redirection happens, target_type becomes 2. TODO(dsjang):
+ // let's disable this code and see what happens on onclickads.com
+ // for googleads JavaScript code assigned to an image.
+ if (id_target_map_[identifier] != target_type) {
+ id_target_map_[identifier] = target_type;
+ }
+ }
+ id_target_map_[identifier] = target_type;
+}
+
+void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame,
+ unsigned identifier,
+ const WebURLResponse& response) {
+
nasko 2013/08/06 17:29:27 nit: no need for empty line here.
dsjang 2013/08/07 00:19:07 Done.
+ DCHECK(id_target_map_.count(identifier) == 1);
+
+ UMA_HISTOGRAM_COUNTS("XSDP.ALL", 1);
+
+ GURL response_url = response.url();
+ WebURLRequest::TargetType target_type = id_target_map_[identifier];
+ id_target_map_.erase(identifier);
+
+ // See if this is for navigation. If it is, let it pass.
+ if (IsFrameNotCommitted(frame)) {
+ LOG(INFO) << "SiteIsolationPolicy.FrameNotCommitted";
+ return;
+ }
+
+ GURL frame_origin(frame->document().securityOrigin().toString().utf8());
+
+ // TODO(dsjang): Find out all non-network scheme here.
+ // If it's the data: scheme, we can let it pass through.
+ if (IsSafeScheme(frame_origin)) {
+ LOG(INFO) << "SiteIsolationPolicy.SafeScheme:" << frame_origin;
+ return;
+ }
+
+ if (IsSameSite(frame_origin, response_url)) {
+ LOG(INFO) << "SiteIsolationPolicy.SameSite:" << frame_origin << ","
+ << response_url;
+ return;
+ }
+
+ ResponseMetaData::CanonicalMimeType canonical_mime_type =
+ GetCanonicalMimeType(response);
+
+ if (canonical_mime_type == ResponseMetaData::IsOthers) {
+ LOG(INFO) << "SiteIsolationPolicy.mimetype:" << frame_origin << ","
+ << response_url << "," << response.mimeType().utf8();
+ return;
+ }
+
+ std::string access_control_origin = response
+ .httpHeaderField(
+ WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8();
+
+ if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) {
+ LOG(INFO) << "SiteIsolationPolicy.CorsisSafe:";
nasko 2013/08/06 17:29:27 nit: CorsIsSafe
dsjang 2013/08/07 00:19:07 Done.
+ return;
+ }
+
+ // Real data collection starts from here.
+ //
+ // XSDP.XSD.%MIMECODE is a shortened name for
+ // XSDP.All.NNav.NSafeScheme.NSMIMEType.NSCORS.%MIMECODE from now
nasko 2013/08/06 17:29:27 What is the meaning of this string? Putting a comm
dsjang 2013/08/07 00:19:07 Done.
+ // on.
+
+ LOG(INFO) << "SiteIsolationPolicy.XSD!!!:" << response_url;
+
+ ResponseMetaData metaData;
+ metaData.frame_origin = frame_origin.spec();
+ metaData.response_url = response_url.spec();
+ metaData.identifier = identifier;
+ metaData.target_type = target_type;
+ metaData.canonical_mime_type = canonical_mime_type;
+ metaData.http_status_code = response.httpStatusCode();
+
+ url_responsedata_map_[metaData.response_url] = metaData;
+ id_url_map_[identifier] = metaData.response_url;
+
+ return;
+}
+
+void SiteIsolationPolicy::DidReceiveData(const char* data,
+ int length,
+ WebURL& web_response_url) {
+ GURL response_url(web_response_url);
+
+ std::string response_url_str = response_url.spec();
+ if (url_responsedata_map_.count(response_url_str) == 0)
nasko 2013/08/06 17:29:27 Is this a valid case? When will we have seen a req
dsjang 2013/08/07 00:19:07 url_responsedata_map_ only maintains url for cross
+ return;
+
+ // Record the length of the first received network packet to see if
+ // it's enough for sniffing.
+ UMA_HISTOGRAM_COUNTS("XSDP.XSD.DataLength", length);
+
+ DCHECK(url_responsedata_map_.count(response_url_str) == 1);
+ ResponseMetaData metaData = url_responsedata_map_[response_url_str];
+ url_responsedata_map_.erase(response_url_str);
+
+ std::string uma_bucket_name("XSDP.XSD.");
+ uma_bucket_name.append(ResponseMetaData::CanonicalMimeTypeToString(
+ metaData.canonical_mime_type));
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+
+ // TODO(dsjang): sometimes the length of payload can be not enough to do
+ // correct content sniffing. If that happens, put it into a buffer
+ // so that we can do it later.
+ bool verified_for_blocking = false;
+ switch (metaData.canonical_mime_type) {
+ case ResponseMetaData::IsHTML:
+ if (SniffForHTML(data, length)) {
+ uma_bucket_name.append(".Verified");
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ verified_for_blocking = true;
+ }
+ break;
+ case ResponseMetaData::IsXML:
+ if (SniffForXML(data, length)) {
+ uma_bucket_name.append(".Verified");
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ verified_for_blocking = true;
+ }
+ break;
+ case ResponseMetaData::IsJSON:
+ if (SniffForJSON(data, length)) {
+ uma_bucket_name.append(".Verified");
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ verified_for_blocking = true;
+ }
+ break;
+ case ResponseMetaData::IsPlain:
+ if (SniffForHTML(data, length)) {
+ uma_bucket_name.append(".Verified.HTML");
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ verified_for_blocking = true;
+ } else if (SniffForXML(data, length)) {
+ uma_bucket_name.append(".Verified.XML");
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ verified_for_blocking = true;
+ } else if (SniffForJSON(data, length)) {
+ uma_bucket_name.append(".Verified.JSON");
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ verified_for_blocking = true;
+ }
+ break;
+ case ResponseMetaData::IsOthers:
+ DCHECK(false);
+ break;
+ }
+
+ // We block these. See how many of them have unaffected status code.
+ if (verified_for_blocking) {
+ if (UnaffectedStatusCode(metaData.http_status_code)) {
+ // This is a blocking that does not affect the browser behavior
+ // by the following reasons : 1) this is not a binary object
+ // (such as an image) since this is sniffed as a text
+ // document. 2) then, this blocking only breaks the renderer
+ // behavior only if it is either JavaScript or CSS. However, the
+ // renderer doesn't use the contents of JS/CSS with unaffected
+ // status code(e.g, 404). *) the renderer is expected not to use
+ // the cross-site document content for purposes other than
+ // JS/CSS (e.g, XHR).
+ uma_bucket_name.append(".UnaffectedStatusCode.");
+ std::stringstream stat_code_strm;
+ LOG(INFO) << "Blocked:UNAFFECTED STAT CODE:" << metaData.http_status_code;
+ stat_code_strm << metaData.http_status_code;
+ uma_bucket_name.append(stat_code_strm.str());
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ } else {
+ LOG(INFO) << "Blocked:AFFECTED STAT CODE:" << metaData.http_status_code;
+ // This blocking can be disruptive if it was actually JS, and
+ // requested for JS.
+ uma_bucket_name.append(".NUnaffectedStatusCode.");
+ uma_bucket_name.append(
+ ResponseMetaData::TargetTypeToString(metaData.target_type));
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ if (SniffForJS(data, length)) {
+ // This shows if this blocking can be JS.
+ uma_bucket_name.append(".MaybeJS");
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ }
+ }
+ } else {
+ LOG(INFO) << "Not Blocked:sniffing failed:";
+ // Not blocked. How many of them can be JS? This is only useful
+ // for studying non-blocked documents.
+ if (SniffForJS(data, length)) {
+ uma_bucket_name.append(".NVerified.MaybeJS");
+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
+ }
+ }
+}
+
+void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) {
+ id_target_map_.erase(identifier);
+ if (id_url_map_.count(identifier) > 0) {
+ url_responsedata_map_.erase(id_url_map_[identifier]);
+ id_url_map_.erase(identifier);
+ }
+}
+
+void SiteIsolationPolicy::DidFinishResourceLoad(
+ WebKit::WebURL& web_response_url) {
+ GURL response_url(web_response_url);
+
+ if (url_responsedata_map_.count(response_url.spec()) > 0) {
+ ResponseMetaData meta_data = url_responsedata_map_[response_url.spec()];
+ url_responsedata_map_.erase(response_url.spec());
+ id_target_map_.erase(meta_data.identifier);
+ id_url_map_.erase(meta_data.identifier);
+ }
+}
+
+ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType(
+ const WebURLResponse& response) {
+ // RFC 2045 says: "The type, subtype, and parameter names are not
+ // case sensitive." If you have a MIME type of text/plain that's a
+ // type of text and a subtype of plain. So, per the spec, these are
+ // not case sensitive.
+ std::string mime_type = response.mimeType().utf8();
+ StringToLowerASCII(&mime_type);
nasko 2013/08/06 17:29:27 The mime_type string is UTF8, yet you are using AS
dsjang 2013/08/07 00:19:07 I found that all the crawled mime types are origin
+
+ const char* const document_mime_types[] = {
+ "text/html", "text/xml", "application/rss+xml", "application/xml",
+ "application/json", "text/x-json", "text/json", "text/plain"};
+ size_t i = 0;
+ for (i = 0; i < 8; ++i) {
+ if (!strcmp(document_mime_types[i], mime_type.data())) {
nasko 2013/08/06 17:29:27 This is unsafe comparison, you should be bounding
dsjang 2013/08/07 00:19:07 Switched to std::string::operator==(). On 2013/08
+ break;
+ }
+ }
+
+ if (i == 0) {
nasko 2013/08/06 17:29:27 Using constants like these seems unclean to me. Is
dsjang 2013/08/07 00:19:07 Done.
+ return ResponseMetaData::IsHTML;
+ } else if (1 <= i && i < 4) {
+ return ResponseMetaData::IsXML;
+ } else if (4 <= i && i < 7) {
+ return ResponseMetaData::IsJSON;
+ } else if (i == 7) {
+ return ResponseMetaData::IsPlain;
+ } else {
+ return ResponseMetaData::IsOthers;
+ }
+}
+
+bool SiteIsolationPolicy::IsSafeScheme(GURL& url) {
nasko 2013/08/06 17:29:27 What about blob URLs? Are those safe too?
dsjang 2013/08/07 00:19:07 Switched to blacklisting from whitelisting. On 20
+ return url.scheme() == "data";
+}
+
+bool SiteIsolationPolicy::IsSameSite(GURL& frame_origin, GURL& response_url) {
+ if (frame_origin.scheme() != response_url.scheme())
+ return false;
+
+ // Extract the effective domains (public suffix plus one) of the
+ // urls.
+ std::string frame_domain =
+ net::registry_controlled_domains::GetDomainAndRegistry(
+ frame_origin,
+ net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
nasko 2013/08/06 17:29:27 Is there a reason we are deviating from how SiteIn
dsjang 2013/08/07 00:19:07 I thought allowing private registries here enables
+ std::string response_domain =
+ net::registry_controlled_domains::GetDomainAndRegistry(
+ response_url,
+ net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
+
+ return frame_domain == response_domain;
+}
+
+bool SiteIsolationPolicy::IsFrameNotCommitted(WebKit::WebFrame* frame) {
nasko 2013/08/06 17:29:27 This name is a bit confusing, can you put a descri
dsjang 2013/08/07 00:19:07 Done.
+ return frame->provisionalDataSource() != NULL;
+}
+
+bool SiteIsolationPolicy::IsValidCorsHeaderSet(
+ GURL& frame_origin,
+ GURL& website_origin,
+ std::string access_control_origin) {
+
+ size_t access_control_origin_len = access_control_origin.size();
+
+ // TODO(dsjang): Is this actually true? The server seems to return
+ // an empty string or "null".
+ if (access_control_origin_len == 0)
+ return false;
+
+ // Strip quotes off from it. This is non-standard practice, but many
nasko 2013/08/06 17:29:27 Don't we have code that parses the header that we
dsjang 2013/08/07 00:19:07 Done.
+ // websites use quote strings surrounding the actual header value.
+ if (access_control_origin_len > 2) {
+ char first = access_control_origin[0];
+ char last = access_control_origin[access_control_origin_len - 1];
+ if ((first == '\"' && last == '\"') || (first == '\'' && last == '\'')) {
+ access_control_origin =
+ access_control_origin.substr(1, access_control_origin_len - 2);
+ }
+ }
+
+ // TODO(dsjang): * is not allowed for the response from a request
+ // with cookies. This allows for more than what the renderer will
+ // eventually be able to receive, so we won't see illegal cross-site
+ // documents alllowed by this. We have to have t a way to see if
+ // this response is from a cookie-tagged request or not in the
+ // future.
+ if (access_control_origin == "*")
nasko 2013/08/06 17:29:27 You talk about cookie-tagged requests, but there i
dsjang 2013/08/07 00:19:07 Yes, it has to be incorporated here in the future,
+ return true;
+
+ // TODO(dsjang): The CORS spec only treats a fully specified URL,
+ // not just a domain here. Confirm this ad-hoc rule to be
+ // correct. If this doesn't start with a scheme(http://, https://),
nasko 2013/08/06 17:29:27 This sounds scary. Is this correct?
dsjang 2013/08/07 00:19:07 This was actually scarily wrong :-). I found out t
+ // it inherits the site's scheme.
+ if (access_control_origin.find("http://") != 0 &&
+ access_control_origin.find("https://") != 0) {
+ access_control_origin.insert(0, website_origin.scheme() + "://");
+ }
+
+ LOG(ERROR) << access_control_origin;
+
+ // We don't use Webkit's
+ // frame->securityOrigin().canAccess(WebSecurityOrigin::createFromString(acc
+ // ess_control_origin)))here since their .canAccess works in terms of origins,
+ // not sites. For example, when frame is sub.a.com and it is not allowed
+ // to access a document with sub1.a.com. But under Site Isolation,
+ // it's allowed.
+
+ // TODO(dsjang): examine createFromString()'s behavior for a URL
+ // containing * in it.
+ WebKit::WebSecurityOrigin cors_security_origin =
+ WebKit::WebSecurityOrigin::createFromString(
+ WebKit::WebString::fromUTF8(access_control_origin));
+ GURL cors_origin(cors_security_origin.toString().utf8());
+
+ LOG(ERROR) << cors_security_origin.toString().utf8();
+ return IsSameSite(frame_origin, cors_origin);
+}
+
+bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {
+ // TODO(dsjang): The content sniffer used by Chrome and Firefox are
+ // using "<!--" as one of the HTML signatures, but it also appears
+ // in valid JavaScript, considered as well-formed JS by the browser.
+ // Since we do not want to block any JS, we exclude it from our HTML
+ // signatures. This can weaken our document block policy, but we can
+ // break less websites.
+ const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec
+ "<script", // HTML5 spec, Mozilla
+ "<html", // HTML5 spec, Mozilla
+ "<head", // HTML5 spec, Mozilla
+ "<iframe", // Mozilla
+ "<h1", // Mozilla
+ "<div", // Mozilla
+ "<font", // Mozilla
+ "<table", // Mozilla
+ "<a", // Mozilla
+ "<style", // Mozilla
+ "<title", // Mozilla
+ "<b", // Mozilla
+ "<body", // Mozilla
+ "<br", "<p" // Mozilla
+ };
+ return DoSignatureMatching(
+ data, length, html_signatures, arraysize(html_signatures));
+}
+
+bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {
+ const char* xml_signatures[] = {"<?xml" // Mozilla
+ };
+ return DoSignatureMatching(
+ data, length, xml_signatures, arraysize(xml_signatures));
+}
+
+bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {
+ // TODO(dsjang): We have to come up with a better way to sniff
+ // JSON. However, even RE cannot help us that much due to the fact
+ // that we don't do full parsing. This DFA finds 1) {, 2) "(or'),
+ // 3) : in the order. This is intentionally not using a regular
+ // expression library so that we can make the trusted code base as
+ // small as possible.
+ int state = 0;
+ for (size_t i = 0; i < length && state < 3; ++i, ++data) {
+ char c = *data;
+ if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
+ continue;
+
+ switch (state) {
+ case 0:
+ if (c == '{')
+ state = 1;
nasko 2013/08/06 17:29:27 Please use symbolic names or describe what those n
dsjang 2013/08/07 00:19:07 Done.
+ else
+ state = 4;
+ break;
+ case 1:
+ if (c == '\"' || c == '\'')
+ state = 2;
+ else
+ state = 4;
+ break;
+ case 2:
+ if (c == ':') {
+ state = 3;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ return state == 3;
+}
+
+bool SiteIsolationPolicy::DoSignatureMatching(const char* data,
+ size_t length,
+ const char* signatures[],
+ size_t arr_size) {
+ for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
+ const char* signature = signatures[sig_index];
+ size_t signature_length = strlen(signature);
+ size_t i = 0;
+ // Skip the white characters at the beginning of the document.
+ for (i = 0; i < length; ++i) {
+ char c = *data;
+ if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t')) {
+ break;
+ }
+ ++data;
+ }
+ length = length - i;
+ if (length < signature_length)
+ continue;
+ if (base::strncasecmp(signature, data, signature_length) == 0) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool SiteIsolationPolicy::UnaffectedStatusCode(int status_code) {
+ // Chrome only uses the content of a response with one of these
+ // status codes for CSS/JavaScript. For images, Chrome just ignores
+ // status code.
+ const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,
+ 303, 305, 306, 307};
+ for (size_t i = 0; i < 12; ++i) {
+ if (renderable_status_code[i] == status_code)
+ return false;
+ }
+ return true;
+}
+
+bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {
+ // TODO(dsjang): This is a real hacking. The only purpose of this
+ // function is to try to see if there's any possibility that this
+ // data can be JavaScript.(superset of JS). This function will be
+ // removed for the production code.
+
+ // Search for "var " for JS detection. :-)
+ for (size_t i = 0; i < length - 3; ++i) {
+ if (strncmp(data, "var ", 4) == 0) {
+ return true;
+ }
+ ++data;
+ }
+ return false;
+}
+}
nasko 2013/08/06 17:29:27 Leave an empty line and put a // namespace comment

Powered by Google App Engine
This is Rietveld 408576698