webkit/child/site_isolation_policy.cc - Issue 22254005: UMA data collector for cross-site documents(XSD)

Unified Diff: webkit/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr

Patch Set: Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: webkit/child/site_isolation_policy.cc

diff --git a/webkit/child/site_isolation_policy.cc b/webkit/child/site_isolation_policy.cc

new file mode 100644

index 0000000000000000000000000000000000000000..0fd8ab0e5016bbcd7a1d67ee8def7e728e0632d6

--- /dev/null

+++ b/webkit/child/site_isolation_policy.cc

@@ -0,0 +1,502 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "webkit/child/site_isolation_policy.h"

+#include "base/basictypes.h"

+#include "base/logging.h"

+#include "base/metrics/histogram.h"

+#include "base/strings/string_util.h"

+#include "net/base/registry_controlled_domains/registry_controlled_domain.h"

+#include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"

+#include "third_party/WebKit/public/platform/WebString.h"

+#include "third_party/WebKit/public/platform/WebURL.h"

+#include "third_party/WebKit/public/platform/WebURLRequest.h"

+#include "third_party/WebKit/public/platform/WebURLResponse.h"

+#include "third_party/WebKit/public/web/WebDocument.h"

+#include "third_party/WebKit/public/web/WebFrame.h"

+#include "third_party/WebKit/public/web/WebFrameClient.h"

+#include "third_party/WebKit/public/web/WebSecurityOrigin.h"

+using WebKit::WebURLResponse;

+using WebKit::WebURLRequest;

+using WebKit::WebURL;

+using WebKit::WebString;

+using WebKit::WebDocument;

+namespace webkit_glue {

+std::map<unsigned, WebURLRequest::TargetType>

nasko 2013/08/06 17:29:27 Why are these needed in this file? They are alread

dsjang 2013/08/07 00:19:07 Done.

+ SiteIsolationPolicy::id_target_map_;

+std::map<std::string, ResponseMetaData>

+ SiteIsolationPolicy::url_responsedata_map_;

+std::map<unsigned, std::string> SiteIsolationPolicy::id_url_map_;

+void SiteIsolationPolicy::WillSendRequest(

+ unsigned identifier,

+ WebURLRequest::TargetType target_type) {

+ // This happens when the original request is redirected.

+ if (id_target_map_.count(identifier) != 0) {

+ // This check actually can fail. If it is, which target_type do we

+ // have to record between the old one and the new one? When

+ // redirection happens, target_type becomes 2. TODO(dsjang):

+ // let's disable this code and see what happens on onclickads.com

+ // for googleads JavaScript code assigned to an image.

+ if (id_target_map_[identifier] != target_type) {

+ id_target_map_[identifier] = target_type;

+ }

+ id_target_map_[identifier] = target_type;

+void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame,

+ unsigned identifier,

+ const WebURLResponse& response) {

nasko 2013/08/06 17:29:27 nit: no need for empty line here.

dsjang 2013/08/07 00:19:07 Done.

+ DCHECK(id_target_map_.count(identifier) == 1);

+ UMA_HISTOGRAM_COUNTS("XSDP.ALL", 1);

+ GURL response_url = response.url();

+ WebURLRequest::TargetType target_type = id_target_map_[identifier];

+ id_target_map_.erase(identifier);

+ // See if this is for navigation. If it is, let it pass.

+ if (IsFrameNotCommitted(frame)) {

+ LOG(INFO) << "SiteIsolationPolicy.FrameNotCommitted";

+ return;

+ }

+ GURL frame_origin(frame->document().securityOrigin().toString().utf8());

+ // TODO(dsjang): Find out all non-network scheme here.

+ // If it's the data: scheme, we can let it pass through.

+ if (IsSafeScheme(frame_origin)) {

+ LOG(INFO) << "SiteIsolationPolicy.SafeScheme:" << frame_origin;

+ return;

+ }

+ if (IsSameSite(frame_origin, response_url)) {

+ LOG(INFO) << "SiteIsolationPolicy.SameSite:" << frame_origin << ","

+ << response_url;

+ return;

+ }

+ ResponseMetaData::CanonicalMimeType canonical_mime_type =

+ GetCanonicalMimeType(response);

+ if (canonical_mime_type == ResponseMetaData::IsOthers) {

+ LOG(INFO) << "SiteIsolationPolicy.mimetype:" << frame_origin << ","

+ << response_url << "," << response.mimeType().utf8();

+ return;

+ }

+ std::string access_control_origin = response

+ .httpHeaderField(

+ WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8();

+ if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) {

+ LOG(INFO) << "SiteIsolationPolicy.CorsisSafe:";

nasko 2013/08/06 17:29:27 nit: CorsIsSafe

dsjang 2013/08/07 00:19:07 Done.

+ return;

+ }

+ // Real data collection starts from here.

+ //

+ // XSDP.XSD.%MIMECODE is a shortened name for

+ // XSDP.All.NNav.NSafeScheme.NSMIMEType.NSCORS.%MIMECODE from now

nasko 2013/08/06 17:29:27 What is the meaning of this string? Putting a comm

dsjang 2013/08/07 00:19:07 Done.

+ // on.

+ LOG(INFO) << "SiteIsolationPolicy.XSD!!!:" << response_url;

+ ResponseMetaData metaData;

+ metaData.frame_origin = frame_origin.spec();

+ metaData.response_url = response_url.spec();

+ metaData.identifier = identifier;

+ metaData.target_type = target_type;

+ metaData.canonical_mime_type = canonical_mime_type;

+ metaData.http_status_code = response.httpStatusCode();

+ url_responsedata_map_[metaData.response_url] = metaData;

+ id_url_map_[identifier] = metaData.response_url;

+ return;

+void SiteIsolationPolicy::DidReceiveData(const char* data,

+ int length,

+ WebURL& web_response_url) {

+ GURL response_url(web_response_url);

+ std::string response_url_str = response_url.spec();

+ if (url_responsedata_map_.count(response_url_str) == 0)

nasko 2013/08/06 17:29:27 Is this a valid case? When will we have seen a req

dsjang 2013/08/07 00:19:07 url_responsedata_map_ only maintains url for cross

+ return;

+ // Record the length of the first received network packet to see if

+ // it's enough for sniffing.

+ UMA_HISTOGRAM_COUNTS("XSDP.XSD.DataLength", length);

+ DCHECK(url_responsedata_map_.count(response_url_str) == 1);

+ ResponseMetaData metaData = url_responsedata_map_[response_url_str];

+ url_responsedata_map_.erase(response_url_str);

+ std::string uma_bucket_name("XSDP.XSD.");

+ uma_bucket_name.append(ResponseMetaData::CanonicalMimeTypeToString(

+ metaData.canonical_mime_type));

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ // TODO(dsjang): sometimes the length of payload can be not enough to do

+ // correct content sniffing. If that happens, put it into a buffer

+ // so that we can do it later.

+ bool verified_for_blocking = false;

+ switch (metaData.canonical_mime_type) {

+ case ResponseMetaData::IsHTML:

+ if (SniffForHTML(data, length)) {

+ uma_bucket_name.append(".Verified");

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ verified_for_blocking = true;

+ }

+ break;

+ case ResponseMetaData::IsXML:

+ if (SniffForXML(data, length)) {

+ uma_bucket_name.append(".Verified");

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ verified_for_blocking = true;

+ }

+ break;

+ case ResponseMetaData::IsJSON:

+ if (SniffForJSON(data, length)) {

+ uma_bucket_name.append(".Verified");

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ verified_for_blocking = true;

+ }

+ break;

+ case ResponseMetaData::IsPlain:

+ if (SniffForHTML(data, length)) {

+ uma_bucket_name.append(".Verified.HTML");

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ verified_for_blocking = true;

+ } else if (SniffForXML(data, length)) {

+ uma_bucket_name.append(".Verified.XML");

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ verified_for_blocking = true;

+ } else if (SniffForJSON(data, length)) {

+ uma_bucket_name.append(".Verified.JSON");

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ verified_for_blocking = true;

+ }

+ break;

+ case ResponseMetaData::IsOthers:

+ DCHECK(false);

+ break;

+ }

+ // We block these. See how many of them have unaffected status code.

+ if (verified_for_blocking) {

+ if (UnaffectedStatusCode(metaData.http_status_code)) {

+ // This is a blocking that does not affect the browser behavior

+ // by the following reasons : 1) this is not a binary object

+ // (such as an image) since this is sniffed as a text

+ // document. 2) then, this blocking only breaks the renderer

+ // behavior only if it is either JavaScript or CSS. However, the

+ // renderer doesn't use the contents of JS/CSS with unaffected

+ // status code(e.g, 404). *) the renderer is expected not to use

+ // the cross-site document content for purposes other than

+ // JS/CSS (e.g, XHR).

+ uma_bucket_name.append(".UnaffectedStatusCode.");

+ std::stringstream stat_code_strm;

+ LOG(INFO) << "Blocked:UNAFFECTED STAT CODE:" << metaData.http_status_code;

+ stat_code_strm << metaData.http_status_code;

+ uma_bucket_name.append(stat_code_strm.str());

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ } else {

+ LOG(INFO) << "Blocked:AFFECTED STAT CODE:" << metaData.http_status_code;

+ // This blocking can be disruptive if it was actually JS, and

+ // requested for JS.

+ uma_bucket_name.append(".NUnaffectedStatusCode.");

+ uma_bucket_name.append(

+ ResponseMetaData::TargetTypeToString(metaData.target_type));

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ if (SniffForJS(data, length)) {

+ // This shows if this blocking can be JS.

+ uma_bucket_name.append(".MaybeJS");

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ }

+ } else {

+ LOG(INFO) << "Not Blocked:sniffing failed:";

+ // Not blocked. How many of them can be JS? This is only useful

+ // for studying non-blocked documents.

+ if (SniffForJS(data, length)) {

+ uma_bucket_name.append(".NVerified.MaybeJS");

+ UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);

+ }

+void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) {

+ id_target_map_.erase(identifier);

+ if (id_url_map_.count(identifier) > 0) {

+ url_responsedata_map_.erase(id_url_map_[identifier]);

+ id_url_map_.erase(identifier);

+ }

+void SiteIsolationPolicy::DidFinishResourceLoad(

+ WebKit::WebURL& web_response_url) {

+ GURL response_url(web_response_url);

+ if (url_responsedata_map_.count(response_url.spec()) > 0) {

+ ResponseMetaData meta_data = url_responsedata_map_[response_url.spec()];

+ url_responsedata_map_.erase(response_url.spec());

+ id_target_map_.erase(meta_data.identifier);

+ id_url_map_.erase(meta_data.identifier);

+ }

+ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType(

+ const WebURLResponse& response) {

+ // RFC 2045 says: "The type, subtype, and parameter names are not

+ // case sensitive." If you have a MIME type of text/plain that's a

+ // type of text and a subtype of plain. So, per the spec, these are

+ // not case sensitive.

+ std::string mime_type = response.mimeType().utf8();

+ StringToLowerASCII(&mime_type);

nasko 2013/08/06 17:29:27 The mime_type string is UTF8, yet you are using AS

dsjang 2013/08/07 00:19:07 I found that all the crawled mime types are origin

+ const char* const document_mime_types[] = {

+ "text/html", "text/xml", "application/rss+xml", "application/xml",

+ "application/json", "text/x-json", "text/json", "text/plain"};

+ size_t i = 0;

+ for (i = 0; i < 8; ++i) {

+ if (!strcmp(document_mime_types[i], mime_type.data())) {

nasko 2013/08/06 17:29:27 This is unsafe comparison, you should be bounding

dsjang 2013/08/07 00:19:07 Switched to std::string::operator==(). On 2013/08

+ break;

+ }

+ if (i == 0) {

nasko 2013/08/06 17:29:27 Using constants like these seems unclean to me. Is

dsjang 2013/08/07 00:19:07 Done.

+ return ResponseMetaData::IsHTML;

+ } else if (1 <= i && i < 4) {

+ return ResponseMetaData::IsXML;

+ } else if (4 <= i && i < 7) {

+ return ResponseMetaData::IsJSON;

+ } else if (i == 7) {

+ return ResponseMetaData::IsPlain;

+ } else {

+ return ResponseMetaData::IsOthers;

+ }

+bool SiteIsolationPolicy::IsSafeScheme(GURL& url) {

nasko 2013/08/06 17:29:27 What about blob URLs? Are those safe too?

dsjang 2013/08/07 00:19:07 Switched to blacklisting from whitelisting. On 20

+ return url.scheme() == "data";

+bool SiteIsolationPolicy::IsSameSite(GURL& frame_origin, GURL& response_url) {

+ if (frame_origin.scheme() != response_url.scheme())

+ return false;

+ // Extract the effective domains (public suffix plus one) of the

+ // urls.

+ std::string frame_domain =

+ net::registry_controlled_domains::GetDomainAndRegistry(

+ frame_origin,

+ net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);

nasko 2013/08/06 17:29:27 Is there a reason we are deviating from how SiteIn

dsjang 2013/08/07 00:19:07 I thought allowing private registries here enables

+ std::string response_domain =

+ net::registry_controlled_domains::GetDomainAndRegistry(

+ response_url,

+ net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);

+ return frame_domain == response_domain;

+bool SiteIsolationPolicy::IsFrameNotCommitted(WebKit::WebFrame* frame) {

nasko 2013/08/06 17:29:27 This name is a bit confusing, can you put a descri

dsjang 2013/08/07 00:19:07 Done.

+ return frame->provisionalDataSource() != NULL;

+bool SiteIsolationPolicy::IsValidCorsHeaderSet(

+ GURL& frame_origin,

+ GURL& website_origin,

+ std::string access_control_origin) {

+ size_t access_control_origin_len = access_control_origin.size();

+ // TODO(dsjang): Is this actually true? The server seems to return

+ // an empty string or "null".

+ if (access_control_origin_len == 0)

+ return false;

+ // Strip quotes off from it. This is non-standard practice, but many

nasko 2013/08/06 17:29:27 Don't we have code that parses the header that we

dsjang 2013/08/07 00:19:07 Done.

+ // websites use quote strings surrounding the actual header value.

+ if (access_control_origin_len > 2) {

+ char first = access_control_origin[0];

+ char last = access_control_origin[access_control_origin_len - 1];

+ if ((first == '\"' && last == '\"') || (first == '\'' && last == '\'')) {

+ access_control_origin =

+ access_control_origin.substr(1, access_control_origin_len - 2);

+ }

+ // TODO(dsjang): * is not allowed for the response from a request

+ // with cookies. This allows for more than what the renderer will

+ // eventually be able to receive, so we won't see illegal cross-site

+ // documents alllowed by this. We have to have t a way to see if

+ // this response is from a cookie-tagged request or not in the

+ // future.

+ if (access_control_origin == "*")

nasko 2013/08/06 17:29:27 You talk about cookie-tagged requests, but there i

dsjang 2013/08/07 00:19:07 Yes, it has to be incorporated here in the future,

+ return true;

+ // TODO(dsjang): The CORS spec only treats a fully specified URL,

+ // not just a domain here. Confirm this ad-hoc rule to be

+ // correct. If this doesn't start with a scheme(http://, https://),

nasko 2013/08/06 17:29:27 This sounds scary. Is this correct?

dsjang 2013/08/07 00:19:07 This was actually scarily wrong :-). I found out t

+ // it inherits the site's scheme.

+ if (access_control_origin.find("http://") != 0 &&

+ access_control_origin.find("https://") != 0) {

+ access_control_origin.insert(0, website_origin.scheme() + "://");

+ }

+ LOG(ERROR) << access_control_origin;

+ // We don't use Webkit's

+ // frame->securityOrigin().canAccess(WebSecurityOrigin::createFromString(acc

+ // ess_control_origin)))here since their .canAccess works in terms of origins,

+ // not sites. For example, when frame is sub.a.com and it is not allowed

+ // to access a document with sub1.a.com. But under Site Isolation,

+ // it's allowed.

+ // TODO(dsjang): examine createFromString()'s behavior for a URL

+ // containing * in it.

+ WebKit::WebSecurityOrigin cors_security_origin =

+ WebKit::WebSecurityOrigin::createFromString(

+ WebKit::WebString::fromUTF8(access_control_origin));

+ GURL cors_origin(cors_security_origin.toString().utf8());

+ LOG(ERROR) << cors_security_origin.toString().utf8();

+ return IsSameSite(frame_origin, cors_origin);

+bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {

+ // TODO(dsjang): The content sniffer used by Chrome and Firefox are

+ // using "<!--" as one of the HTML signatures, but it also appears

+ // in valid JavaScript, considered as well-formed JS by the browser.

+ // Since we do not want to block any JS, we exclude it from our HTML

+ // signatures. This can weaken our document block policy, but we can

+ // break less websites.

+ const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec

+ "<script", // HTML5 spec, Mozilla

+ "<html", // HTML5 spec, Mozilla

+ "<head", // HTML5 spec, Mozilla

+ "<iframe", // Mozilla

+ "<h1", // Mozilla

+ "<div", // Mozilla

+ "<font", // Mozilla

+ "<table", // Mozilla

+ "<a", // Mozilla

+ "<style", // Mozilla

+ "<title", // Mozilla

+ "<b", // Mozilla

+ "<body", // Mozilla

+ "<br", "<p" // Mozilla

+ };

+ return DoSignatureMatching(

+ data, length, html_signatures, arraysize(html_signatures));

+bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {

+ const char* xml_signatures[] = {"<?xml" // Mozilla

+ };

+ return DoSignatureMatching(

+ data, length, xml_signatures, arraysize(xml_signatures));

+bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {

+ // TODO(dsjang): We have to come up with a better way to sniff

+ // JSON. However, even RE cannot help us that much due to the fact

+ // that we don't do full parsing. This DFA finds 1) {, 2) "(or'),

+ // 3) : in the order. This is intentionally not using a regular

+ // expression library so that we can make the trusted code base as

+ // small as possible.

+ int state = 0;

+ for (size_t i = 0; i < length && state < 3; ++i, ++data) {

+ char c = *data;

+ if (c == ' ' || c == '\t' || c == '\r' || c == '\n')

+ continue;

+ switch (state) {

+ case 0:

+ if (c == '{')

+ state = 1;

nasko 2013/08/06 17:29:27 Please use symbolic names or describe what those n

dsjang 2013/08/07 00:19:07 Done.

+ else

+ state = 4;

+ break;

+ case 1:

+ if (c == '\"' || c == '\'')

+ state = 2;

+ else

+ state = 4;

+ break;

+ case 2:

+ if (c == ':') {

+ state = 3;

+ }

+ break;

+ default:

+ break;

+ }

+ return state == 3;

+bool SiteIsolationPolicy::DoSignatureMatching(const char* data,

+ size_t length,

+ const char* signatures[],

+ size_t arr_size) {

+ for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {

+ const char* signature = signatures[sig_index];

+ size_t signature_length = strlen(signature);

+ size_t i = 0;

+ // Skip the white characters at the beginning of the document.

+ for (i = 0; i < length; ++i) {

+ char c = *data;

+ if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t')) {

+ break;

+ }

+ ++data;

+ }

+ length = length - i;

+ if (length < signature_length)

+ continue;

+ if (base::strncasecmp(signature, data, signature_length) == 0) {

+ return true;

+ }

+ return false;

+bool SiteIsolationPolicy::UnaffectedStatusCode(int status_code) {

+ // Chrome only uses the content of a response with one of these

+ // status codes for CSS/JavaScript. For images, Chrome just ignores

+ // status code.

+ const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,

+ 303, 305, 306, 307};

+ for (size_t i = 0; i < 12; ++i) {

+ if (renderable_status_code[i] == status_code)

+ return false;

+ }

+ return true;

+bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {

+ // TODO(dsjang): This is a real hacking. The only purpose of this

+ // function is to try to see if there's any possibility that this

+ // data can be JavaScript.(superset of JS). This function will be

+ // removed for the production code.

+ // Search for "var " for JS detection. :-)

+ for (size_t i = 0; i < length - 3; ++i) {

+ if (strncmp(data, "var ", 4) == 0) {

+ return true;

+ }

+ ++data;

+ }

+ return false;

nasko 2013/08/06 17:29:27 Leave an empty line and put a // namespace comment

« webkit/child/site_isolation_policy.h ('K') | « webkit/child/site_isolation_policy.h ('k') | webkit/child/site_isolation_policy_unittest.cc » ('j') | no next file with comments »