Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(34)

Unified Diff: webkit/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr
Patch Set: "X-Content-Type-Options: nosniff" rule is added. Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: webkit/child/site_isolation_policy.cc
diff --git a/webkit/child/site_isolation_policy.cc b/webkit/child/site_isolation_policy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..061561f8210c984199a201d3faed9d6181ad19bb
--- /dev/null
+++ b/webkit/child/site_isolation_policy.cc
@@ -0,0 +1,531 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "webkit/child/site_isolation_policy.h"
+
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "base/metrics/histogram.h"
+#include "base/strings/string_util.h"
+#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
+#include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
+#include "third_party/WebKit/public/platform/WebString.h"
+#include "third_party/WebKit/public/platform/WebURL.h"
+#include "third_party/WebKit/public/platform/WebURLRequest.h"
+#include "third_party/WebKit/public/platform/WebURLResponse.h"
+#include "third_party/WebKit/public/web/WebDocument.h"
+#include "third_party/WebKit/public/web/WebFrame.h"
+#include "third_party/WebKit/public/web/WebFrameClient.h"
+#include "third_party/WebKit/public/web/WebSecurityOrigin.h"
+
+using WebKit::WebDocument;
+using WebKit::WebString;
+using WebKit::WebURL;
+using WebKit::WebURLResponse;
+using WebKit::WebURLRequest;
+
+
+namespace webkit_glue {
+
+ResponseMetaData::ResponseMetaData() {}
+
+void SiteIsolationPolicy::WillSendRequest(
+ unsigned identifier,
+ WebURLRequest::TargetType target_type) {
+ TargetTypeMap* id_target_map = GetIdTargetMap();
+ // When |identifier| already exists in the map, it means that this request has
+ // been redirected to issue another request. We don't overwrite the existing
+ // target_type since it becomes TargetIsSubresource no matter what the
+ // original target_type was.
+ if (!id_target_map->count(identifier))
+ (*id_target_map)[identifier] = target_type;
+}
+
+void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame,
+ unsigned identifier,
+ const WebURLResponse& response) {
+ TargetTypeMap* id_target_map = GetIdTargetMap();
+ DCHECK_EQ(id_target_map->count(identifier),1U);
+
+ UMA_HISTOGRAM_COUNTS("SiteIsolation.AllResponses", 1);
+
+ GURL response_url = response.url();
+ WebURLRequest::TargetType target_type = (*id_target_map)[identifier];
+ id_target_map->erase(identifier);
+
+ // See if this is for navigation. If it is, don't block it, under the
+ // assumption that we will put it in an appropriate process.
+ if (IsFrameNavigating(frame)) {
+ return;
+ }
+
+ GURL frame_origin(frame->document().securityOrigin().toString());
+
+ if (!IsBlockableScheme(frame_origin)) {
+ return;
+ }
+
+ if (IsSameSite(frame_origin, response_url)) {
+ return;
+ }
+
+ ResponseMetaData::CanonicalMimeType canonical_mime_type =
+ GetCanonicalMimeType(response);
+
+ if (canonical_mime_type == ResponseMetaData::Others) {
+ return;
+ }
+
+ // Every CORS request should have the Access-Control-Allow-Origin header even
+ // if it is preceded by a pre-flight request. Therefore, if this is a CORS
+ // request, it has this header. response.httpHeaderField() internally uses
+ // case-insensitive matching for the header name.
+ std::string access_control_origin = response.httpHeaderField(
+ WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8();
+
+ if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) {
+ return;
+ }
+
+ // Real XSD data collection starts from here.
+ std::string no_sniff = response.httpHeaderField(
+ WebKit::WebString::fromUTF8("X-Content-Type-Options")).utf8();
+
+ ResponseMetaData resp_data;
+ resp_data.frame_origin = frame_origin.spec();
+ resp_data.response_url = response_url;
+ resp_data.request_identifier = identifier;
+ resp_data.target_type = target_type;
+ resp_data.canonical_mime_type = canonical_mime_type;
+ resp_data.http_status_code = response.httpStatusCode();
+ resp_data.no_sniff = (no_sniff == "nosniff");
+
+ UrlResponseMetaDataMap* url_responsedata_map = GetUrlResponseMetaDataMap();
+ IdUrlMap* id_url_map = GetIdUrlMap();
+
+ (*url_responsedata_map)[resp_data.response_url] = resp_data;
+ (*id_url_map)[identifier] = resp_data.response_url;
+}
+
+// These macros are defined here so that we prevent code size bloat-up due to
+// the UMA_HISTOGRAM_* macros. Similar logic is used for recording UMA stats for
+// different MIME types, but we cannot create a helper function for this since
+// UMA_HISTOGRAM_* macros do not accept variabls as their bucket names. As a
Charlie Reis 2013/08/13 21:09:03 nit: variables
dsjang 2013/08/13 21:49:52 Done.
+// solution, macros are used instead to capture the repeated pattern for
+// recording UMA stats.
+// TODO(dsjang): this is only needed for collecting UMA stat. Will be deleted
+// when this class is used for actual blocking.
+
+#define SITE_ISOLATION_POLICY_COUNT_BLOCK(BUCKET_PREFIX) \
+ UMA_HISTOGRAM_COUNTS( BUCKET_PREFIX ".Blocked", 1); \
+ if (renderable_status_code) { \
+ UMA_HISTOGRAM_ENUMERATION( \
+ BUCKET_PREFIX ".Blocked.RenderableStatusCode", \
+ resp_data.target_type, \
+ WebURLRequest::TargetIsUnspecified + 1); \
+ } else { \
+ UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".Blocked.NonRenderableStatusCode",1);\
+ }
+
+#define SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(BUCKET_PREFIX) \
+ UMA_HISTOGRAM_COUNTS( BUCKET_PREFIX ".NoSniffBlocked", 1); \
+ if (renderable_status_code) { \
+ UMA_HISTOGRAM_ENUMERATION( \
+ BUCKET_PREFIX ".NoSniffBlocked.RenderableStatusCode", \
+ resp_data.target_type, \
+ WebURLRequest::TargetIsUnspecified + 1); \
+ } else { \
+ UMA_HISTOGRAM_ENUMERATION( \
+ BUCKET_PREFIX ".NoSniffBlocked.NonRenderableStatusCode", \
+ resp_data.target_type, \
+ WebURLRequest::TargetIsUnspecified + 1); \
+ }
+
+#define SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(BUCKET_PREFIX) \
+ UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".NotBlocked", 1); \
+ if (is_sniffed_for_js) \
+ UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".NotBlocked.MaybeJS", 1); \
+
+#define SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SNIFF_EXPR,BUCKET_PREFIX) \
+ if (SNIFF_EXPR) { \
+ SITE_ISOLATION_POLICY_COUNT_BLOCK(BUCKET_PREFIX) \
+ } else { \
+ if (resp_data.no_sniff) { \
+ SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(BUCKET_PREFIX) \
+ } else { \
+ SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(BUCKET_PREFIX) \
+ } \
+ }
+
+void SiteIsolationPolicy::DidReceiveData(const char* data,
+ int length,
+ WebURL& web_response_url) {
+ GURL response_url(web_response_url);
+
+ UrlResponseMetaDataMap* url_responsedata_map = GetUrlResponseMetaDataMap();
+
+ if (url_responsedata_map->count(response_url) == 0)
+ return;
+
+ DCHECK_EQ(url_responsedata_map->count(response_url), 1U);
+ ResponseMetaData resp_data = (*url_responsedata_map)[response_url];
+ url_responsedata_map->erase(response_url);
+
+ // Record the length of the first received network packet to see if it's
+ // enough for sniffing.
+ UMA_HISTOGRAM_COUNTS("SiteIsolation.XSD.DataLength", length);
+
+ // Record the number of cross-site document responses with a specific mime
+ // type (text/html, text/xml, etc).
+ UMA_HISTOGRAM_ENUMERATION("SiteIsolation.XSD.MimeType",
+ resp_data.canonical_mime_type,
+ ResponseMetaData::MaxCanonicalMimeType);
+
+ // The content is blocked if it is sniffed for HTML/JSON/XML. When the blocked
+ // response is with an error status code, it is not disruptive by the
+ // following reasons : 1) the blocked content is not a binary object (such as
+ // an image) since it is sniffed for text; 2) then, this blocking only breaks
+ // the renderer behavior only if it is either JavaScript or CSS. However, the
+ // renderer doesn't use the contents of JS/CSS with unaffected status code
+ // (e.g, 404). 3) the renderer is expected not to use the cross-site document
+ // content for purposes other than JS/CSS (e.g, XHR).
+ bool renderable_status_code = IsRenderableStatusCodeForDocument(
+ resp_data.http_status_code);
+
+ // This is only used for false-negative analysis for non-blocked resources.
+ bool is_sniffed_for_js = SniffForJS(data, length);
+
+ // Record the number of responses whose content is sniffed for what its mime
+ // type claims it to be. For example, we apply a HTML sniffer for a document
+ // tagged with text/html here. Whenever this check becomes true, we'll block
+ // the response.
+ switch (resp_data.canonical_mime_type) {
+ case ResponseMetaData::HTML:
+ SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForHTML(data, length),
+ "SiteIsolation.XSD.HTML");
+ break;
+ case ResponseMetaData::XML:
+ SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForXML(data, length),
+ "SiteIsolation.XSD.XML");
+ break;
+ case ResponseMetaData::JSON:
+ SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForJSON(data, length),
+ "SiteIsolation.XSD.JSON");
+ break;
+ case ResponseMetaData::Plain:
+ if (SniffForHTML(data, length)) {
+ SITE_ISOLATION_POLICY_COUNT_BLOCK(
+ "SiteIsolation.XSD.Plain.HTML");
+ } else if (SniffForXML(data, length)) {
+ SITE_ISOLATION_POLICY_COUNT_BLOCK(
+ "SiteIsolation.XSD.Plain.XML");
+ } else if (SniffForJSON(data, length)) {
+ SITE_ISOLATION_POLICY_COUNT_BLOCK(
+ "SiteIsolation.XSD.Plain.JSON");
+ } else if (is_sniffed_for_js) {
+ if (resp_data.no_sniff) {
+ SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(
+ "SiteIsolation.XSD.Plain");
+ } else {
+ SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(
+ "SiteIsolation.XSD.Plain");
+ }
+ }
+ break;
+ default :
+ NOTREACHED() <<
+ "Not a blockable mime type. This mime type shouldn't reach here.";
+ break;
+ }
+}
+
+#undef SITE_ISOLATION_POLICY_COUNT_NOTBLOCK
+#undef SITE_ISOLATION_POLICY_SNIFF_AND_COUNT
+#undef SITE_ISOLATION_POLICY_COUNT_BLOCK
+
+
+void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) {
+ TargetTypeMap* id_target_map = GetIdTargetMap();
+ UrlResponseMetaDataMap* url_responsedata_map = GetUrlResponseMetaDataMap();
+ IdUrlMap* id_url_map = GetIdUrlMap();
+
+ id_target_map->erase(identifier);
+ if (!id_url_map->count(identifier)) {
+ url_responsedata_map->erase((*id_url_map)[identifier]);
+ id_url_map->erase(identifier);
+ }
+}
+
+void SiteIsolationPolicy::DidFinishResourceLoadForUrl(
+ const WebKit::WebURL& web_response_url) {
+ GURL response_url(web_response_url);
+
+ TargetTypeMap* id_target_map = GetIdTargetMap();
+ UrlResponseMetaDataMap* url_responsedata_map = GetUrlResponseMetaDataMap();
+ IdUrlMap* id_url_map = GetIdUrlMap();
+
+ if (!url_responsedata_map->count(response_url)) {
+ ResponseMetaData meta_data = (*url_responsedata_map)[response_url];
+ url_responsedata_map->erase(response_url);
+ id_target_map->erase(meta_data.request_identifier);
+ id_url_map->erase(meta_data.request_identifier);
+ }
+}
+
+ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType(
+ const WebURLResponse& response) {
+
+ // These are a thorough list of the mime types crawled over the top
+ // 50k sites related to HTML, XML, JSON, Plain.
+ static const char kTextHtml[] = "text/html";
+ static const char kTextXml[] = "text/xml";
+ static const char xAppRssXml[] = "application/rss+xml";
+ static const char kAppXml[] = "application/xml";
+ static const char kAppJson[] = "application/json";
+ static const char kTextJson[] = "text/json";
+ static const char kTextXjson[] = "text/x-json";
+ static const char kTextPlain[] = "text/plain";
+
+ const std::string mime_type = response.mimeType().utf8();
+
+ if (LowerCaseEqualsASCII(mime_type, kTextHtml)) {
+ return ResponseMetaData::HTML;
+ } else if (LowerCaseEqualsASCII(mime_type, kTextPlain)) {
+ return ResponseMetaData::Plain;
+ } else if (LowerCaseEqualsASCII(mime_type, kAppJson) ||
+ LowerCaseEqualsASCII(mime_type, kTextJson) ||
+ LowerCaseEqualsASCII(mime_type, kTextXjson)) {
+ return ResponseMetaData::JSON;
+ } else if (LowerCaseEqualsASCII(mime_type, kTextXml) ||
+ LowerCaseEqualsASCII(mime_type, xAppRssXml) ||
+ LowerCaseEqualsASCII(mime_type, kAppXml)) {
+ return ResponseMetaData::XML;
+ } else {
+ return ResponseMetaData::Others;
+ }
+}
+
+bool SiteIsolationPolicy::IsBlockableScheme(const GURL& url) {
+ // We exclude ftp:// from here. FTP doesn't provide a Content-Type
+ // header which our policy depends on, so we cannot protect any
+ // document from FTP servers.
+ return url.SchemeIs("http") || url.SchemeIs("https");
+}
+
+bool SiteIsolationPolicy::IsSameSite(const GURL& frame_origin,
+ const GURL& response_url) {
Charlie Reis 2013/08/13 21:09:03 Please add a check for whether either URL is inval
dsjang 2013/08/13 21:49:52 Done.
+ if (frame_origin.scheme() != response_url.scheme())
+ return false;
+
+ // SameDomainOrHost() extracts the effective domains (public suffix plus one)
+ // from the two URLs and compare them.
+ // TODO(dsjang): use INCLUDE_PRIVATE_REGISTRIES when http://crbug.com/7988 is
+ // fixed.
+ return net::registry_controlled_domains::SameDomainOrHost(
+ frame_origin,
+ response_url,
+ net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
+}
+
+bool SiteIsolationPolicy::IsFrameNavigating(WebKit::WebFrame* frame) {
+ // When a navigation starts, frame->provisionalDataSource() is set
+ // to a not-null value which stands for the request made for the
+ // navigation. As soon as the network request is committed to the
+ // frame, frame->provisionalDataSource() is converted to null, and
+ // the committed data source is moved to frame->dataSource(). This
+ // is the most reliable way to detect whether the frame is in
+ // navigation or not.
+ return frame->provisionalDataSource() != NULL;
+}
+
+// We don't use Webkit's existing CORS policy implementation since
+// their policy works in terms of origins, not sites. For example,
+// when frame is sub.a.com and it is not allowed to access a document
+// with sub1.a.com. But under Site Isolation, it's allowed.
+bool SiteIsolationPolicy::IsValidCorsHeaderSet(
+ GURL& frame_origin,
+ GURL& website_origin,
+ std::string access_control_origin) {
+ // Many websites are sending back "\"*\"" instead of "*". This is
+ // non-standard practice, and not supported by Chrome. Refer to
+ // CrossOriginAccessControl::passesAccessControlCheck().
+
+ // TODO(dsjang): * is not allowed for the response from a request
+ // with cookies. This allows for more than what the renderer will
+ // eventually be able to receive, so we won't see illegal cross-site
+ // documents allowed by this. We have to find a way to see if this
+ // response is from a cookie-tagged request or not in the future.
+ if (access_control_origin == "*")
+ return true;
+
+ // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
+ // "*", but many websites are using just a domain for access_control_origin,
+ // and this is blocked by Webkit's CORS logic here :
+ // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
+ // is_valid() to false when it is created from a URL containing * in the
+ // domain part.
+
+ GURL cors_origin(access_control_origin);
+ return IsSameSite(frame_origin, cors_origin);
+}
+
+// This function is a slight modification of |net::SniffForHTML|.
+bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {
+ // The content sniffer used by Chrome and Firefox are using "<!--"
+ // as one of the HTML signatures, but it also appears in valid
+ // JavaScript, considered as well-formed JS by the browser. Since
+ // we do not want to block any JS, we exclude it from our HTML
+ // signatures. This can weaken our document block policy, but we can
+ // break less websites.
+ // TODO(dsjang): parameterize |net::SniffForHTML| with an option
+ // that decides whether to include <!-- or not, so that we can
+ // remove this function.
+ const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec
+ "<script", // HTML5 spec, Mozilla
+ "<html", // HTML5 spec, Mozilla
+ "<head", // HTML5 spec, Mozilla
+ "<iframe", // Mozilla
+ "<h1", // Mozilla
+ "<div", // Mozilla
+ "<font", // Mozilla
+ "<table", // Mozilla
+ "<a", // Mozilla
+ "<style", // Mozilla
+ "<title", // Mozilla
+ "<b", // Mozilla
+ "<body", // Mozilla
+ "<br", "<p" // Mozilla
+ };
+ return MatchesSignature(
+ data, length, html_signatures, arraysize(html_signatures));
+}
+
+bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {
+ // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
+ // this signature. However, XML is case-sensitive. Don't we have to
+ // be more lenient only to block documents starting with the exact
+ // string <?xml rather than <?XML ?
+ const char* xml_signatures[] = {"<?xml" // Mozilla
+ };
+ return MatchesSignature(
+ data, length, xml_signatures, arraysize(xml_signatures));
+}
+
+bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {
+ // TODO(dsjang): We have to come up with a better way to sniff
+ // JSON. However, even RE cannot help us that much due to the fact
+ // that we don't do full parsing. This DFA starts with state 0, and
+ // finds {, "/' and : in that order. We're avoiding adding a
+ // dependency on a regular expression library.
+ const int kInitState = 0;
+ const int kLeftBraceState = 1;
+ const int kLeftQuoteState = 2;
+ const int kColonState = 3;
+ const int kDeadState = 4;
+
+ int state = kInitState;
+ for (size_t i = 0; i < length && state < kColonState; ++i) {
+ const char c = data[i];
+ if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
+ continue;
+
+ switch (state) {
+ case kInitState:
+ if (c == '{')
+ state = kLeftBraceState;
+ else
+ state = kDeadState;
+ break;
+ case kLeftBraceState:
+ if (c == '\"' || c == '\'')
+ state = kLeftQuoteState;
+ else
+ state = kDeadState;
+ break;
+ case kLeftQuoteState:
+ if (c == ':')
+ state = kColonState;
+ break;
+ default:
+ NOTREACHED();
+ break;
+ }
+ }
+ return state == kColonState;
+}
+
+bool SiteIsolationPolicy::MatchesSignature(const char* raw_data,
+ size_t raw_length,
+ const char* signatures[],
+ size_t arr_size) {
+ size_t start = 0;
+ // Skip white characters at the beginning of the document.
+ for (start = 0; start < raw_length; ++start) {
+ char c = raw_data[start];
+ if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t'))
+ break;
+ }
+
+ // There is no not-whitespace character in this document.
+ if (!(start < raw_length))
+ return false;
+
+ const char* data = raw_data + start;
+ size_t length = raw_length - start;
+
+ for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
+ const char* signature = signatures[sig_index];
+ size_t signature_length = strlen(signature);
+
+ if (length < signature_length)
+ continue;
+
+ if (!base::strncasecmp(signature, data, signature_length))
+ return true;
+ }
+ return false;
+}
+
+bool SiteIsolationPolicy::IsRenderableStatusCodeForDocument(int status_code) {
+ // Chrome only uses the content of a response with one of these status codes
+ // for CSS/JavaScript. For images, Chrome just ignores status code.
+ const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,
+ 303, 305, 306, 307};
+ for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
+ if (renderable_status_code[i] == status_code)
+ return true;
+ }
+ return false;
+}
+
+bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {
+ // TODO(dsjang): This is a real hack. The only purpose of this function is to
+ // try to see if there's any possibility that this data can be JavaScript
+ // (superset of JS). This function will be removed once UMA stats are
+ // gathered.
+
+ // Search for "var " for JS detection.
+ for (size_t i = 0; i < length - 3; ++i) {
+ if (strncmp(data + i, "var ", 4) == 0)
+ return true;
+ }
+ return false;
+}
+
+TargetTypeMap* SiteIsolationPolicy::GetIdTargetMap() {
+ CR_DEFINE_STATIC_LOCAL(TargetTypeMap, id_target_map_, ());
+ return &id_target_map_;
+}
+
+UrlResponseMetaDataMap* SiteIsolationPolicy::GetUrlResponseMetaDataMap() {
+ CR_DEFINE_STATIC_LOCAL(UrlResponseMetaDataMap, url_responsedata_map_, ());
+ return &url_responsedata_map_;
+}
+
+IdUrlMap* SiteIsolationPolicy::GetIdUrlMap() {
+ CR_DEFINE_STATIC_LOCAL(IdUrlMap, id_url_map_, ());
+ return &id_url_map_;
+}
+
+} // namespace webkit_glue

Powered by Google App Engine
This is Rietveld 408576698