webkit/child/site_isolation_policy.cc - Issue 22254005: UMA data collector for cross-site documents(XSD)

Side by Side Diff: webkit/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr

Patch Set: switched to using UMA_HISTOGRAM_ENUMERATION from COUNTS Created 7 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "webkit/child/site_isolation_policy.h"

	6

	7 #include "base/basictypes.h"

	8 #include "base/logging.h"

	9 #include "base/metrics/histogram.h"

	10 #include "base/strings/string_util.h"

	11 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"

	12 #include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"

	13 #include "third_party/WebKit/public/platform/WebString.h"

	14 #include "third_party/WebKit/public/platform/WebURL.h"

	15 #include "third_party/WebKit/public/platform/WebURLRequest.h"

	16 #include "third_party/WebKit/public/platform/WebURLResponse.h"

	17 #include "third_party/WebKit/public/web/WebDocument.h"

	18 #include "third_party/WebKit/public/web/WebFrame.h"

	19 #include "third_party/WebKit/public/web/WebFrameClient.h"

	20 #include "third_party/WebKit/public/web/WebSecurityOrigin.h"

	21

	22 using base::strncasecmp;

	23 using WebKit::WebURLResponse;

	24 using WebKit::WebURLRequest;

	25 using WebKit::WebURL;

	26 using WebKit::WebString;

	27 using WebKit::WebDocument;
	Charlie Reis 2013/08/07 21:02:02 These should be alphabetized. These should be alphabetized. dsjang 2013/08/08 21:21:01 Done. Show quoted text On 2013/08/07 21:02:02, creis wrote: > These should be alphabetized. Done.
	28

	29 namespace webkit_glue {

	30

	31 std::map<unsigned, WebURLRequest::TargetType>

	32 SiteIsolationPolicy::id_target_map_;

	33 std::map<std::string, ResponseMetaData>

	34 SiteIsolationPolicy::url_responsedata_map_;

	35 std::map<unsigned, std::string> SiteIsolationPolicy::id_url_map_;

	36

	37 void SiteIsolationPolicy::WillSendRequest(

	38 unsigned identifier,

	39 WebURLRequest::TargetType target_type) {

	40 // This happens when the original request is redirected.

	41 if (id_target_map_.count(identifier) != 0) {

	42 // This check actually can fail. If it is, which target_type do we

	43 // have to record between the old one and the new one? When

	44 // redirection happens, target_type becomes 2. TODO(dsjang):

	45 // let's disable this code and see what happens on onclickads.com

	46 // for googleads JavaScript code assigned to an image. To disable

	47 // this, we need a guarntee that target_type is always erased at

	48 // the end of a transaction.

	49 if (id_target_map_[identifier] != target_type) {

	50 id_target_map_[identifier] = target_type;
	Charlie Reis 2013/08/07 21:02:02 I can't understand this comment or code. It looks I can't understand this comment or code. It looks like you're going to set id_target_map_[identifier] to target_type regardless below, so why bother with this? dsjang 2013/08/08 21:21:01 Done. Show quoted text On 2013/08/07 21:02:02, creis wrote: > I can't understand this comment or code. It looks like you're going to set > id_target_map_[identifier] to target_type regardless below, so why bother with > this? Done.
	51 }

	52 }

	53 id_target_map_[identifier] = target_type;

	54 }

	55

	56 void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame,

	57 unsigned identifier,

	58 const WebURLResponse& response) {

	59 DCHECK(id_target_map_.count(identifier) == 1);

	60

	61 UMA_HISTOGRAM_COUNTS("XSDP.ALL", 1);

	62

	63 GURL response_url = response.url();

	64 WebURLRequest::TargetType target_type = id_target_map_[identifier];

	65 id_target_map_.erase(identifier);

	66

	67 // See if this is for navigation. If it is, let it pass.

	68 if (IsFrameNotCommitted(frame)) {

	69 LOG(INFO) << "SiteIsolationPolicy.FrameNotCommitted";

	70 return;

	71 }

	72

	73 GURL frame_origin(frame->document().securityOrigin().toString().utf8());

	74

	75 // TODO(dsjang): Find out all network related schemes here.

	76 if (!IsNetworkScheme(frame_origin)) {

	77 LOG(INFO) << "SiteIsolationPolicy.NotNetworkScheme:" << frame_origin;

	78 return;

	79 }

	80

	81 if (IsSameSite(frame_origin, response_url)) {

	82 LOG(INFO) << "SiteIsolationPolicy.SameSite:" << frame_origin << ","

	83 << response_url;

	84 return;

	85 }

	86

	87 ResponseMetaData::CanonicalMimeType canonical_mime_type =

	88 GetCanonicalMimeType(response);

	89

	90 if (canonical_mime_type == ResponseMetaData::IsOthers) {

	91 LOG(INFO) << "SiteIsolationPolicy.mimetype:" << frame_origin << ","

	92 << response_url << ",[" << response.mimeType().utf8() << "]";

	93 return;

	94 }

	95

	96 // There was a possiblity that a CORS request preceded by a

	97 // pre-flight request does not have "Access-Control-Allow-Origin"

	98 // header. But it turns out that every CORS request should have the

	99 // header no matter what CORS request it is. Therefore, if this is a

	100 // CORS request, it has this header.

	101 std::string access_control_origin = response

	102 .httpHeaderField(

	103 WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8();

	104

	105 if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) {

	106 LOG(INFO) << "SiteIsolationPolicy.CorsIsSafe:";

	107 return;

	108 }

	109

	110 // Real XSD data collection starts from here.

	111 LOG(INFO) << "SiteIsolationPolicy.XSD!!!:" << canonical_mime_type <<

	112 ":" << response_url;

	113

	114 // TODO(dsjang): Apply X-Content-Type option here.

	115 ResponseMetaData resp_data;

	116 resp_data.frame_origin = frame_origin.spec();

	117 resp_data.response_url = response_url.spec();

	118 resp_data.identifier = identifier;

	119 resp_data.target_type = target_type;

	120 resp_data.canonical_mime_type = canonical_mime_type;

	121 resp_data.http_status_code = response.httpStatusCode();

	122

	123 url_responsedata_map_[resp_data.response_url] = resp_data;

	124 id_url_map_[identifier] = resp_data.response_url;

	125

	126 return;

	127 }

	128

	129 void SiteIsolationPolicy::DidReceiveData(const char* data,

	130 int length,

	131 WebURL& web_response_url) {
	Charlie Reis 2013/08/09 00:39:03 Add a comment to the .h file that there's a risk t Add a comment to the .h file that there's a risk that multiple requests can be made for the same URL, but we're ignoring it because the response will almost always be the same.
	132 // We only record XSDs whose content is actually non-zero.

	133 GURL response_url(web_response_url);

	134

	135 std::string response_url_str = response_url.spec();

	136 if (url_responsedata_map_.count(response_url_str) == 0)

	137 return;

	138

	139 DCHECK(url_responsedata_map_.count(response_url_str) == 1);

	140 ResponseMetaData resp_data = url_responsedata_map_[response_url_str];

	141 url_responsedata_map_.erase(response_url_str);

	142

	143 // Record the length of the first received network packet to see if

	144 // it's enough for sniffing.

	145 UMA_HISTOGRAM_COUNTS("XSDP.XSD.DataLength", length);
	Charlie Reis 2013/08/07 21:02:02 Why would we need to collect this? Why would we need to collect this? dsjang 2013/08/08 21:21:01 I wanted to see if that's not the case that most o I wanted to see if that's not the case that most of the cross-site documents are passed due to the short length of the first network packet. This might not be useful. On 2013/08/07 21:02:02, creis wrote: Show quoted text > Why would we need to collect this?
	146

	147 // Record the entire number of responses with a specific mime

	148 // type(text/html, text/xml, etc).

	149 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.MimeType",

	150 resp_data.canonical_mime_type,

	151 ResponseMetaData::IsOthers + 1);

	152

	153 // TODO(dsjang): sometimes the length of payload can be not enough to do

	154 // correct content sniffing. If that happens, put it into a buffer

	155 // so that we can do it later.

	156 bool verified_for_blocking = false;

	157 ResponseMetaData::CanonicalMimeType sniffed_type =

	158 ResponseMetaData::IsOthers;

	159

	160 switch (resp_data.canonical_mime_type) {

	161 // Record the number of responses whose content is sniffed for

	162 // what its mime type claims it to be. For example, we apply a

	163 // HTML sniffer for a document tagged with text/html here, and

	164 // increments the count of "XSDP.XSD.HTML.Verified".

	165 case ResponseMetaData::IsHTML:

	166 if (SniffForHTML(data, length)) {

	167 UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.HTML.Verified", 1);

	168 verified_for_blocking = true;

	169 }

	170 break;

	171 case ResponseMetaData::IsXML:

	172 if (SniffForXML(data, length)) {

	173 UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.XML.Verified", 1);

	174 verified_for_blocking = true;

	175 }

	176 break;

	177 case ResponseMetaData::IsJSON:

	178 if (SniffForJSON(data, length)) {

	179 UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.JSON.Verified", 1);

	180 verified_for_blocking = true;

	181 }

	182 break;

	183 case ResponseMetaData::IsPlain:

	184 if (SniffForHTML(data, length)) {

	185 sniffed_type = ResponseMetaData::IsHTML;

	186 verified_for_blocking = true;

	187 } else if (SniffForXML(data, length)) {

	188 sniffed_type = ResponseMetaData::IsXML;

	189 verified_for_blocking = true;

	190 } else if (SniffForJSON(data, length)) {

	191 sniffed_type = ResponseMetaData::IsJSON;

	192 verified_for_blocking = true;

	193 }

	194 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.MimeType.Plain.Verified",

	195 sniffed_type,

	196 ResponseMetaData::IsJSON + 1);

	197 break;

	198 case ResponseMetaData::IsOthers:

	199 DCHECK(false);

	200 break;

	201 }

	202

	203 // We block these. See how many of them have unaffected status code.

	204 if (verified_for_blocking) {

	205 if (IsErrorStatusCode(resp_data.http_status_code)) {

	206 // This is a blocking that does not affect the browser behavior

	207 // by the following reasons : 1) this is not a binary object

	208 // (such as an image) since this is sniffed as a text

	209 // document. 2) then, this blocking only breaks the renderer

	210 // behavior only if it is either JavaScript or CSS. However, the

	211 // renderer doesn't use the contents of JS/CSS with unaffected

	212 // status code(e.g, 404). *) the renderer is expected not to use

	213 // the cross-site document content for purposes other than

	214 // JS/CSS (e.g, XHR).

	215 UMA_HISTOGRAM_COUNTS("XSDP.XSD.Blocked.ErrorStatusCode", 1);

	216 } else {

	217 // This is the case that a blocked response is with a non-error

	218 // status code, so this blocking can be actually disruptive.

	219 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.Blocked.NormalStatusCode",

	220 resp_data.target_type, WebURLRequest::TargetIsUnspecified + 1);

	221 }

	222 } else {

	223 LOG(INFO) << "Not Blocked:sniffing failed:";

	224 // Not blocked, but How many of them can be JS? This is only

	225 // useful for studying non-blocked documents.

	226 if (SniffForJS(data, length)) {

	227 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.NotBlocked.MaybeJS",

	228 resp_data.target_type,

	229 WebURLRequest::TargetIsUnspecified + 1);

	230 }

	231 }

	232 }

	233

	234 void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) {

	235 id_target_map_.erase(identifier);

	236 if (id_url_map_.count(identifier) > 0) {

	237 url_responsedata_map_.erase(id_url_map_[identifier]);

	238 id_url_map_.erase(identifier);

	239 }

	240 }

	241

	242 void SiteIsolationPolicy::DidFinishResourceLoadForUrl(

	243 const WebKit::WebURL& web_response_url) {

	244 GURL response_url(web_response_url);

	245

	246 if (url_responsedata_map_.count(response_url.spec()) > 0) {

	247 ResponseMetaData meta_data = url_responsedata_map_[response_url.spec()];

	248 url_responsedata_map_.erase(response_url.spec());

	249 id_target_map_.erase(meta_data.identifier);

	250 id_url_map_.erase(meta_data.identifier);

	251 }

	252 }

	253

	254 ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType(

	255 const WebURLResponse& response) {

	256 static const char TEXT_HTML[] = "text/html";

	257 static const char TEXT_XML[] = "text/xml";

	258 static const char APP_RSS_XML[] = "application/rss+xml";

	259 static const char APP_XML[] = "application/xml";

	260 static const char APP_JSON[] = "application/json";

	261 static const char TEXT_XJSON[] = "text/x-json";

	262 static const char TEXT_JSON[] = "text/json";

	263 static const char TEXT_PLAIN[] = "text/json";

	264

	265 const std::string mime_type = response.mimeType().utf8();

	266

	267 LOG(ERROR) << "mimetype:" << mime_type << "==[" << TEXT_HTML << "]";

	268

	269 // These are a thorough list of the mime types crawled over the top

	270 // 50k sites related to HTML, XML, JSON, Plain.

	271 if (LowerCaseEqualsASCII(mime_type, TEXT_HTML)) {

	272 return ResponseMetaData::IsHTML;

	273 } else if (LowerCaseEqualsASCII(mime_type, TEXT_XML) \|\|

	274 LowerCaseEqualsASCII(mime_type, APP_RSS_XML) \|\|

	275 LowerCaseEqualsASCII(mime_type, APP_XML)) {

	276 return ResponseMetaData::IsXML;

	277 } else if (LowerCaseEqualsASCII(mime_type, APP_JSON) \|\|

	278 LowerCaseEqualsASCII(mime_type, TEXT_XJSON) \|\|

	279 LowerCaseEqualsASCII(mime_type, TEXT_JSON)) {

	280 return ResponseMetaData::IsJSON;

	281 } else if (LowerCaseEqualsASCII(mime_type, TEXT_PLAIN)) {

	282 return ResponseMetaData::IsPlain;

	283 } else {

	284 return ResponseMetaData::IsOthers;

	285 }

	286 }

	287

	288 bool SiteIsolationPolicy::IsNetworkScheme(GURL& url) {

	289 // We exclude ftp:// from here. FTP doesn't provide a Content-Type

	290 // header which our policy depends on, so we cannot protect any

	291 // document from FTP servers.

	292 return url.SchemeIs("http") \|\| url.SchemeIs("https");

	293 }

	294

	295 bool SiteIsolationPolicy::IsSameSite(GURL& frame_origin, GURL& response_url) {

	296 if (frame_origin.scheme() != response_url.scheme())

	297 return false;

	298

	299 // Extract the effective domains (public suffix plus one) of the

	300 // urls.

	301

	302 // TODO(dsjang): Is there any reason why we don't use

	303 // net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES

	304 // instead of

	305 // net::registry_controlled_domains::EXCLUSE_PRIVATE_REGISTRIES? If

	306 // we allow sites to use their private registries, they can use

	307 // "finer grained" sites than only using public ones.

	308 std::string frame_domain =

	309 net::registry_controlled_domains::GetDomainAndRegistry(

	310 frame_origin,

	311 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);

	312 std::string response_domain =

	313 net::registry_controlled_domains::GetDomainAndRegistry(

	314 response_url,

	315 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);

	316

	317 return frame_domain == response_domain;

	318 }

	319

	320 bool SiteIsolationPolicy::IsFrameNotCommitted(WebKit::WebFrame* frame) {

	321 // When a navigation starts, frame->provisionalDataSource() is set

	322 // to a not-null value which stands for the request made for the

	323 // navigation. As soon as the network request is committed to the

	324 // frame, frame->provisionalDataSource() is converted to null, and

	325 // the committed data source is moved to frame->dataSource(). This

	326 // is the most reliable way to detect whether the frame is in

	327 // navigation or not by far.

	328 return frame->provisionalDataSource() != NULL;

	329 }

	330

	331 bool SiteIsolationPolicy::IsValidCorsHeaderSet(

	332 GURL& frame_origin,

	333 GURL& website_origin,

	334 std::string access_control_origin) {

	335

	336 size_t access_control_origin_len = access_control_origin.size();

	337

	338 // TODO(dsjang): Is this actually true? The server seems to return

	339 // an empty string or "null".

	340 if (access_control_origin_len == 0)

	341 return false;

	342

	343 // Many websites are sending back "\"\"" instead of "". This is

	344 // non-standard practice, and seems not supported by the

	345 // brwoser. Refer to

	346 // CrossOriginAccessControl::passesAccessControlCheck().

	347

	348 // TODO(dsjang): * is not allowed for the response from a request

	349 // with cookies. This allows for more than what the renderer will

	350 // eventually be able to receive, so we won't see illegal cross-site

	351 // documents alllowed by this. We have to have t a way to see if

	352 // this response is from a cookie-tagged request or not in the

	353 // future.

	354 if (access_control_origin == "*")

	355 return true;

	356

	357 // TODO(dsjang): The CORS spec only treats a fully specified URL,

	358 // except for "*", but many websites are using just a domain for

	359 // access_control_origin, and this is blocked by Webkit's CORS logic

	360 // here : CrossOriginAccessControl::passesAccessControlCheck()

	361

	362 // We don't use Webkit's existing CORS policy implementation since

	363 // their policy works in terms of origins, not sites. For

	364 // example, when frame is sub.a.com and it is not allowed to access

	365 // a document with sub1.a.com. But under Site Isolation, it's

	366 // allowed.

	367

	368 // TODO(dsjang): examine createFromString()'s behavior for a URL

	369 // containing * in it.

	370 WebKit::WebSecurityOrigin cors_security_origin =

	371 WebKit::WebSecurityOrigin::createFromString(

	372 WebKit::WebString::fromUTF8(access_control_origin));

	373 GURL cors_origin(cors_security_origin.toString().utf8());

	374

	375 LOG(ERROR) << cors_security_origin.toString().utf8();

	376 return IsSameSite(frame_origin, cors_origin);

	377 }

	378

	379 bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {

	380 // TODO(dsjang): The content sniffer used by Chrome and Firefox are

	381 // using "<!--" as one of the HTML signatures, but it also appears

	382 // in valid JavaScript, considered as well-formed JS by the browser.

	383 // Since we do not want to block any JS, we exclude it from our HTML

	384 // signatures. This can weaken our document block policy, but we can

	385 // break less websites.

	386 const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec

	387 "<script", // HTML5 spec, Mozilla

	388 "<html", // HTML5 spec, Mozilla

	389 "<head", // HTML5 spec, Mozilla

	390 "<iframe", // Mozilla

	391 "<h1", // Mozilla

	392 "<div", // Mozilla

	393 "<font", // Mozilla

	394 "<table", // Mozilla

	395 "<a", // Mozilla

	396 "<style", // Mozilla

	397 "<title", // Mozilla

	398 "<b", // Mozilla

	399 "<body", // Mozilla

	400 "<br", "<p" // Mozilla

	401 };

	402 return DoSignatureMatching(

	403 data, length, html_signatures, arraysize(html_signatures));

	404 }

	405

	406 bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {

	407 const char* xml_signatures[] = {"<?xml" // Mozilla

	408 };

	409 return DoSignatureMatching(

	410 data, length, xml_signatures, arraysize(xml_signatures));

	411 }

	412

	413 bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {

	414 // TODO(dsjang): We have to come up with a better way to sniff

	415 // JSON. However, even RE cannot help us that much due to the fact

	416 // that we don't do full parsing. This DFA starts with state 0, and

	417 // finds 1) {, 2) "or', 3) : in the order. This is intentionally not

	418 // using a regular expression library so that we can make the

	419 // trusted code base as small as possible. State 4 is a dead state.

	420 const int INIT_ST = 0;

	421 const int LBRACE_ST = 1;

	422 const int LQUOTE_ST = 2;

	423 const int COLON_ST = 3;

	424 const int DEAD_ST = 4;

	425

	426 int state = INIT_ST;

	427 for (size_t i = 0; i < length && state < COLON_ST; ++i, ++data) {

	428 const char c = *data;

	429 if (c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\n')

	430 continue;

	431

	432 switch (state) {

	433 case INIT_ST:

	434 if (c == '{')

	435 state = LBRACE_ST;

	436 else

	437 state = DEAD_ST;

	438 break;

	439 case LBRACE_ST:

	440 if (c == '\"' \|\| c == '\'')

	441 state = LQUOTE_ST;

	442 else

	443 state = DEAD_ST;

	444 break;

	445 case LQUOTE_ST:

	446 if (c == ':') {

	447 state = COLON_ST;

	448 }

	449 break;

	450 default:

	451 break;

	452 }

	453 }

	454 return state == COLON_ST;

	455 }

	456

	457 bool SiteIsolationPolicy::DoSignatureMatching(const char* data,

	458 size_t length,

	459 const char* signatures[],

	460 size_t arr_size) {

	461 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {

	462 const char* signature = signatures[sig_index];

	463 size_t signature_length = strlen(signature);

	464 size_t i = 0;

	465 // Skip the white characters at the beginning of the document.

	466 for (i = 0; i < length; ++i) {

	467 char c = *data;

	468 if (!(c == ' ' \|\| c == '\r' \|\| c == '\n' \|\| c == '\t')) {

	469 break;

	470 }

	471 ++data;

	472 }

	473 length = length - i;

	474 if (length < signature_length)

	475 continue;

	476 if (!base::strncasecmp(signature, data, signature_length)) {

	477 return true;

	478 }

	479 }

	480 return false;

	481 }

	482

	483 bool SiteIsolationPolicy::IsErrorStatusCode(int status_code) {

	484 // Chrome only uses the content of a response with one of these

	485 // status codes for CSS/JavaScript. For images, Chrome just ignores

	486 // status code.

	487 const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,

	488 303, 305, 306, 307};

	489 for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {

	490 if (renderable_status_code[i] == status_code)

	491 return false;

	492 }

	493 return true;

	494 }

	495

	496 bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {

	497 // TODO(dsjang): This is a real hacking. The only purpose of this

	498 // function is to try to see if there's any possibility that this

	499 // data can be JavaScript.(superset of JS). This function will be

	500 // removed for the production code.

	501

	502 // Search for "var " for JS detection. :-)

	503 for (size_t i = 0; i < length - 3; ++i) {

	504 if (strncmp(data, "var ", 4) == 0) {

	505 return true;

	506 }

	507 ++data;

	508 }

	509 return false;

	510 }

	511

	512 } // namespace webkit_glue

OLD	NEW

« webkit/child/site_isolation_policy.h ('K') | « webkit/child/site_isolation_policy.h ('k') | webkit/child/site_isolation_policy_unittest.cc » ('j') | no next file with comments »