Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(319)

Side by Side Diff: webkit/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr
Patch Set: Revise comments and use lowercaseequls for "nosniff" value Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "webkit/child/site_isolation_policy.h"
6
7 #include "base/basictypes.h"
8 #include "base/logging.h"
9 #include "base/metrics/histogram.h"
10 #include "base/strings/string_util.h"
11 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
12 #include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
13 #include "third_party/WebKit/public/platform/WebString.h"
14 #include "third_party/WebKit/public/platform/WebURL.h"
15 #include "third_party/WebKit/public/platform/WebURLRequest.h"
16 #include "third_party/WebKit/public/platform/WebURLResponse.h"
17 #include "third_party/WebKit/public/web/WebDocument.h"
18 #include "third_party/WebKit/public/web/WebFrame.h"
19 #include "third_party/WebKit/public/web/WebFrameClient.h"
20 #include "third_party/WebKit/public/web/WebSecurityOrigin.h"
21
22 using WebKit::WebDocument;
23 using WebKit::WebString;
24 using WebKit::WebURL;
25 using WebKit::WebURLResponse;
26 using WebKit::WebURLRequest;
27
28
29 namespace webkit_glue {
30
31 ResponseMetaData::ResponseMetaData() {}
32
33 void SiteIsolationPolicy::WillSendRequest(
34 unsigned identifier,
35 WebURLRequest::TargetType target_type) {
36 TargetTypeMap* id_target_map = GetIdTargetMap();
37 // When |identifier| already exists in the map, it means that this request has
38 // been redirected to issue another request. We don't overwrite the existing
39 // target_type since it becomes TargetIsSubresource no matter what the
40 // original target_type was.
41 if (!id_target_map->count(identifier))
42 (*id_target_map)[identifier] = target_type;
43 }
44
45 void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame,
46 unsigned identifier,
47 const WebURLResponse& response) {
48 TargetTypeMap* id_target_map = GetIdTargetMap();
49 DCHECK_EQ(id_target_map->count(identifier),1U);
50
51 UMA_HISTOGRAM_COUNTS("SiteIsolation.AllResponses", 1);
52
53 GURL response_url = response.url();
54 WebURLRequest::TargetType target_type = (*id_target_map)[identifier];
55 id_target_map->erase(identifier);
56
57 // See if this is for navigation. If it is, don't block it, under the
58 // assumption that we will put it in an appropriate process.
59 if (IsFrameNavigating(frame)) {
60 return;
61 }
62
63 GURL frame_origin(frame->document().securityOrigin().toString());
64
65 if (!IsBlockableScheme(frame_origin)) {
66 return;
67 }
68
69 if (IsSameSite(frame_origin, response_url)) {
70 return;
71 }
72
73 ResponseMetaData::CanonicalMimeType canonical_mime_type =
74 GetCanonicalMimeType(response);
75
76 if (canonical_mime_type == ResponseMetaData::Others) {
77 return;
78 }
79
80 // Every CORS request should have the Access-Control-Allow-Origin header even
81 // if it is preceded by a pre-flight request. Therefore, if this is a CORS
82 // request, it has this header. response.httpHeaderField() internally uses
83 // case-insensitive matching for the header name.
84 std::string access_control_origin = response.httpHeaderField(
85 WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8();
86
87 if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) {
88 return;
89 }
90
91 // Real XSD data collection starts from here.
92 const std::string no_sniff =
93 response.httpHeaderField(
94 WebKit::WebString::fromUTF8("X-Content-Type-Options")).utf8();
95
96 ResponseMetaData resp_data;
97 resp_data.frame_origin = frame_origin.spec();
98 resp_data.response_url = response_url;
99 resp_data.request_identifier = identifier;
100 resp_data.target_type = target_type;
101 resp_data.canonical_mime_type = canonical_mime_type;
102 resp_data.http_status_code = response.httpStatusCode();
103 // TODO(dsjang): Lowercase comparison can be dangerous for unicode. Confirm
104 // that this is memory safe.
105 resp_data.no_sniff = LowerCaseEqualsASCII(no_sniff, "nosniff");
106
107 UrlResponseMetaDataMap* url_responsedata_map = GetUrlResponseMetaDataMap();
108 IdUrlMap* id_url_map = GetIdUrlMap();
109
110 (*url_responsedata_map)[resp_data.response_url] = resp_data;
111 (*id_url_map)[identifier] = resp_data.response_url;
112 }
113
114 // These macros are defined here so that we prevent code size bloat-up due to
115 // the UMA_HISTOGRAM_* macros. Similar logic is used for recording UMA stats for
116 // different MIME types, but we cannot create a helper function for this since
117 // UMA_HISTOGRAM_* macros do not accept variables as their bucket names. As a
118 // solution, macros are used instead to capture the repeated pattern for
119 // recording UMA stats. TODO(dsjang): this is only needed for collecting UMA
120 // stat. Will be deleted when this class is used for actual blocking.
121
122 #define SITE_ISOLATION_POLICY_COUNT_BLOCK(BUCKET_PREFIX) \
darin (slow to review) 2013/08/14 05:23:27 nit: Can you do all of this without macros? Even
dsjang 2013/08/14 20:47:03 We're forced to use a constant string bucket name
123 UMA_HISTOGRAM_COUNTS( BUCKET_PREFIX ".Blocked", 1); \
124 if (renderable_status_code) { \
125 UMA_HISTOGRAM_ENUMERATION( \
126 BUCKET_PREFIX ".Blocked.RenderableStatusCode", \
127 resp_data.target_type, \
128 WebURLRequest::TargetIsUnspecified + 1); \
129 } else { \
130 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".Blocked.NonRenderableStatusCode",1);\
131 }
132
133 #define SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(BUCKET_PREFIX) \
134 UMA_HISTOGRAM_COUNTS( BUCKET_PREFIX ".NoSniffBlocked", 1); \
135 if (renderable_status_code) { \
136 UMA_HISTOGRAM_ENUMERATION( \
137 BUCKET_PREFIX ".NoSniffBlocked.RenderableStatusCode", \
138 resp_data.target_type, \
139 WebURLRequest::TargetIsUnspecified + 1); \
140 } else { \
141 UMA_HISTOGRAM_ENUMERATION( \
142 BUCKET_PREFIX ".NoSniffBlocked.NonRenderableStatusCode", \
143 resp_data.target_type, \
144 WebURLRequest::TargetIsUnspecified + 1); \
145 }
146
147 #define SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(BUCKET_PREFIX) \
148 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".NotBlocked", 1); \
149 if (is_sniffed_for_js) \
150 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".NotBlocked.MaybeJS", 1); \
151
152 #define SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SNIFF_EXPR,BUCKET_PREFIX) \
153 if (SNIFF_EXPR) { \
154 SITE_ISOLATION_POLICY_COUNT_BLOCK(BUCKET_PREFIX) \
155 } else { \
156 if (resp_data.no_sniff) { \
157 SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(BUCKET_PREFIX) \
158 } else { \
159 SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(BUCKET_PREFIX) \
160 } \
161 }
162
163 void SiteIsolationPolicy::DidReceiveData(const char* data,
164 int length,
165 WebURL& web_response_url) {
166 GURL response_url(web_response_url);
167
168 UrlResponseMetaDataMap* url_responsedata_map = GetUrlResponseMetaDataMap();
169
170 if (url_responsedata_map->count(response_url) == 0)
171 return;
172
173 DCHECK_EQ(url_responsedata_map->count(response_url), 1U);
174 ResponseMetaData resp_data = (*url_responsedata_map)[response_url];
175 url_responsedata_map->erase(response_url);
176
177 // Record the length of the first received network packet to see if it's
178 // enough for sniffing.
179 UMA_HISTOGRAM_COUNTS("SiteIsolation.XSD.DataLength", length);
180
181 // Record the number of cross-site document responses with a specific mime
182 // type (text/html, text/xml, etc).
183 UMA_HISTOGRAM_ENUMERATION("SiteIsolation.XSD.MimeType",
184 resp_data.canonical_mime_type,
185 ResponseMetaData::MaxCanonicalMimeType);
186
187 // The content is blocked if it is sniffed for HTML/JSON/XML. When the blocked
188 // response is with an error status code, it is not disruptive by the
189 // following reasons : 1) the blocked content is not a binary object (such as
190 // an image) since it is sniffed for text; 2) then, this blocking only breaks
191 // the renderer behavior only if it is either JavaScript or CSS. However, the
192 // renderer doesn't use the contents of JS/CSS with unaffected status code
193 // (e.g, 404). 3) the renderer is expected not to use the cross-site document
194 // content for purposes other than JS/CSS (e.g, XHR).
195 bool renderable_status_code = IsRenderableStatusCodeForDocument(
196 resp_data.http_status_code);
197
198 // This is only used for false-negative analysis for non-blocked resources.
199 bool is_sniffed_for_js = SniffForJS(data, length);
200
201 // Record the number of responses whose content is sniffed for what its mime
202 // type claims it to be. For example, we apply a HTML sniffer for a document
203 // tagged with text/html here. Whenever this check becomes true, we'll block
204 // the response.
205 switch (resp_data.canonical_mime_type) {
206 case ResponseMetaData::HTML:
207 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForHTML(data, length),
208 "SiteIsolation.XSD.HTML");
209 break;
210 case ResponseMetaData::XML:
211 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForXML(data, length),
212 "SiteIsolation.XSD.XML");
213 break;
214 case ResponseMetaData::JSON:
215 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForJSON(data, length),
216 "SiteIsolation.XSD.JSON");
217 break;
218 case ResponseMetaData::Plain:
219 if (SniffForHTML(data, length)) {
220 SITE_ISOLATION_POLICY_COUNT_BLOCK(
221 "SiteIsolation.XSD.Plain.HTML");
222 } else if (SniffForXML(data, length)) {
223 SITE_ISOLATION_POLICY_COUNT_BLOCK(
224 "SiteIsolation.XSD.Plain.XML");
225 } else if (SniffForJSON(data, length)) {
226 SITE_ISOLATION_POLICY_COUNT_BLOCK(
227 "SiteIsolation.XSD.Plain.JSON");
228 } else if (is_sniffed_for_js) {
229 if (resp_data.no_sniff) {
230 SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(
231 "SiteIsolation.XSD.Plain");
232 } else {
233 SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(
234 "SiteIsolation.XSD.Plain");
235 }
236 }
237 break;
238 default :
239 NOTREACHED() <<
240 "Not a blockable mime type. This mime type shouldn't reach here.";
241 break;
242 }
243 }
244
245 #undef SITE_ISOLATION_POLICY_COUNT_NOTBLOCK
246 #undef SITE_ISOLATION_POLICY_SNIFF_AND_COUNT
247 #undef SITE_ISOLATION_POLICY_COUNT_BLOCK
248
249
250 void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) {
251 TargetTypeMap* id_target_map = GetIdTargetMap();
252 UrlResponseMetaDataMap* url_responsedata_map = GetUrlResponseMetaDataMap();
253 IdUrlMap* id_url_map = GetIdUrlMap();
254
255 id_target_map->erase(identifier);
256 if (!id_url_map->count(identifier)) {
257 url_responsedata_map->erase((*id_url_map)[identifier]);
258 id_url_map->erase(identifier);
259 }
260 }
261
262 void SiteIsolationPolicy::DidFinishResourceLoadForUrl(
263 const WebKit::WebURL& web_response_url) {
264 GURL response_url(web_response_url);
265
266 TargetTypeMap* id_target_map = GetIdTargetMap();
267 UrlResponseMetaDataMap* url_responsedata_map = GetUrlResponseMetaDataMap();
268 IdUrlMap* id_url_map = GetIdUrlMap();
269
270 if (!url_responsedata_map->count(response_url)) {
271 ResponseMetaData meta_data = (*url_responsedata_map)[response_url];
272 url_responsedata_map->erase(response_url);
273 id_target_map->erase(meta_data.request_identifier);
274 id_url_map->erase(meta_data.request_identifier);
275 }
276 }
277
278 ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType(
279 const WebURLResponse& response) {
280
281 // These are a thorough list of the mime types crawled over the top
282 // 50k sites related to HTML, XML, JSON, Plain.
283 static const char kTextHtml[] = "text/html";
darin (slow to review) 2013/08/14 05:23:27 nit: for constants like these, it is more canonica
dsjang 2013/08/14 20:47:03 Done.
284 static const char kTextXml[] = "text/xml";
285 static const char xAppRssXml[] = "application/rss+xml";
286 static const char kAppXml[] = "application/xml";
287 static const char kAppJson[] = "application/json";
288 static const char kTextJson[] = "text/json";
289 static const char kTextXjson[] = "text/x-json";
290 static const char kTextPlain[] = "text/plain";
291
292 const std::string mime_type = response.mimeType().utf8();
293
294 if (LowerCaseEqualsASCII(mime_type, kTextHtml)) {
295 return ResponseMetaData::HTML;
296 } else if (LowerCaseEqualsASCII(mime_type, kTextPlain)) {
darin (slow to review) 2013/08/14 05:23:27 nit: no need for "else" after "return"
dsjang 2013/08/14 20:47:03 Done.
297 return ResponseMetaData::Plain;
298 } else if (LowerCaseEqualsASCII(mime_type, kAppJson) ||
299 LowerCaseEqualsASCII(mime_type, kTextJson) ||
300 LowerCaseEqualsASCII(mime_type, kTextXjson)) {
301 return ResponseMetaData::JSON;
302 } else if (LowerCaseEqualsASCII(mime_type, kTextXml) ||
303 LowerCaseEqualsASCII(mime_type, xAppRssXml) ||
304 LowerCaseEqualsASCII(mime_type, kAppXml)) {
305 return ResponseMetaData::XML;
306 } else {
307 return ResponseMetaData::Others;
308 }
309 }
310
311 bool SiteIsolationPolicy::IsBlockableScheme(const GURL& url) {
312 // We exclude ftp:// from here. FTP doesn't provide a Content-Type
313 // header which our policy depends on, so we cannot protect any
314 // document from FTP servers.
315 return url.SchemeIs("http") || url.SchemeIs("https");
316 }
317
318 bool SiteIsolationPolicy::IsSameSite(const GURL& frame_origin,
319 const GURL& response_url) {
320
321 if (!frame_origin.is_valid() || !response_url.is_valid())
322 return false;
323
324 if (frame_origin.scheme() != response_url.scheme())
325 return false;
326
327 // SameDomainOrHost() extracts the effective domains (public suffix plus one)
328 // from the two URLs and compare them.
329 // TODO(dsjang): use INCLUDE_PRIVATE_REGISTRIES when http://crbug.com/7988 is
330 // fixed.
331 return net::registry_controlled_domains::SameDomainOrHost(
332 frame_origin,
333 response_url,
334 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
335 }
336
337 bool SiteIsolationPolicy::IsFrameNavigating(WebKit::WebFrame* frame) {
338 // When a navigation starts, frame->provisionalDataSource() is set
339 // to a not-null value which stands for the request made for the
340 // navigation. As soon as the network request is committed to the
341 // frame, frame->provisionalDataSource() is converted to null, and
342 // the committed data source is moved to frame->dataSource(). This
343 // is the most reliable way to detect whether the frame is in
344 // navigation or not.
345 return frame->provisionalDataSource() != NULL;
346 }
347
348 // We don't use Webkit's existing CORS policy implementation since
349 // their policy works in terms of origins, not sites. For example,
350 // when frame is sub.a.com and it is not allowed to access a document
351 // with sub1.a.com. But under Site Isolation, it's allowed.
352 bool SiteIsolationPolicy::IsValidCorsHeaderSet(
353 GURL& frame_origin,
354 GURL& website_origin,
355 std::string access_control_origin) {
356 // Many websites are sending back "\"*\"" instead of "*". This is
357 // non-standard practice, and not supported by Chrome. Refer to
358 // CrossOriginAccessControl::passesAccessControlCheck().
359
360 // TODO(dsjang): * is not allowed for the response from a request
361 // with cookies. This allows for more than what the renderer will
362 // eventually be able to receive, so we won't see illegal cross-site
363 // documents allowed by this. We have to find a way to see if this
364 // response is from a cookie-tagged request or not in the future.
365 if (access_control_origin == "*")
366 return true;
367
368 // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
369 // "*", but many websites are using just a domain for access_control_origin,
370 // and this is blocked by Webkit's CORS logic here :
371 // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
372 // is_valid() to false when it is created from a URL containing * in the
373 // domain part.
374
375 GURL cors_origin(access_control_origin);
376 return IsSameSite(frame_origin, cors_origin);
377 }
378
379 // This function is a slight modification of |net::SniffForHTML|.
380 bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {
381 // The content sniffer used by Chrome and Firefox are using "<!--"
382 // as one of the HTML signatures, but it also appears in valid
383 // JavaScript, considered as well-formed JS by the browser. Since
384 // we do not want to block any JS, we exclude it from our HTML
385 // signatures. This can weaken our document block policy, but we can
386 // break less websites.
387 // TODO(dsjang): parameterize |net::SniffForHTML| with an option
388 // that decides whether to include <!-- or not, so that we can
389 // remove this function.
390 const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec
391 "<script", // HTML5 spec, Mozilla
392 "<html", // HTML5 spec, Mozilla
393 "<head", // HTML5 spec, Mozilla
394 "<iframe", // Mozilla
395 "<h1", // Mozilla
396 "<div", // Mozilla
397 "<font", // Mozilla
398 "<table", // Mozilla
399 "<a", // Mozilla
400 "<style", // Mozilla
401 "<title", // Mozilla
402 "<b", // Mozilla
403 "<body", // Mozilla
404 "<br", "<p" // Mozilla
405 };
406 return MatchesSignature(
407 data, length, html_signatures, arraysize(html_signatures));
408 }
409
410 bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {
411 // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
412 // this signature. However, XML is case-sensitive. Don't we have to
413 // be more lenient only to block documents starting with the exact
414 // string <?xml rather than <?XML ?
415 const char* xml_signatures[] = {"<?xml" // Mozilla
416 };
417 return MatchesSignature(
418 data, length, xml_signatures, arraysize(xml_signatures));
419 }
420
421 bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {
422 // TODO(dsjang): We have to come up with a better way to sniff
423 // JSON. However, even RE cannot help us that much due to the fact
424 // that we don't do full parsing. This DFA starts with state 0, and
425 // finds {, "/' and : in that order. We're avoiding adding a
426 // dependency on a regular expression library.
427 const int kInitState = 0;
428 const int kLeftBraceState = 1;
429 const int kLeftQuoteState = 2;
430 const int kColonState = 3;
431 const int kDeadState = 4;
432
433 int state = kInitState;
434 for (size_t i = 0; i < length && state < kColonState; ++i) {
435 const char c = data[i];
436 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
437 continue;
438
439 switch (state) {
440 case kInitState:
441 if (c == '{')
442 state = kLeftBraceState;
443 else
444 state = kDeadState;
445 break;
446 case kLeftBraceState:
447 if (c == '\"' || c == '\'')
448 state = kLeftQuoteState;
449 else
450 state = kDeadState;
451 break;
452 case kLeftQuoteState:
453 if (c == ':')
454 state = kColonState;
455 break;
456 default:
457 NOTREACHED();
458 break;
459 }
460 }
461 return state == kColonState;
462 }
463
464 bool SiteIsolationPolicy::MatchesSignature(const char* raw_data,
465 size_t raw_length,
466 const char* signatures[],
467 size_t arr_size) {
468 size_t start = 0;
469 // Skip white characters at the beginning of the document.
470 for (start = 0; start < raw_length; ++start) {
471 char c = raw_data[start];
472 if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t'))
473 break;
474 }
475
476 // There is no not-whitespace character in this document.
477 if (!(start < raw_length))
478 return false;
479
480 const char* data = raw_data + start;
481 size_t length = raw_length - start;
482
483 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
484 const char* signature = signatures[sig_index];
485 size_t signature_length = strlen(signature);
486
487 if (length < signature_length)
488 continue;
489
490 if (!base::strncasecmp(signature, data, signature_length))
491 return true;
492 }
493 return false;
494 }
495
496 bool SiteIsolationPolicy::IsRenderableStatusCodeForDocument(int status_code) {
497 // Chrome only uses the content of a response with one of these status codes
498 // for CSS/JavaScript. For images, Chrome just ignores status code.
499 const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,
500 303, 305, 306, 307};
501 for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
502 if (renderable_status_code[i] == status_code)
503 return true;
504 }
505 return false;
506 }
507
508 bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {
509 // TODO(dsjang): This is a real hack. The only purpose of this function is to
510 // try to see if there's any possibility that this data can be JavaScript
511 // (superset of JS). This function will be removed once UMA stats are
512 // gathered.
513
514 // Search for "var " for JS detection.
515 for (size_t i = 0; i < length - 3; ++i) {
516 if (strncmp(data + i, "var ", 4) == 0)
517 return true;
518 }
519 return false;
520 }
521
522 TargetTypeMap* SiteIsolationPolicy::GetIdTargetMap() {
523 CR_DEFINE_STATIC_LOCAL(TargetTypeMap, id_target_map_, ());
524 return &id_target_map_;
525 }
526
527 UrlResponseMetaDataMap* SiteIsolationPolicy::GetUrlResponseMetaDataMap() {
528 CR_DEFINE_STATIC_LOCAL(UrlResponseMetaDataMap, url_responsedata_map_, ());
529 return &url_responsedata_map_;
530 }
531
532 IdUrlMap* SiteIsolationPolicy::GetIdUrlMap() {
533 CR_DEFINE_STATIC_LOCAL(IdUrlMap, id_url_map_, ());
534 return &id_url_map_;
535 }
536
537 } // namespace webkit_glue
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698