Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(405)

Side by Side Diff: content/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr
Patch Set: blocking code gets simpler and testcase is moved to /content Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "content/child/site_isolation_policy.h"
6
7 #include "base/basictypes.h"
8 #include "base/logging.h"
9 #include "base/metrics/histogram.h"
10 #include "base/strings/string_util.h"
11 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
12 #include "net/http/http_response_headers.h"
13 #include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
14 #include "third_party/WebKit/public/platform/WebString.h"
15 #include "third_party/WebKit/public/platform/WebURL.h"
16 #include "third_party/WebKit/public/platform/WebURLRequest.h"
17 #include "third_party/WebKit/public/platform/WebURLResponse.h"
18 #include "third_party/WebKit/public/web/WebDocument.h"
19 #include "third_party/WebKit/public/web/WebFrame.h"
20 #include "third_party/WebKit/public/web/WebFrameClient.h"
21 #include "third_party/WebKit/public/web/WebSecurityOrigin.h"
22
23 using WebKit::WebDocument;
24 using WebKit::WebString;
25 using WebKit::WebURL;
26 using WebKit::WebURLResponse;
27 using WebKit::WebURLRequest;
28
29 namespace content {
30
31 namespace {
32
33 // MIME types
34 const char kTextHtml[] = "text/html";
35 const char kTextXml[] = "text/xml";
36 const char xAppRssXml[] = "application/rss+xml";
37 const char kAppXml[] = "application/xml";
38 const char kAppJson[] = "application/json";
39 const char kTextJson[] = "text/json";
40 const char kTextXjson[] = "text/x-json";
41 const char kTextPlain[] = "text/plain";
42
43 } // anonymous namespace
44
45 SiteIsolationPolicy::ResponseMetaData::ResponseMetaData() {}
46
47 void SiteIsolationPolicy::OnReceivedResponse(
48 int request_id,
49 GURL& frame_origin,
50 GURL& response_url,
51 ResourceType::Type resource_type,
52 const webkit_glue::ResourceResponseInfo& info) {
53 UMA_HISTOGRAM_COUNTS("SiteIsolation.AllResponses", 1);
54
55 // See if this is for navigation. If it is, don't block it, under the
56 // assumption that we will put it in an appropriate process.
57 if (ResourceType::IsFrame(resource_type))
58 return;
59
60 if (!IsBlockableScheme(response_url))
61 return;
62
63 if (IsSameSite(frame_origin, response_url))
64 return;
65
66 SiteIsolationPolicy::ResponseMetaData::CanonicalMimeType canonical_mime_type =
67 GetCanonicalMimeType(info.mime_type);
68
69 if (canonical_mime_type == SiteIsolationPolicy::ResponseMetaData::Others)
70 return;
71
72 // Every CORS request should have the Access-Control-Allow-Origin header even
73 // if it is preceded by a pre-flight request. Therefore, if this is a CORS
74 // request, it has this header. response.httpHeaderField() internally uses
75 // case-insensitive matching for the header name.
76 std::string access_control_origin;
77
78 // We can use a case-insensitive header name for EnumerateHeader().
79 info.headers->EnumerateHeader(
80 NULL, "access-control-allow-origin", &access_control_origin);
81 if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin))
82 return;
83
84 // Real XSD data collection starts from here.
85 std::string no_sniff;
86 info.headers->EnumerateHeader(NULL, "x-content-type-options", &no_sniff);
87
88 ResponseMetaData resp_data;
89 resp_data.frame_origin = frame_origin.spec();
90 resp_data.response_url = response_url;
91 resp_data.resource_type = resource_type;
92 resp_data.canonical_mime_type = canonical_mime_type;
93 resp_data.http_status_code = info.headers->response_code();
94 resp_data.no_sniff = LowerCaseEqualsASCII(no_sniff, "nosniff");
95
96 RequestIdToMetaDataMap* metadata_map = GetRequestIdToMetaDataMap();
97 (*metadata_map)[request_id] = resp_data;
98 }
99
100 // These macros are defined here so that we prevent code size bloat-up due to
101 // the UMA_HISTOGRAM_* macros. Similar logic is used for recording UMA stats for
102 // different MIME types, but we cannot create a helper function for this since
103 // UMA_HISTOGRAM_* macros do not accept variables as their bucket names. As a
104 // solution, macros are used instead to capture the repeated pattern for
105 // recording UMA stats. TODO(dsjang): this is only needed for collecting UMA
106 // stat. Will be deleted when this class is used for actual blocking.
107
108 #define SITE_ISOLATION_POLICY_COUNT_BLOCK(BUCKET_PREFIX) \
109 UMA_HISTOGRAM_COUNTS( BUCKET_PREFIX ".Blocked", 1); \
110 result = false; \
111 if (renderable_status_code) { \
112 UMA_HISTOGRAM_ENUMERATION( \
113 BUCKET_PREFIX ".Blocked.RenderableStatusCode", \
114 resp_data.resource_type, \
115 WebURLRequest::TargetIsUnspecified + 1); \
116 } else { \
117 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".Blocked.NonRenderableStatusCode",1);\
118 }
119
120 #define SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(BUCKET_PREFIX) \
121 UMA_HISTOGRAM_COUNTS( BUCKET_PREFIX ".NoSniffBlocked", 1); \
122 result = false; \
123 if (renderable_status_code) { \
124 UMA_HISTOGRAM_ENUMERATION( \
125 BUCKET_PREFIX ".NoSniffBlocked.RenderableStatusCode", \
126 resp_data.resource_type, \
127 WebURLRequest::TargetIsUnspecified + 1); \
128 } else { \
129 UMA_HISTOGRAM_ENUMERATION( \
130 BUCKET_PREFIX ".NoSniffBlocked.NonRenderableStatusCode", \
131 resp_data.resource_type, \
132 WebURLRequest::TargetIsUnspecified + 1); \
133 }
134
135 #define SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(BUCKET_PREFIX) \
136 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".NotBlocked", 1); \
137 if (is_sniffed_for_js) \
138 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".NotBlocked.MaybeJS", 1); \
139
140 #define SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SNIFF_EXPR,BUCKET_PREFIX) \
141 if (SNIFF_EXPR) { \
142 SITE_ISOLATION_POLICY_COUNT_BLOCK(BUCKET_PREFIX) \
143 } else { \
144 if (resp_data.no_sniff) { \
145 SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(BUCKET_PREFIX) \
146 } else { \
147 SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(BUCKET_PREFIX) \
148 } \
149 }
150
151 bool SiteIsolationPolicy::OnReceivedData(
152 int request_id,
153 const char* data,
154 int length) {
155
156 RequestIdToMetaDataMap* metadata_map = GetRequestIdToMetaDataMap();
157 RequestIdToResultMap* result_map = GetRequestIdToResultMap();
158
159 // If there's an entry for |request_id| in blocked_map, this request's first
160 // data packet has already been examined. We can return the result here.
161 if (result_map->count(request_id) != 0)
162 return (*result_map)[request_id];
163
164 // If result_map doesn't have an entry for |request_id|, we're receiving the
165 // first data packet for request_id. If request_id is not registered, this
166 // request is identified as a non-target of our policy. So we return true.
167 if (metadata_map->count(request_id) == 0) {
168 // We set request_id to true so that we always return true for this request.
169 (*result_map)[request_id] = true;
170 return true;
171 }
172
173 // We now look at the first data packet received for request_id.
174 ResponseMetaData resp_data = (*metadata_map)[request_id];
175 metadata_map->erase(request_id);
176
177 // Record the length of the first received network packet to see if it's
178 // enough for sniffing.
179 UMA_HISTOGRAM_COUNTS("SiteIsolation.XSD.DataLength", length);
180
181 // Record the number of cross-site document responses with a specific mime
182 // type (text/html, text/xml, etc).
183 UMA_HISTOGRAM_ENUMERATION(
184 "SiteIsolation.XSD.MimeType",
185 resp_data.canonical_mime_type,
186 SiteIsolationPolicy::ResponseMetaData::MaxCanonicalMimeType);
187
188 // Store the result of cross-site document blocking analysis. True means we
189 // can return this document to the renderer, false means that we have to block
190 // the response data.
191 bool result = true;
192
193 // The content is blocked if it is sniffed for HTML/JSON/XML. When the blocked
194 // response is with an error status code, it is not disruptive by the
195 // following reasons : 1) the blocked content is not a binary object (such as
196 // an image) since it is sniffed for text; 2) then, this blocking only breaks
197 // the renderer behavior only if it is either JavaScript or CSS. However, the
198 // renderer doesn't use the contents of JS/CSS with unaffected status code
199 // (e.g, 404). 3) the renderer is expected not to use the cross-site document
200 // content for purposes other than JS/CSS (e.g, XHR).
201 bool renderable_status_code = IsRenderableStatusCodeForDocument(
202 resp_data.http_status_code);
203
204 // This is only used for false-negative analysis for non-blocked resources.
205 bool is_sniffed_for_js = SniffForJS(data, length);
206
207 // Record the number of responses whose content is sniffed for what its mime
208 // type claims it to be. For example, we apply a HTML sniffer for a document
209 // tagged with text/html here. Whenever this check becomes true, we'll block
210 // the response.
211 switch (resp_data.canonical_mime_type) {
212 case SiteIsolationPolicy::ResponseMetaData::HTML:
213 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForHTML(data, length),
214 "SiteIsolation.XSD.HTML");
215 break;
216 case SiteIsolationPolicy::ResponseMetaData::XML:
217 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForXML(data, length),
218 "SiteIsolation.XSD.XML");
219 break;
220 case SiteIsolationPolicy::ResponseMetaData::JSON:
221 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForJSON(data, length),
222 "SiteIsolation.XSD.JSON");
223 break;
224 case SiteIsolationPolicy::ResponseMetaData::Plain:
225 if (SniffForHTML(data, length)) {
226 SITE_ISOLATION_POLICY_COUNT_BLOCK(
227 "SiteIsolation.XSD.Plain.HTML");
228 } else if (SniffForXML(data, length)) {
229 SITE_ISOLATION_POLICY_COUNT_BLOCK(
230 "SiteIsolation.XSD.Plain.XML");
231 } else if (SniffForJSON(data, length)) {
232 SITE_ISOLATION_POLICY_COUNT_BLOCK(
233 "SiteIsolation.XSD.Plain.JSON");
234 } else if (is_sniffed_for_js) {
235 if (resp_data.no_sniff) {
236 SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(
237 "SiteIsolation.XSD.Plain");
238 } else {
239 SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(
240 "SiteIsolation.XSD.Plain");
241 }
242 }
243 break;
244 default :
245 NOTREACHED() <<
246 "Not a blockable mime type. This mime type shouldn't reach here.";
247 break;
248 }
249
250 (*result_map)[request_id] = result;
251 return result;
252 }
253
254 #undef SITE_ISOLATION_POLICY_COUNT_NOTBLOCK
255 #undef SITE_ISOLATION_POLICY_SNIFF_AND_COUNT
256 #undef SITE_ISOLATION_POLICY_COUNT_BLOCK
257
258
Charlie Reis 2013/08/22 18:23:30 nit: Only one blank line between blocks/functions,
dsjang 2013/08/22 19:05:55 Done.
259 void SiteIsolationPolicy::OnRequestComplete(int request_id) {
260 RequestIdToMetaDataMap* metadata_map = GetRequestIdToMetaDataMap();
261 RequestIdToResultMap* result_map = GetRequestIdToResultMap();
262 metadata_map->erase(request_id);
263 result_map->erase(request_id);
264 }
265
266
267 SiteIsolationPolicy::ResponseMetaData::CanonicalMimeType
268 SiteIsolationPolicy::GetCanonicalMimeType(const std::string& mime_type) {
269 if (LowerCaseEqualsASCII(mime_type, kTextHtml)) {
270 return SiteIsolationPolicy::ResponseMetaData::HTML;
271 }
272
273 if (LowerCaseEqualsASCII(mime_type, kTextPlain)) {
274 return SiteIsolationPolicy::ResponseMetaData::Plain;
275 }
276
277 if (LowerCaseEqualsASCII(mime_type, kAppJson) ||
278 LowerCaseEqualsASCII(mime_type, kTextJson) ||
279 LowerCaseEqualsASCII(mime_type, kTextXjson)) {
280 return SiteIsolationPolicy::ResponseMetaData::JSON;
281 }
282
283 if (LowerCaseEqualsASCII(mime_type, kTextXml) ||
284 LowerCaseEqualsASCII(mime_type, xAppRssXml) ||
285 LowerCaseEqualsASCII(mime_type, kAppXml)) {
286 return SiteIsolationPolicy::ResponseMetaData::XML;
287 }
288
289 return SiteIsolationPolicy::ResponseMetaData::Others;
290
291 }
292
293 bool SiteIsolationPolicy::IsBlockableScheme(const GURL& url) {
294 // We exclude ftp:// from here. FTP doesn't provide a Content-Type
295 // header which our policy depends on, so we cannot protect any
296 // document from FTP servers.
297 return url.SchemeIs("http") || url.SchemeIs("https");
298 }
299
300 bool SiteIsolationPolicy::IsSameSite(const GURL& frame_origin,
301 const GURL& response_url) {
302
303 if (!frame_origin.is_valid() || !response_url.is_valid())
304 return false;
305
306 if (frame_origin.scheme() != response_url.scheme())
307 return false;
308
309 // SameDomainOrHost() extracts the effective domains (public suffix plus one)
310 // from the two URLs and compare them.
311 // TODO(dsjang): use INCLUDE_PRIVATE_REGISTRIES when http://crbug.com/7988 is
312 // fixed.
313 return net::registry_controlled_domains::SameDomainOrHost(
314 frame_origin,
315 response_url,
316 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
317 }
318
319 bool SiteIsolationPolicy::IsFrameNavigating(WebKit::WebFrame* frame) {
320 // When a navigation starts, frame->provisionalDataSource() is set
321 // to a not-null value which stands for the request made for the
322 // navigation. As soon as the network request is committed to the
323 // frame, frame->provisionalDataSource() is converted to null, and
324 // the committed data source is moved to frame->dataSource(). This
325 // is the most reliable way to detect whether the frame is in
326 // navigation or not.
327 return frame->provisionalDataSource() != NULL;
328 }
329
330 // We don't use Webkit's existing CORS policy implementation since
331 // their policy works in terms of origins, not sites. For example,
332 // when frame is sub.a.com and it is not allowed to access a document
333 // with sub1.a.com. But under Site Isolation, it's allowed.
334 bool SiteIsolationPolicy::IsValidCorsHeaderSet(
335 GURL& frame_origin,
336 GURL& website_origin,
337 std::string access_control_origin) {
338 // Many websites are sending back "\"*\"" instead of "*". This is
339 // non-standard practice, and not supported by Chrome. Refer to
340 // CrossOriginAccessControl::passesAccessControlCheck().
341
342 // TODO(dsjang): * is not allowed for the response from a request
343 // with cookies. This allows for more than what the renderer will
344 // eventually be able to receive, so we won't see illegal cross-site
345 // documents allowed by this. We have to find a way to see if this
346 // response is from a cookie-tagged request or not in the future.
347 if (access_control_origin == "*")
348 return true;
349
350 // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
351 // "*", but many websites are using just a domain for access_control_origin,
352 // and this is blocked by Webkit's CORS logic here :
353 // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
354 // is_valid() to false when it is created from a URL containing * in the
355 // domain part.
356
357 GURL cors_origin(access_control_origin);
358 return IsSameSite(frame_origin, cors_origin);
359 }
360
361 // This function is a slight modification of |net::SniffForHTML|.
362 bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {
363 // The content sniffer used by Chrome and Firefox are using "<!--"
364 // as one of the HTML signatures, but it also appears in valid
365 // JavaScript, considered as well-formed JS by the browser. Since
366 // we do not want to block any JS, we exclude it from our HTML
367 // signatures. This can weaken our document block policy, but we can
368 // break less websites.
369 // TODO(dsjang): parameterize |net::SniffForHTML| with an option
370 // that decides whether to include <!-- or not, so that we can
371 // remove this function.
372 const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec
373 "<script", // HTML5 spec, Mozilla
374 "<html", // HTML5 spec, Mozilla
375 "<head", // HTML5 spec, Mozilla
376 "<iframe", // Mozilla
377 "<h1", // Mozilla
378 "<div", // Mozilla
379 "<font", // Mozilla
380 "<table", // Mozilla
381 "<a", // Mozilla
382 "<style", // Mozilla
383 "<title", // Mozilla
384 "<b", // Mozilla
385 "<body", // Mozilla
386 "<br", "<p", // Mozilla
387 "<?xml" // Mozilla
388 };
389
390 if (MatchesSignature(
391 data, length, html_signatures, arraysize(html_signatures)))
392 return true;
393
394 // "<!--" is specially treated since web JS can use "<!--" "-->" pair for
395 // comments.
396 const char* comment_begins[] = {"<!--" };
397
398 if (MatchesSignature(
399 data, length, comment_begins, arraysize(comment_begins))) {
400 // Search for --> and do SniffForHTML after that. If we can find the
401 // comment's end, we start HTML sniffing from there again.
402 const char end_comment[] = "-->";
403 const size_t end_comment_size = strlen(end_comment);
404
405 for (size_t i = 0; i <= length - end_comment_size; ++i) {
406 if (!strncmp(data + i, end_comment, end_comment_size)) {
407 size_t skipped = i + end_comment_size;
408 return SniffForHTML(data + skipped, length - skipped);
409 }
410 }
411 }
412
413 return false;
414 }
415
416 bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {
417 // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
418 // this signature. However, XML is case-sensitive. Don't we have to
419 // be more lenient only to block documents starting with the exact
420 // string <?xml rather than <?XML ?
421 const char* xml_signatures[] = {"<?xml" // Mozilla
422 };
423 return MatchesSignature(
424 data, length, xml_signatures, arraysize(xml_signatures));
425 }
426
427 bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {
428 // TODO(dsjang): We have to come up with a better way to sniff
429 // JSON. However, even RE cannot help us that much due to the fact
430 // that we don't do full parsing. This DFA starts with state 0, and
431 // finds {, "/' and : in that order. We're avoiding adding a
432 // dependency on a regular expression library.
433 const int kInitState = 0;
434 const int kLeftBraceState = 1;
435 const int kLeftQuoteState = 2;
436 const int kColonState = 3;
437 const int kDeadState = 4;
438
439 int state = kInitState;
440 for (size_t i = 0; i < length && state < kColonState; ++i) {
441 const char c = data[i];
442 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
443 continue;
444
445 switch (state) {
446 case kInitState:
447 if (c == '{')
448 state = kLeftBraceState;
449 else
450 state = kDeadState;
451 break;
452 case kLeftBraceState:
453 if (c == '\"' || c == '\'')
454 state = kLeftQuoteState;
455 else
456 state = kDeadState;
457 break;
458 case kLeftQuoteState:
459 if (c == ':')
460 state = kColonState;
461 break;
462 default:
463 NOTREACHED();
464 break;
465 }
466 }
467 return state == kColonState;
468 }
469
470 bool SiteIsolationPolicy::MatchesSignature(const char* raw_data,
471 size_t raw_length,
472 const char* signatures[],
473 size_t arr_size) {
474 size_t start = 0;
475 // Skip white characters at the beginning of the document.
476 for (start = 0; start < raw_length; ++start) {
477 char c = raw_data[start];
478 if (!(c == ' ' || c == '\t' || c == '\r' || c == '\n'))
479 break;
480 }
481
482 // There is no not-whitespace character in this document.
483 if (!(start < raw_length))
484 return false;
485
486 const char* data = raw_data + start;
487 size_t length = raw_length - start;
488
489 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
490 const char* signature = signatures[sig_index];
491 size_t signature_length = strlen(signature);
492
493 if (length < signature_length)
494 continue;
495
496 if (!base::strncasecmp(signature, data, signature_length))
497 return true;
498 }
499 return false;
500 }
501
502 bool SiteIsolationPolicy::IsRenderableStatusCodeForDocument(int status_code) {
503 // Chrome only uses the content of a response with one of these status codes
504 // for CSS/JavaScript. For images, Chrome just ignores status code.
505 const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,
506 303, 305, 306, 307};
507 for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
508 if (renderable_status_code[i] == status_code)
509 return true;
510 }
511 return false;
512 }
513
514 bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {
515 // TODO(dsjang): This is a real hack. The only purpose of this function is to
516 // try to see if there's any possibility that this data can be JavaScript
517 // (superset of JS). This function will be removed once UMA stats are
518 // gathered.
519
520 // Search for "var " for JS detection.
521 for (size_t i = 0; i < length - 3; ++i) {
522 if (strncmp(data + i, "var ", 4) == 0)
523 return true;
524 }
525 return false;
526 }
527
528 SiteIsolationPolicy::RequestIdToMetaDataMap*
529 SiteIsolationPolicy::GetRequestIdToMetaDataMap() {
530 CR_DEFINE_STATIC_LOCAL(RequestIdToMetaDataMap, metadata_map_, ());
531 return &metadata_map_;
532 }
533
534 SiteIsolationPolicy::RequestIdToResultMap*
535 SiteIsolationPolicy::GetRequestIdToResultMap() {
536 CR_DEFINE_STATIC_LOCAL(RequestIdToResultMap, result_map_, ());
537 return &result_map_;
538 }
539
540 } // namespace content
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698