Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(58)

Side by Side Diff: content/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr
Patch Set: Blocking code is moved to SiteIsolationPolicy from ResourceDispatcher. Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "content/child/site_isolation_policy.h"
6
7 #include "base/basictypes.h"
8 #include "base/command_line.h"
9 #include "base/logging.h"
10 #include "base/metrics/histogram.h"
11 #include "base/strings/string_util.h"
12 #include "content/public/common/content_switches.h"
13 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
14 #include "net/http/http_response_headers.h"
15 #include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
16 #include "third_party/WebKit/public/platform/WebString.h"
17 #include "third_party/WebKit/public/platform/WebURL.h"
18 #include "third_party/WebKit/public/platform/WebURLRequest.h"
19 #include "third_party/WebKit/public/platform/WebURLResponse.h"
20 #include "third_party/WebKit/public/web/WebDocument.h"
21 #include "third_party/WebKit/public/web/WebFrame.h"
22 #include "third_party/WebKit/public/web/WebFrameClient.h"
23 #include "third_party/WebKit/public/web/WebSecurityOrigin.h"
24
25 using WebKit::WebDocument;
26 using WebKit::WebString;
27 using WebKit::WebURL;
28 using WebKit::WebURLResponse;
29 using WebKit::WebURLRequest;
30
31 namespace content {
32
33 namespace {
34
35 // MIME types
36 const char kTextHtml[] = "text/html";
37 const char kTextXml[] = "text/xml";
38 const char xAppRssXml[] = "application/rss+xml";
39 const char kAppXml[] = "application/xml";
40 const char kAppJson[] = "application/json";
41 const char kTextJson[] = "text/json";
42 const char kTextXjson[] = "text/x-json";
43 const char kTextPlain[] = "text/plain";
44
45 } // anonymous namespace
46
47 SiteIsolationPolicy::ResponseMetaData::ResponseMetaData() {}
48
49 void SiteIsolationPolicy::OnReceivedResponse(
50 int request_id,
51 GURL& frame_origin,
52 GURL& response_url,
53 ResourceType::Type resource_type,
54 const webkit_glue::ResourceResponseInfo& info) {
55 UMA_HISTOGRAM_COUNTS("SiteIsolation.AllResponses", 1);
56
57 // See if this is for navigation. If it is, don't block it, under the
58 // assumption that we will put it in an appropriate process.
59 if (ResourceType::IsFrame(resource_type))
60 return;
61
62 if (!IsBlockableScheme(response_url))
63 return;
64
65 if (IsSameSite(frame_origin, response_url))
66 return;
67
68 SiteIsolationPolicy::ResponseMetaData::CanonicalMimeType canonical_mime_type =
69 GetCanonicalMimeType(info.mime_type);
70
71 if (canonical_mime_type == SiteIsolationPolicy::ResponseMetaData::Others)
72 return;
73
74 // Every CORS request should have the Access-Control-Allow-Origin header even
75 // if it is preceded by a pre-flight request. Therefore, if this is a CORS
76 // request, it has this header. response.httpHeaderField() internally uses
77 // case-insensitive matching for the header name.
78 std::string access_control_origin;
79
80 // We can use a case-insensitive header name for EnumerateHeader().
81 info.headers->EnumerateHeader(
82 NULL, "access-control-allow-origin", &access_control_origin);
83 if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin))
84 return;
85
86 // Real XSD data collection starts from here.
87 std::string no_sniff;
88 info.headers->EnumerateHeader(NULL, "x-content-type-options", &no_sniff);
89
90 ResponseMetaData resp_data;
91 resp_data.frame_origin = frame_origin.spec();
92 resp_data.response_url = response_url;
93 resp_data.resource_type = resource_type;
94 resp_data.canonical_mime_type = canonical_mime_type;
95 resp_data.http_status_code = info.headers->response_code();
96 resp_data.no_sniff = LowerCaseEqualsASCII(no_sniff, "nosniff");
97
98 RequestIdToMetaDataMap* metadata_map = GetRequestIdToMetaDataMap();
99 (*metadata_map)[request_id] = resp_data;
100 }
101
102 // These macros are defined here so that we prevent code size bloat-up due to
103 // the UMA_HISTOGRAM_* macros. Similar logic is used for recording UMA stats for
104 // different MIME types, but we cannot create a helper function for this since
105 // UMA_HISTOGRAM_* macros do not accept variables as their bucket names. As a
106 // solution, macros are used instead to capture the repeated pattern for
107 // recording UMA stats. TODO(dsjang): this is only needed for collecting UMA
108 // stat. Will be deleted when this class is used for actual blocking.
109
110 #define SITE_ISOLATION_POLICY_COUNT_BLOCK(BUCKET_PREFIX) \
111 UMA_HISTOGRAM_COUNTS( BUCKET_PREFIX ".Blocked", 1); \
112 result = true; \
113 if (renderable_status_code) { \
114 UMA_HISTOGRAM_ENUMERATION( \
115 BUCKET_PREFIX ".Blocked.RenderableStatusCode", \
116 resp_data.resource_type, \
117 WebURLRequest::TargetIsUnspecified + 1); \
118 } else { \
119 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".Blocked.NonRenderableStatusCode",1);\
120 }
121
122 #define SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(BUCKET_PREFIX) \
123 UMA_HISTOGRAM_COUNTS( BUCKET_PREFIX ".NoSniffBlocked", 1); \
124 result = true; \
125 if (renderable_status_code) { \
126 UMA_HISTOGRAM_ENUMERATION( \
127 BUCKET_PREFIX ".NoSniffBlocked.RenderableStatusCode", \
128 resp_data.resource_type, \
129 WebURLRequest::TargetIsUnspecified + 1); \
130 } else { \
131 UMA_HISTOGRAM_ENUMERATION( \
132 BUCKET_PREFIX ".NoSniffBlocked.NonRenderableStatusCode", \
133 resp_data.resource_type, \
134 WebURLRequest::TargetIsUnspecified + 1); \
135 }
136
137 #define SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(BUCKET_PREFIX) \
138 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".NotBlocked", 1); \
139 if (is_sniffed_for_js) \
140 UMA_HISTOGRAM_COUNTS(BUCKET_PREFIX ".NotBlocked.MaybeJS", 1); \
141
142 #define SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SNIFF_EXPR,BUCKET_PREFIX) \
143 if (SNIFF_EXPR) { \
144 SITE_ISOLATION_POLICY_COUNT_BLOCK(BUCKET_PREFIX) \
145 } else { \
146 if (resp_data.no_sniff) { \
147 SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(BUCKET_PREFIX) \
148 } else { \
149 SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(BUCKET_PREFIX) \
150 } \
151 }
152
153 bool SiteIsolationPolicy::ShouldBlockResponse(
154 int request_id,
155 const char* data,
156 int length,
157 std::string* alternative_data) {
158
159 RequestIdToMetaDataMap* metadata_map = GetRequestIdToMetaDataMap();
160 RequestIdToResultMap* result_map = GetRequestIdToResultMap();
161
162 // If there's an entry for |request_id| in blocked_map, this request's first
163 // data packet has already been examined. We can return the result here.
164 if (result_map->count(request_id) != 0) {
165 if ((*result_map)[request_id]) {
166 // When we block the resource, we also set an alternative data to be sent.
167 alternative_data->erase();
168 //alternative_data->insert(0, " ");
Charlie Reis 2013/08/22 23:05:22 This should be uncommented or removed. If I had t
169 return true;
170 }
171 return false;
172 }
173
174 // If result_map doesn't have an entry for |request_id|, we're receiving the
175 // first data packet for request_id. If request_id is not registered, this
176 // request is identified as a non-target of our policy. So we return true.
177 if (metadata_map->count(request_id) == 0) {
178 // We set request_id to true so that we always return true for this request.
179 (*result_map)[request_id] = false;
180 return false;
181 }
182
183 // We now look at the first data packet received for request_id.
184 ResponseMetaData resp_data = (*metadata_map)[request_id];
185 metadata_map->erase(request_id);
186
187 // Record the length of the first received network packet to see if it's
188 // enough for sniffing.
189 UMA_HISTOGRAM_COUNTS("SiteIsolation.XSD.DataLength", length);
190
191 // Record the number of cross-site document responses with a specific mime
192 // type (text/html, text/xml, etc).
193 UMA_HISTOGRAM_ENUMERATION(
194 "SiteIsolation.XSD.MimeType",
195 resp_data.canonical_mime_type,
196 SiteIsolationPolicy::ResponseMetaData::MaxCanonicalMimeType);
197
198 // Store the result of cross-site document blocking analysis. True means we
199 // can return this document to the renderer, false means that we have to block
200 // the response data.
201 bool result = false;
202
203 // The content is blocked if it is sniffed for HTML/JSON/XML. When the blocked
204 // response is with an error status code, it is not disruptive by the
205 // following reasons : 1) the blocked content is not a binary object (such as
206 // an image) since it is sniffed for text; 2) then, this blocking only breaks
207 // the renderer behavior only if it is either JavaScript or CSS. However, the
208 // renderer doesn't use the contents of JS/CSS with unaffected status code
209 // (e.g, 404). 3) the renderer is expected not to use the cross-site document
210 // content for purposes other than JS/CSS (e.g, XHR).
211 bool renderable_status_code = IsRenderableStatusCodeForDocument(
212 resp_data.http_status_code);
213
214 // This is only used for false-negative analysis for non-blocked resources.
215 bool is_sniffed_for_js = SniffForJS(data, length);
216
217 // Record the number of responses whose content is sniffed for what its mime
218 // type claims it to be. For example, we apply a HTML sniffer for a document
219 // tagged with text/html here. Whenever this check becomes true, we'll block
220 // the response.
221 switch (resp_data.canonical_mime_type) {
222 case SiteIsolationPolicy::ResponseMetaData::HTML:
223 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForHTML(data, length),
224 "SiteIsolation.XSD.HTML");
225 break;
226 case SiteIsolationPolicy::ResponseMetaData::XML:
227 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForXML(data, length),
228 "SiteIsolation.XSD.XML");
229 break;
230 case SiteIsolationPolicy::ResponseMetaData::JSON:
231 SITE_ISOLATION_POLICY_SNIFF_AND_COUNT(SniffForJSON(data, length),
232 "SiteIsolation.XSD.JSON");
233 break;
234 case SiteIsolationPolicy::ResponseMetaData::Plain:
235 if (SniffForHTML(data, length)) {
236 SITE_ISOLATION_POLICY_COUNT_BLOCK(
237 "SiteIsolation.XSD.Plain.HTML");
238 } else if (SniffForXML(data, length)) {
239 SITE_ISOLATION_POLICY_COUNT_BLOCK(
240 "SiteIsolation.XSD.Plain.XML");
241 } else if (SniffForJSON(data, length)) {
242 SITE_ISOLATION_POLICY_COUNT_BLOCK(
243 "SiteIsolation.XSD.Plain.JSON");
244 } else if (is_sniffed_for_js) {
245 if (resp_data.no_sniff) {
246 SITE_ISOLATION_POLICY_COUNT_NO_SNIFF_BLOCK(
247 "SiteIsolation.XSD.Plain");
248 } else {
249 SITE_ISOLATION_POLICY_COUNT_NOTBLOCK(
250 "SiteIsolation.XSD.Plain");
251 }
252 }
253 break;
254 default :
255 NOTREACHED() <<
256 "Not a blockable mime type. This mime type shouldn't reach here.";
257 break;
258 }
259
260 const CommandLine& command_line = *CommandLine::ForCurrentProcess();
261 if (!command_line.HasSwitch(switches::kBlockCrossSiteDocuments))
262 result = false;
263 (*result_map)[request_id] = result;
264
265 if (result) {
266 alternative_data->erase();
267 alternative_data->insert(0, " ");
268 LOG(ERROR) << resp_data.response_url
269 << " is blocked as an illegal cross-site document from "
270 << resp_data.frame_origin;
271
272 }
273 return result;
274 }
275
276 #undef SITE_ISOLATION_POLICY_COUNT_NOTBLOCK
277 #undef SITE_ISOLATION_POLICY_SNIFF_AND_COUNT
278 #undef SITE_ISOLATION_POLICY_COUNT_BLOCK
279
280 void SiteIsolationPolicy::OnRequestComplete(int request_id) {
281 RequestIdToMetaDataMap* metadata_map = GetRequestIdToMetaDataMap();
282 RequestIdToResultMap* result_map = GetRequestIdToResultMap();
283 metadata_map->erase(request_id);
284 result_map->erase(request_id);
285 }
286
287 SiteIsolationPolicy::ResponseMetaData::CanonicalMimeType
288 SiteIsolationPolicy::GetCanonicalMimeType(const std::string& mime_type) {
289 if (LowerCaseEqualsASCII(mime_type, kTextHtml)) {
290 return SiteIsolationPolicy::ResponseMetaData::HTML;
291 }
292
293 if (LowerCaseEqualsASCII(mime_type, kTextPlain)) {
294 return SiteIsolationPolicy::ResponseMetaData::Plain;
295 }
296
297 if (LowerCaseEqualsASCII(mime_type, kAppJson) ||
298 LowerCaseEqualsASCII(mime_type, kTextJson) ||
299 LowerCaseEqualsASCII(mime_type, kTextXjson)) {
300 return SiteIsolationPolicy::ResponseMetaData::JSON;
301 }
302
303 if (LowerCaseEqualsASCII(mime_type, kTextXml) ||
304 LowerCaseEqualsASCII(mime_type, xAppRssXml) ||
305 LowerCaseEqualsASCII(mime_type, kAppXml)) {
306 return SiteIsolationPolicy::ResponseMetaData::XML;
307 }
308
309 return SiteIsolationPolicy::ResponseMetaData::Others;
310
311 }
312
313 bool SiteIsolationPolicy::IsBlockableScheme(const GURL& url) {
314 // We exclude ftp:// from here. FTP doesn't provide a Content-Type
315 // header which our policy depends on, so we cannot protect any
316 // document from FTP servers.
317 return url.SchemeIs("http") || url.SchemeIs("https");
318 }
319
320 bool SiteIsolationPolicy::IsSameSite(const GURL& frame_origin,
321 const GURL& response_url) {
322
323 if (!frame_origin.is_valid() || !response_url.is_valid())
324 return false;
325
326 if (frame_origin.scheme() != response_url.scheme())
327 return false;
328
329 // SameDomainOrHost() extracts the effective domains (public suffix plus one)
330 // from the two URLs and compare them.
331 // TODO(dsjang): use INCLUDE_PRIVATE_REGISTRIES when http://crbug.com/7988 is
332 // fixed.
333 return net::registry_controlled_domains::SameDomainOrHost(
334 frame_origin,
335 response_url,
336 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
337 }
338
339 bool SiteIsolationPolicy::IsFrameNavigating(WebKit::WebFrame* frame) {
340 // When a navigation starts, frame->provisionalDataSource() is set
341 // to a not-null value which stands for the request made for the
342 // navigation. As soon as the network request is committed to the
343 // frame, frame->provisionalDataSource() is converted to null, and
344 // the committed data source is moved to frame->dataSource(). This
345 // is the most reliable way to detect whether the frame is in
346 // navigation or not.
347 return frame->provisionalDataSource() != NULL;
348 }
349
350 // We don't use Webkit's existing CORS policy implementation since
351 // their policy works in terms of origins, not sites. For example,
352 // when frame is sub.a.com and it is not allowed to access a document
353 // with sub1.a.com. But under Site Isolation, it's allowed.
354 bool SiteIsolationPolicy::IsValidCorsHeaderSet(
355 GURL& frame_origin,
356 GURL& website_origin,
357 std::string access_control_origin) {
358 // Many websites are sending back "\"*\"" instead of "*". This is
359 // non-standard practice, and not supported by Chrome. Refer to
360 // CrossOriginAccessControl::passesAccessControlCheck().
361
362 // TODO(dsjang): * is not allowed for the response from a request
363 // with cookies. This allows for more than what the renderer will
364 // eventually be able to receive, so we won't see illegal cross-site
365 // documents allowed by this. We have to find a way to see if this
366 // response is from a cookie-tagged request or not in the future.
367 if (access_control_origin == "*")
368 return true;
369
370 // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
371 // "*", but many websites are using just a domain for access_control_origin,
372 // and this is blocked by Webkit's CORS logic here :
373 // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
374 // is_valid() to false when it is created from a URL containing * in the
375 // domain part.
376
377 GURL cors_origin(access_control_origin);
378 return IsSameSite(frame_origin, cors_origin);
379 }
380
381 // This function is a slight modification of |net::SniffForHTML|.
382 bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {
383 // The content sniffer used by Chrome and Firefox are using "<!--"
384 // as one of the HTML signatures, but it also appears in valid
385 // JavaScript, considered as well-formed JS by the browser. Since
386 // we do not want to block any JS, we exclude it from our HTML
387 // signatures. This can weaken our document block policy, but we can
388 // break less websites.
389 // TODO(dsjang): parameterize |net::SniffForHTML| with an option
390 // that decides whether to include <!-- or not, so that we can
391 // remove this function.
392 const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec
393 "<script", // HTML5 spec, Mozilla
394 "<html", // HTML5 spec, Mozilla
395 "<head", // HTML5 spec, Mozilla
396 "<iframe", // Mozilla
397 "<h1", // Mozilla
398 "<div", // Mozilla
399 "<font", // Mozilla
400 "<table", // Mozilla
401 "<a", // Mozilla
402 "<style", // Mozilla
403 "<title", // Mozilla
404 "<b", // Mozilla
405 "<body", // Mozilla
406 "<br", "<p", // Mozilla
407 "<?xml" // Mozilla
408 };
409
410 if (MatchesSignature(
411 data, length, html_signatures, arraysize(html_signatures)))
412 return true;
413
414 // "<!--" is specially treated since web JS can use "<!--" "-->" pair for
415 // comments.
416 const char* comment_begins[] = {"<!--" };
417
418 if (MatchesSignature(
419 data, length, comment_begins, arraysize(comment_begins))) {
420 // Search for --> and do SniffForHTML after that. If we can find the
421 // comment's end, we start HTML sniffing from there again.
422 const char end_comment[] = "-->";
423 const size_t end_comment_size = strlen(end_comment);
424
425 for (size_t i = 0; i <= length - end_comment_size; ++i) {
426 if (!strncmp(data + i, end_comment, end_comment_size)) {
427 size_t skipped = i + end_comment_size;
428 return SniffForHTML(data + skipped, length - skipped);
429 }
430 }
431 }
432
433 return false;
434 }
435
436 bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {
437 // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
438 // this signature. However, XML is case-sensitive. Don't we have to
439 // be more lenient only to block documents starting with the exact
440 // string <?xml rather than <?XML ?
441 const char* xml_signatures[] = {"<?xml" // Mozilla
442 };
443 return MatchesSignature(
444 data, length, xml_signatures, arraysize(xml_signatures));
445 }
446
447 bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {
448 // TODO(dsjang): We have to come up with a better way to sniff
449 // JSON. However, even RE cannot help us that much due to the fact
450 // that we don't do full parsing. This DFA starts with state 0, and
451 // finds {, "/' and : in that order. We're avoiding adding a
452 // dependency on a regular expression library.
453 const int kInitState = 0;
454 const int kLeftBraceState = 1;
455 const int kLeftQuoteState = 2;
456 const int kColonState = 3;
457 const int kDeadState = 4;
458
459 int state = kInitState;
460 for (size_t i = 0; i < length && state < kColonState; ++i) {
461 const char c = data[i];
462 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
463 continue;
464
465 switch (state) {
466 case kInitState:
467 if (c == '{')
468 state = kLeftBraceState;
469 else
470 state = kDeadState;
471 break;
472 case kLeftBraceState:
473 if (c == '\"' || c == '\'')
474 state = kLeftQuoteState;
475 else
476 state = kDeadState;
477 break;
478 case kLeftQuoteState:
479 if (c == ':')
480 state = kColonState;
481 break;
482 default:
483 NOTREACHED();
484 break;
485 }
486 }
487 return state == kColonState;
488 }
489
490 bool SiteIsolationPolicy::MatchesSignature(const char* raw_data,
491 size_t raw_length,
492 const char* signatures[],
493 size_t arr_size) {
494 size_t start = 0;
495 // Skip white characters at the beginning of the document.
496 for (start = 0; start < raw_length; ++start) {
497 char c = raw_data[start];
498 if (!(c == ' ' || c == '\t' || c == '\r' || c == '\n'))
499 break;
500 }
501
502 // There is no not-whitespace character in this document.
503 if (!(start < raw_length))
504 return false;
505
506 const char* data = raw_data + start;
507 size_t length = raw_length - start;
508
509 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
510 const char* signature = signatures[sig_index];
511 size_t signature_length = strlen(signature);
512
513 if (length < signature_length)
514 continue;
515
516 if (!base::strncasecmp(signature, data, signature_length))
517 return true;
518 }
519 return false;
520 }
521
522 bool SiteIsolationPolicy::IsRenderableStatusCodeForDocument(int status_code) {
523 // Chrome only uses the content of a response with one of these status codes
524 // for CSS/JavaScript. For images, Chrome just ignores status code.
525 const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,
526 303, 305, 306, 307};
527 for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
528 if (renderable_status_code[i] == status_code)
529 return true;
530 }
531 return false;
532 }
533
534 bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {
535 // TODO(dsjang): This is a real hack. The only purpose of this function is to
536 // try to see if there's any possibility that this data can be JavaScript
537 // (superset of JS). This function will be removed once UMA stats are
538 // gathered.
539
540 // Search for "var " for JS detection.
541 for (size_t i = 0; i < length - 3; ++i) {
542 if (strncmp(data + i, "var ", 4) == 0)
543 return true;
544 }
545 return false;
546 }
547
548 SiteIsolationPolicy::RequestIdToMetaDataMap*
549 SiteIsolationPolicy::GetRequestIdToMetaDataMap() {
550 CR_DEFINE_STATIC_LOCAL(RequestIdToMetaDataMap, metadata_map_, ());
551 return &metadata_map_;
552 }
553
554 SiteIsolationPolicy::RequestIdToResultMap*
555 SiteIsolationPolicy::GetRequestIdToResultMap() {
556 CR_DEFINE_STATIC_LOCAL(RequestIdToResultMap, result_map_, ());
557 return &result_map_;
558 }
559
560 } // namespace content
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698