Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(590)

Side by Side Diff: webkit/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr
Patch Set: UMA Bucket names are reorganized. Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "webkit/child/site_isolation_policy.h"
6
7 #include "base/basictypes.h"
8 #include "base/logging.h"
9 #include "base/metrics/histogram.h"
10 #include "base/strings/string_util.h"
11 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
12 #include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
13 #include "third_party/WebKit/public/platform/WebString.h"
14 #include "third_party/WebKit/public/platform/WebURL.h"
15 #include "third_party/WebKit/public/platform/WebURLRequest.h"
16 #include "third_party/WebKit/public/platform/WebURLResponse.h"
17 #include "third_party/WebKit/public/web/WebDocument.h"
18 #include "third_party/WebKit/public/web/WebFrame.h"
19 #include "third_party/WebKit/public/web/WebFrameClient.h"
20 #include "third_party/WebKit/public/web/WebSecurityOrigin.h"
21
22 using base::strncasecmp;
23 using WebKit::WebDocument;
24 using WebKit::WebString;
25 using WebKit::WebURL;
26 using WebKit::WebURLResponse;
27 using WebKit::WebURLRequest;
28
29
30 namespace webkit_glue {
31
32 std::map<unsigned, WebURLRequest::TargetType>
33 SiteIsolationPolicy::id_target_map_;
34 std::map<std::string, ResponseMetaData>
35 SiteIsolationPolicy::url_responsedata_map_;
36 std::map<unsigned, std::string> SiteIsolationPolicy::id_url_map_;
37
38 void SiteIsolationPolicy::WillSendRequest(
39 unsigned identifier,
40 WebURLRequest::TargetType target_type) {
41 // When identifier already exists in the map, it means that this
42 // request has been redirected to issue another request. We don't
43 // overwrite the existing target_type since it becomes
44 // TargetIsSubresource no matter what the original target_type was.
Charlie Reis 2013/08/09 00:39:03 Much clearer. Thanks.
dsjang 2013/08/09 01:31:23 Done.
45 if (!id_target_map_.count(identifier))
46 id_target_map_[identifier] = target_type;
47 }
48
49 void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame,
50 unsigned identifier,
51 const WebURLResponse& response) {
52 DCHECK_EQ(id_target_map_.count(identifier),1U);
53
54 UMA_HISTOGRAM_COUNTS("XSDP.ALL", 1);
Charlie Reis 2013/08/09 00:39:03 For the naming scheme, let's stick to SiteIsolatio
dsjang 2013/08/09 01:31:23 Done.
55
56 GURL response_url = response.url();
57 WebURLRequest::TargetType target_type = id_target_map_[identifier];
58 id_target_map_.erase(identifier);
59
60 // See if this is for navigation. If it is, let it pass.
Charlie Reis 2013/08/09 00:39:03 "let it pass" -> "don't block it, under the assump
dsjang 2013/08/09 01:31:23 Done.
61 if (IsFrameInNavigation(frame)) {
62 LOG(INFO) << "SiteIsolationPolicy.FrameInNavigation";
63 return;
64 }
65
66 GURL frame_origin(frame->document().securityOrigin().toString().utf8());
Charlie Reis 2013/08/09 00:39:03 I don't think you need the utf8() call here, do yo
dsjang 2013/08/09 01:31:23 Done.
67
68 // TODO(dsjang): Find out all network related schemes here.
Charlie Reis 2013/08/09 00:39:03 Is there more to be done here?
dsjang 2013/08/09 01:31:23 Done.
69 if (!IsNetworkScheme(frame_origin)) {
70 LOG(INFO) << "SiteIsolationPolicy.NotNetworkScheme:" << frame_origin;
71 return;
72 }
73
74 if (IsSameSite(frame_origin, response_url)) {
75 LOG(INFO) << "SiteIsolationPolicy.SameSite:" << frame_origin << ","
76 << response_url;
77 return;
78 }
79
80 ResponseMetaData::CanonicalMimeType canonical_mime_type =
81 GetCanonicalMimeType(response);
82
83 if (canonical_mime_type == ResponseMetaData::IsOthers) {
84 LOG(INFO) << "SiteIsolationPolicy.mimetype:" << frame_origin << ","
85 << response_url << ",[" << response.mimeType().utf8() << "]";
86 return;
87 }
88
89 // There was a possiblity that a CORS request preceded by a
Charlie Reis 2013/08/09 00:39:03 Do we need this comment? Or perhaps we can just s
dsjang 2013/08/09 01:31:23 Done.
90 // pre-flight request does not have "Access-Control-Allow-Origin"
91 // header. But it turns out that every CORS request should have the
92 // header no matter what CORS request it is. Therefore, if this is a
93 // CORS request, it has this header.
94 std::string access_control_origin = response
95 .httpHeaderField(
Charlie Reis 2013/08/09 00:39:03 Style nit: Better to response and .httpHeaderField
dsjang 2013/08/09 01:31:23 Done.
96 WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8();
97
98 if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) {
99 LOG(INFO) << "SiteIsolationPolicy.CorsIsSafe:";
100 return;
101 }
102
103 // Real XSD data collection starts from here.
104 LOG(INFO) << "SiteIsolationPolicy.XSD:from header:" << canonical_mime_type <<
105 ":" << response_url;
106
107 // TODO(dsjang): Apply X-Content-Type option here.
Charlie Reis 2013/08/09 00:39:03 What does this mean?
dsjang 2013/08/09 01:31:23 I'm planning to detect X-Content-Type: nosniff hea
108 ResponseMetaData resp_data;
109 resp_data.frame_origin = frame_origin.spec();
110 resp_data.response_url = response_url.spec();
111 resp_data.identifier = identifier;
112 resp_data.target_type = target_type;
113 resp_data.canonical_mime_type = canonical_mime_type;
114 resp_data.http_status_code = response.httpStatusCode();
115
116 url_responsedata_map_[resp_data.response_url] = resp_data;
117 id_url_map_[identifier] = resp_data.response_url;
118
119 return;
120 }
121
122 #define COUNT_BLOCK(BUCKET_PREFIX) \
Charlie Reis 2013/08/09 00:39:03 Introducing new macros is generally frowned upon:
dsjang 2013/08/09 01:31:23 Done.
123 UMA_HISTOGRAM_COUNTS(""BUCKET_PREFIX".Blocked", 1); \
124 if (ok_status_code) { \
125 UMA_HISTOGRAM_ENUMERATION( \
126 ""BUCKET_PREFIX".Blocked.OKStatusCode", \
127 resp_data.target_type, \
128 WebURLRequest::TargetIsUnspecified + 1); \
129 } else { \
130 UMA_HISTOGRAM_COUNTS(""BUCKET_PREFIX".Blocked.ErrorStatusCode", 1); \
131 }
132
133 #define COUNT_NOTBLOCK(BUCKET_PREFIX) \
134 UMA_HISTOGRAM_COUNTS(""BUCKET_PREFIX".NotBlocked", 1); \
135 if (is_sniffed_for_js) \
136 UMA_HISTOGRAM_COUNTS(""BUCKET_PREFIX".NotBlocked.MaybeJS", 1); \
137
138 #define SNIFF_AND_COUNT(SNIFF_EXPR,BUCKET_PREFIX) \
139 if (SNIFF_EXPR) { \
140 COUNT_BLOCK(BUCKET_PREFIX) \
141 } else { \
142 COUNT_NOTBLOCK(BUCKET_PREFIX) \
143 }
144
145 void SiteIsolationPolicy::DidReceiveData(const char* data,
146 int length,
147 WebURL& web_response_url) {
148 // We only record XSDs whose content is actually non-zero.
149 GURL response_url(web_response_url);
150
151 std::string response_url_str = response_url.spec();
152 if (url_responsedata_map_.count(response_url_str) == 0)
153 return;
154
155 DCHECK_EQ(url_responsedata_map_.count(response_url_str), 1U);
156 ResponseMetaData resp_data = url_responsedata_map_[response_url_str];
157 url_responsedata_map_.erase(response_url_str);
158
159 // Record the length of the first received network packet to see if
160 // it's enough for sniffing.
161 UMA_HISTOGRAM_COUNTS("XSDP.XSD.DataLength", length);
162
163 // Record the entire number of responses with a specific mime
164 // type(text/html, text/xml, etc).
165 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.MimeType",
166 resp_data.canonical_mime_type,
167 ResponseMetaData::MaxCanonicalMimeType);
168
169 // Blocking only happens when the content is sniffed for
170 // HTML/JSON/XML. So if the status code is an error status code, it
171 // is not disruptive by the following reasons : 1) the blocked
172 // content is not a binary object (such as an image) since it is
173 // sniffed as a text document. 2) then, this blocking only breaks
174 // the renderer behavior only if it is either JavaScript or
175 // CSS. However, the renderer doesn't use the contents of JS/CSS
176 // with unaffected status code(e.g, 404). *) the renderer is
177 // expected not to use the cross-site document content for purposes
178 // other than JS/CSS (e.g, XHR).
179 bool ok_status_code = !IsErrorStatusCode(resp_data.http_status_code);
180
181 // This is only used for measuring false-negative analysis for
182 // non-blocked resources.
183 bool is_sniffed_for_js = SniffForJS(data, length);
184
185 // Record the number of responses whose content is sniffed for what
186 // its mime type claims it to be. For example, we apply a HTML
187 // sniffer for a document tagged with text/html here. Whenever this
188 // check becomes true, we'll block the response.
189 switch (resp_data.canonical_mime_type) {
190 case ResponseMetaData::IsHTML:
191 SNIFF_AND_COUNT(SniffForHTML(data, length), "XSDP.XSD.MimeType.HTML");
192 break;
193 case ResponseMetaData::IsXML:
194 SNIFF_AND_COUNT(SniffForXML(data, length), "XSDP.XSD.MimeType.XML");
195 break;
196 case ResponseMetaData::IsJSON:
197 SNIFF_AND_COUNT(SniffForJSON(data, length), "XSDP.XSD.MimeType.JSON");
198 break;
199 case ResponseMetaData::IsPlain:
200 if (SniffForHTML(data, length)) {
201 COUNT_BLOCK("XSDP.XSD.MimeType.Plain.HTML");
202 } else if (SniffForXML(data, length)) {
203 COUNT_BLOCK("XSDP.XSD.MimeType.Plain.XML");
204 } else if (SniffForJSON(data, length)) {
205 COUNT_BLOCK("XSDP.XSD.MimeType.Plain.JSON");
206 } else if (is_sniffed_for_js) {
207 COUNT_NOTBLOCK("XSDP.XSD.MimeType.Plain");
208 }
209 break;
210 default :
211 DCHECK(false);
212 break;
213 }
214 }
215
216 void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) {
217 id_target_map_.erase(identifier);
218 if (!id_url_map_.count(identifier)) {
219 url_responsedata_map_.erase(id_url_map_[identifier]);
220 id_url_map_.erase(identifier);
221 }
222 }
223
224 void SiteIsolationPolicy::DidFinishResourceLoadForUrl(
225 const WebKit::WebURL& web_response_url) {
226 GURL response_url(web_response_url);
227
228 if (!url_responsedata_map_.count(response_url.spec())) {
229 ResponseMetaData meta_data = url_responsedata_map_[response_url.spec()];
230 url_responsedata_map_.erase(response_url.spec());
231 id_target_map_.erase(meta_data.identifier);
232 id_url_map_.erase(meta_data.identifier);
233 }
234 }
235
236 ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType(
237 const WebURLResponse& response) {
238 static const char TEXT_HTML[] = "text/html";
239 static const char TEXT_XML[] = "text/xml";
240 static const char APP_RSS_XML[] = "application/rss+xml";
241 static const char APP_XML[] = "application/xml";
242 static const char APP_JSON[] = "application/json";
243 static const char TEXT_XJSON[] = "text/x-json";
244 static const char TEXT_JSON[] = "text/json";
245 static const char TEXT_PLAIN[] = "text/json";
246
247 const std::string mime_type = response.mimeType().utf8();
248
249 LOG(ERROR) << "mimetype:" << mime_type << "==[" << TEXT_HTML << "]";
250
251 // These are a thorough list of the mime types crawled over the top
252 // 50k sites related to HTML, XML, JSON, Plain.
253 if (LowerCaseEqualsASCII(mime_type, TEXT_HTML)) {
254 return ResponseMetaData::IsHTML;
255 } else if (LowerCaseEqualsASCII(mime_type, TEXT_XML) ||
256 LowerCaseEqualsASCII(mime_type, APP_RSS_XML) ||
257 LowerCaseEqualsASCII(mime_type, APP_XML)) {
258 return ResponseMetaData::IsXML;
259 } else if (LowerCaseEqualsASCII(mime_type, APP_JSON) ||
260 LowerCaseEqualsASCII(mime_type, TEXT_XJSON) ||
261 LowerCaseEqualsASCII(mime_type, TEXT_JSON)) {
262 return ResponseMetaData::IsJSON;
263 } else if (LowerCaseEqualsASCII(mime_type, TEXT_PLAIN)) {
264 return ResponseMetaData::IsPlain;
265 } else {
266 return ResponseMetaData::IsOthers;
267 }
268 }
269
270 bool SiteIsolationPolicy::IsNetworkScheme(GURL& url) {
271 // We exclude ftp:// from here. FTP doesn't provide a Content-Type
272 // header which our policy depends on, so we cannot protect any
273 // document from FTP servers.
274 return url.SchemeIs("http") || url.SchemeIs("https");
275 }
276
277 bool SiteIsolationPolicy::IsSameSite(GURL& frame_origin, GURL& response_url) {
278 if (frame_origin.scheme() != response_url.scheme())
279 return false;
280
281 // Extract the effective domains (public suffix plus one) of the
282 // urls.
283
284 // TODO(dsjang): Is there any reason why we don't use
285 // net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES
286 // instead of
287 // net::registry_controlled_domains::EXCLUSE_PRIVATE_REGISTRIES? If
288 // we allow sites to use their private registries, they can use
289 // "finer grained" sites than only using public ones.
290 std::string frame_domain =
291 net::registry_controlled_domains::GetDomainAndRegistry(
292 frame_origin,
293 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
294 std::string response_domain =
295 net::registry_controlled_domains::GetDomainAndRegistry(
296 response_url,
297 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
298
299 return frame_domain == response_domain;
300 }
301
302 bool SiteIsolationPolicy::IsFrameInNavigation(WebKit::WebFrame* frame) {
303 // When a navigation starts, frame->provisionalDataSource() is set
304 // to a not-null value which stands for the request made for the
305 // navigation. As soon as the network request is committed to the
306 // frame, frame->provisionalDataSource() is converted to null, and
307 // the committed data source is moved to frame->dataSource(). This
308 // is the most reliable way to detect whether the frame is in
309 // navigation or not by far.
310 return frame->provisionalDataSource() != NULL;
311 }
312
313 bool SiteIsolationPolicy::IsValidCorsHeaderSet(
314 GURL& frame_origin,
315 GURL& website_origin,
316 std::string access_control_origin) {
317
318 size_t access_control_origin_len = access_control_origin.size();
319
320 // TODO(dsjang): Is this actually true? The server seems to return
321 // an empty string or "null".
322 if (access_control_origin_len == 0)
323 return false;
324
325 // Many websites are sending back "\"*\"" instead of "*". This is
326 // non-standard practice, and seems not supported by the
327 // brwoser. Refer to
328 // CrossOriginAccessControl::passesAccessControlCheck().
329
330 // TODO(dsjang): * is not allowed for the response from a request
331 // with cookies. This allows for more than what the renderer will
332 // eventually be able to receive, so we won't see illegal cross-site
333 // documents alllowed by this. We have to have t a way to see if
334 // this response is from a cookie-tagged request or not in the
335 // future.
336 if (access_control_origin == "*")
337 return true;
338
339 // TODO(dsjang): The CORS spec only treats a fully specified URL,
340 // except for "*", but many websites are using just a domain for
341 // access_control_origin, and this is blocked by Webkit's CORS logic
342 // here : CrossOriginAccessControl::passesAccessControlCheck()
343
344 // We don't use Webkit's existing CORS policy implementation since
345 // their policy works in terms of origins, not sites. For
346 // example, when frame is sub.a.com and it is not allowed to access
347 // a document with sub1.a.com. But under Site Isolation, it's
348 // allowed.
349
350 // TODO(dsjang): examine createFromString()'s behavior for a URL
351 // containing * in it.
352 WebKit::WebSecurityOrigin cors_security_origin =
353 WebKit::WebSecurityOrigin::createFromString(
354 WebKit::WebString::fromUTF8(access_control_origin));
355 GURL cors_origin(cors_security_origin.toString().utf8());
356
357 LOG(ERROR) << cors_security_origin.toString().utf8();
358 return IsSameSite(frame_origin, cors_origin);
359 }
360
361 bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {
362 // TODO(dsjang): The content sniffer used by Chrome and Firefox are
363 // using "<!--" as one of the HTML signatures, but it also appears
364 // in valid JavaScript, considered as well-formed JS by the browser.
365 // Since we do not want to block any JS, we exclude it from our HTML
366 // signatures. This can weaken our document block policy, but we can
367 // break less websites.
368 const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec
369 "<script", // HTML5 spec, Mozilla
370 "<html", // HTML5 spec, Mozilla
371 "<head", // HTML5 spec, Mozilla
372 "<iframe", // Mozilla
373 "<h1", // Mozilla
374 "<div", // Mozilla
375 "<font", // Mozilla
376 "<table", // Mozilla
377 "<a", // Mozilla
378 "<style", // Mozilla
379 "<title", // Mozilla
380 "<b", // Mozilla
381 "<body", // Mozilla
382 "<br", "<p" // Mozilla
383 };
384 return DoSignatureMatching(
385 data, length, html_signatures, arraysize(html_signatures));
386 }
387
388 bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {
389 const char* xml_signatures[] = {"<?xml" // Mozilla
390 };
391 return DoSignatureMatching(
392 data, length, xml_signatures, arraysize(xml_signatures));
393 }
394
395 bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {
396 // TODO(dsjang): We have to come up with a better way to sniff
397 // JSON. However, even RE cannot help us that much due to the fact
398 // that we don't do full parsing. This DFA starts with state 0, and
399 // finds 1) {, 2) "or', 3) : in the order. This is intentionally not
400 // using a regular expression library so that we can make the
401 // trusted code base as small as possible. State 4 is a dead state.
402 const int INIT_ST = 0;
403 const int LBRACE_ST = 1;
404 const int LQUOTE_ST = 2;
405 const int COLON_ST = 3;
406 const int DEAD_ST = 4;
407
408 int state = INIT_ST;
409 for (size_t i = 0; i < length && state < COLON_ST; ++i, ++data) {
410 const char c = *data;
411 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
412 continue;
413
414 switch (state) {
415 case INIT_ST:
416 if (c == '{')
417 state = LBRACE_ST;
418 else
419 state = DEAD_ST;
420 break;
421 case LBRACE_ST:
422 if (c == '\"' || c == '\'')
423 state = LQUOTE_ST;
424 else
425 state = DEAD_ST;
426 break;
427 case LQUOTE_ST:
428 if (c == ':') {
429 state = COLON_ST;
430 }
431 break;
432 default:
433 break;
434 }
435 }
436 return state == COLON_ST;
437 }
438
439 bool SiteIsolationPolicy::DoSignatureMatching(const char* data,
440 size_t length,
441 const char* signatures[],
442 size_t arr_size) {
443 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
444 const char* signature = signatures[sig_index];
445 size_t signature_length = strlen(signature);
446 size_t i = 0;
447 // Skip the white characters at the beginning of the document.
448 for (i = 0; i < length; ++i) {
449 char c = *data;
450 if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t')) {
451 break;
452 }
453 ++data;
454 }
455 length = length - i;
456 if (length < signature_length)
457 continue;
458 if (!base::strncasecmp(signature, data, signature_length)) {
459 return true;
460 }
461 }
462 return false;
463 }
464
465 bool SiteIsolationPolicy::IsErrorStatusCode(int status_code) {
466 // Chrome only uses the content of a response with one of these
467 // status codes for CSS/JavaScript. For images, Chrome just ignores
468 // status code.
469 const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,
470 303, 305, 306, 307};
471 for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
472 if (renderable_status_code[i] == status_code)
473 return false;
474 }
475 return true;
476 }
477
478 bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {
479 // TODO(dsjang): This is a real hacking. The only purpose of this
480 // function is to try to see if there's any possibility that this
481 // data can be JavaScript.(superset of JS). This function will be
482 // removed for the production code.
483
484 // Search for "var " for JS detection. :-)
485 for (size_t i = 0; i < length - 3; ++i) {
486 if (strncmp(data, "var ", 4) == 0) {
487 return true;
488 }
489 ++data;
490 }
491 return false;
492 }
493
494 } // namespace webkit_glue
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698