Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(286)

Side by Side Diff: webkit/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr
Patch Set: switched to using UMA_HISTOGRAM_ENUMERATION from COUNTS Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "webkit/child/site_isolation_policy.h"
6
7 #include "base/basictypes.h"
8 #include "base/logging.h"
9 #include "base/metrics/histogram.h"
10 #include "base/strings/string_util.h"
11 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
12 #include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
13 #include "third_party/WebKit/public/platform/WebString.h"
14 #include "third_party/WebKit/public/platform/WebURL.h"
15 #include "third_party/WebKit/public/platform/WebURLRequest.h"
16 #include "third_party/WebKit/public/platform/WebURLResponse.h"
17 #include "third_party/WebKit/public/web/WebDocument.h"
18 #include "third_party/WebKit/public/web/WebFrame.h"
19 #include "third_party/WebKit/public/web/WebFrameClient.h"
20 #include "third_party/WebKit/public/web/WebSecurityOrigin.h"
21
22 using base::strncasecmp;
23 using WebKit::WebURLResponse;
24 using WebKit::WebURLRequest;
25 using WebKit::WebURL;
26 using WebKit::WebString;
27 using WebKit::WebDocument;
Charlie Reis 2013/08/07 21:02:02 These should be alphabetized.
dsjang 2013/08/08 21:21:01 Done.
28
29 namespace webkit_glue {
30
31 std::map<unsigned, WebURLRequest::TargetType>
32 SiteIsolationPolicy::id_target_map_;
33 std::map<std::string, ResponseMetaData>
34 SiteIsolationPolicy::url_responsedata_map_;
35 std::map<unsigned, std::string> SiteIsolationPolicy::id_url_map_;
36
37 void SiteIsolationPolicy::WillSendRequest(
38 unsigned identifier,
39 WebURLRequest::TargetType target_type) {
40 // This happens when the original request is redirected.
41 if (id_target_map_.count(identifier) != 0) {
42 // This check actually can fail. If it is, which target_type do we
43 // have to record between the old one and the new one? When
44 // redirection happens, target_type becomes 2. TODO(dsjang):
45 // let's disable this code and see what happens on onclickads.com
46 // for googleads JavaScript code assigned to an image. To disable
47 // this, we need a guarntee that target_type is always erased at
48 // the end of a transaction.
49 if (id_target_map_[identifier] != target_type) {
50 id_target_map_[identifier] = target_type;
Charlie Reis 2013/08/07 21:02:02 I can't understand this comment or code. It looks
dsjang 2013/08/08 21:21:01 Done.
51 }
52 }
53 id_target_map_[identifier] = target_type;
54 }
55
56 void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame,
57 unsigned identifier,
58 const WebURLResponse& response) {
59 DCHECK(id_target_map_.count(identifier) == 1);
60
61 UMA_HISTOGRAM_COUNTS("XSDP.ALL", 1);
62
63 GURL response_url = response.url();
64 WebURLRequest::TargetType target_type = id_target_map_[identifier];
65 id_target_map_.erase(identifier);
66
67 // See if this is for navigation. If it is, let it pass.
68 if (IsFrameNotCommitted(frame)) {
69 LOG(INFO) << "SiteIsolationPolicy.FrameNotCommitted";
70 return;
71 }
72
73 GURL frame_origin(frame->document().securityOrigin().toString().utf8());
74
75 // TODO(dsjang): Find out all network related schemes here.
76 if (!IsNetworkScheme(frame_origin)) {
77 LOG(INFO) << "SiteIsolationPolicy.NotNetworkScheme:" << frame_origin;
78 return;
79 }
80
81 if (IsSameSite(frame_origin, response_url)) {
82 LOG(INFO) << "SiteIsolationPolicy.SameSite:" << frame_origin << ","
83 << response_url;
84 return;
85 }
86
87 ResponseMetaData::CanonicalMimeType canonical_mime_type =
88 GetCanonicalMimeType(response);
89
90 if (canonical_mime_type == ResponseMetaData::IsOthers) {
91 LOG(INFO) << "SiteIsolationPolicy.mimetype:" << frame_origin << ","
92 << response_url << ",[" << response.mimeType().utf8() << "]";
93 return;
94 }
95
96 // There was a possiblity that a CORS request preceded by a
97 // pre-flight request does not have "Access-Control-Allow-Origin"
98 // header. But it turns out that every CORS request should have the
99 // header no matter what CORS request it is. Therefore, if this is a
100 // CORS request, it has this header.
101 std::string access_control_origin = response
102 .httpHeaderField(
103 WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8();
104
105 if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) {
106 LOG(INFO) << "SiteIsolationPolicy.CorsIsSafe:";
107 return;
108 }
109
110 // Real XSD data collection starts from here.
111 LOG(INFO) << "SiteIsolationPolicy.XSD!!!:" << canonical_mime_type <<
112 ":" << response_url;
113
114 // TODO(dsjang): Apply X-Content-Type option here.
115 ResponseMetaData resp_data;
116 resp_data.frame_origin = frame_origin.spec();
117 resp_data.response_url = response_url.spec();
118 resp_data.identifier = identifier;
119 resp_data.target_type = target_type;
120 resp_data.canonical_mime_type = canonical_mime_type;
121 resp_data.http_status_code = response.httpStatusCode();
122
123 url_responsedata_map_[resp_data.response_url] = resp_data;
124 id_url_map_[identifier] = resp_data.response_url;
125
126 return;
127 }
128
129 void SiteIsolationPolicy::DidReceiveData(const char* data,
130 int length,
131 WebURL& web_response_url) {
Charlie Reis 2013/08/09 00:39:03 Add a comment to the .h file that there's a risk t
132 // We only record XSDs whose content is actually non-zero.
133 GURL response_url(web_response_url);
134
135 std::string response_url_str = response_url.spec();
136 if (url_responsedata_map_.count(response_url_str) == 0)
137 return;
138
139 DCHECK(url_responsedata_map_.count(response_url_str) == 1);
140 ResponseMetaData resp_data = url_responsedata_map_[response_url_str];
141 url_responsedata_map_.erase(response_url_str);
142
143 // Record the length of the first received network packet to see if
144 // it's enough for sniffing.
145 UMA_HISTOGRAM_COUNTS("XSDP.XSD.DataLength", length);
Charlie Reis 2013/08/07 21:02:02 Why would we need to collect this?
dsjang 2013/08/08 21:21:01 I wanted to see if that's not the case that most o
146
147 // Record the entire number of responses with a specific mime
148 // type(text/html, text/xml, etc).
149 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.MimeType",
150 resp_data.canonical_mime_type,
151 ResponseMetaData::IsOthers + 1);
152
153 // TODO(dsjang): sometimes the length of payload can be not enough to do
154 // correct content sniffing. If that happens, put it into a buffer
155 // so that we can do it later.
156 bool verified_for_blocking = false;
157 ResponseMetaData::CanonicalMimeType sniffed_type =
158 ResponseMetaData::IsOthers;
159
160 switch (resp_data.canonical_mime_type) {
161 // Record the number of responses whose content is sniffed for
162 // what its mime type claims it to be. For example, we apply a
163 // HTML sniffer for a document tagged with text/html here, and
164 // increments the count of "XSDP.XSD.HTML.Verified".
165 case ResponseMetaData::IsHTML:
166 if (SniffForHTML(data, length)) {
167 UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.HTML.Verified", 1);
168 verified_for_blocking = true;
169 }
170 break;
171 case ResponseMetaData::IsXML:
172 if (SniffForXML(data, length)) {
173 UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.XML.Verified", 1);
174 verified_for_blocking = true;
175 }
176 break;
177 case ResponseMetaData::IsJSON:
178 if (SniffForJSON(data, length)) {
179 UMA_HISTOGRAM_COUNTS("XSDP.XSD.MimeType.JSON.Verified", 1);
180 verified_for_blocking = true;
181 }
182 break;
183 case ResponseMetaData::IsPlain:
184 if (SniffForHTML(data, length)) {
185 sniffed_type = ResponseMetaData::IsHTML;
186 verified_for_blocking = true;
187 } else if (SniffForXML(data, length)) {
188 sniffed_type = ResponseMetaData::IsXML;
189 verified_for_blocking = true;
190 } else if (SniffForJSON(data, length)) {
191 sniffed_type = ResponseMetaData::IsJSON;
192 verified_for_blocking = true;
193 }
194 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.MimeType.Plain.Verified",
195 sniffed_type,
196 ResponseMetaData::IsJSON + 1);
197 break;
198 case ResponseMetaData::IsOthers:
199 DCHECK(false);
200 break;
201 }
202
203 // We block these. See how many of them have unaffected status code.
204 if (verified_for_blocking) {
205 if (IsErrorStatusCode(resp_data.http_status_code)) {
206 // This is a blocking that does not affect the browser behavior
207 // by the following reasons : 1) this is not a binary object
208 // (such as an image) since this is sniffed as a text
209 // document. 2) then, this blocking only breaks the renderer
210 // behavior only if it is either JavaScript or CSS. However, the
211 // renderer doesn't use the contents of JS/CSS with unaffected
212 // status code(e.g, 404). *) the renderer is expected not to use
213 // the cross-site document content for purposes other than
214 // JS/CSS (e.g, XHR).
215 UMA_HISTOGRAM_COUNTS("XSDP.XSD.Blocked.ErrorStatusCode", 1);
216 } else {
217 // This is the case that a blocked response is with a non-error
218 // status code, so this blocking can be actually disruptive.
219 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.Blocked.NormalStatusCode",
220 resp_data.target_type, WebURLRequest::TargetIsUnspecified + 1);
221 }
222 } else {
223 LOG(INFO) << "Not Blocked:sniffing failed:";
224 // Not blocked, but How many of them can be JS? This is only
225 // useful for studying non-blocked documents.
226 if (SniffForJS(data, length)) {
227 UMA_HISTOGRAM_ENUMERATION("XSDP.XSD.NotBlocked.MaybeJS",
228 resp_data.target_type,
229 WebURLRequest::TargetIsUnspecified + 1);
230 }
231 }
232 }
233
234 void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) {
235 id_target_map_.erase(identifier);
236 if (id_url_map_.count(identifier) > 0) {
237 url_responsedata_map_.erase(id_url_map_[identifier]);
238 id_url_map_.erase(identifier);
239 }
240 }
241
242 void SiteIsolationPolicy::DidFinishResourceLoadForUrl(
243 const WebKit::WebURL& web_response_url) {
244 GURL response_url(web_response_url);
245
246 if (url_responsedata_map_.count(response_url.spec()) > 0) {
247 ResponseMetaData meta_data = url_responsedata_map_[response_url.spec()];
248 url_responsedata_map_.erase(response_url.spec());
249 id_target_map_.erase(meta_data.identifier);
250 id_url_map_.erase(meta_data.identifier);
251 }
252 }
253
254 ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType(
255 const WebURLResponse& response) {
256 static const char TEXT_HTML[] = "text/html";
257 static const char TEXT_XML[] = "text/xml";
258 static const char APP_RSS_XML[] = "application/rss+xml";
259 static const char APP_XML[] = "application/xml";
260 static const char APP_JSON[] = "application/json";
261 static const char TEXT_XJSON[] = "text/x-json";
262 static const char TEXT_JSON[] = "text/json";
263 static const char TEXT_PLAIN[] = "text/json";
264
265 const std::string mime_type = response.mimeType().utf8();
266
267 LOG(ERROR) << "mimetype:" << mime_type << "==[" << TEXT_HTML << "]";
268
269 // These are a thorough list of the mime types crawled over the top
270 // 50k sites related to HTML, XML, JSON, Plain.
271 if (LowerCaseEqualsASCII(mime_type, TEXT_HTML)) {
272 return ResponseMetaData::IsHTML;
273 } else if (LowerCaseEqualsASCII(mime_type, TEXT_XML) ||
274 LowerCaseEqualsASCII(mime_type, APP_RSS_XML) ||
275 LowerCaseEqualsASCII(mime_type, APP_XML)) {
276 return ResponseMetaData::IsXML;
277 } else if (LowerCaseEqualsASCII(mime_type, APP_JSON) ||
278 LowerCaseEqualsASCII(mime_type, TEXT_XJSON) ||
279 LowerCaseEqualsASCII(mime_type, TEXT_JSON)) {
280 return ResponseMetaData::IsJSON;
281 } else if (LowerCaseEqualsASCII(mime_type, TEXT_PLAIN)) {
282 return ResponseMetaData::IsPlain;
283 } else {
284 return ResponseMetaData::IsOthers;
285 }
286 }
287
288 bool SiteIsolationPolicy::IsNetworkScheme(GURL& url) {
289 // We exclude ftp:// from here. FTP doesn't provide a Content-Type
290 // header which our policy depends on, so we cannot protect any
291 // document from FTP servers.
292 return url.SchemeIs("http") || url.SchemeIs("https");
293 }
294
295 bool SiteIsolationPolicy::IsSameSite(GURL& frame_origin, GURL& response_url) {
296 if (frame_origin.scheme() != response_url.scheme())
297 return false;
298
299 // Extract the effective domains (public suffix plus one) of the
300 // urls.
301
302 // TODO(dsjang): Is there any reason why we don't use
303 // net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES
304 // instead of
305 // net::registry_controlled_domains::EXCLUSE_PRIVATE_REGISTRIES? If
306 // we allow sites to use their private registries, they can use
307 // "finer grained" sites than only using public ones.
308 std::string frame_domain =
309 net::registry_controlled_domains::GetDomainAndRegistry(
310 frame_origin,
311 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
312 std::string response_domain =
313 net::registry_controlled_domains::GetDomainAndRegistry(
314 response_url,
315 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
316
317 return frame_domain == response_domain;
318 }
319
320 bool SiteIsolationPolicy::IsFrameNotCommitted(WebKit::WebFrame* frame) {
321 // When a navigation starts, frame->provisionalDataSource() is set
322 // to a not-null value which stands for the request made for the
323 // navigation. As soon as the network request is committed to the
324 // frame, frame->provisionalDataSource() is converted to null, and
325 // the committed data source is moved to frame->dataSource(). This
326 // is the most reliable way to detect whether the frame is in
327 // navigation or not by far.
328 return frame->provisionalDataSource() != NULL;
329 }
330
331 bool SiteIsolationPolicy::IsValidCorsHeaderSet(
332 GURL& frame_origin,
333 GURL& website_origin,
334 std::string access_control_origin) {
335
336 size_t access_control_origin_len = access_control_origin.size();
337
338 // TODO(dsjang): Is this actually true? The server seems to return
339 // an empty string or "null".
340 if (access_control_origin_len == 0)
341 return false;
342
343 // Many websites are sending back "\"*\"" instead of "*". This is
344 // non-standard practice, and seems not supported by the
345 // brwoser. Refer to
346 // CrossOriginAccessControl::passesAccessControlCheck().
347
348 // TODO(dsjang): * is not allowed for the response from a request
349 // with cookies. This allows for more than what the renderer will
350 // eventually be able to receive, so we won't see illegal cross-site
351 // documents alllowed by this. We have to have t a way to see if
352 // this response is from a cookie-tagged request or not in the
353 // future.
354 if (access_control_origin == "*")
355 return true;
356
357 // TODO(dsjang): The CORS spec only treats a fully specified URL,
358 // except for "*", but many websites are using just a domain for
359 // access_control_origin, and this is blocked by Webkit's CORS logic
360 // here : CrossOriginAccessControl::passesAccessControlCheck()
361
362 // We don't use Webkit's existing CORS policy implementation since
363 // their policy works in terms of origins, not sites. For
364 // example, when frame is sub.a.com and it is not allowed to access
365 // a document with sub1.a.com. But under Site Isolation, it's
366 // allowed.
367
368 // TODO(dsjang): examine createFromString()'s behavior for a URL
369 // containing * in it.
370 WebKit::WebSecurityOrigin cors_security_origin =
371 WebKit::WebSecurityOrigin::createFromString(
372 WebKit::WebString::fromUTF8(access_control_origin));
373 GURL cors_origin(cors_security_origin.toString().utf8());
374
375 LOG(ERROR) << cors_security_origin.toString().utf8();
376 return IsSameSite(frame_origin, cors_origin);
377 }
378
379 bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {
380 // TODO(dsjang): The content sniffer used by Chrome and Firefox are
381 // using "<!--" as one of the HTML signatures, but it also appears
382 // in valid JavaScript, considered as well-formed JS by the browser.
383 // Since we do not want to block any JS, we exclude it from our HTML
384 // signatures. This can weaken our document block policy, but we can
385 // break less websites.
386 const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec
387 "<script", // HTML5 spec, Mozilla
388 "<html", // HTML5 spec, Mozilla
389 "<head", // HTML5 spec, Mozilla
390 "<iframe", // Mozilla
391 "<h1", // Mozilla
392 "<div", // Mozilla
393 "<font", // Mozilla
394 "<table", // Mozilla
395 "<a", // Mozilla
396 "<style", // Mozilla
397 "<title", // Mozilla
398 "<b", // Mozilla
399 "<body", // Mozilla
400 "<br", "<p" // Mozilla
401 };
402 return DoSignatureMatching(
403 data, length, html_signatures, arraysize(html_signatures));
404 }
405
406 bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {
407 const char* xml_signatures[] = {"<?xml" // Mozilla
408 };
409 return DoSignatureMatching(
410 data, length, xml_signatures, arraysize(xml_signatures));
411 }
412
413 bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {
414 // TODO(dsjang): We have to come up with a better way to sniff
415 // JSON. However, even RE cannot help us that much due to the fact
416 // that we don't do full parsing. This DFA starts with state 0, and
417 // finds 1) {, 2) "or', 3) : in the order. This is intentionally not
418 // using a regular expression library so that we can make the
419 // trusted code base as small as possible. State 4 is a dead state.
420 const int INIT_ST = 0;
421 const int LBRACE_ST = 1;
422 const int LQUOTE_ST = 2;
423 const int COLON_ST = 3;
424 const int DEAD_ST = 4;
425
426 int state = INIT_ST;
427 for (size_t i = 0; i < length && state < COLON_ST; ++i, ++data) {
428 const char c = *data;
429 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
430 continue;
431
432 switch (state) {
433 case INIT_ST:
434 if (c == '{')
435 state = LBRACE_ST;
436 else
437 state = DEAD_ST;
438 break;
439 case LBRACE_ST:
440 if (c == '\"' || c == '\'')
441 state = LQUOTE_ST;
442 else
443 state = DEAD_ST;
444 break;
445 case LQUOTE_ST:
446 if (c == ':') {
447 state = COLON_ST;
448 }
449 break;
450 default:
451 break;
452 }
453 }
454 return state == COLON_ST;
455 }
456
457 bool SiteIsolationPolicy::DoSignatureMatching(const char* data,
458 size_t length,
459 const char* signatures[],
460 size_t arr_size) {
461 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
462 const char* signature = signatures[sig_index];
463 size_t signature_length = strlen(signature);
464 size_t i = 0;
465 // Skip the white characters at the beginning of the document.
466 for (i = 0; i < length; ++i) {
467 char c = *data;
468 if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t')) {
469 break;
470 }
471 ++data;
472 }
473 length = length - i;
474 if (length < signature_length)
475 continue;
476 if (!base::strncasecmp(signature, data, signature_length)) {
477 return true;
478 }
479 }
480 return false;
481 }
482
483 bool SiteIsolationPolicy::IsErrorStatusCode(int status_code) {
484 // Chrome only uses the content of a response with one of these
485 // status codes for CSS/JavaScript. For images, Chrome just ignores
486 // status code.
487 const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,
488 303, 305, 306, 307};
489 for (size_t i = 0; i < arraysize(renderable_status_code); ++i) {
490 if (renderable_status_code[i] == status_code)
491 return false;
492 }
493 return true;
494 }
495
496 bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {
497 // TODO(dsjang): This is a real hacking. The only purpose of this
498 // function is to try to see if there's any possibility that this
499 // data can be JavaScript.(superset of JS). This function will be
500 // removed for the production code.
501
502 // Search for "var " for JS detection. :-)
503 for (size_t i = 0; i < length - 3; ++i) {
504 if (strncmp(data, "var ", 4) == 0) {
505 return true;
506 }
507 ++data;
508 }
509 return false;
510 }
511
512 } // namespace webkit_glue
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698