Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(390)

Side by Side Diff: webkit/child/site_isolation_policy.cc

Issue 22254005: UMA data collector for cross-site documents(XSD) (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@lkgr
Patch Set: Created 7 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "webkit/child/site_isolation_policy.h"
6
7 #include "base/basictypes.h"
8 #include "base/logging.h"
9 #include "base/metrics/histogram.h"
10 #include "base/strings/string_util.h"
11 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
12 #include "third_party/WebKit/public/platform/WebHTTPHeaderVisitor.h"
13 #include "third_party/WebKit/public/platform/WebString.h"
14 #include "third_party/WebKit/public/platform/WebURL.h"
15 #include "third_party/WebKit/public/platform/WebURLRequest.h"
16 #include "third_party/WebKit/public/platform/WebURLResponse.h"
17 #include "third_party/WebKit/public/web/WebDocument.h"
18 #include "third_party/WebKit/public/web/WebFrame.h"
19 #include "third_party/WebKit/public/web/WebFrameClient.h"
20 #include "third_party/WebKit/public/web/WebSecurityOrigin.h"
21
22 using WebKit::WebURLResponse;
23 using WebKit::WebURLRequest;
24 using WebKit::WebURL;
25 using WebKit::WebString;
26 using WebKit::WebDocument;
27
28 namespace webkit_glue {
29
30 std::map<unsigned, WebURLRequest::TargetType>
nasko 2013/08/06 17:29:27 Why are these needed in this file? They are alread
dsjang 2013/08/07 00:19:07 Done.
31 SiteIsolationPolicy::id_target_map_;
32 std::map<std::string, ResponseMetaData>
33 SiteIsolationPolicy::url_responsedata_map_;
34 std::map<unsigned, std::string> SiteIsolationPolicy::id_url_map_;
35
36 void SiteIsolationPolicy::WillSendRequest(
37 unsigned identifier,
38 WebURLRequest::TargetType target_type) {
39 // This happens when the original request is redirected.
40 if (id_target_map_.count(identifier) != 0) {
41 // This check actually can fail. If it is, which target_type do we
42 // have to record between the old one and the new one? When
43 // redirection happens, target_type becomes 2. TODO(dsjang):
44 // let's disable this code and see what happens on onclickads.com
45 // for googleads JavaScript code assigned to an image.
46 if (id_target_map_[identifier] != target_type) {
47 id_target_map_[identifier] = target_type;
48 }
49 }
50 id_target_map_[identifier] = target_type;
51 }
52
53 void SiteIsolationPolicy::DidReceiveResponse(WebKit::WebFrame* frame,
54 unsigned identifier,
55 const WebURLResponse& response) {
56
nasko 2013/08/06 17:29:27 nit: no need for empty line here.
dsjang 2013/08/07 00:19:07 Done.
57 DCHECK(id_target_map_.count(identifier) == 1);
58
59 UMA_HISTOGRAM_COUNTS("XSDP.ALL", 1);
60
61 GURL response_url = response.url();
62 WebURLRequest::TargetType target_type = id_target_map_[identifier];
63 id_target_map_.erase(identifier);
64
65 // See if this is for navigation. If it is, let it pass.
66 if (IsFrameNotCommitted(frame)) {
67 LOG(INFO) << "SiteIsolationPolicy.FrameNotCommitted";
68 return;
69 }
70
71 GURL frame_origin(frame->document().securityOrigin().toString().utf8());
72
73 // TODO(dsjang): Find out all non-network scheme here.
74 // If it's the data: scheme, we can let it pass through.
75 if (IsSafeScheme(frame_origin)) {
76 LOG(INFO) << "SiteIsolationPolicy.SafeScheme:" << frame_origin;
77 return;
78 }
79
80 if (IsSameSite(frame_origin, response_url)) {
81 LOG(INFO) << "SiteIsolationPolicy.SameSite:" << frame_origin << ","
82 << response_url;
83 return;
84 }
85
86 ResponseMetaData::CanonicalMimeType canonical_mime_type =
87 GetCanonicalMimeType(response);
88
89 if (canonical_mime_type == ResponseMetaData::IsOthers) {
90 LOG(INFO) << "SiteIsolationPolicy.mimetype:" << frame_origin << ","
91 << response_url << "," << response.mimeType().utf8();
92 return;
93 }
94
95 std::string access_control_origin = response
96 .httpHeaderField(
97 WebKit::WebString::fromUTF8("Access-Control-Allow-Origin")).utf8();
98
99 if (IsValidCorsHeaderSet(frame_origin, response_url, access_control_origin)) {
100 LOG(INFO) << "SiteIsolationPolicy.CorsisSafe:";
nasko 2013/08/06 17:29:27 nit: CorsIsSafe
dsjang 2013/08/07 00:19:07 Done.
101 return;
102 }
103
104 // Real data collection starts from here.
105 //
106 // XSDP.XSD.%MIMECODE is a shortened name for
107 // XSDP.All.NNav.NSafeScheme.NSMIMEType.NSCORS.%MIMECODE from now
nasko 2013/08/06 17:29:27 What is the meaning of this string? Putting a comm
dsjang 2013/08/07 00:19:07 Done.
108 // on.
109
110 LOG(INFO) << "SiteIsolationPolicy.XSD!!!:" << response_url;
111
112 ResponseMetaData metaData;
113 metaData.frame_origin = frame_origin.spec();
114 metaData.response_url = response_url.spec();
115 metaData.identifier = identifier;
116 metaData.target_type = target_type;
117 metaData.canonical_mime_type = canonical_mime_type;
118 metaData.http_status_code = response.httpStatusCode();
119
120 url_responsedata_map_[metaData.response_url] = metaData;
121 id_url_map_[identifier] = metaData.response_url;
122
123 return;
124 }
125
126 void SiteIsolationPolicy::DidReceiveData(const char* data,
127 int length,
128 WebURL& web_response_url) {
129 GURL response_url(web_response_url);
130
131 std::string response_url_str = response_url.spec();
132 if (url_responsedata_map_.count(response_url_str) == 0)
nasko 2013/08/06 17:29:27 Is this a valid case? When will we have seen a req
dsjang 2013/08/07 00:19:07 url_responsedata_map_ only maintains url for cross
133 return;
134
135 // Record the length of the first received network packet to see if
136 // it's enough for sniffing.
137 UMA_HISTOGRAM_COUNTS("XSDP.XSD.DataLength", length);
138
139 DCHECK(url_responsedata_map_.count(response_url_str) == 1);
140 ResponseMetaData metaData = url_responsedata_map_[response_url_str];
141 url_responsedata_map_.erase(response_url_str);
142
143 std::string uma_bucket_name("XSDP.XSD.");
144 uma_bucket_name.append(ResponseMetaData::CanonicalMimeTypeToString(
145 metaData.canonical_mime_type));
146 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
147
148 // TODO(dsjang): sometimes the length of payload can be not enough to do
149 // correct content sniffing. If that happens, put it into a buffer
150 // so that we can do it later.
151 bool verified_for_blocking = false;
152 switch (metaData.canonical_mime_type) {
153 case ResponseMetaData::IsHTML:
154 if (SniffForHTML(data, length)) {
155 uma_bucket_name.append(".Verified");
156 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
157 verified_for_blocking = true;
158 }
159 break;
160 case ResponseMetaData::IsXML:
161 if (SniffForXML(data, length)) {
162 uma_bucket_name.append(".Verified");
163 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
164 verified_for_blocking = true;
165 }
166 break;
167 case ResponseMetaData::IsJSON:
168 if (SniffForJSON(data, length)) {
169 uma_bucket_name.append(".Verified");
170 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
171 verified_for_blocking = true;
172 }
173 break;
174 case ResponseMetaData::IsPlain:
175 if (SniffForHTML(data, length)) {
176 uma_bucket_name.append(".Verified.HTML");
177 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
178 verified_for_blocking = true;
179 } else if (SniffForXML(data, length)) {
180 uma_bucket_name.append(".Verified.XML");
181 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
182 verified_for_blocking = true;
183 } else if (SniffForJSON(data, length)) {
184 uma_bucket_name.append(".Verified.JSON");
185 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
186 verified_for_blocking = true;
187 }
188 break;
189 case ResponseMetaData::IsOthers:
190 DCHECK(false);
191 break;
192 }
193
194 // We block these. See how many of them have unaffected status code.
195 if (verified_for_blocking) {
196 if (UnaffectedStatusCode(metaData.http_status_code)) {
197 // This is a blocking that does not affect the browser behavior
198 // by the following reasons : 1) this is not a binary object
199 // (such as an image) since this is sniffed as a text
200 // document. 2) then, this blocking only breaks the renderer
201 // behavior only if it is either JavaScript or CSS. However, the
202 // renderer doesn't use the contents of JS/CSS with unaffected
203 // status code(e.g, 404). *) the renderer is expected not to use
204 // the cross-site document content for purposes other than
205 // JS/CSS (e.g, XHR).
206 uma_bucket_name.append(".UnaffectedStatusCode.");
207 std::stringstream stat_code_strm;
208 LOG(INFO) << "Blocked:UNAFFECTED STAT CODE:" << metaData.http_status_code;
209 stat_code_strm << metaData.http_status_code;
210 uma_bucket_name.append(stat_code_strm.str());
211 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
212 } else {
213 LOG(INFO) << "Blocked:AFFECTED STAT CODE:" << metaData.http_status_code;
214 // This blocking can be disruptive if it was actually JS, and
215 // requested for JS.
216 uma_bucket_name.append(".NUnaffectedStatusCode.");
217 uma_bucket_name.append(
218 ResponseMetaData::TargetTypeToString(metaData.target_type));
219 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
220 if (SniffForJS(data, length)) {
221 // This shows if this blocking can be JS.
222 uma_bucket_name.append(".MaybeJS");
223 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
224 }
225 }
226 } else {
227 LOG(INFO) << "Not Blocked:sniffing failed:";
228 // Not blocked. How many of them can be JS? This is only useful
229 // for studying non-blocked documents.
230 if (SniffForJS(data, length)) {
231 uma_bucket_name.append(".NVerified.MaybeJS");
232 UMA_HISTOGRAM_COUNTS(uma_bucket_name.data(), 1);
233 }
234 }
235 }
236
237 void SiteIsolationPolicy::DidFinishResourceLoad(unsigned identifier) {
238 id_target_map_.erase(identifier);
239 if (id_url_map_.count(identifier) > 0) {
240 url_responsedata_map_.erase(id_url_map_[identifier]);
241 id_url_map_.erase(identifier);
242 }
243 }
244
245 void SiteIsolationPolicy::DidFinishResourceLoad(
246 WebKit::WebURL& web_response_url) {
247 GURL response_url(web_response_url);
248
249 if (url_responsedata_map_.count(response_url.spec()) > 0) {
250 ResponseMetaData meta_data = url_responsedata_map_[response_url.spec()];
251 url_responsedata_map_.erase(response_url.spec());
252 id_target_map_.erase(meta_data.identifier);
253 id_url_map_.erase(meta_data.identifier);
254 }
255 }
256
257 ResponseMetaData::CanonicalMimeType SiteIsolationPolicy::GetCanonicalMimeType(
258 const WebURLResponse& response) {
259 // RFC 2045 says: "The type, subtype, and parameter names are not
260 // case sensitive." If you have a MIME type of text/plain that's a
261 // type of text and a subtype of plain. So, per the spec, these are
262 // not case sensitive.
263 std::string mime_type = response.mimeType().utf8();
264 StringToLowerASCII(&mime_type);
nasko 2013/08/06 17:29:27 The mime_type string is UTF8, yet you are using AS
dsjang 2013/08/07 00:19:07 I found that all the crawled mime types are origin
265
266 const char* const document_mime_types[] = {
267 "text/html", "text/xml", "application/rss+xml", "application/xml",
268 "application/json", "text/x-json", "text/json", "text/plain"};
269 size_t i = 0;
270 for (i = 0; i < 8; ++i) {
271 if (!strcmp(document_mime_types[i], mime_type.data())) {
nasko 2013/08/06 17:29:27 This is unsafe comparison, you should be bounding
dsjang 2013/08/07 00:19:07 Switched to std::string::operator==(). On 2013/08
272 break;
273 }
274 }
275
276 if (i == 0) {
nasko 2013/08/06 17:29:27 Using constants like these seems unclean to me. Is
dsjang 2013/08/07 00:19:07 Done.
277 return ResponseMetaData::IsHTML;
278 } else if (1 <= i && i < 4) {
279 return ResponseMetaData::IsXML;
280 } else if (4 <= i && i < 7) {
281 return ResponseMetaData::IsJSON;
282 } else if (i == 7) {
283 return ResponseMetaData::IsPlain;
284 } else {
285 return ResponseMetaData::IsOthers;
286 }
287 }
288
289 bool SiteIsolationPolicy::IsSafeScheme(GURL& url) {
nasko 2013/08/06 17:29:27 What about blob URLs? Are those safe too?
dsjang 2013/08/07 00:19:07 Switched to blacklisting from whitelisting. On 20
290 return url.scheme() == "data";
291 }
292
293 bool SiteIsolationPolicy::IsSameSite(GURL& frame_origin, GURL& response_url) {
294 if (frame_origin.scheme() != response_url.scheme())
295 return false;
296
297 // Extract the effective domains (public suffix plus one) of the
298 // urls.
299 std::string frame_domain =
300 net::registry_controlled_domains::GetDomainAndRegistry(
301 frame_origin,
302 net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
nasko 2013/08/06 17:29:27 Is there a reason we are deviating from how SiteIn
dsjang 2013/08/07 00:19:07 I thought allowing private registries here enables
303 std::string response_domain =
304 net::registry_controlled_domains::GetDomainAndRegistry(
305 response_url,
306 net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
307
308 return frame_domain == response_domain;
309 }
310
311 bool SiteIsolationPolicy::IsFrameNotCommitted(WebKit::WebFrame* frame) {
nasko 2013/08/06 17:29:27 This name is a bit confusing, can you put a descri
dsjang 2013/08/07 00:19:07 Done.
312 return frame->provisionalDataSource() != NULL;
313 }
314
315 bool SiteIsolationPolicy::IsValidCorsHeaderSet(
316 GURL& frame_origin,
317 GURL& website_origin,
318 std::string access_control_origin) {
319
320 size_t access_control_origin_len = access_control_origin.size();
321
322 // TODO(dsjang): Is this actually true? The server seems to return
323 // an empty string or "null".
324 if (access_control_origin_len == 0)
325 return false;
326
327 // Strip quotes off from it. This is non-standard practice, but many
nasko 2013/08/06 17:29:27 Don't we have code that parses the header that we
dsjang 2013/08/07 00:19:07 Done.
328 // websites use quote strings surrounding the actual header value.
329 if (access_control_origin_len > 2) {
330 char first = access_control_origin[0];
331 char last = access_control_origin[access_control_origin_len - 1];
332 if ((first == '\"' && last == '\"') || (first == '\'' && last == '\'')) {
333 access_control_origin =
334 access_control_origin.substr(1, access_control_origin_len - 2);
335 }
336 }
337
338 // TODO(dsjang): * is not allowed for the response from a request
339 // with cookies. This allows for more than what the renderer will
340 // eventually be able to receive, so we won't see illegal cross-site
341 // documents alllowed by this. We have to have t a way to see if
342 // this response is from a cookie-tagged request or not in the
343 // future.
344 if (access_control_origin == "*")
nasko 2013/08/06 17:29:27 You talk about cookie-tagged requests, but there i
dsjang 2013/08/07 00:19:07 Yes, it has to be incorporated here in the future,
345 return true;
346
347 // TODO(dsjang): The CORS spec only treats a fully specified URL,
348 // not just a domain here. Confirm this ad-hoc rule to be
349 // correct. If this doesn't start with a scheme(http://, https://),
nasko 2013/08/06 17:29:27 This sounds scary. Is this correct?
dsjang 2013/08/07 00:19:07 This was actually scarily wrong :-). I found out t
350 // it inherits the site's scheme.
351 if (access_control_origin.find("http://") != 0 &&
352 access_control_origin.find("https://") != 0) {
353 access_control_origin.insert(0, website_origin.scheme() + "://");
354 }
355
356 LOG(ERROR) << access_control_origin;
357
358 // We don't use Webkit's
359 // frame->securityOrigin().canAccess(WebSecurityOrigin::createFromString(acc
360 // ess_control_origin)))here since their .canAccess works in terms of origins,
361 // not sites. For example, when frame is sub.a.com and it is not allowed
362 // to access a document with sub1.a.com. But under Site Isolation,
363 // it's allowed.
364
365 // TODO(dsjang): examine createFromString()'s behavior for a URL
366 // containing * in it.
367 WebKit::WebSecurityOrigin cors_security_origin =
368 WebKit::WebSecurityOrigin::createFromString(
369 WebKit::WebString::fromUTF8(access_control_origin));
370 GURL cors_origin(cors_security_origin.toString().utf8());
371
372 LOG(ERROR) << cors_security_origin.toString().utf8();
373 return IsSameSite(frame_origin, cors_origin);
374 }
375
376 bool SiteIsolationPolicy::SniffForHTML(const char* data, size_t length) {
377 // TODO(dsjang): The content sniffer used by Chrome and Firefox are
378 // using "<!--" as one of the HTML signatures, but it also appears
379 // in valid JavaScript, considered as well-formed JS by the browser.
380 // Since we do not want to block any JS, we exclude it from our HTML
381 // signatures. This can weaken our document block policy, but we can
382 // break less websites.
383 const char* html_signatures[] = {"<!DOCTYPE html", // HTML5 spec
384 "<script", // HTML5 spec, Mozilla
385 "<html", // HTML5 spec, Mozilla
386 "<head", // HTML5 spec, Mozilla
387 "<iframe", // Mozilla
388 "<h1", // Mozilla
389 "<div", // Mozilla
390 "<font", // Mozilla
391 "<table", // Mozilla
392 "<a", // Mozilla
393 "<style", // Mozilla
394 "<title", // Mozilla
395 "<b", // Mozilla
396 "<body", // Mozilla
397 "<br", "<p" // Mozilla
398 };
399 return DoSignatureMatching(
400 data, length, html_signatures, arraysize(html_signatures));
401 }
402
403 bool SiteIsolationPolicy::SniffForXML(const char* data, size_t length) {
404 const char* xml_signatures[] = {"<?xml" // Mozilla
405 };
406 return DoSignatureMatching(
407 data, length, xml_signatures, arraysize(xml_signatures));
408 }
409
410 bool SiteIsolationPolicy::SniffForJSON(const char* data, size_t length) {
411 // TODO(dsjang): We have to come up with a better way to sniff
412 // JSON. However, even RE cannot help us that much due to the fact
413 // that we don't do full parsing. This DFA finds 1) {, 2) "(or'),
414 // 3) : in the order. This is intentionally not using a regular
415 // expression library so that we can make the trusted code base as
416 // small as possible.
417 int state = 0;
418 for (size_t i = 0; i < length && state < 3; ++i, ++data) {
419 char c = *data;
420 if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
421 continue;
422
423 switch (state) {
424 case 0:
425 if (c == '{')
426 state = 1;
nasko 2013/08/06 17:29:27 Please use symbolic names or describe what those n
dsjang 2013/08/07 00:19:07 Done.
427 else
428 state = 4;
429 break;
430 case 1:
431 if (c == '\"' || c == '\'')
432 state = 2;
433 else
434 state = 4;
435 break;
436 case 2:
437 if (c == ':') {
438 state = 3;
439 }
440 break;
441 default:
442 break;
443 }
444 }
445 return state == 3;
446 }
447
448 bool SiteIsolationPolicy::DoSignatureMatching(const char* data,
449 size_t length,
450 const char* signatures[],
451 size_t arr_size) {
452 for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
453 const char* signature = signatures[sig_index];
454 size_t signature_length = strlen(signature);
455 size_t i = 0;
456 // Skip the white characters at the beginning of the document.
457 for (i = 0; i < length; ++i) {
458 char c = *data;
459 if (!(c == ' ' || c == '\r' || c == '\n' || c == '\t')) {
460 break;
461 }
462 ++data;
463 }
464 length = length - i;
465 if (length < signature_length)
466 continue;
467 if (base::strncasecmp(signature, data, signature_length) == 0) {
468 return true;
469 }
470 }
471 return false;
472 }
473
474 bool SiteIsolationPolicy::UnaffectedStatusCode(int status_code) {
475 // Chrome only uses the content of a response with one of these
476 // status codes for CSS/JavaScript. For images, Chrome just ignores
477 // status code.
478 const int renderable_status_code[] = {200, 201, 202, 203, 206, 300, 301, 302,
479 303, 305, 306, 307};
480 for (size_t i = 0; i < 12; ++i) {
481 if (renderable_status_code[i] == status_code)
482 return false;
483 }
484 return true;
485 }
486
487 bool SiteIsolationPolicy::SniffForJS(const char* data, size_t length) {
488 // TODO(dsjang): This is a real hacking. The only purpose of this
489 // function is to try to see if there's any possibility that this
490 // data can be JavaScript.(superset of JS). This function will be
491 // removed for the production code.
492
493 // Search for "var " for JS detection. :-)
494 for (size_t i = 0; i < length - 3; ++i) {
495 if (strncmp(data, "var ", 4) == 0) {
496 return true;
497 }
498 ++data;
499 }
500 return false;
501 }
502 }
nasko 2013/08/06 17:29:27 Leave an empty line and put a // namespace comment
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698