OLD | NEW |
---|---|
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Detecting mime types is a tricky business because we need to balance | 5 // Detecting mime types is a tricky business because we need to balance |
6 // compatibility concerns with security issues. Here is a survey of how other | 6 // compatibility concerns with security issues. Here is a survey of how other |
7 // browsers behave and then a description of how we intend to behave. | 7 // browsers behave and then a description of how we intend to behave. |
8 // | 8 // |
9 // HTML payload, no Content-Type header: | 9 // HTML payload, no Content-Type header: |
10 // * IE 7: Render as HTML | 10 // * IE 7: Render as HTML |
(...skipping 191 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
202 MAGIC_HTML_TAG("table") // Mozilla | 202 MAGIC_HTML_TAG("table") // Mozilla |
203 MAGIC_HTML_TAG("a") // Mozilla | 203 MAGIC_HTML_TAG("a") // Mozilla |
204 MAGIC_HTML_TAG("style") // Mozilla | 204 MAGIC_HTML_TAG("style") // Mozilla |
205 MAGIC_HTML_TAG("title") // Mozilla | 205 MAGIC_HTML_TAG("title") // Mozilla |
206 MAGIC_HTML_TAG("b") // Mozilla | 206 MAGIC_HTML_TAG("b") // Mozilla |
207 MAGIC_HTML_TAG("body") // Mozilla | 207 MAGIC_HTML_TAG("body") // Mozilla |
208 MAGIC_HTML_TAG("br") | 208 MAGIC_HTML_TAG("br") |
209 MAGIC_HTML_TAG("p") // Mozilla | 209 MAGIC_HTML_TAG("p") // Mozilla |
210 }; | 210 }; |
211 | 211 |
212 static scoped_refptr<base::Histogram> UMASnifferHistogramGet(const char* name, | 212 static base::Histogram* UMASnifferHistogramGet(const char* name, |
213 int array_size) { | 213 int array_size) { |
214 scoped_refptr<base::Histogram> counter = | 214 base::Histogram* counter = |
215 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, | 215 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, |
216 base::Histogram::kUmaTargetedHistogramFlag); | 216 base::Histogram::kUmaTargetedHistogramFlag); |
217 return counter; | 217 return counter; |
218 } | 218 } |
219 | 219 |
220 // Compare content header to a magic number where magic_entry can contain '.' | 220 // Compare content header to a magic number where magic_entry can contain '.' |
221 // for single character of anything, allowing some bytes to be skipped. | 221 // for single character of anything, allowing some bytes to be skipped. |
222 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { | 222 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { |
223 while (len) { | 223 while (len) { |
224 if ((*magic_entry != '.') && (*magic_entry != *content)) | 224 if ((*magic_entry != '.') && (*magic_entry != *content)) |
(...skipping 76 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
301 *have_enough_content &= TruncateSize(512, &size); | 301 *have_enough_content &= TruncateSize(512, &size); |
302 | 302 |
303 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, | 303 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, |
304 // but with some modifications to better match the HTML5 spec. | 304 // but with some modifications to better match the HTML5 spec. |
305 const char* const end = content + size; | 305 const char* const end = content + size; |
306 const char* pos; | 306 const char* pos; |
307 for (pos = content; pos < end; ++pos) { | 307 for (pos = content; pos < end; ++pos) { |
308 if (!IsAsciiWhitespace(*pos)) | 308 if (!IsAsciiWhitespace(*pos)) |
309 break; | 309 break; |
310 } | 310 } |
311 scoped_refptr<base::Histogram> counter = | 311 static base::Histogram* counter(NULL); |
312 UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", | 312 if (!counter) |
313 arraysize(kSniffableTags)); | 313 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", |
314 arraysize(kSniffableTags)); | |
314 // |pos| now points to first non-whitespace character (or at end). | 315 // |pos| now points to first non-whitespace character (or at end). |
315 return CheckForMagicNumbers(pos, end - pos, | 316 return CheckForMagicNumbers(pos, end - pos, |
316 kSniffableTags, arraysize(kSniffableTags), | 317 kSniffableTags, arraysize(kSniffableTags), |
317 counter.get(), result); | 318 counter, result); |
318 } | 319 } |
319 | 320 |
320 // Returns true and sets result if the content matches any of kMagicNumbers. | 321 // Returns true and sets result if the content matches any of kMagicNumbers. |
321 // Clears have_enough_content if more data could possibly change the result. | 322 // Clears have_enough_content if more data could possibly change the result. |
322 static bool SniffForMagicNumbers(const char* content, | 323 static bool SniffForMagicNumbers(const char* content, |
323 size_t size, | 324 size_t size, |
324 bool* have_enough_content, | 325 bool* have_enough_content, |
325 std::string* result) { | 326 std::string* result) { |
326 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | 327 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); |
327 | 328 |
328 // Check our big table of Magic Numbers | 329 // Check our big table of Magic Numbers |
329 scoped_refptr<base::Histogram> counter = | 330 static base::Histogram* counter(NULL); |
330 UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", | 331 if (!counter) |
331 arraysize(kMagicNumbers)); | 332 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", |
333 arraysize(kMagicNumbers)); | |
332 return CheckForMagicNumbers(content, size, | 334 return CheckForMagicNumbers(content, size, |
333 kMagicNumbers, arraysize(kMagicNumbers), | 335 kMagicNumbers, arraysize(kMagicNumbers), |
334 counter.get(), result); | 336 counter, result); |
335 } | 337 } |
336 | 338 |
337 // Byte order marks | 339 // Byte order marks |
338 static const MagicNumber kMagicXML[] = { | 340 static const MagicNumber kMagicXML[] = { |
339 // We want to be very conservative in interpreting text/xml content as | 341 // We want to be very conservative in interpreting text/xml content as |
340 // XHTML -- we just want to sniff enough to make unit tests pass. | 342 // XHTML -- we just want to sniff enough to make unit tests pass. |
341 // So we match explicitly on this, and don't match other ways of writing | 343 // So we match explicitly on this, and don't match other ways of writing |
342 // it in semantically-equivalent ways. | 344 // it in semantically-equivalent ways. |
343 MAGIC_STRING("application/xhtml+xml", | 345 MAGIC_STRING("application/xhtml+xml", |
344 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") | 346 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") |
(...skipping 15 matching lines...) Expand all Loading... | |
360 std::string* result) { | 362 std::string* result) { |
361 // We allow at most 300 bytes of content before we expect the opening tag. | 363 // We allow at most 300 bytes of content before we expect the opening tag. |
362 *have_enough_content &= TruncateSize(300, &size); | 364 *have_enough_content &= TruncateSize(300, &size); |
363 const char* pos = content; | 365 const char* pos = content; |
364 const char* const end = content + size; | 366 const char* const end = content + size; |
365 | 367 |
366 // This loop iterates through tag-looking offsets in the file. | 368 // This loop iterates through tag-looking offsets in the file. |
367 // We want to skip XML processing instructions (of the form "<?xml ...") | 369 // We want to skip XML processing instructions (of the form "<?xml ...") |
368 // and stop at the first "plain" tag, then make a decision on the mime-type | 370 // and stop at the first "plain" tag, then make a decision on the mime-type |
369 // based on the name (or possibly attributes) of that tag. | 371 // based on the name (or possibly attributes) of that tag. |
370 scoped_refptr<base::Histogram> counter = | 372 static base::Histogram* counter(NULL); |
371 UMASnifferHistogramGet("mime_sniffer.kMagicXML2", | 373 if (!counter) |
372 arraysize(kMagicXML)); | 374 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", |
375 arraysize(kMagicXML)); | |
373 const int kMaxTagIterations = 5; | 376 const int kMaxTagIterations = 5; |
374 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { | 377 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { |
375 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); | 378 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); |
376 if (!pos) | 379 if (!pos) |
377 return false; | 380 return false; |
378 | 381 |
379 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) { | 382 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) { |
380 // Skip XML declarations. | 383 // Skip XML declarations. |
381 ++pos; | 384 ++pos; |
382 continue; | 385 continue; |
383 } else if (base::strncasecmp(pos, "<!DOCTYPE", | 386 } else if (base::strncasecmp(pos, "<!DOCTYPE", |
384 sizeof("<!DOCTYPE")-1) == 0) { | 387 sizeof("<!DOCTYPE")-1) == 0) { |
385 // Skip DOCTYPE declarations. | 388 // Skip DOCTYPE declarations. |
386 ++pos; | 389 ++pos; |
387 continue; | 390 continue; |
388 } | 391 } |
389 | 392 |
390 if (CheckForMagicNumbers(pos, end - pos, | 393 if (CheckForMagicNumbers(pos, end - pos, |
391 kMagicXML, arraysize(kMagicXML), | 394 kMagicXML, arraysize(kMagicXML), |
392 counter.get(), result)) | 395 counter, result)) |
393 return true; | 396 return true; |
394 | 397 |
395 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult | 398 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult |
396 // to identify. | 399 // to identify. |
397 | 400 |
398 // If we get here, we've hit an initial tag that hasn't matched one of the | 401 // If we get here, we've hit an initial tag that hasn't matched one of the |
399 // above tests. Abort. | 402 // above tests. Abort. |
400 return true; | 403 return true; |
401 } | 404 } |
402 | 405 |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
444 // There is no concensus about exactly how to sniff for binary content. | 447 // There is no concensus about exactly how to sniff for binary content. |
445 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. | 448 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. |
446 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. | 449 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. |
447 // Here, we side with FF, but with a smaller buffer. This size was chosen | 450 // Here, we side with FF, but with a smaller buffer. This size was chosen |
448 // because it is small enough to comfortably fit into a single packet (after | 451 // because it is small enough to comfortably fit into a single packet (after |
449 // allowing for headers) and yet large enough to account for binary formats | 452 // allowing for headers) and yet large enough to account for binary formats |
450 // that have a significant amount of ASCII at the beginning (crbug.com/15314). | 453 // that have a significant amount of ASCII at the beginning (crbug.com/15314). |
451 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); | 454 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); |
452 | 455 |
453 // First, we look for a BOM. | 456 // First, we look for a BOM. |
454 scoped_refptr<base::Histogram> counter = | 457 static base::Histogram* counter(NULL); |
455 UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", | 458 if (!counter) |
456 arraysize(kByteOrderMark)); | 459 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", |
460 arraysize(kByteOrderMark)); | |
457 std::string unused; | 461 std::string unused; |
458 if (CheckForMagicNumbers(content, size, | 462 if (CheckForMagicNumbers(content, size, |
459 kByteOrderMark, arraysize(kByteOrderMark), | 463 kByteOrderMark, arraysize(kByteOrderMark), |
460 counter.get(), &unused)) { | 464 counter, &unused)) { |
461 // If there is BOM, we think the buffer is not binary. | 465 // If there is BOM, we think the buffer is not binary. |
462 result->assign("text/plain"); | 466 result->assign("text/plain"); |
463 return false; | 467 return false; |
464 } | 468 } |
465 | 469 |
466 // Next we look to see if any of the bytes "look binary." | 470 // Next we look to see if any of the bytes "look binary." |
467 for (size_t i = 0; i < size; ++i) { | 471 for (size_t i = 0; i < size; ++i) { |
468 // If we a see a binary-looking byte, we think the content is binary. | 472 // If we a see a binary-looking byte, we think the content is binary. |
469 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { | 473 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { |
470 result->assign("application/octet-stream"); | 474 result->assign("application/octet-stream"); |
(...skipping 15 matching lines...) Expand all Loading... | |
486 static const char* kUnknownMimeTypes[] = { | 490 static const char* kUnknownMimeTypes[] = { |
487 // Empty mime types are as unknown as they get. | 491 // Empty mime types are as unknown as they get. |
488 "", | 492 "", |
489 // The unknown/unknown type is popular and uninformative | 493 // The unknown/unknown type is popular and uninformative |
490 "unknown/unknown", | 494 "unknown/unknown", |
491 // The second most popular unknown mime type is application/unknown | 495 // The second most popular unknown mime type is application/unknown |
492 "application/unknown", | 496 "application/unknown", |
493 // Firefox rejects a mime type if it is exactly */* | 497 // Firefox rejects a mime type if it is exactly */* |
494 "*/*", | 498 "*/*", |
495 }; | 499 }; |
496 scoped_refptr<base::Histogram> counter = | 500 static base::Histogram* counter(NULL); |
497 UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", | 501 if (!counter) |
498 arraysize(kUnknownMimeTypes) + 1); | 502 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", |
503 arraysize(kUnknownMimeTypes) + 1); | |
499 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { | 504 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { |
500 if (mime_type == kUnknownMimeTypes[i]) { | 505 if (mime_type == kUnknownMimeTypes[i]) { |
501 counter->Add(i); | 506 counter->Add(i); |
502 return true; | 507 return true; |
503 } | 508 } |
504 } | 509 } |
505 if (mime_type.find('/') == std::string::npos) { | 510 if (mime_type.find('/') == std::string::npos) { |
506 // Firefox rejects a mime type if it does not contain a slash | 511 // Firefox rejects a mime type if it does not contain a slash |
507 counter->Add(arraysize(kUnknownMimeTypes)); | 512 counter->Add(arraysize(kUnknownMimeTypes)); |
508 return true; | 513 return true; |
509 } | 514 } |
510 return false; | 515 return false; |
511 } | 516 } |
512 | 517 |
513 // Returns true and sets result if the content appears to be a crx (chrome | 518 // Returns true and sets result if the content appears to be a crx (chrome |
514 // extension) file. | 519 // extension) file. |
515 // Clears have_enough_content if more data could possibly change the result. | 520 // Clears have_enough_content if more data could possibly change the result. |
516 static bool SniffCRX(const char* content, | 521 static bool SniffCRX(const char* content, |
517 size_t size, | 522 size_t size, |
518 const GURL& url, | 523 const GURL& url, |
519 const std::string& type_hint, | 524 const std::string& type_hint, |
520 bool* have_enough_content, | 525 bool* have_enough_content, |
521 std::string* result) { | 526 std::string* result) { |
522 scoped_refptr<base::Histogram> counter = | 527 static base::Histogram* counter(NULL); |
523 UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); | 528 if (!counter) |
529 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); | |
524 | 530 |
525 // Technically, the crx magic number is just Cr24, but the bytes after that | 531 // Technically, the crx magic number is just Cr24, but the bytes after that |
526 // are a version number which changes infrequently. Including it in the | 532 // are a version number which changes infrequently. Including it in the |
527 // sniffing gives us less room for error. If the version number ever changes, | 533 // sniffing gives us less room for error. If the version number ever changes, |
528 // we can just add an entry to this list. | 534 // we can just add an entry to this list. |
529 // | 535 // |
530 // TODO(aa): If we ever have another magic number, we'll want to pass a | 536 // TODO(aa): If we ever have another magic number, we'll want to pass a |
531 // histogram into CheckForMagicNumbers(), below, to see which one matched. | 537 // histogram into CheckForMagicNumbers(), below, to see which one matched. |
532 static const struct MagicNumber kCRXMagicNumbers[] = { | 538 static const struct MagicNumber kCRXMagicNumbers[] = { |
533 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") | 539 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") |
(...skipping 16 matching lines...) Expand all Loading... | |
550 NULL, result)) { | 556 NULL, result)) { |
551 counter->Add(2); | 557 counter->Add(2); |
552 } else { | 558 } else { |
553 return false; | 559 return false; |
554 } | 560 } |
555 | 561 |
556 return true; | 562 return true; |
557 } | 563 } |
558 | 564 |
559 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { | 565 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { |
560 scoped_refptr<base::Histogram> should_sniff_counter = | 566 base::Histogram* should_sniff_counter(NULL); |
561 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); | 567 if (!should_sniff_counter) |
ramant (doing other things)
2011/04/01 20:05:38
Should we make this static?
jar (doing other things)
2011/04/01 21:50:27
Done.
| |
568 should_sniff_counter = | |
569 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); | |
562 // We are willing to sniff the mime type for HTTP, HTTPS, and FTP | 570 // We are willing to sniff the mime type for HTTP, HTTPS, and FTP |
563 bool sniffable_scheme = url.is_empty() || | 571 bool sniffable_scheme = url.is_empty() || |
564 url.SchemeIs("http") || | 572 url.SchemeIs("http") || |
565 url.SchemeIs("https") || | 573 url.SchemeIs("https") || |
566 url.SchemeIs("ftp") || | 574 url.SchemeIs("ftp") || |
567 url.SchemeIsFile(); | 575 url.SchemeIsFile(); |
568 if (!sniffable_scheme) { | 576 if (!sniffable_scheme) { |
569 should_sniff_counter->Add(1); | 577 should_sniff_counter->Add(1); |
570 return false; | 578 return false; |
571 } | 579 } |
572 | 580 |
573 static const char* kSniffableTypes[] = { | 581 static const char* kSniffableTypes[] = { |
574 // Many web servers are misconfigured to send text/plain for many | 582 // Many web servers are misconfigured to send text/plain for many |
575 // different types of content. | 583 // different types of content. |
576 "text/plain", | 584 "text/plain", |
577 // We want to sniff application/octet-stream for | 585 // We want to sniff application/octet-stream for |
578 // application/x-chrome-extension, but nothing else. | 586 // application/x-chrome-extension, but nothing else. |
579 "application/octet-stream", | 587 "application/octet-stream", |
580 // XHTML and Atom/RSS feeds are often served as plain xml instead of | 588 // XHTML and Atom/RSS feeds are often served as plain xml instead of |
581 // their more specific mime types. | 589 // their more specific mime types. |
582 "text/xml", | 590 "text/xml", |
583 "application/xml", | 591 "application/xml", |
584 }; | 592 }; |
585 scoped_refptr<base::Histogram> counter = | 593 static base::Histogram* counter(NULL); |
586 UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", | 594 if (!counter) |
587 arraysize(kSniffableTypes) + 1); | 595 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", |
596 arraysize(kSniffableTypes) + 1); | |
588 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { | 597 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { |
589 if (mime_type == kSniffableTypes[i]) { | 598 if (mime_type == kSniffableTypes[i]) { |
590 counter->Add(i); | 599 counter->Add(i); |
591 should_sniff_counter->Add(2); | 600 should_sniff_counter->Add(2); |
592 return true; | 601 return true; |
593 } | 602 } |
594 } | 603 } |
595 if (IsUnknownMimeType(mime_type)) { | 604 if (IsUnknownMimeType(mime_type)) { |
596 // The web server didn't specify a content type or specified a mime | 605 // The web server didn't specify a content type or specified a mime |
597 // type that we ignore. | 606 // type that we ignore. |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
671 // Now we look in our large table of magic numbers to see if we can find | 680 // Now we look in our large table of magic numbers to see if we can find |
672 // anything that matches the content. | 681 // anything that matches the content. |
673 if (SniffForMagicNumbers(content, content_size, | 682 if (SniffForMagicNumbers(content, content_size, |
674 &have_enough_content, result)) | 683 &have_enough_content, result)) |
675 return true; // We've matched a magic number. No more content needed. | 684 return true; // We've matched a magic number. No more content needed. |
676 | 685 |
677 return have_enough_content; | 686 return have_enough_content; |
678 } | 687 } |
679 | 688 |
680 } // namespace net | 689 } // namespace net |
OLD | NEW |