net/base/mime_sniffer.cc - Issue 2883833002: net: Do not sniff XHTML content

Side by Side Diff: net/base/mime_sniffer.cc

Issue 2883833002: net: Do not sniff XHTML content (Closed)

Patch Set: . Created 3 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Detecting mime types is a tricky business because we need to balance	5 // Detecting mime types is a tricky business because we need to balance

6 // compatibility concerns with security issues. Here is a survey of how other	6 // compatibility concerns with security issues. Here is a survey of how other

7 // browsers behave and then a description of how we intend to behave.	7 // browsers behave and then a description of how we intend to behave.

8 //	8 //

9 // HTML payload, no Content-Type header:	9 // HTML payload, no Content-Type header:

10 // * IE 7: Render as HTML	10 // * IE 7: Render as HTML

(...skipping 528 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
539 // document or not, so sniffing is completed.	539 // document or not, so sniffing is completed.

540 return true;	540 return true;

541 }	541 }

542	542

543 // Byte order marks	543 // Byte order marks

544 static const MagicNumber kMagicXML[] = {	544 static const MagicNumber kMagicXML[] = {

545 // We want to be very conservative in interpreting text/xml content as	545 // We want to be very conservative in interpreting text/xml content as

546 // XHTML -- we just want to sniff enough to make unit tests pass.	546 // XHTML -- we just want to sniff enough to make unit tests pass.

547 // So we match explicitly on this, and don't match other ways of writing	547 // So we match explicitly on this, and don't match other ways of writing

548 // it in semantically-equivalent ways.	548 // it in semantically-equivalent ways.

549 MAGIC_STRING("application/xhtml+xml",	549 MAGIC_STRING("application/xhtml+xml",
	tkent 2017/05/15 07:25:32 I think just removing this entry would be another I think just removing this entry would be another option.
550 "<html xmlns=\"http://www.w3.org/1999/xhtml\""),	550 "<html xmlns=\"http://www.w3.org/1999/xhtml\""),

551 MAGIC_STRING("application/atom+xml", "<feed"),	551 MAGIC_STRING("application/atom+xml", "<feed"),

552 MAGIC_STRING("application/rss+xml", "<rss"), // UTF-8	552 MAGIC_STRING("application/rss+xml", "<rss"), // UTF-8

553 };	553 };

554	554

	555 static const MagicNumber kMagicXMLForApplicationXML[] = {

	556 MAGIC_STRING("application/atom+xml", "<feed"),

	557 MAGIC_STRING("application/rss+xml", "<rss"), // UTF-8

	558 };

	559

555 // Returns true and sets result if the content appears to contain XHTML or a	560 // Returns true and sets result if the content appears to contain XHTML or a

556 // feed.	561 // feed.

557 // Clears have_enough_content if more data could possibly change the result.	562 // Clears have_enough_content if more data could possibly change the result.

558 //	563 //

559 // TODO(evanm): this is similar but more conservative than what Safari does,	564 // TODO(evanm): this is similar but more conservative than what Safari does,

560 // while HTML5 has a different recommendation -- what should we do?	565 // while HTML5 has a different recommendation -- what should we do?

561 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset	566 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset

562 // of ASCII -- do we care?	567 // of ASCII -- do we care?

563 static bool SniffXML(const char* content,	568 static bool SniffXML(const char* content,

564 size_t size,	569 size_t size,

	570 const std::string& type_hint,

565 bool* have_enough_content,	571 bool* have_enough_content,

566 std::string* result) {	572 std::string* result) {

567 // We allow at most 300 bytes of content before we expect the opening tag.	573 // We allow at most 300 bytes of content before we expect the opening tag.

568 *have_enough_content &= TruncateSize(300, &size);	574 *have_enough_content &= TruncateSize(300, &size);

569 const char* pos = content;	575 const char* pos = content;

570 const char* const end = content + size;	576 const char* const end = content + size;

571	577

572 // This loop iterates through tag-looking offsets in the file.	578 // This loop iterates through tag-looking offsets in the file.

573 // We want to skip XML processing instructions (of the form "<?xml ...")	579 // We want to skip XML processing instructions (of the form "<?xml ...")

574 // and stop at the first "plain" tag, then make a decision on the mime-type	580 // and stop at the first "plain" tag, then make a decision on the mime-type

(...skipping 18 matching lines...) Expand all Loading...
593 continue;	599 continue;

594 } else if ((pos + kDocTypePrefixLength <= end) &&	600 } else if ((pos + kDocTypePrefixLength <= end) &&

595 base::EqualsCaseInsensitiveASCII(	601 base::EqualsCaseInsensitiveASCII(

596 base::StringPiece(pos, kDocTypePrefixLength),	602 base::StringPiece(pos, kDocTypePrefixLength),

597 base::StringPiece(kDocTypePrefix, kDocTypePrefixLength))) {	603 base::StringPiece(kDocTypePrefix, kDocTypePrefixLength))) {

598 // Skip DOCTYPE declarations.	604 // Skip DOCTYPE declarations.

599 ++pos;	605 ++pos;

600 continue;	606 continue;

601 }	607 }

602	608

603 if (CheckForMagicNumbers(pos, end - pos, kMagicXML, arraysize(kMagicXML),	609 if (type_hint == "application/xml") {

604 result))	610 if (CheckForMagicNumbers(pos, end - pos, kMagicXMLForApplicationXML,

605 return true;	611 arraysize(kMagicXMLForApplicationXML), result))

	612 return true;

	613 } else {

	614 if (CheckForMagicNumbers(pos, end - pos, kMagicXML, arraysize(kMagicXML),

	615 result))

	616 return true;

	617 }

606	618

607 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult	619 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult

608 // to identify.	620 // to identify.

609	621

610 // If we get here, we've hit an initial tag that hasn't matched one of the	622 // If we get here, we've hit an initial tag that hasn't matched one of the

611 // above tests. Abort.	623 // above tests. Abort.

612 return true;	624 return true;

613 }	625 }

614	626

615 // We iterated too far without finding a start tag.	627 // We iterated too far without finding a start tag.

(...skipping 199 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
815 return have_enough_content;	827 return have_enough_content;

816 }	828 }

817 }	829 }

818 }	830 }

819	831

820 // If we have plain XML, sniff XML subtypes.	832 // If we have plain XML, sniff XML subtypes.

821 if (type_hint == "text/xml" \|\| type_hint == "application/xml") {	833 if (type_hint == "text/xml" \|\| type_hint == "application/xml") {

822 // We're not interested in sniffing these types for images and the like.	834 // We're not interested in sniffing these types for images and the like.

823 // Instead, we're looking explicitly for a feed. If we don't find one	835 // Instead, we're looking explicitly for a feed. If we don't find one

824 // we're done and return early.	836 // we're done and return early.

825 if (SniffXML(content, content_size, &have_enough_content, result))	837 if (SniffXML(content, content_size, type_hint, &have_enough_content,

	838 result))

826 return true;	839 return true;

827 return have_enough_content;	840 return have_enough_content;

828 }	841 }

829	842

830 // CRX files (Chrome extensions) have a special sniffing algorithm. It is	843 // CRX files (Chrome extensions) have a special sniffing algorithm. It is

831 // tighter than the others because we don't have to match legacy behavior.	844 // tighter than the others because we don't have to match legacy behavior.

832 if (SniffCRX(content, content_size, url, type_hint,	845 if (SniffCRX(content, content_size, url, type_hint,

833 &have_enough_content, result))	846 &have_enough_content, result))

834 return true;	847 return true;

835	848

(...skipping 42 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
878 ~(1u << '\t' \| 1u << '\n' \| 1u << '\r' \| 1u << '\f' \| 1u << '\x1b');	891 ~(1u << '\t' \| 1u << '\n' \| 1u << '\r' \| 1u << '\f' \| 1u << '\x1b');

879 for (size_t i = 0; i < size; ++i) {	892 for (size_t i = 0; i < size; ++i) {

880 uint8_t byte = static_cast<uint8_t>(content[i]);	893 uint8_t byte = static_cast<uint8_t>(content[i]);

881 if (byte < 0x20 && (kBinaryBits & (1u << byte)))	894 if (byte < 0x20 && (kBinaryBits & (1u << byte)))

882 return true;	895 return true;

883 }	896 }

884 return false;	897 return false;

885 }	898 }

886	899

887 } // namespace net	900 } // namespace net

OLD	NEW

« no previous file with comments | « no previous file | net/base/mime_sniffer_unittest.cc » ('j') | no next file with comments »