OLD | NEW |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Detecting mime types is a tricky business because we need to balance | 5 // Detecting mime types is a tricky business because we need to balance |
6 // compatibility concerns with security issues. Here is a survey of how other | 6 // compatibility concerns with security issues. Here is a survey of how other |
7 // browsers behave and then a description of how we intend to behave. | 7 // browsers behave and then a description of how we intend to behave. |
8 // | 8 // |
9 // HTML payload, no Content-Type header: | 9 // HTML payload, no Content-Type header: |
10 // * IE 7: Render as HTML | 10 // * IE 7: Render as HTML |
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
96 #include "base/histogram.h" | 96 #include "base/histogram.h" |
97 #include "base/logging.h" | 97 #include "base/logging.h" |
98 #include "base/string_util.h" | 98 #include "base/string_util.h" |
99 #include "googleurl/src/gurl.h" | 99 #include "googleurl/src/gurl.h" |
100 #include "net/base/mime_util.h" | 100 #include "net/base/mime_util.h" |
101 | 101 |
102 namespace { | 102 namespace { |
103 | 103 |
104 class SnifferHistogram : public LinearHistogram { | 104 class SnifferHistogram : public LinearHistogram { |
105 public: | 105 public: |
106 SnifferHistogram(const wchar_t* name, int array_size) | 106 SnifferHistogram(const char* name, int array_size) |
107 : LinearHistogram(name, 0, array_size - 1, array_size) { | 107 : LinearHistogram(name, 0, array_size - 1, array_size) { |
108 SetFlags(kUmaTargetedHistogramFlag); | 108 SetFlags(kUmaTargetedHistogramFlag); |
109 } | 109 } |
110 }; | 110 }; |
111 | 111 |
112 } // namespace | 112 } // namespace |
113 | 113 |
114 namespace net { | 114 namespace net { |
115 | 115 |
116 // We aren't interested in looking at more than 512 bytes of content | 116 // We aren't interested in looking at more than 512 bytes of content |
(...skipping 149 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
266 static bool SniffForHTML(const char* content, size_t size, | 266 static bool SniffForHTML(const char* content, size_t size, |
267 std::string* result) { | 267 std::string* result) { |
268 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, | 268 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, |
269 // but with some modifications to better match the HTML5 spec. | 269 // but with some modifications to better match the HTML5 spec. |
270 const char* const end = content + size; | 270 const char* const end = content + size; |
271 const char* pos; | 271 const char* pos; |
272 for (pos = content; pos < end; ++pos) { | 272 for (pos = content; pos < end; ++pos) { |
273 if (!IsAsciiWhitespace(*pos)) | 273 if (!IsAsciiWhitespace(*pos)) |
274 break; | 274 break; |
275 } | 275 } |
276 static SnifferHistogram counter(L"mime_sniffer.kSniffableTags2", | 276 static SnifferHistogram counter("mime_sniffer.kSniffableTags2", |
277 arraysize(kSniffableTags)); | 277 arraysize(kSniffableTags)); |
278 // |pos| now points to first non-whitespace character (or at end). | 278 // |pos| now points to first non-whitespace character (or at end). |
279 return CheckForMagicNumbers(pos, end - pos, | 279 return CheckForMagicNumbers(pos, end - pos, |
280 kSniffableTags, arraysize(kSniffableTags), | 280 kSniffableTags, arraysize(kSniffableTags), |
281 &counter, result); | 281 &counter, result); |
282 } | 282 } |
283 | 283 |
284 static bool SniffForMagicNumbers(const char* content, size_t size, | 284 static bool SniffForMagicNumbers(const char* content, size_t size, |
285 std::string* result) { | 285 std::string* result) { |
286 // Check our big table of Magic Numbers | 286 // Check our big table of Magic Numbers |
287 static SnifferHistogram counter(L"mime_sniffer.kMagicNumbers2", | 287 static SnifferHistogram counter("mime_sniffer.kMagicNumbers2", |
288 arraysize(kMagicNumbers)); | 288 arraysize(kMagicNumbers)); |
289 return CheckForMagicNumbers(content, size, | 289 return CheckForMagicNumbers(content, size, |
290 kMagicNumbers, arraysize(kMagicNumbers), | 290 kMagicNumbers, arraysize(kMagicNumbers), |
291 &counter, result); | 291 &counter, result); |
292 } | 292 } |
293 | 293 |
294 // Byte order marks | 294 // Byte order marks |
295 static const MagicNumber kMagicXML[] = { | 295 static const MagicNumber kMagicXML[] = { |
296 // We want to be very conservative in interpreting text/xml content as | 296 // We want to be very conservative in interpreting text/xml content as |
297 // XHTML -- we just want to sniff enough to make unit tests pass. | 297 // XHTML -- we just want to sniff enough to make unit tests pass. |
(...skipping 15 matching lines...) Expand all Loading... |
313 // We allow at most kFirstTagBytes bytes of content before we expect the | 313 // We allow at most kFirstTagBytes bytes of content before we expect the |
314 // opening tag. | 314 // opening tag. |
315 const size_t kFeedAllowedHeaderBytes = 300; | 315 const size_t kFeedAllowedHeaderBytes = 300; |
316 const char* const end = content + std::min(size, kFeedAllowedHeaderBytes); | 316 const char* const end = content + std::min(size, kFeedAllowedHeaderBytes); |
317 const char* pos = content; | 317 const char* pos = content; |
318 | 318 |
319 // This loop iterates through tag-looking offsets in the file. | 319 // This loop iterates through tag-looking offsets in the file. |
320 // We want to skip XML processing instructions (of the form "<?xml ...") | 320 // We want to skip XML processing instructions (of the form "<?xml ...") |
321 // and stop at the first "plain" tag, then make a decision on the mime-type | 321 // and stop at the first "plain" tag, then make a decision on the mime-type |
322 // based on the name (or possibly attributes) of that tag. | 322 // based on the name (or possibly attributes) of that tag. |
323 static SnifferHistogram counter(L"mime_sniffer.kMagicXML2", | 323 static SnifferHistogram counter("mime_sniffer.kMagicXML2", |
324 arraysize(kMagicXML)); | 324 arraysize(kMagicXML)); |
325 const int kMaxTagIterations = 5; | 325 const int kMaxTagIterations = 5; |
326 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { | 326 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { |
327 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); | 327 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); |
328 if (!pos) | 328 if (!pos) |
329 return false; | 329 return false; |
330 | 330 |
331 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) { | 331 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml")-1) == 0) { |
332 // Skip XML declarations. | 332 // Skip XML declarations. |
333 ++pos; | 333 ++pos; |
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
380 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF | 380 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF |
381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF | 381 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF |
382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF | 382 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF |
383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF | 383 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF |
384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF | 384 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF |
385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF | 385 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF |
386 }; | 386 }; |
387 | 387 |
388 static bool LooksBinary(const char* content, size_t size) { | 388 static bool LooksBinary(const char* content, size_t size) { |
389 // First, we look for a BOM. | 389 // First, we look for a BOM. |
390 static SnifferHistogram counter(L"mime_sniffer.kByteOrderMark2", | 390 static SnifferHistogram counter("mime_sniffer.kByteOrderMark2", |
391 arraysize(kByteOrderMark)); | 391 arraysize(kByteOrderMark)); |
392 std::string unused; | 392 std::string unused; |
393 if (CheckForMagicNumbers(content, size, | 393 if (CheckForMagicNumbers(content, size, |
394 kByteOrderMark, arraysize(kByteOrderMark), | 394 kByteOrderMark, arraysize(kByteOrderMark), |
395 &counter, &unused)) { | 395 &counter, &unused)) { |
396 // If there is BOM, we think the buffer is not binary. | 396 // If there is BOM, we think the buffer is not binary. |
397 return false; | 397 return false; |
398 } | 398 } |
399 | 399 |
400 // Next we look to see if any of the bytes "look binary." | 400 // Next we look to see if any of the bytes "look binary." |
(...skipping 13 matching lines...) Expand all Loading... |
414 static const char* kUnknownMimeTypes[] = { | 414 static const char* kUnknownMimeTypes[] = { |
415 // Empty mime types are as unknown as they get. | 415 // Empty mime types are as unknown as they get. |
416 "", | 416 "", |
417 // The unknown/unknown type is popular and uninformative | 417 // The unknown/unknown type is popular and uninformative |
418 "unknown/unknown", | 418 "unknown/unknown", |
419 // The second most popular unknown mime type is application/unknown | 419 // The second most popular unknown mime type is application/unknown |
420 "application/unknown", | 420 "application/unknown", |
421 // Firefox rejects a mime type if it is exactly */* | 421 // Firefox rejects a mime type if it is exactly */* |
422 "*/*", | 422 "*/*", |
423 }; | 423 }; |
424 static SnifferHistogram counter(L"mime_sniffer.kUnknownMimeTypes2", | 424 static SnifferHistogram counter("mime_sniffer.kUnknownMimeTypes2", |
425 arraysize(kUnknownMimeTypes) + 1); | 425 arraysize(kUnknownMimeTypes) + 1); |
426 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { | 426 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { |
427 if (mime_type == kUnknownMimeTypes[i]) { | 427 if (mime_type == kUnknownMimeTypes[i]) { |
428 counter.Add(i); | 428 counter.Add(i); |
429 return true; | 429 return true; |
430 } | 430 } |
431 } | 431 } |
432 if (mime_type.find('/') == std::string::npos) { | 432 if (mime_type.find('/') == std::string::npos) { |
433 // Firefox rejects a mime type if it does not contain a slash | 433 // Firefox rejects a mime type if it does not contain a slash |
434 counter.Add(arraysize(kUnknownMimeTypes)); | 434 counter.Add(arraysize(kUnknownMimeTypes)); |
435 return true; | 435 return true; |
436 } | 436 } |
437 return false; | 437 return false; |
438 } | 438 } |
439 | 439 |
440 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { | 440 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { |
441 static SnifferHistogram should_sniff_counter( | 441 static SnifferHistogram should_sniff_counter( |
442 L"mime_sniffer.ShouldSniffMimeType2", 3); | 442 "mime_sniffer.ShouldSniffMimeType2", 3); |
443 // We are willing to sniff the mime type for HTTP, HTTPS, and FTP | 443 // We are willing to sniff the mime type for HTTP, HTTPS, and FTP |
444 bool sniffable_scheme = url.is_empty() || | 444 bool sniffable_scheme = url.is_empty() || |
445 url.SchemeIs("http") || | 445 url.SchemeIs("http") || |
446 url.SchemeIs("https") || | 446 url.SchemeIs("https") || |
447 url.SchemeIs("ftp"); | 447 url.SchemeIs("ftp"); |
448 if (!sniffable_scheme) { | 448 if (!sniffable_scheme) { |
449 should_sniff_counter.Add(1); | 449 should_sniff_counter.Add(1); |
450 return false; | 450 return false; |
451 } | 451 } |
452 | 452 |
453 static const char* kSniffableTypes[] = { | 453 static const char* kSniffableTypes[] = { |
454 // Many web servers are misconfigured to send text/plain for many | 454 // Many web servers are misconfigured to send text/plain for many |
455 // different types of content. | 455 // different types of content. |
456 "text/plain", | 456 "text/plain", |
457 // IIS 4.0 and 5.0 send application/octet-stream when serving .xhtml | 457 // IIS 4.0 and 5.0 send application/octet-stream when serving .xhtml |
458 // files. Firefox 2.0 does not sniff xhtml here, but Safari 3, | 458 // files. Firefox 2.0 does not sniff xhtml here, but Safari 3, |
459 // Opera 9, and IE do. | 459 // Opera 9, and IE do. |
460 "application/octet-stream", | 460 "application/octet-stream", |
461 // XHTML and Atom/RSS feeds are often served as plain xml instead of | 461 // XHTML and Atom/RSS feeds are often served as plain xml instead of |
462 // their more specific mime types. | 462 // their more specific mime types. |
463 "text/xml", | 463 "text/xml", |
464 "application/xml", | 464 "application/xml", |
465 }; | 465 }; |
466 static SnifferHistogram counter(L"mime_sniffer.kSniffableTypes2", | 466 static SnifferHistogram counter("mime_sniffer.kSniffableTypes2", |
467 arraysize(kSniffableTypes) + 1); | 467 arraysize(kSniffableTypes) + 1); |
468 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { | 468 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { |
469 if (mime_type == kSniffableTypes[i]) { | 469 if (mime_type == kSniffableTypes[i]) { |
470 counter.Add(i); | 470 counter.Add(i); |
471 should_sniff_counter.Add(2); | 471 should_sniff_counter.Add(2); |
472 return true; | 472 return true; |
473 } | 473 } |
474 } | 474 } |
475 if (IsUnknownMimeType(mime_type)) { | 475 if (IsUnknownMimeType(mime_type)) { |
476 // The web server didn't specify a content type or specified a mime | 476 // The web server didn't specify a content type or specified a mime |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
548 result->assign("text/plain"); | 548 result->assign("text/plain"); |
549 // We could change our mind if a binary-looking byte appears later in | 549 // We could change our mind if a binary-looking byte appears later in |
550 // the content, so we only have enough content if we have the max. | 550 // the content, so we only have enough content if we have the max. |
551 return content_size >= kMaxBytesToSniff; | 551 return content_size >= kMaxBytesToSniff; |
552 } | 552 } |
553 | 553 |
554 return have_enough_content; | 554 return have_enough_content; |
555 } | 555 } |
556 | 556 |
557 } // namespace net | 557 } // namespace net |
558 | |
OLD | NEW |