| OLD | NEW |
| (Empty) |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // Detecting mime types is a tricky business because we need to balance | |
| 6 // compatibility concerns with security issues. Here is a survey of how other | |
| 7 // browsers behave and then a description of how we intend to behave. | |
| 8 // | |
| 9 // HTML payload, no Content-Type header: | |
| 10 // * IE 7: Render as HTML | |
| 11 // * Firefox 2: Render as HTML | |
| 12 // * Safari 3: Render as HTML | |
| 13 // * Opera 9: Render as HTML | |
| 14 // | |
| 15 // Here the choice seems clear: | |
| 16 // => Chrome: Render as HTML | |
| 17 // | |
| 18 // HTML payload, Content-Type: "text/plain": | |
| 19 // * IE 7: Render as HTML | |
| 20 // * Firefox 2: Render as text | |
| 21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL | |
| 22 // has an HTML extension) | |
| 23 // * Opera 9: Render as text | |
| 24 // | |
| 25 // Here we choose to follow the majority (and break some compatibility with IE). | |
| 26 // Many folks dislike IE's behavior here. | |
| 27 // => Chrome: Render as text | |
| 28 // We generalize this as follows. If the Content-Type header is text/plain | |
| 29 // we won't detect dangerous mime types (those that can execute script). | |
| 30 // | |
| 31 // HTML payload, Content-Type: "application/octet-stream": | |
| 32 // * IE 7: Render as HTML | |
| 33 // * Firefox 2: Download as application/octet-stream | |
| 34 // * Safari 3: Render as HTML | |
| 35 // * Opera 9: Render as HTML | |
| 36 // | |
| 37 // We follow Firefox. | |
| 38 // => Chrome: Download as application/octet-stream | |
| 39 // One factor in this decision is that IIS 4 and 5 will send | |
| 40 // application/octet-stream for .xhtml files (because they don't recognize | |
| 41 // the extension). We did some experiments and it looks like this doesn't occur | |
| 42 // very often on the web. We choose the more secure option. | |
| 43 // | |
| 44 // GIF payload, no Content-Type header: | |
| 45 // * IE 7: Render as GIF | |
| 46 // * Firefox 2: Render as GIF | |
| 47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the | |
| 48 // URL has an GIF extension) | |
| 49 // * Opera 9: Render as GIF | |
| 50 // | |
| 51 // The choice is clear. | |
| 52 // => Chrome: Render as GIF | |
| 53 // Once we decide to render HTML without a Content-Type header, there isn't much | |
| 54 // reason not to render GIFs. | |
| 55 // | |
| 56 // GIF payload, Content-Type: "text/plain": | |
| 57 // * IE 7: Render as GIF | |
| 58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will | |
| 59 // Download as GIF if the URL has an GIF extension) | |
| 60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the | |
| 61 // URL has an GIF extension) | |
| 62 // * Opera 9: Render as GIF | |
| 63 // | |
| 64 // Displaying as text/plain makes little sense as the content will look like | |
| 65 // gibberish. Here, we could change our minds and download. | |
| 66 // => Chrome: Render as GIF | |
| 67 // | |
| 68 // GIF payload, Content-Type: "application/octet-stream": | |
| 69 // * IE 7: Render as GIF | |
| 70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will | |
| 71 // Download as GIF if the URL has an GIF extension) | |
| 72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the | |
| 73 // URL has an GIF extension) | |
| 74 // * Opera 9: Render as GIF | |
| 75 // | |
| 76 // We used to render as GIF here, but the problem is that some sites want to | |
| 77 // trigger downloads by sending application/octet-stream (even though they | |
| 78 // should be sending Content-Disposition: attachment). Although it is safe | |
| 79 // to render as GIF from a security perspective, we actually get better | |
| 80 // compatibility if we don't sniff from application/octet stream at all. | |
| 81 // => Chrome: Download as application/octet-stream | |
| 82 // | |
| 83 // XHTML payload, Content-Type: "text/xml": | |
| 84 // * IE 7: Render as XML | |
| 85 // * Firefox 2: Render as HTML | |
| 86 // * Safari 3: Render as HTML | |
| 87 // * Opera 9: Render as HTML | |
| 88 // The layout tests rely on us rendering this as HTML. | |
| 89 // But we're conservative in XHTML detection, as this runs afoul of the | |
| 90 // "don't detect dangerous mime types" rule. | |
| 91 // | |
| 92 // Note that our definition of HTML payload is much stricter than IE's | |
| 93 // definition and roughly the same as Firefox's definition. | |
| 94 | |
| 95 #include <string> | |
| 96 | |
| 97 #include "net/base/mime_sniffer.h" | |
| 98 | |
| 99 #include "base/basictypes.h" | |
| 100 #include "base/logging.h" | |
| 101 #include "base/metrics/histogram.h" | |
| 102 #include "base/strings/string_util.h" | |
| 103 #include "net/base/mime_util.h" | |
| 104 #include "url/gurl.h" | |
| 105 | |
| 106 namespace net { | |
| 107 | |
| 108 // The number of content bytes we need to use all our magic numbers. Feel free | |
| 109 // to increase this number if you add a longer magic number. | |
| 110 static const size_t kBytesRequiredForMagic = 42; | |
| 111 | |
| 112 struct MagicNumber { | |
| 113 const char* const mime_type; | |
| 114 const char* const magic; | |
| 115 size_t magic_len; | |
| 116 bool is_string; | |
| 117 const char* const mask; // if set, must have same length as |magic| | |
| 118 }; | |
| 119 | |
| 120 #define MAGIC_NUMBER(mime_type, magic) \ | |
| 121 { (mime_type), (magic), sizeof(magic)-1, false, NULL }, | |
| 122 | |
| 123 template <int MagicSize, int MaskSize> | |
| 124 class VerifySizes { | |
| 125 static_assert(MagicSize == MaskSize, "sizes must be equal"); | |
| 126 | |
| 127 public: | |
| 128 enum { SIZES = MagicSize }; | |
| 129 }; | |
| 130 | |
| 131 #define verified_sizeof(magic, mask) \ | |
| 132 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES | |
| 133 | |
| 134 #define MAGIC_MASK(mime_type, magic, mask) \ | |
| 135 { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) }, | |
| 136 | |
| 137 // Magic strings are case insensitive and must not include '\0' characters | |
| 138 #define MAGIC_STRING(mime_type, magic) \ | |
| 139 { (mime_type), (magic), sizeof(magic)-1, true, NULL }, | |
| 140 | |
| 141 static const MagicNumber kMagicNumbers[] = { | |
| 142 // Source: HTML 5 specification | |
| 143 MAGIC_NUMBER("application/pdf", "%PDF-") | |
| 144 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") | |
| 145 MAGIC_NUMBER("image/gif", "GIF87a") | |
| 146 MAGIC_NUMBER("image/gif", "GIF89a") | |
| 147 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A") | |
| 148 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") | |
| 149 MAGIC_NUMBER("image/bmp", "BM") | |
| 150 // Source: Mozilla | |
| 151 MAGIC_NUMBER("text/plain", "#!") // Script | |
| 152 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS | |
| 153 MAGIC_NUMBER("text/plain", "From") | |
| 154 MAGIC_NUMBER("text/plain", ">From") | |
| 155 // Chrome specific | |
| 156 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") | |
| 157 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") | |
| 158 MAGIC_NUMBER("video/x-ms-asf", | |
| 159 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") | |
| 160 MAGIC_NUMBER("image/tiff", "I I") | |
| 161 MAGIC_NUMBER("image/tiff", "II*") | |
| 162 MAGIC_NUMBER("image/tiff", "MM\x00*") | |
| 163 MAGIC_NUMBER("audio/mpeg", "ID3") | |
| 164 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") | |
| 165 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") | |
| 166 // TODO(abarth): we don't handle partial byte matches yet | |
| 167 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") | |
| 168 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") | |
| 169 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") | |
| 170 MAGIC_NUMBER("application/zip", "PK\x03\x04") | |
| 171 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") | |
| 172 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") | |
| 173 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE | |
| 174 // Sniffing for Flash: | |
| 175 // | |
| 176 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") | |
| 177 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") | |
| 178 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") | |
| 179 // | |
| 180 // Including these magic number for Flash is a trade off. | |
| 181 // | |
| 182 // Pros: | |
| 183 // * Flash is an important and popular file format | |
| 184 // | |
| 185 // Cons: | |
| 186 // * These patterns are fairly weak | |
| 187 // * If we mistakenly decide something is Flash, we will execute it | |
| 188 // in the origin of an unsuspecting site. This could be a security | |
| 189 // vulnerability if the site allows users to upload content. | |
| 190 // | |
| 191 // On balance, we do not include these patterns. | |
| 192 }; | |
| 193 | |
| 194 // The number of content bytes we need to use all our Microsoft Office magic | |
| 195 // numbers. | |
| 196 static const size_t kBytesRequiredForOfficeMagic = 8; | |
| 197 | |
| 198 static const MagicNumber kOfficeMagicNumbers[] = { | |
| 199 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1") | |
| 200 MAGIC_NUMBER("OOXML", "PK\x03\x04") | |
| 201 }; | |
| 202 | |
| 203 enum OfficeDocType { | |
| 204 DOC_TYPE_WORD, | |
| 205 DOC_TYPE_EXCEL, | |
| 206 DOC_TYPE_POWERPOINT, | |
| 207 DOC_TYPE_NONE | |
| 208 }; | |
| 209 | |
| 210 struct OfficeExtensionType { | |
| 211 OfficeDocType doc_type; | |
| 212 const char* const extension; | |
| 213 size_t extension_len; | |
| 214 }; | |
| 215 | |
| 216 #define OFFICE_EXTENSION(type, extension) \ | |
| 217 { (type), (extension), sizeof(extension) - 1 }, | |
| 218 | |
| 219 static const OfficeExtensionType kOfficeExtensionTypes[] = { | |
| 220 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc") | |
| 221 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls") | |
| 222 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt") | |
| 223 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx") | |
| 224 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx") | |
| 225 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx") | |
| 226 }; | |
| 227 | |
| 228 static const MagicNumber kExtraMagicNumbers[] = { | |
| 229 MAGIC_NUMBER("image/x-xbitmap", "#define") | |
| 230 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00") | |
| 231 MAGIC_NUMBER("image/svg+xml", "<?xml_version=") | |
| 232 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ") | |
| 233 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST") | |
| 234 MAGIC_NUMBER("audio/ogg", "OggS") | |
| 235 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0") | |
| 236 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0") | |
| 237 MAGIC_NUMBER("video/3gpp", "....ftyp3g") | |
| 238 MAGIC_NUMBER("video/3gpp", "....ftypavcl") | |
| 239 MAGIC_NUMBER("video/mp4", "....ftyp") | |
| 240 MAGIC_NUMBER("video/quicktime", "....moov") | |
| 241 MAGIC_NUMBER("application/x-shockwave-flash", "CWS") | |
| 242 MAGIC_NUMBER("application/x-shockwave-flash", "FWS") | |
| 243 MAGIC_NUMBER("video/x-flv", "FLV") | |
| 244 MAGIC_NUMBER("audio/x-flac", "fLaC") | |
| 245 | |
| 246 // RAW image types. | |
| 247 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR") | |
| 248 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR") | |
| 249 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM") | |
| 250 MAGIC_NUMBER("image/x-olympus-orf", "MMOR") // big-endian | |
| 251 MAGIC_NUMBER("image/x-olympus-orf", "IIRO") // little-endian | |
| 252 MAGIC_NUMBER("image/x-olympus-orf", "IIRS") // little-endian | |
| 253 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ") | |
| 254 MAGIC_NUMBER("image/x-panasonic-raw", | |
| 255 "IIU\x00\x08\x00\x00\x00") // Panasonic .raw | |
| 256 MAGIC_NUMBER("image/x-panasonic-raw", | |
| 257 "IIU\x00\x18\x00\x00\x00") // Panasonic .rw2 | |
| 258 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw") | |
| 259 MAGIC_NUMBER("image/x-x3f", "FOVb") | |
| 260 }; | |
| 261 | |
| 262 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will | |
| 263 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is | |
| 264 // HTML, but we will not. | |
| 265 | |
| 266 #define MAGIC_HTML_TAG(tag) \ | |
| 267 MAGIC_STRING("text/html", "<" tag) | |
| 268 | |
| 269 static const MagicNumber kSniffableTags[] = { | |
| 270 // XML processing directive. Although this is not an HTML mime type, we sniff | |
| 271 // for this in the HTML phase because text/xml is just as powerful as HTML and | |
| 272 // we want to leverage our white space skipping technology. | |
| 273 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla | |
| 274 // DOCTYPEs | |
| 275 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec | |
| 276 // Sniffable tags, ordered by how often they occur in sniffable documents. | |
| 277 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla | |
| 278 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla | |
| 279 MAGIC_HTML_TAG("!--") | |
| 280 MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla | |
| 281 MAGIC_HTML_TAG("iframe") // Mozilla | |
| 282 MAGIC_HTML_TAG("h1") // Mozilla | |
| 283 MAGIC_HTML_TAG("div") // Mozilla | |
| 284 MAGIC_HTML_TAG("font") // Mozilla | |
| 285 MAGIC_HTML_TAG("table") // Mozilla | |
| 286 MAGIC_HTML_TAG("a") // Mozilla | |
| 287 MAGIC_HTML_TAG("style") // Mozilla | |
| 288 MAGIC_HTML_TAG("title") // Mozilla | |
| 289 MAGIC_HTML_TAG("b") // Mozilla | |
| 290 MAGIC_HTML_TAG("body") // Mozilla | |
| 291 MAGIC_HTML_TAG("br") | |
| 292 MAGIC_HTML_TAG("p") // Mozilla | |
| 293 }; | |
| 294 | |
| 295 static base::HistogramBase* UMASnifferHistogramGet(const char* name, | |
| 296 int array_size) { | |
| 297 base::HistogramBase* counter = | |
| 298 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, | |
| 299 base::HistogramBase::kUmaTargetedHistogramFlag); | |
| 300 return counter; | |
| 301 } | |
| 302 | |
| 303 // Compare content header to a magic number where magic_entry can contain '.' | |
| 304 // for single character of anything, allowing some bytes to be skipped. | |
| 305 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { | |
| 306 while (len) { | |
| 307 if ((*magic_entry != '.') && (*magic_entry != *content)) | |
| 308 return false; | |
| 309 ++magic_entry; | |
| 310 ++content; | |
| 311 --len; | |
| 312 } | |
| 313 return true; | |
| 314 } | |
| 315 | |
| 316 // Like MagicCmp() except that it ANDs each byte with a mask before | |
| 317 // the comparison, because there are some bits we don't care about. | |
| 318 static bool MagicMaskCmp(const char* magic_entry, | |
| 319 const char* content, | |
| 320 size_t len, | |
| 321 const char* mask) { | |
| 322 while (len) { | |
| 323 if ((*magic_entry != '.') && (*magic_entry != (*mask & *content))) | |
| 324 return false; | |
| 325 ++magic_entry; | |
| 326 ++content; | |
| 327 ++mask; | |
| 328 --len; | |
| 329 } | |
| 330 return true; | |
| 331 } | |
| 332 | |
| 333 static bool MatchMagicNumber(const char* content, | |
| 334 size_t size, | |
| 335 const MagicNumber& magic_entry, | |
| 336 std::string* result) { | |
| 337 const size_t len = magic_entry.magic_len; | |
| 338 | |
| 339 // Keep kBytesRequiredForMagic honest. | |
| 340 DCHECK_LE(len, kBytesRequiredForMagic); | |
| 341 | |
| 342 // To compare with magic strings, we need to compute strlen(content), but | |
| 343 // content might not actually have a null terminator. In that case, we | |
| 344 // pretend the length is content_size. | |
| 345 const char* end = static_cast<const char*>(memchr(content, '\0', size)); | |
| 346 const size_t content_strlen = | |
| 347 (end != NULL) ? static_cast<size_t>(end - content) : size; | |
| 348 | |
| 349 bool match = false; | |
| 350 if (magic_entry.is_string) { | |
| 351 if (content_strlen >= len) { | |
| 352 // String comparisons are case-insensitive | |
| 353 match = (base::strncasecmp(magic_entry.magic, content, len) == 0); | |
| 354 } | |
| 355 } else { | |
| 356 if (size >= len) { | |
| 357 if (!magic_entry.mask) { | |
| 358 match = MagicCmp(magic_entry.magic, content, len); | |
| 359 } else { | |
| 360 match = MagicMaskCmp(magic_entry.magic, content, len, magic_entry.mask); | |
| 361 } | |
| 362 } | |
| 363 } | |
| 364 | |
| 365 if (match) { | |
| 366 result->assign(magic_entry.mime_type); | |
| 367 return true; | |
| 368 } | |
| 369 return false; | |
| 370 } | |
| 371 | |
| 372 static bool CheckForMagicNumbers(const char* content, size_t size, | |
| 373 const MagicNumber* magic, size_t magic_len, | |
| 374 base::HistogramBase* counter, | |
| 375 std::string* result) { | |
| 376 for (size_t i = 0; i < magic_len; ++i) { | |
| 377 if (MatchMagicNumber(content, size, magic[i], result)) { | |
| 378 if (counter) counter->Add(static_cast<int>(i)); | |
| 379 return true; | |
| 380 } | |
| 381 } | |
| 382 return false; | |
| 383 } | |
| 384 | |
| 385 // Truncates |size| to |max_size| and returns true if |size| is at least | |
| 386 // |max_size|. | |
| 387 static bool TruncateSize(const size_t max_size, size_t* size) { | |
| 388 // Keep kMaxBytesToSniff honest. | |
| 389 DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff); | |
| 390 | |
| 391 if (*size >= max_size) { | |
| 392 *size = max_size; | |
| 393 return true; | |
| 394 } | |
| 395 return false; | |
| 396 } | |
| 397 | |
| 398 // Returns true and sets result if the content appears to be HTML. | |
| 399 // Clears have_enough_content if more data could possibly change the result. | |
| 400 static bool SniffForHTML(const char* content, | |
| 401 size_t size, | |
| 402 bool* have_enough_content, | |
| 403 std::string* result) { | |
| 404 // For HTML, we are willing to consider up to 512 bytes. This may be overly | |
| 405 // conservative as IE only considers 256. | |
| 406 *have_enough_content &= TruncateSize(512, &size); | |
| 407 | |
| 408 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, | |
| 409 // but with some modifications to better match the HTML5 spec. | |
| 410 const char* const end = content + size; | |
| 411 const char* pos; | |
| 412 for (pos = content; pos < end; ++pos) { | |
| 413 if (!IsAsciiWhitespace(*pos)) | |
| 414 break; | |
| 415 } | |
| 416 static base::HistogramBase* counter(NULL); | |
| 417 if (!counter) { | |
| 418 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", | |
| 419 arraysize(kSniffableTags)); | |
| 420 } | |
| 421 // |pos| now points to first non-whitespace character (or at end). | |
| 422 return CheckForMagicNumbers(pos, end - pos, | |
| 423 kSniffableTags, arraysize(kSniffableTags), | |
| 424 counter, result); | |
| 425 } | |
| 426 | |
| 427 // Returns true and sets result if the content matches any of kMagicNumbers. | |
| 428 // Clears have_enough_content if more data could possibly change the result. | |
| 429 static bool SniffForMagicNumbers(const char* content, | |
| 430 size_t size, | |
| 431 bool* have_enough_content, | |
| 432 std::string* result) { | |
| 433 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | |
| 434 | |
| 435 // Check our big table of Magic Numbers | |
| 436 static base::HistogramBase* counter(NULL); | |
| 437 if (!counter) { | |
| 438 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", | |
| 439 arraysize(kMagicNumbers)); | |
| 440 } | |
| 441 return CheckForMagicNumbers(content, size, | |
| 442 kMagicNumbers, arraysize(kMagicNumbers), | |
| 443 counter, result); | |
| 444 } | |
| 445 | |
| 446 // Returns true and sets result if the content matches any of | |
| 447 // kOfficeMagicNumbers, and the URL has the proper extension. | |
| 448 // Clears |have_enough_content| if more data could possibly change the result. | |
| 449 static bool SniffForOfficeDocs(const char* content, | |
| 450 size_t size, | |
| 451 const GURL& url, | |
| 452 bool* have_enough_content, | |
| 453 std::string* result) { | |
| 454 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size); | |
| 455 | |
| 456 // Check our table of magic numbers for Office file types. | |
| 457 std::string office_version; | |
| 458 if (!CheckForMagicNumbers(content, size, | |
| 459 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), | |
| 460 NULL, &office_version)) | |
| 461 return false; | |
| 462 | |
| 463 OfficeDocType type = DOC_TYPE_NONE; | |
| 464 for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) { | |
| 465 std::string url_path = url.path(); | |
| 466 | |
| 467 if (url_path.length() < kOfficeExtensionTypes[i].extension_len) | |
| 468 continue; | |
| 469 | |
| 470 const char* extension = | |
| 471 &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len]; | |
| 472 | |
| 473 if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension, | |
| 474 kOfficeExtensionTypes[i].extension_len)) { | |
| 475 type = kOfficeExtensionTypes[i].doc_type; | |
| 476 break; | |
| 477 } | |
| 478 } | |
| 479 | |
| 480 if (type == DOC_TYPE_NONE) | |
| 481 return false; | |
| 482 | |
| 483 if (office_version == "CFB") { | |
| 484 switch (type) { | |
| 485 case DOC_TYPE_WORD: | |
| 486 *result = "application/msword"; | |
| 487 return true; | |
| 488 case DOC_TYPE_EXCEL: | |
| 489 *result = "application/vnd.ms-excel"; | |
| 490 return true; | |
| 491 case DOC_TYPE_POWERPOINT: | |
| 492 *result = "application/vnd.ms-powerpoint"; | |
| 493 return true; | |
| 494 case DOC_TYPE_NONE: | |
| 495 NOTREACHED(); | |
| 496 return false; | |
| 497 } | |
| 498 } else if (office_version == "OOXML") { | |
| 499 switch (type) { | |
| 500 case DOC_TYPE_WORD: | |
| 501 *result = "application/vnd.openxmlformats-officedocument." | |
| 502 "wordprocessingml.document"; | |
| 503 return true; | |
| 504 case DOC_TYPE_EXCEL: | |
| 505 *result = "application/vnd.openxmlformats-officedocument." | |
| 506 "spreadsheetml.sheet"; | |
| 507 return true; | |
| 508 case DOC_TYPE_POWERPOINT: | |
| 509 *result = "application/vnd.openxmlformats-officedocument." | |
| 510 "presentationml.presentation"; | |
| 511 return true; | |
| 512 case DOC_TYPE_NONE: | |
| 513 NOTREACHED(); | |
| 514 return false; | |
| 515 } | |
| 516 } | |
| 517 | |
| 518 NOTREACHED(); | |
| 519 return false; | |
| 520 } | |
| 521 | |
| 522 static bool IsOfficeType(const std::string& type_hint) { | |
| 523 return (type_hint == "application/msword" || | |
| 524 type_hint == "application/vnd.ms-excel" || | |
| 525 type_hint == "application/vnd.ms-powerpoint" || | |
| 526 type_hint == "application/vnd.openxmlformats-officedocument." | |
| 527 "wordprocessingml.document" || | |
| 528 type_hint == "application/vnd.openxmlformats-officedocument." | |
| 529 "spreadsheetml.sheet" || | |
| 530 type_hint == "application/vnd.openxmlformats-officedocument." | |
| 531 "presentationml.presentation" || | |
| 532 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" || | |
| 533 type_hint == "application/vnd.ms-word.document.macroenabled.12" || | |
| 534 type_hint == "application/vnd.ms-powerpoint.presentation." | |
| 535 "macroenabled.12" || | |
| 536 type_hint == "application/mspowerpoint" || | |
| 537 type_hint == "application/msexcel" || | |
| 538 type_hint == "application/vnd.ms-word" || | |
| 539 type_hint == "application/vnd.ms-word.document.12" || | |
| 540 type_hint == "application/vnd.msword"); | |
| 541 } | |
| 542 | |
| 543 // This function checks for files that have a Microsoft Office MIME type | |
| 544 // set, but are not actually Office files. | |
| 545 // | |
| 546 // If this is not actually an Office file, |*result| is set to | |
| 547 // "application/octet-stream", otherwise it is not modified. | |
| 548 // | |
| 549 // Returns false if additional data is required to determine the file type, or | |
| 550 // true if there is enough data to make a decision. | |
| 551 static bool SniffForInvalidOfficeDocs(const char* content, | |
| 552 size_t size, | |
| 553 const GURL& url, | |
| 554 std::string* result) { | |
| 555 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size)) | |
| 556 return false; | |
| 557 | |
| 558 // Check our table of magic numbers for Office file types. If it does not | |
| 559 // match one, the MIME type was invalid. Set it instead to a safe value. | |
| 560 std::string office_version; | |
| 561 if (!CheckForMagicNumbers(content, size, | |
| 562 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), | |
| 563 NULL, &office_version)) { | |
| 564 *result = "application/octet-stream"; | |
| 565 } | |
| 566 | |
| 567 // We have enough information to determine if this was a Microsoft Office | |
| 568 // document or not, so sniffing is completed. | |
| 569 return true; | |
| 570 } | |
| 571 | |
| 572 // Byte order marks | |
| 573 static const MagicNumber kMagicXML[] = { | |
| 574 // We want to be very conservative in interpreting text/xml content as | |
| 575 // XHTML -- we just want to sniff enough to make unit tests pass. | |
| 576 // So we match explicitly on this, and don't match other ways of writing | |
| 577 // it in semantically-equivalent ways. | |
| 578 MAGIC_STRING("application/xhtml+xml", | |
| 579 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") | |
| 580 MAGIC_STRING("application/atom+xml", "<feed") | |
| 581 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 | |
| 582 }; | |
| 583 | |
| 584 // Returns true and sets result if the content appears to contain XHTML or a | |
| 585 // feed. | |
| 586 // Clears have_enough_content if more data could possibly change the result. | |
| 587 // | |
| 588 // TODO(evanm): this is similar but more conservative than what Safari does, | |
| 589 // while HTML5 has a different recommendation -- what should we do? | |
| 590 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset | |
| 591 // of ASCII -- do we care? | |
| 592 static bool SniffXML(const char* content, | |
| 593 size_t size, | |
| 594 bool* have_enough_content, | |
| 595 std::string* result) { | |
| 596 // We allow at most 300 bytes of content before we expect the opening tag. | |
| 597 *have_enough_content &= TruncateSize(300, &size); | |
| 598 const char* pos = content; | |
| 599 const char* const end = content + size; | |
| 600 | |
| 601 // This loop iterates through tag-looking offsets in the file. | |
| 602 // We want to skip XML processing instructions (of the form "<?xml ...") | |
| 603 // and stop at the first "plain" tag, then make a decision on the mime-type | |
| 604 // based on the name (or possibly attributes) of that tag. | |
| 605 static base::HistogramBase* counter(NULL); | |
| 606 if (!counter) { | |
| 607 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", | |
| 608 arraysize(kMagicXML)); | |
| 609 } | |
| 610 const int kMaxTagIterations = 5; | |
| 611 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { | |
| 612 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); | |
| 613 if (!pos) | |
| 614 return false; | |
| 615 | |
| 616 if ((pos + sizeof("<?xml") - 1 <= end) && | |
| 617 (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0)) { | |
| 618 // Skip XML declarations. | |
| 619 ++pos; | |
| 620 continue; | |
| 621 } else if ((pos + sizeof("<!DOCTYPE") - 1 <= end) && | |
| 622 (base::strncasecmp(pos, "<!DOCTYPE", sizeof("<!DOCTYPE") - 1) == | |
| 623 0)) { | |
| 624 // Skip DOCTYPE declarations. | |
| 625 ++pos; | |
| 626 continue; | |
| 627 } | |
| 628 | |
| 629 if (CheckForMagicNumbers(pos, end - pos, | |
| 630 kMagicXML, arraysize(kMagicXML), | |
| 631 counter, result)) | |
| 632 return true; | |
| 633 | |
| 634 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult | |
| 635 // to identify. | |
| 636 | |
| 637 // If we get here, we've hit an initial tag that hasn't matched one of the | |
| 638 // above tests. Abort. | |
| 639 return true; | |
| 640 } | |
| 641 | |
| 642 // We iterated too far without finding a start tag. | |
| 643 // If we have more content to look at, we aren't going to change our mind by | |
| 644 // seeing more bytes from the network. | |
| 645 return pos < end; | |
| 646 } | |
| 647 | |
| 648 // Byte order marks | |
| 649 static const MagicNumber kByteOrderMark[] = { | |
| 650 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE | |
| 651 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE | |
| 652 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 | |
| 653 }; | |
| 654 | |
| 655 // Whether a given byte looks like it might be part of binary content. | |
| 656 // Source: HTML5 spec | |
| 657 static char kByteLooksBinary[] = { | |
| 658 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F | |
| 659 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F | |
| 660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F | |
| 661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F | |
| 662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F | |
| 663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F | |
| 664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F | |
| 665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F | |
| 666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F | |
| 667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F | |
| 668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF | |
| 669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF | |
| 670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF | |
| 671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF | |
| 672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF | |
| 673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF | |
| 674 }; | |
| 675 | |
| 676 // Returns true and sets result to "application/octet-stream" if the content | |
| 677 // appears to be binary data. Otherwise, returns false and sets "text/plain". | |
| 678 // Clears have_enough_content if more data could possibly change the result. | |
| 679 static bool SniffBinary(const char* content, | |
| 680 size_t size, | |
| 681 bool* have_enough_content, | |
| 682 std::string* result) { | |
| 683 // There is no concensus about exactly how to sniff for binary content. | |
| 684 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. | |
| 685 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. | |
| 686 // Here, we side with FF, but with a smaller buffer. This size was chosen | |
| 687 // because it is small enough to comfortably fit into a single packet (after | |
| 688 // allowing for headers) and yet large enough to account for binary formats | |
| 689 // that have a significant amount of ASCII at the beginning (crbug.com/15314). | |
| 690 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); | |
| 691 | |
| 692 // First, we look for a BOM. | |
| 693 static base::HistogramBase* counter(NULL); | |
| 694 if (!counter) { | |
| 695 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", | |
| 696 arraysize(kByteOrderMark)); | |
| 697 } | |
| 698 std::string unused; | |
| 699 if (CheckForMagicNumbers(content, size, | |
| 700 kByteOrderMark, arraysize(kByteOrderMark), | |
| 701 counter, &unused)) { | |
| 702 // If there is BOM, we think the buffer is not binary. | |
| 703 result->assign("text/plain"); | |
| 704 return false; | |
| 705 } | |
| 706 | |
| 707 // Next we look to see if any of the bytes "look binary." | |
| 708 for (size_t i = 0; i < size; ++i) { | |
| 709 // If we a see a binary-looking byte, we think the content is binary. | |
| 710 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { | |
| 711 result->assign("application/octet-stream"); | |
| 712 return true; | |
| 713 } | |
| 714 } | |
| 715 | |
| 716 // No evidence either way. Default to non-binary and, if truncated, clear | |
| 717 // have_enough_content because there could be a binary looking byte in the | |
| 718 // truncated data. | |
| 719 *have_enough_content &= is_truncated; | |
| 720 result->assign("text/plain"); | |
| 721 return false; | |
| 722 } | |
| 723 | |
| 724 static bool IsUnknownMimeType(const std::string& mime_type) { | |
| 725 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. | |
| 726 // If we do, please be careful not to alter the semantics at all. | |
| 727 static const char* const kUnknownMimeTypes[] = { | |
| 728 // Empty mime types are as unknown as they get. | |
| 729 "", | |
| 730 // The unknown/unknown type is popular and uninformative | |
| 731 "unknown/unknown", | |
| 732 // The second most popular unknown mime type is application/unknown | |
| 733 "application/unknown", | |
| 734 // Firefox rejects a mime type if it is exactly */* | |
| 735 "*/*", | |
| 736 }; | |
| 737 static base::HistogramBase* counter(NULL); | |
| 738 if (!counter) { | |
| 739 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", | |
| 740 arraysize(kUnknownMimeTypes) + 1); | |
| 741 } | |
| 742 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { | |
| 743 if (mime_type == kUnknownMimeTypes[i]) { | |
| 744 counter->Add(i); | |
| 745 return true; | |
| 746 } | |
| 747 } | |
| 748 if (mime_type.find('/') == std::string::npos) { | |
| 749 // Firefox rejects a mime type if it does not contain a slash | |
| 750 counter->Add(arraysize(kUnknownMimeTypes)); | |
| 751 return true; | |
| 752 } | |
| 753 return false; | |
| 754 } | |
| 755 | |
| 756 // Returns true and sets result if the content appears to be a crx (Chrome | |
| 757 // extension) file. | |
| 758 // Clears have_enough_content if more data could possibly change the result. | |
| 759 static bool SniffCRX(const char* content, | |
| 760 size_t size, | |
| 761 const GURL& url, | |
| 762 const std::string& type_hint, | |
| 763 bool* have_enough_content, | |
| 764 std::string* result) { | |
| 765 static base::HistogramBase* counter(NULL); | |
| 766 if (!counter) | |
| 767 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); | |
| 768 | |
| 769 // Technically, the crx magic number is just Cr24, but the bytes after that | |
| 770 // are a version number which changes infrequently. Including it in the | |
| 771 // sniffing gives us less room for error. If the version number ever changes, | |
| 772 // we can just add an entry to this list. | |
| 773 // | |
| 774 // TODO(aa): If we ever have another magic number, we'll want to pass a | |
| 775 // histogram into CheckForMagicNumbers(), below, to see which one matched. | |
| 776 static const struct MagicNumber kCRXMagicNumbers[] = { | |
| 777 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") | |
| 778 }; | |
| 779 | |
| 780 // Only consider files that have the extension ".crx". | |
| 781 static const char kCRXExtension[] = ".crx"; | |
| 782 // Ignore null by subtracting 1. | |
| 783 static const int kExtensionLength = arraysize(kCRXExtension) - 1; | |
| 784 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == | |
| 785 url.path().size() - kExtensionLength) { | |
| 786 counter->Add(1); | |
| 787 } else { | |
| 788 return false; | |
| 789 } | |
| 790 | |
| 791 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | |
| 792 if (CheckForMagicNumbers(content, size, | |
| 793 kCRXMagicNumbers, arraysize(kCRXMagicNumbers), | |
| 794 NULL, result)) { | |
| 795 counter->Add(2); | |
| 796 } else { | |
| 797 return false; | |
| 798 } | |
| 799 | |
| 800 return true; | |
| 801 } | |
| 802 | |
| 803 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { | |
| 804 static base::HistogramBase* should_sniff_counter(NULL); | |
| 805 if (!should_sniff_counter) { | |
| 806 should_sniff_counter = | |
| 807 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); | |
| 808 } | |
| 809 bool sniffable_scheme = url.is_empty() || | |
| 810 url.SchemeIsHTTPOrHTTPS() || | |
| 811 url.SchemeIs("ftp") || | |
| 812 #if defined(OS_ANDROID) | |
| 813 url.SchemeIs("content") || | |
| 814 #endif | |
| 815 url.SchemeIsFile() || | |
| 816 url.SchemeIsFileSystem(); | |
| 817 if (!sniffable_scheme) { | |
| 818 should_sniff_counter->Add(1); | |
| 819 return false; | |
| 820 } | |
| 821 | |
| 822 static const char* const kSniffableTypes[] = { | |
| 823 // Many web servers are misconfigured to send text/plain for many | |
| 824 // different types of content. | |
| 825 "text/plain", | |
| 826 // We want to sniff application/octet-stream for | |
| 827 // application/x-chrome-extension, but nothing else. | |
| 828 "application/octet-stream", | |
| 829 // XHTML and Atom/RSS feeds are often served as plain xml instead of | |
| 830 // their more specific mime types. | |
| 831 "text/xml", | |
| 832 "application/xml", | |
| 833 // Check for false Microsoft Office MIME types. | |
| 834 "application/msword", | |
| 835 "application/vnd.ms-excel", | |
| 836 "application/vnd.ms-powerpoint", | |
| 837 "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| 838 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| 839 "application/vnd.openxmlformats-officedocument.presentationml.presentation", | |
| 840 "application/vnd.ms-excel.sheet.macroenabled.12", | |
| 841 "application/vnd.ms-word.document.macroenabled.12", | |
| 842 "application/vnd.ms-powerpoint.presentation.macroenabled.12", | |
| 843 "application/mspowerpoint", | |
| 844 "application/msexcel", | |
| 845 "application/vnd.ms-word", | |
| 846 "application/vnd.ms-word.document.12", | |
| 847 "application/vnd.msword", | |
| 848 }; | |
| 849 static base::HistogramBase* counter(NULL); | |
| 850 if (!counter) { | |
| 851 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", | |
| 852 arraysize(kSniffableTypes) + 1); | |
| 853 } | |
| 854 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { | |
| 855 if (mime_type == kSniffableTypes[i]) { | |
| 856 counter->Add(i); | |
| 857 should_sniff_counter->Add(2); | |
| 858 return true; | |
| 859 } | |
| 860 } | |
| 861 if (IsUnknownMimeType(mime_type)) { | |
| 862 // The web server didn't specify a content type or specified a mime | |
| 863 // type that we ignore. | |
| 864 counter->Add(arraysize(kSniffableTypes)); | |
| 865 should_sniff_counter->Add(2); | |
| 866 return true; | |
| 867 } | |
| 868 should_sniff_counter->Add(1); | |
| 869 return false; | |
| 870 } | |
| 871 | |
| 872 bool SniffMimeType(const char* content, | |
| 873 size_t content_size, | |
| 874 const GURL& url, | |
| 875 const std::string& type_hint, | |
| 876 std::string* result) { | |
| 877 DCHECK_LT(content_size, 1000000U); // sanity check | |
| 878 DCHECK(content); | |
| 879 DCHECK(result); | |
| 880 | |
| 881 // By default, we assume we have enough content. | |
| 882 // Each sniff routine may unset this if it wasn't provided enough content. | |
| 883 bool have_enough_content = true; | |
| 884 | |
| 885 // By default, we'll return the type hint. | |
| 886 // Each sniff routine may modify this if it has a better guess.. | |
| 887 result->assign(type_hint); | |
| 888 | |
| 889 // If the file has a Microsoft Office MIME type, we should only check that it | |
| 890 // is a valid Office file. Because this is the only reason we sniff files | |
| 891 // with a Microsoft Office MIME type, we can return early. | |
| 892 if (IsOfficeType(type_hint)) | |
| 893 return SniffForInvalidOfficeDocs(content, content_size, url, result); | |
| 894 | |
| 895 // Cache information about the type_hint | |
| 896 const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint); | |
| 897 | |
| 898 // First check for HTML | |
| 899 if (hint_is_unknown_mime_type) { | |
| 900 // We're only willing to sniff HTML if the server has not supplied a mime | |
| 901 // type, or if the type it did supply indicates that it doesn't know what | |
| 902 // the type should be. | |
| 903 if (SniffForHTML(content, content_size, &have_enough_content, result)) | |
| 904 return true; // We succeeded in sniffing HTML. No more content needed. | |
| 905 } | |
| 906 | |
| 907 // We're only willing to sniff for binary in 3 cases: | |
| 908 // 1. The server has not supplied a mime type. | |
| 909 // 2. The type it did supply indicates that it doesn't know what the type | |
| 910 // should be. | |
| 911 // 3. The type is "text/plain" which is the default on some web servers and | |
| 912 // could be indicative of a mis-configuration that we shield the user from. | |
| 913 const bool hint_is_text_plain = (type_hint == "text/plain"); | |
| 914 if (hint_is_unknown_mime_type || hint_is_text_plain) { | |
| 915 if (!SniffBinary(content, content_size, &have_enough_content, result)) { | |
| 916 // If the server said the content was text/plain and it doesn't appear | |
| 917 // to be binary, then we trust it. | |
| 918 if (hint_is_text_plain) { | |
| 919 return have_enough_content; | |
| 920 } | |
| 921 } | |
| 922 } | |
| 923 | |
| 924 // If we have plain XML, sniff XML subtypes. | |
| 925 if (type_hint == "text/xml" || type_hint == "application/xml") { | |
| 926 // We're not interested in sniffing these types for images and the like. | |
| 927 // Instead, we're looking explicitly for a feed. If we don't find one | |
| 928 // we're done and return early. | |
| 929 if (SniffXML(content, content_size, &have_enough_content, result)) | |
| 930 return true; | |
| 931 return have_enough_content; | |
| 932 } | |
| 933 | |
| 934 // CRX files (Chrome extensions) have a special sniffing algorithm. It is | |
| 935 // tighter than the others because we don't have to match legacy behavior. | |
| 936 if (SniffCRX(content, content_size, url, type_hint, | |
| 937 &have_enough_content, result)) | |
| 938 return true; | |
| 939 | |
| 940 // Check the file extension and magic numbers to see if this is an Office | |
| 941 // document. This needs to be checked before the general magic numbers | |
| 942 // because zip files and Office documents (OOXML) have the same magic number. | |
| 943 if (SniffForOfficeDocs(content, content_size, url, | |
| 944 &have_enough_content, result)) | |
| 945 return true; // We've matched a magic number. No more content needed. | |
| 946 | |
| 947 // We're not interested in sniffing for magic numbers when the type_hint | |
| 948 // is application/octet-stream. Time to bail out. | |
| 949 if (type_hint == "application/octet-stream") | |
| 950 return have_enough_content; | |
| 951 | |
| 952 // Now we look in our large table of magic numbers to see if we can find | |
| 953 // anything that matches the content. | |
| 954 if (SniffForMagicNumbers(content, content_size, | |
| 955 &have_enough_content, result)) | |
| 956 return true; // We've matched a magic number. No more content needed. | |
| 957 | |
| 958 return have_enough_content; | |
| 959 } | |
| 960 | |
| 961 bool SniffMimeTypeFromLocalData(const char* content, | |
| 962 size_t size, | |
| 963 std::string* result) { | |
| 964 // First check the extra table. | |
| 965 if (CheckForMagicNumbers(content, size, kExtraMagicNumbers, | |
| 966 arraysize(kExtraMagicNumbers), NULL, result)) | |
| 967 return true; | |
| 968 // Finally check the original table. | |
| 969 return CheckForMagicNumbers(content, size, kMagicNumbers, | |
| 970 arraysize(kMagicNumbers), NULL, result); | |
| 971 } | |
| 972 | |
| 973 } // namespace net | |
| OLD | NEW |