OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 // Detecting mime types is a tricky business because we need to balance | |
6 // compatibility concerns with security issues. Here is a survey of how other | |
7 // browsers behave and then a description of how we intend to behave. | |
8 // | |
9 // HTML payload, no Content-Type header: | |
10 // * IE 7: Render as HTML | |
11 // * Firefox 2: Render as HTML | |
12 // * Safari 3: Render as HTML | |
13 // * Opera 9: Render as HTML | |
14 // | |
15 // Here the choice seems clear: | |
16 // => Chrome: Render as HTML | |
17 // | |
18 // HTML payload, Content-Type: "text/plain": | |
19 // * IE 7: Render as HTML | |
20 // * Firefox 2: Render as text | |
21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL | |
22 // has an HTML extension) | |
23 // * Opera 9: Render as text | |
24 // | |
25 // Here we choose to follow the majority (and break some compatibility with IE). | |
26 // Many folks dislike IE's behavior here. | |
27 // => Chrome: Render as text | |
28 // We generalize this as follows. If the Content-Type header is text/plain | |
29 // we won't detect dangerous mime types (those that can execute script). | |
30 // | |
31 // HTML payload, Content-Type: "application/octet-stream": | |
32 // * IE 7: Render as HTML | |
33 // * Firefox 2: Download as application/octet-stream | |
34 // * Safari 3: Render as HTML | |
35 // * Opera 9: Render as HTML | |
36 // | |
37 // We follow Firefox. | |
38 // => Chrome: Download as application/octet-stream | |
39 // One factor in this decision is that IIS 4 and 5 will send | |
40 // application/octet-stream for .xhtml files (because they don't recognize | |
41 // the extension). We did some experiments and it looks like this doesn't occur | |
42 // very often on the web. We choose the more secure option. | |
43 // | |
44 // GIF payload, no Content-Type header: | |
45 // * IE 7: Render as GIF | |
46 // * Firefox 2: Render as GIF | |
47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the | |
48 // URL has an GIF extension) | |
49 // * Opera 9: Render as GIF | |
50 // | |
51 // The choice is clear. | |
52 // => Chrome: Render as GIF | |
53 // Once we decide to render HTML without a Content-Type header, there isn't much | |
54 // reason not to render GIFs. | |
55 // | |
56 // GIF payload, Content-Type: "text/plain": | |
57 // * IE 7: Render as GIF | |
58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will | |
59 // Download as GIF if the URL has an GIF extension) | |
60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the | |
61 // URL has an GIF extension) | |
62 // * Opera 9: Render as GIF | |
63 // | |
64 // Displaying as text/plain makes little sense as the content will look like | |
65 // gibberish. Here, we could change our minds and download. | |
66 // => Chrome: Render as GIF | |
67 // | |
68 // GIF payload, Content-Type: "application/octet-stream": | |
69 // * IE 7: Render as GIF | |
70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will | |
71 // Download as GIF if the URL has an GIF extension) | |
72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the | |
73 // URL has an GIF extension) | |
74 // * Opera 9: Render as GIF | |
75 // | |
76 // We used to render as GIF here, but the problem is that some sites want to | |
77 // trigger downloads by sending application/octet-stream (even though they | |
78 // should be sending Content-Disposition: attachment). Although it is safe | |
79 // to render as GIF from a security perspective, we actually get better | |
80 // compatibility if we don't sniff from application/octet stream at all. | |
81 // => Chrome: Download as application/octet-stream | |
82 // | |
83 // XHTML payload, Content-Type: "text/xml": | |
84 // * IE 7: Render as XML | |
85 // * Firefox 2: Render as HTML | |
86 // * Safari 3: Render as HTML | |
87 // * Opera 9: Render as HTML | |
88 // The layout tests rely on us rendering this as HTML. | |
89 // But we're conservative in XHTML detection, as this runs afoul of the | |
90 // "don't detect dangerous mime types" rule. | |
91 // | |
92 // Note that our definition of HTML payload is much stricter than IE's | |
93 // definition and roughly the same as Firefox's definition. | |
94 | |
95 #include <string> | |
96 | |
97 #include "net/base/mime_sniffer.h" | |
98 | |
99 #include "base/basictypes.h" | |
100 #include "base/logging.h" | |
101 #include "base/metrics/histogram.h" | |
102 #include "base/strings/string_util.h" | |
103 #include "net/base/mime_util.h" | |
104 #include "url/gurl.h" | |
105 | |
106 namespace net { | |
107 | |
108 // The number of content bytes we need to use all our magic numbers. Feel free | |
109 // to increase this number if you add a longer magic number. | |
110 static const size_t kBytesRequiredForMagic = 42; | |
111 | |
112 struct MagicNumber { | |
113 const char* const mime_type; | |
114 const char* const magic; | |
115 size_t magic_len; | |
116 bool is_string; | |
117 const char* const mask; // if set, must have same length as |magic| | |
118 }; | |
119 | |
120 #define MAGIC_NUMBER(mime_type, magic) \ | |
121 { (mime_type), (magic), sizeof(magic)-1, false, NULL }, | |
122 | |
123 template <int MagicSize, int MaskSize> | |
124 class VerifySizes { | |
125 static_assert(MagicSize == MaskSize, "sizes must be equal"); | |
126 | |
127 public: | |
128 enum { SIZES = MagicSize }; | |
129 }; | |
130 | |
131 #define verified_sizeof(magic, mask) \ | |
132 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES | |
133 | |
134 #define MAGIC_MASK(mime_type, magic, mask) \ | |
135 { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) }, | |
136 | |
137 // Magic strings are case insensitive and must not include '\0' characters | |
138 #define MAGIC_STRING(mime_type, magic) \ | |
139 { (mime_type), (magic), sizeof(magic)-1, true, NULL }, | |
140 | |
141 static const MagicNumber kMagicNumbers[] = { | |
142 // Source: HTML 5 specification | |
143 MAGIC_NUMBER("application/pdf", "%PDF-") | |
144 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") | |
145 MAGIC_NUMBER("image/gif", "GIF87a") | |
146 MAGIC_NUMBER("image/gif", "GIF89a") | |
147 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A") | |
148 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") | |
149 MAGIC_NUMBER("image/bmp", "BM") | |
150 // Source: Mozilla | |
151 MAGIC_NUMBER("text/plain", "#!") // Script | |
152 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS | |
153 MAGIC_NUMBER("text/plain", "From") | |
154 MAGIC_NUMBER("text/plain", ">From") | |
155 // Chrome specific | |
156 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") | |
157 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") | |
158 MAGIC_NUMBER("video/x-ms-asf", | |
159 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") | |
160 MAGIC_NUMBER("image/tiff", "I I") | |
161 MAGIC_NUMBER("image/tiff", "II*") | |
162 MAGIC_NUMBER("image/tiff", "MM\x00*") | |
163 MAGIC_NUMBER("audio/mpeg", "ID3") | |
164 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") | |
165 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") | |
166 // TODO(abarth): we don't handle partial byte matches yet | |
167 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") | |
168 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") | |
169 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") | |
170 MAGIC_NUMBER("application/zip", "PK\x03\x04") | |
171 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") | |
172 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") | |
173 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE | |
174 // Sniffing for Flash: | |
175 // | |
176 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") | |
177 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") | |
178 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") | |
179 // | |
180 // Including these magic number for Flash is a trade off. | |
181 // | |
182 // Pros: | |
183 // * Flash is an important and popular file format | |
184 // | |
185 // Cons: | |
186 // * These patterns are fairly weak | |
187 // * If we mistakenly decide something is Flash, we will execute it | |
188 // in the origin of an unsuspecting site. This could be a security | |
189 // vulnerability if the site allows users to upload content. | |
190 // | |
191 // On balance, we do not include these patterns. | |
192 }; | |
193 | |
194 // The number of content bytes we need to use all our Microsoft Office magic | |
195 // numbers. | |
196 static const size_t kBytesRequiredForOfficeMagic = 8; | |
197 | |
198 static const MagicNumber kOfficeMagicNumbers[] = { | |
199 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1") | |
200 MAGIC_NUMBER("OOXML", "PK\x03\x04") | |
201 }; | |
202 | |
203 enum OfficeDocType { | |
204 DOC_TYPE_WORD, | |
205 DOC_TYPE_EXCEL, | |
206 DOC_TYPE_POWERPOINT, | |
207 DOC_TYPE_NONE | |
208 }; | |
209 | |
210 struct OfficeExtensionType { | |
211 OfficeDocType doc_type; | |
212 const char* const extension; | |
213 size_t extension_len; | |
214 }; | |
215 | |
216 #define OFFICE_EXTENSION(type, extension) \ | |
217 { (type), (extension), sizeof(extension) - 1 }, | |
218 | |
219 static const OfficeExtensionType kOfficeExtensionTypes[] = { | |
220 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc") | |
221 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls") | |
222 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt") | |
223 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx") | |
224 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx") | |
225 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx") | |
226 }; | |
227 | |
228 static const MagicNumber kExtraMagicNumbers[] = { | |
229 MAGIC_NUMBER("image/x-xbitmap", "#define") | |
230 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00") | |
231 MAGIC_NUMBER("image/svg+xml", "<?xml_version=") | |
232 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ") | |
233 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST") | |
234 MAGIC_NUMBER("audio/ogg", "OggS") | |
235 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0") | |
236 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0") | |
237 MAGIC_NUMBER("video/3gpp", "....ftyp3g") | |
238 MAGIC_NUMBER("video/3gpp", "....ftypavcl") | |
239 MAGIC_NUMBER("video/mp4", "....ftyp") | |
240 MAGIC_NUMBER("video/quicktime", "....moov") | |
241 MAGIC_NUMBER("application/x-shockwave-flash", "CWS") | |
242 MAGIC_NUMBER("application/x-shockwave-flash", "FWS") | |
243 MAGIC_NUMBER("video/x-flv", "FLV") | |
244 MAGIC_NUMBER("audio/x-flac", "fLaC") | |
245 | |
246 // RAW image types. | |
247 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR") | |
248 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR") | |
249 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM") | |
250 MAGIC_NUMBER("image/x-olympus-orf", "MMOR") // big-endian | |
251 MAGIC_NUMBER("image/x-olympus-orf", "IIRO") // little-endian | |
252 MAGIC_NUMBER("image/x-olympus-orf", "IIRS") // little-endian | |
253 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ") | |
254 MAGIC_NUMBER("image/x-panasonic-raw", | |
255 "IIU\x00\x08\x00\x00\x00") // Panasonic .raw | |
256 MAGIC_NUMBER("image/x-panasonic-raw", | |
257 "IIU\x00\x18\x00\x00\x00") // Panasonic .rw2 | |
258 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw") | |
259 MAGIC_NUMBER("image/x-x3f", "FOVb") | |
260 }; | |
261 | |
262 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will | |
263 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is | |
264 // HTML, but we will not. | |
265 | |
266 #define MAGIC_HTML_TAG(tag) \ | |
267 MAGIC_STRING("text/html", "<" tag) | |
268 | |
269 static const MagicNumber kSniffableTags[] = { | |
270 // XML processing directive. Although this is not an HTML mime type, we sniff | |
271 // for this in the HTML phase because text/xml is just as powerful as HTML and | |
272 // we want to leverage our white space skipping technology. | |
273 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla | |
274 // DOCTYPEs | |
275 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec | |
276 // Sniffable tags, ordered by how often they occur in sniffable documents. | |
277 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla | |
278 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla | |
279 MAGIC_HTML_TAG("!--") | |
280 MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla | |
281 MAGIC_HTML_TAG("iframe") // Mozilla | |
282 MAGIC_HTML_TAG("h1") // Mozilla | |
283 MAGIC_HTML_TAG("div") // Mozilla | |
284 MAGIC_HTML_TAG("font") // Mozilla | |
285 MAGIC_HTML_TAG("table") // Mozilla | |
286 MAGIC_HTML_TAG("a") // Mozilla | |
287 MAGIC_HTML_TAG("style") // Mozilla | |
288 MAGIC_HTML_TAG("title") // Mozilla | |
289 MAGIC_HTML_TAG("b") // Mozilla | |
290 MAGIC_HTML_TAG("body") // Mozilla | |
291 MAGIC_HTML_TAG("br") | |
292 MAGIC_HTML_TAG("p") // Mozilla | |
293 }; | |
294 | |
295 static base::HistogramBase* UMASnifferHistogramGet(const char* name, | |
296 int array_size) { | |
297 base::HistogramBase* counter = | |
298 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, | |
299 base::HistogramBase::kUmaTargetedHistogramFlag); | |
300 return counter; | |
301 } | |
302 | |
303 // Compare content header to a magic number where magic_entry can contain '.' | |
304 // for single character of anything, allowing some bytes to be skipped. | |
305 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { | |
306 while (len) { | |
307 if ((*magic_entry != '.') && (*magic_entry != *content)) | |
308 return false; | |
309 ++magic_entry; | |
310 ++content; | |
311 --len; | |
312 } | |
313 return true; | |
314 } | |
315 | |
316 // Like MagicCmp() except that it ANDs each byte with a mask before | |
317 // the comparison, because there are some bits we don't care about. | |
318 static bool MagicMaskCmp(const char* magic_entry, | |
319 const char* content, | |
320 size_t len, | |
321 const char* mask) { | |
322 while (len) { | |
323 if ((*magic_entry != '.') && (*magic_entry != (*mask & *content))) | |
324 return false; | |
325 ++magic_entry; | |
326 ++content; | |
327 ++mask; | |
328 --len; | |
329 } | |
330 return true; | |
331 } | |
332 | |
333 static bool MatchMagicNumber(const char* content, | |
334 size_t size, | |
335 const MagicNumber& magic_entry, | |
336 std::string* result) { | |
337 const size_t len = magic_entry.magic_len; | |
338 | |
339 // Keep kBytesRequiredForMagic honest. | |
340 DCHECK_LE(len, kBytesRequiredForMagic); | |
341 | |
342 // To compare with magic strings, we need to compute strlen(content), but | |
343 // content might not actually have a null terminator. In that case, we | |
344 // pretend the length is content_size. | |
345 const char* end = static_cast<const char*>(memchr(content, '\0', size)); | |
346 const size_t content_strlen = | |
347 (end != NULL) ? static_cast<size_t>(end - content) : size; | |
348 | |
349 bool match = false; | |
350 if (magic_entry.is_string) { | |
351 if (content_strlen >= len) { | |
352 // String comparisons are case-insensitive | |
353 match = (base::strncasecmp(magic_entry.magic, content, len) == 0); | |
354 } | |
355 } else { | |
356 if (size >= len) { | |
357 if (!magic_entry.mask) { | |
358 match = MagicCmp(magic_entry.magic, content, len); | |
359 } else { | |
360 match = MagicMaskCmp(magic_entry.magic, content, len, magic_entry.mask); | |
361 } | |
362 } | |
363 } | |
364 | |
365 if (match) { | |
366 result->assign(magic_entry.mime_type); | |
367 return true; | |
368 } | |
369 return false; | |
370 } | |
371 | |
372 static bool CheckForMagicNumbers(const char* content, size_t size, | |
373 const MagicNumber* magic, size_t magic_len, | |
374 base::HistogramBase* counter, | |
375 std::string* result) { | |
376 for (size_t i = 0; i < magic_len; ++i) { | |
377 if (MatchMagicNumber(content, size, magic[i], result)) { | |
378 if (counter) counter->Add(static_cast<int>(i)); | |
379 return true; | |
380 } | |
381 } | |
382 return false; | |
383 } | |
384 | |
385 // Truncates |size| to |max_size| and returns true if |size| is at least | |
386 // |max_size|. | |
387 static bool TruncateSize(const size_t max_size, size_t* size) { | |
388 // Keep kMaxBytesToSniff honest. | |
389 DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff); | |
390 | |
391 if (*size >= max_size) { | |
392 *size = max_size; | |
393 return true; | |
394 } | |
395 return false; | |
396 } | |
397 | |
398 // Returns true and sets result if the content appears to be HTML. | |
399 // Clears have_enough_content if more data could possibly change the result. | |
400 static bool SniffForHTML(const char* content, | |
401 size_t size, | |
402 bool* have_enough_content, | |
403 std::string* result) { | |
404 // For HTML, we are willing to consider up to 512 bytes. This may be overly | |
405 // conservative as IE only considers 256. | |
406 *have_enough_content &= TruncateSize(512, &size); | |
407 | |
408 // We adopt a strategy similar to that used by Mozilla to sniff HTML tags, | |
409 // but with some modifications to better match the HTML5 spec. | |
410 const char* const end = content + size; | |
411 const char* pos; | |
412 for (pos = content; pos < end; ++pos) { | |
413 if (!IsAsciiWhitespace(*pos)) | |
414 break; | |
415 } | |
416 static base::HistogramBase* counter(NULL); | |
417 if (!counter) { | |
418 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", | |
419 arraysize(kSniffableTags)); | |
420 } | |
421 // |pos| now points to first non-whitespace character (or at end). | |
422 return CheckForMagicNumbers(pos, end - pos, | |
423 kSniffableTags, arraysize(kSniffableTags), | |
424 counter, result); | |
425 } | |
426 | |
427 // Returns true and sets result if the content matches any of kMagicNumbers. | |
428 // Clears have_enough_content if more data could possibly change the result. | |
429 static bool SniffForMagicNumbers(const char* content, | |
430 size_t size, | |
431 bool* have_enough_content, | |
432 std::string* result) { | |
433 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | |
434 | |
435 // Check our big table of Magic Numbers | |
436 static base::HistogramBase* counter(NULL); | |
437 if (!counter) { | |
438 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", | |
439 arraysize(kMagicNumbers)); | |
440 } | |
441 return CheckForMagicNumbers(content, size, | |
442 kMagicNumbers, arraysize(kMagicNumbers), | |
443 counter, result); | |
444 } | |
445 | |
446 // Returns true and sets result if the content matches any of | |
447 // kOfficeMagicNumbers, and the URL has the proper extension. | |
448 // Clears |have_enough_content| if more data could possibly change the result. | |
449 static bool SniffForOfficeDocs(const char* content, | |
450 size_t size, | |
451 const GURL& url, | |
452 bool* have_enough_content, | |
453 std::string* result) { | |
454 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size); | |
455 | |
456 // Check our table of magic numbers for Office file types. | |
457 std::string office_version; | |
458 if (!CheckForMagicNumbers(content, size, | |
459 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), | |
460 NULL, &office_version)) | |
461 return false; | |
462 | |
463 OfficeDocType type = DOC_TYPE_NONE; | |
464 for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) { | |
465 std::string url_path = url.path(); | |
466 | |
467 if (url_path.length() < kOfficeExtensionTypes[i].extension_len) | |
468 continue; | |
469 | |
470 const char* extension = | |
471 &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len]; | |
472 | |
473 if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension, | |
474 kOfficeExtensionTypes[i].extension_len)) { | |
475 type = kOfficeExtensionTypes[i].doc_type; | |
476 break; | |
477 } | |
478 } | |
479 | |
480 if (type == DOC_TYPE_NONE) | |
481 return false; | |
482 | |
483 if (office_version == "CFB") { | |
484 switch (type) { | |
485 case DOC_TYPE_WORD: | |
486 *result = "application/msword"; | |
487 return true; | |
488 case DOC_TYPE_EXCEL: | |
489 *result = "application/vnd.ms-excel"; | |
490 return true; | |
491 case DOC_TYPE_POWERPOINT: | |
492 *result = "application/vnd.ms-powerpoint"; | |
493 return true; | |
494 case DOC_TYPE_NONE: | |
495 NOTREACHED(); | |
496 return false; | |
497 } | |
498 } else if (office_version == "OOXML") { | |
499 switch (type) { | |
500 case DOC_TYPE_WORD: | |
501 *result = "application/vnd.openxmlformats-officedocument." | |
502 "wordprocessingml.document"; | |
503 return true; | |
504 case DOC_TYPE_EXCEL: | |
505 *result = "application/vnd.openxmlformats-officedocument." | |
506 "spreadsheetml.sheet"; | |
507 return true; | |
508 case DOC_TYPE_POWERPOINT: | |
509 *result = "application/vnd.openxmlformats-officedocument." | |
510 "presentationml.presentation"; | |
511 return true; | |
512 case DOC_TYPE_NONE: | |
513 NOTREACHED(); | |
514 return false; | |
515 } | |
516 } | |
517 | |
518 NOTREACHED(); | |
519 return false; | |
520 } | |
521 | |
522 static bool IsOfficeType(const std::string& type_hint) { | |
523 return (type_hint == "application/msword" || | |
524 type_hint == "application/vnd.ms-excel" || | |
525 type_hint == "application/vnd.ms-powerpoint" || | |
526 type_hint == "application/vnd.openxmlformats-officedocument." | |
527 "wordprocessingml.document" || | |
528 type_hint == "application/vnd.openxmlformats-officedocument." | |
529 "spreadsheetml.sheet" || | |
530 type_hint == "application/vnd.openxmlformats-officedocument." | |
531 "presentationml.presentation" || | |
532 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" || | |
533 type_hint == "application/vnd.ms-word.document.macroenabled.12" || | |
534 type_hint == "application/vnd.ms-powerpoint.presentation." | |
535 "macroenabled.12" || | |
536 type_hint == "application/mspowerpoint" || | |
537 type_hint == "application/msexcel" || | |
538 type_hint == "application/vnd.ms-word" || | |
539 type_hint == "application/vnd.ms-word.document.12" || | |
540 type_hint == "application/vnd.msword"); | |
541 } | |
542 | |
543 // This function checks for files that have a Microsoft Office MIME type | |
544 // set, but are not actually Office files. | |
545 // | |
546 // If this is not actually an Office file, |*result| is set to | |
547 // "application/octet-stream", otherwise it is not modified. | |
548 // | |
549 // Returns false if additional data is required to determine the file type, or | |
550 // true if there is enough data to make a decision. | |
551 static bool SniffForInvalidOfficeDocs(const char* content, | |
552 size_t size, | |
553 const GURL& url, | |
554 std::string* result) { | |
555 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size)) | |
556 return false; | |
557 | |
558 // Check our table of magic numbers for Office file types. If it does not | |
559 // match one, the MIME type was invalid. Set it instead to a safe value. | |
560 std::string office_version; | |
561 if (!CheckForMagicNumbers(content, size, | |
562 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), | |
563 NULL, &office_version)) { | |
564 *result = "application/octet-stream"; | |
565 } | |
566 | |
567 // We have enough information to determine if this was a Microsoft Office | |
568 // document or not, so sniffing is completed. | |
569 return true; | |
570 } | |
571 | |
572 // Byte order marks | |
573 static const MagicNumber kMagicXML[] = { | |
574 // We want to be very conservative in interpreting text/xml content as | |
575 // XHTML -- we just want to sniff enough to make unit tests pass. | |
576 // So we match explicitly on this, and don't match other ways of writing | |
577 // it in semantically-equivalent ways. | |
578 MAGIC_STRING("application/xhtml+xml", | |
579 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") | |
580 MAGIC_STRING("application/atom+xml", "<feed") | |
581 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 | |
582 }; | |
583 | |
584 // Returns true and sets result if the content appears to contain XHTML or a | |
585 // feed. | |
586 // Clears have_enough_content if more data could possibly change the result. | |
587 // | |
588 // TODO(evanm): this is similar but more conservative than what Safari does, | |
589 // while HTML5 has a different recommendation -- what should we do? | |
590 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset | |
591 // of ASCII -- do we care? | |
592 static bool SniffXML(const char* content, | |
593 size_t size, | |
594 bool* have_enough_content, | |
595 std::string* result) { | |
596 // We allow at most 300 bytes of content before we expect the opening tag. | |
597 *have_enough_content &= TruncateSize(300, &size); | |
598 const char* pos = content; | |
599 const char* const end = content + size; | |
600 | |
601 // This loop iterates through tag-looking offsets in the file. | |
602 // We want to skip XML processing instructions (of the form "<?xml ...") | |
603 // and stop at the first "plain" tag, then make a decision on the mime-type | |
604 // based on the name (or possibly attributes) of that tag. | |
605 static base::HistogramBase* counter(NULL); | |
606 if (!counter) { | |
607 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", | |
608 arraysize(kMagicXML)); | |
609 } | |
610 const int kMaxTagIterations = 5; | |
611 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { | |
612 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); | |
613 if (!pos) | |
614 return false; | |
615 | |
616 if ((pos + sizeof("<?xml") - 1 <= end) && | |
617 (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0)) { | |
618 // Skip XML declarations. | |
619 ++pos; | |
620 continue; | |
621 } else if ((pos + sizeof("<!DOCTYPE") - 1 <= end) && | |
622 (base::strncasecmp(pos, "<!DOCTYPE", sizeof("<!DOCTYPE") - 1) == | |
623 0)) { | |
624 // Skip DOCTYPE declarations. | |
625 ++pos; | |
626 continue; | |
627 } | |
628 | |
629 if (CheckForMagicNumbers(pos, end - pos, | |
630 kMagicXML, arraysize(kMagicXML), | |
631 counter, result)) | |
632 return true; | |
633 | |
634 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult | |
635 // to identify. | |
636 | |
637 // If we get here, we've hit an initial tag that hasn't matched one of the | |
638 // above tests. Abort. | |
639 return true; | |
640 } | |
641 | |
642 // We iterated too far without finding a start tag. | |
643 // If we have more content to look at, we aren't going to change our mind by | |
644 // seeing more bytes from the network. | |
645 return pos < end; | |
646 } | |
647 | |
648 // Byte order marks | |
649 static const MagicNumber kByteOrderMark[] = { | |
650 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE | |
651 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE | |
652 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 | |
653 }; | |
654 | |
655 // Whether a given byte looks like it might be part of binary content. | |
656 // Source: HTML5 spec | |
657 static char kByteLooksBinary[] = { | |
658 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F | |
659 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F | |
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F | |
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F | |
662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F | |
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F | |
664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F | |
665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F | |
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F | |
667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F | |
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF | |
669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF | |
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF | |
671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF | |
672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF | |
673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF | |
674 }; | |
675 | |
676 // Returns true and sets result to "application/octet-stream" if the content | |
677 // appears to be binary data. Otherwise, returns false and sets "text/plain". | |
678 // Clears have_enough_content if more data could possibly change the result. | |
679 static bool SniffBinary(const char* content, | |
680 size_t size, | |
681 bool* have_enough_content, | |
682 std::string* result) { | |
683 // There is no concensus about exactly how to sniff for binary content. | |
684 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. | |
685 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. | |
686 // Here, we side with FF, but with a smaller buffer. This size was chosen | |
687 // because it is small enough to comfortably fit into a single packet (after | |
688 // allowing for headers) and yet large enough to account for binary formats | |
689 // that have a significant amount of ASCII at the beginning (crbug.com/15314). | |
690 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); | |
691 | |
692 // First, we look for a BOM. | |
693 static base::HistogramBase* counter(NULL); | |
694 if (!counter) { | |
695 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", | |
696 arraysize(kByteOrderMark)); | |
697 } | |
698 std::string unused; | |
699 if (CheckForMagicNumbers(content, size, | |
700 kByteOrderMark, arraysize(kByteOrderMark), | |
701 counter, &unused)) { | |
702 // If there is BOM, we think the buffer is not binary. | |
703 result->assign("text/plain"); | |
704 return false; | |
705 } | |
706 | |
707 // Next we look to see if any of the bytes "look binary." | |
708 for (size_t i = 0; i < size; ++i) { | |
709 // If we a see a binary-looking byte, we think the content is binary. | |
710 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { | |
711 result->assign("application/octet-stream"); | |
712 return true; | |
713 } | |
714 } | |
715 | |
716 // No evidence either way. Default to non-binary and, if truncated, clear | |
717 // have_enough_content because there could be a binary looking byte in the | |
718 // truncated data. | |
719 *have_enough_content &= is_truncated; | |
720 result->assign("text/plain"); | |
721 return false; | |
722 } | |
723 | |
724 static bool IsUnknownMimeType(const std::string& mime_type) { | |
725 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. | |
726 // If we do, please be careful not to alter the semantics at all. | |
727 static const char* const kUnknownMimeTypes[] = { | |
728 // Empty mime types are as unknown as they get. | |
729 "", | |
730 // The unknown/unknown type is popular and uninformative | |
731 "unknown/unknown", | |
732 // The second most popular unknown mime type is application/unknown | |
733 "application/unknown", | |
734 // Firefox rejects a mime type if it is exactly */* | |
735 "*/*", | |
736 }; | |
737 static base::HistogramBase* counter(NULL); | |
738 if (!counter) { | |
739 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", | |
740 arraysize(kUnknownMimeTypes) + 1); | |
741 } | |
742 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { | |
743 if (mime_type == kUnknownMimeTypes[i]) { | |
744 counter->Add(i); | |
745 return true; | |
746 } | |
747 } | |
748 if (mime_type.find('/') == std::string::npos) { | |
749 // Firefox rejects a mime type if it does not contain a slash | |
750 counter->Add(arraysize(kUnknownMimeTypes)); | |
751 return true; | |
752 } | |
753 return false; | |
754 } | |
755 | |
756 // Returns true and sets result if the content appears to be a crx (Chrome | |
757 // extension) file. | |
758 // Clears have_enough_content if more data could possibly change the result. | |
759 static bool SniffCRX(const char* content, | |
760 size_t size, | |
761 const GURL& url, | |
762 const std::string& type_hint, | |
763 bool* have_enough_content, | |
764 std::string* result) { | |
765 static base::HistogramBase* counter(NULL); | |
766 if (!counter) | |
767 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); | |
768 | |
769 // Technically, the crx magic number is just Cr24, but the bytes after that | |
770 // are a version number which changes infrequently. Including it in the | |
771 // sniffing gives us less room for error. If the version number ever changes, | |
772 // we can just add an entry to this list. | |
773 // | |
774 // TODO(aa): If we ever have another magic number, we'll want to pass a | |
775 // histogram into CheckForMagicNumbers(), below, to see which one matched. | |
776 static const struct MagicNumber kCRXMagicNumbers[] = { | |
777 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") | |
778 }; | |
779 | |
780 // Only consider files that have the extension ".crx". | |
781 static const char kCRXExtension[] = ".crx"; | |
782 // Ignore null by subtracting 1. | |
783 static const int kExtensionLength = arraysize(kCRXExtension) - 1; | |
784 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == | |
785 url.path().size() - kExtensionLength) { | |
786 counter->Add(1); | |
787 } else { | |
788 return false; | |
789 } | |
790 | |
791 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | |
792 if (CheckForMagicNumbers(content, size, | |
793 kCRXMagicNumbers, arraysize(kCRXMagicNumbers), | |
794 NULL, result)) { | |
795 counter->Add(2); | |
796 } else { | |
797 return false; | |
798 } | |
799 | |
800 return true; | |
801 } | |
802 | |
803 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { | |
804 static base::HistogramBase* should_sniff_counter(NULL); | |
805 if (!should_sniff_counter) { | |
806 should_sniff_counter = | |
807 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); | |
808 } | |
809 bool sniffable_scheme = url.is_empty() || | |
810 url.SchemeIsHTTPOrHTTPS() || | |
811 url.SchemeIs("ftp") || | |
812 #if defined(OS_ANDROID) | |
813 url.SchemeIs("content") || | |
814 #endif | |
815 url.SchemeIsFile() || | |
816 url.SchemeIsFileSystem(); | |
817 if (!sniffable_scheme) { | |
818 should_sniff_counter->Add(1); | |
819 return false; | |
820 } | |
821 | |
822 static const char* const kSniffableTypes[] = { | |
823 // Many web servers are misconfigured to send text/plain for many | |
824 // different types of content. | |
825 "text/plain", | |
826 // We want to sniff application/octet-stream for | |
827 // application/x-chrome-extension, but nothing else. | |
828 "application/octet-stream", | |
829 // XHTML and Atom/RSS feeds are often served as plain xml instead of | |
830 // their more specific mime types. | |
831 "text/xml", | |
832 "application/xml", | |
833 // Check for false Microsoft Office MIME types. | |
834 "application/msword", | |
835 "application/vnd.ms-excel", | |
836 "application/vnd.ms-powerpoint", | |
837 "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
838 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
839 "application/vnd.openxmlformats-officedocument.presentationml.presentation", | |
840 "application/vnd.ms-excel.sheet.macroenabled.12", | |
841 "application/vnd.ms-word.document.macroenabled.12", | |
842 "application/vnd.ms-powerpoint.presentation.macroenabled.12", | |
843 "application/mspowerpoint", | |
844 "application/msexcel", | |
845 "application/vnd.ms-word", | |
846 "application/vnd.ms-word.document.12", | |
847 "application/vnd.msword", | |
848 }; | |
849 static base::HistogramBase* counter(NULL); | |
850 if (!counter) { | |
851 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", | |
852 arraysize(kSniffableTypes) + 1); | |
853 } | |
854 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { | |
855 if (mime_type == kSniffableTypes[i]) { | |
856 counter->Add(i); | |
857 should_sniff_counter->Add(2); | |
858 return true; | |
859 } | |
860 } | |
861 if (IsUnknownMimeType(mime_type)) { | |
862 // The web server didn't specify a content type or specified a mime | |
863 // type that we ignore. | |
864 counter->Add(arraysize(kSniffableTypes)); | |
865 should_sniff_counter->Add(2); | |
866 return true; | |
867 } | |
868 should_sniff_counter->Add(1); | |
869 return false; | |
870 } | |
871 | |
872 bool SniffMimeType(const char* content, | |
873 size_t content_size, | |
874 const GURL& url, | |
875 const std::string& type_hint, | |
876 std::string* result) { | |
877 DCHECK_LT(content_size, 1000000U); // sanity check | |
878 DCHECK(content); | |
879 DCHECK(result); | |
880 | |
881 // By default, we assume we have enough content. | |
882 // Each sniff routine may unset this if it wasn't provided enough content. | |
883 bool have_enough_content = true; | |
884 | |
885 // By default, we'll return the type hint. | |
886 // Each sniff routine may modify this if it has a better guess.. | |
887 result->assign(type_hint); | |
888 | |
889 // If the file has a Microsoft Office MIME type, we should only check that it | |
890 // is a valid Office file. Because this is the only reason we sniff files | |
891 // with a Microsoft Office MIME type, we can return early. | |
892 if (IsOfficeType(type_hint)) | |
893 return SniffForInvalidOfficeDocs(content, content_size, url, result); | |
894 | |
895 // Cache information about the type_hint | |
896 const bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint); | |
897 | |
898 // First check for HTML | |
899 if (hint_is_unknown_mime_type) { | |
900 // We're only willing to sniff HTML if the server has not supplied a mime | |
901 // type, or if the type it did supply indicates that it doesn't know what | |
902 // the type should be. | |
903 if (SniffForHTML(content, content_size, &have_enough_content, result)) | |
904 return true; // We succeeded in sniffing HTML. No more content needed. | |
905 } | |
906 | |
907 // We're only willing to sniff for binary in 3 cases: | |
908 // 1. The server has not supplied a mime type. | |
909 // 2. The type it did supply indicates that it doesn't know what the type | |
910 // should be. | |
911 // 3. The type is "text/plain" which is the default on some web servers and | |
912 // could be indicative of a mis-configuration that we shield the user from. | |
913 const bool hint_is_text_plain = (type_hint == "text/plain"); | |
914 if (hint_is_unknown_mime_type || hint_is_text_plain) { | |
915 if (!SniffBinary(content, content_size, &have_enough_content, result)) { | |
916 // If the server said the content was text/plain and it doesn't appear | |
917 // to be binary, then we trust it. | |
918 if (hint_is_text_plain) { | |
919 return have_enough_content; | |
920 } | |
921 } | |
922 } | |
923 | |
924 // If we have plain XML, sniff XML subtypes. | |
925 if (type_hint == "text/xml" || type_hint == "application/xml") { | |
926 // We're not interested in sniffing these types for images and the like. | |
927 // Instead, we're looking explicitly for a feed. If we don't find one | |
928 // we're done and return early. | |
929 if (SniffXML(content, content_size, &have_enough_content, result)) | |
930 return true; | |
931 return have_enough_content; | |
932 } | |
933 | |
934 // CRX files (Chrome extensions) have a special sniffing algorithm. It is | |
935 // tighter than the others because we don't have to match legacy behavior. | |
936 if (SniffCRX(content, content_size, url, type_hint, | |
937 &have_enough_content, result)) | |
938 return true; | |
939 | |
940 // Check the file extension and magic numbers to see if this is an Office | |
941 // document. This needs to be checked before the general magic numbers | |
942 // because zip files and Office documents (OOXML) have the same magic number. | |
943 if (SniffForOfficeDocs(content, content_size, url, | |
944 &have_enough_content, result)) | |
945 return true; // We've matched a magic number. No more content needed. | |
946 | |
947 // We're not interested in sniffing for magic numbers when the type_hint | |
948 // is application/octet-stream. Time to bail out. | |
949 if (type_hint == "application/octet-stream") | |
950 return have_enough_content; | |
951 | |
952 // Now we look in our large table of magic numbers to see if we can find | |
953 // anything that matches the content. | |
954 if (SniffForMagicNumbers(content, content_size, | |
955 &have_enough_content, result)) | |
956 return true; // We've matched a magic number. No more content needed. | |
957 | |
958 return have_enough_content; | |
959 } | |
960 | |
961 bool SniffMimeTypeFromLocalData(const char* content, | |
962 size_t size, | |
963 std::string* result) { | |
964 // First check the extra table. | |
965 if (CheckForMagicNumbers(content, size, kExtraMagicNumbers, | |
966 arraysize(kExtraMagicNumbers), NULL, result)) | |
967 return true; | |
968 // Finally check the original table. | |
969 return CheckForMagicNumbers(content, size, kMagicNumbers, | |
970 arraysize(kMagicNumbers), NULL, result); | |
971 } | |
972 | |
973 } // namespace net | |
OLD | NEW |