OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Detecting mime types is a tricky business because we need to balance | 5 // Detecting mime types is a tricky business because we need to balance |
6 // compatibility concerns with security issues. Here is a survey of how other | 6 // compatibility concerns with security issues. Here is a survey of how other |
7 // browsers behave and then a description of how we intend to behave. | 7 // browsers behave and then a description of how we intend to behave. |
8 // | 8 // |
9 // HTML payload, no Content-Type header: | 9 // HTML payload, no Content-Type header: |
10 // * IE 7: Render as HTML | 10 // * IE 7: Render as HTML |
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
110 static const size_t kBytesRequiredForMagic = 42; | 110 static const size_t kBytesRequiredForMagic = 42; |
111 | 111 |
112 struct MagicNumber { | 112 struct MagicNumber { |
113 const char* mime_type; | 113 const char* mime_type; |
114 const char* magic; | 114 const char* magic; |
115 size_t magic_len; | 115 size_t magic_len; |
116 bool is_string; | 116 bool is_string; |
117 const char* mask; // if set, must have same length as |magic| | 117 const char* mask; // if set, must have same length as |magic| |
118 }; | 118 }; |
119 | 119 |
120 #define MAGIC_NUMBER(mime_type, magic) \ | 120 #define MAGIC_NUMBER(mime_type, magic) \ |
121 { (mime_type), (magic), sizeof(magic)-1, false, NULL }, | 121 { (mime_type), (magic), sizeof(magic) - 1, false, NULL } \ |
122 , | |
mmenke
2014/10/10 18:12:39
Hrm...That comma change is really weird.
| |
122 | 123 |
123 template <int MagicSize, int MaskSize> | 124 template <int MagicSize, int MaskSize> |
124 class VerifySizes { | 125 class VerifySizes { |
125 COMPILE_ASSERT(MagicSize == MaskSize, sizes_must_be_equal); | 126 COMPILE_ASSERT(MagicSize == MaskSize, sizes_must_be_equal); |
127 | |
126 public: | 128 public: |
127 enum { SIZES = MagicSize }; | 129 enum { SIZES = MagicSize }; |
128 }; | 130 }; |
129 | 131 |
130 #define verified_sizeof(magic, mask) \ | 132 #define verified_sizeof(magic, mask) \ |
131 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES | 133 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES |
132 | 134 |
133 #define MAGIC_MASK(mime_type, magic, mask) \ | 135 #define MAGIC_MASK(mime_type, magic, mask) \ |
134 { (mime_type), (magic), verified_sizeof(magic, mask)-1, false, (mask) }, | 136 { (mime_type), (magic), verified_sizeof(magic, mask) - 1, false, (mask) } \ |
137 , | |
135 | 138 |
136 // Magic strings are case insensitive and must not include '\0' characters | 139 // Magic strings are case insensitive and must not include '\0' characters |
137 #define MAGIC_STRING(mime_type, magic) \ | 140 #define MAGIC_STRING(mime_type, magic) \ |
138 { (mime_type), (magic), sizeof(magic)-1, true, NULL }, | 141 { (mime_type), (magic), sizeof(magic) - 1, true, NULL } \ |
142 , | |
139 | 143 |
140 static const MagicNumber kMagicNumbers[] = { | 144 static const MagicNumber kMagicNumbers[] = { |
141 // Source: HTML 5 specification | 145 // Source: HTML 5 specification |
142 MAGIC_NUMBER("application/pdf", "%PDF-") | 146 MAGIC_NUMBER("application/pdf", "%PDF-") |
143 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") | 147 MAGIC_NUMBER("application/postscript", "%!PS-Adobe-") |
144 MAGIC_NUMBER("image/gif", "GIF87a") | 148 MAGIC_NUMBER("image/gif", "GIF87a") MAGIC_NUMBER("image/gif", "GIF89a") |
145 MAGIC_NUMBER("image/gif", "GIF89a") | 149 MAGIC_NUMBER("image/png", |
146 MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A") | 150 "\x89" |
147 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") | 151 "PNG\x0D\x0A\x1A\x0A") |
148 MAGIC_NUMBER("image/bmp", "BM") | 152 MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF") MAGIC_NUMBER("image/bmp", "BM") |
149 // Source: Mozilla | 153 // Source: Mozilla |
150 MAGIC_NUMBER("text/plain", "#!") // Script | 154 MAGIC_NUMBER("text/plain", "#!") // Script |
151 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS | 155 MAGIC_NUMBER("text/plain", "%!") // Script, similar to PS |
152 MAGIC_NUMBER("text/plain", "From") | 156 MAGIC_NUMBER("text/plain", "From") MAGIC_NUMBER("text/plain", ">From") |
153 MAGIC_NUMBER("text/plain", ">From") | 157 // Chrome specific |
154 // Chrome specific | 158 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") |
155 MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08") | 159 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") MAGIC_NUMBER( |
156 MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46") | 160 "video/x-ms-asf", |
157 MAGIC_NUMBER("video/x-ms-asf", | 161 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") |
158 "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C") | 162 MAGIC_NUMBER("image/tiff", "I I") MAGIC_NUMBER("image/tiff", "II*") |
159 MAGIC_NUMBER("image/tiff", "I I") | 163 MAGIC_NUMBER("image/tiff", "MM\x00*") MAGIC_NUMBER("audio/mpeg", "ID3") |
160 MAGIC_NUMBER("image/tiff", "II*") | 164 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") |
161 MAGIC_NUMBER("image/tiff", "MM\x00*") | 165 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") |
162 MAGIC_NUMBER("audio/mpeg", "ID3") | 166 // TODO(abarth): we don't handle partial byte matches yet |
163 MAGIC_NUMBER("image/webp", "RIFF....WEBPVP8 ") | 167 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") |
164 MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3") | 168 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") |
165 // TODO(abarth): we don't handle partial byte matches yet | 169 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") |
166 // MAGIC_NUMBER("video/mpeg", "\x00\x00\x01\xB") | 170 MAGIC_NUMBER("application/zip", "PK\x03\x04") |
167 // MAGIC_NUMBER("audio/mpeg", "\xFF\xE") | 171 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") |
168 // MAGIC_NUMBER("audio/mpeg", "\xFF\xF") | 172 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") |
169 MAGIC_NUMBER("application/zip", "PK\x03\x04") | 173 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE |
170 MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00") | 174 // Sniffing for Flash: |
171 MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A") | 175 // |
172 MAGIC_NUMBER("application/octet-stream", "MZ") // EXE | 176 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") |
173 // Sniffing for Flash: | 177 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") |
174 // | 178 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") |
175 // MAGIC_NUMBER("application/x-shockwave-flash", "CWS") | 179 // |
176 // MAGIC_NUMBER("application/x-shockwave-flash", "FLV") | 180 // Including these magic number for Flash is a trade off. |
177 // MAGIC_NUMBER("application/x-shockwave-flash", "FWS") | 181 // |
178 // | 182 // Pros: |
179 // Including these magic number for Flash is a trade off. | 183 // * Flash is an important and popular file format |
180 // | 184 // |
181 // Pros: | 185 // Cons: |
182 // * Flash is an important and popular file format | 186 // * These patterns are fairly weak |
183 // | 187 // * If we mistakenly decide something is Flash, we will execute it |
184 // Cons: | 188 // in the origin of an unsuspecting site. This could be a security |
185 // * These patterns are fairly weak | 189 // vulnerability if the site allows users to upload content. |
186 // * If we mistakenly decide something is Flash, we will execute it | 190 // |
187 // in the origin of an unsuspecting site. This could be a security | 191 // On balance, we do not include these patterns. |
188 // vulnerability if the site allows users to upload content. | |
189 // | |
190 // On balance, we do not include these patterns. | |
191 }; | 192 }; |
192 | 193 |
193 // The number of content bytes we need to use all our Microsoft Office magic | 194 // The number of content bytes we need to use all our Microsoft Office magic |
194 // numbers. | 195 // numbers. |
195 static const size_t kBytesRequiredForOfficeMagic = 8; | 196 static const size_t kBytesRequiredForOfficeMagic = 8; |
196 | 197 |
197 static const MagicNumber kOfficeMagicNumbers[] = { | 198 static const MagicNumber kOfficeMagicNumbers[] = { |
198 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1") | 199 MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1") |
199 MAGIC_NUMBER("OOXML", "PK\x03\x04") | 200 MAGIC_NUMBER("OOXML", "PK\x03\x04")}; |
mmenke
2014/10/10 18:12:39
Not putting the close brace on its own line seems
| |
200 }; | |
201 | 201 |
202 enum OfficeDocType { | 202 enum OfficeDocType { |
203 DOC_TYPE_WORD, | 203 DOC_TYPE_WORD, |
204 DOC_TYPE_EXCEL, | 204 DOC_TYPE_EXCEL, |
205 DOC_TYPE_POWERPOINT, | 205 DOC_TYPE_POWERPOINT, |
206 DOC_TYPE_NONE | 206 DOC_TYPE_NONE |
207 }; | 207 }; |
208 | 208 |
209 struct OfficeExtensionType { | 209 struct OfficeExtensionType { |
210 OfficeDocType doc_type; | 210 OfficeDocType doc_type; |
211 const char* extension; | 211 const char* extension; |
212 size_t extension_len; | 212 size_t extension_len; |
213 }; | 213 }; |
214 | 214 |
215 #define OFFICE_EXTENSION(type, extension) \ | 215 #define OFFICE_EXTENSION(type, extension) \ |
216 { (type), (extension), sizeof(extension) - 1 }, | 216 { (type), (extension), sizeof(extension) - 1 } \ |
217 , | |
217 | 218 |
218 static const OfficeExtensionType kOfficeExtensionTypes[] = { | 219 static const OfficeExtensionType kOfficeExtensionTypes[] = { |
219 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc") | 220 OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc") |
220 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls") | 221 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls") |
221 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt") | 222 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt") |
222 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx") | 223 OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx") |
223 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx") | 224 OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx") |
224 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx") | 225 OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx")}; |
225 }; | |
226 | 226 |
227 static const MagicNumber kExtraMagicNumbers[] = { | 227 static const MagicNumber kExtraMagicNumbers[] = { |
228 MAGIC_NUMBER("image/x-xbitmap", "#define") | 228 MAGIC_NUMBER("image/x-xbitmap", "#define") |
229 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00") | 229 MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00") |
230 MAGIC_NUMBER("image/svg+xml", "<?xml_version=") | 230 MAGIC_NUMBER("image/svg+xml", "<?xml_version=") |
231 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ") | 231 MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt ") |
232 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST") | 232 MAGIC_NUMBER("video/avi", "RIFF....AVI LIST") |
233 MAGIC_NUMBER("audio/ogg", "OggS") | 233 MAGIC_NUMBER("audio/ogg", "OggS") |
234 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0") | 234 MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0") |
235 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0") | 235 MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0") |
236 MAGIC_NUMBER("video/3gpp", "....ftyp3g") | 236 MAGIC_NUMBER("video/3gpp", "....ftyp3g") |
237 MAGIC_NUMBER("video/3gpp", "....ftypavcl") | 237 MAGIC_NUMBER("video/3gpp", "....ftypavcl") |
238 MAGIC_NUMBER("video/mp4", "....ftyp") | 238 MAGIC_NUMBER("video/mp4", "....ftyp") |
239 MAGIC_NUMBER("video/quicktime", "....moov") | 239 MAGIC_NUMBER("video/quicktime", "....moov") |
240 MAGIC_NUMBER("application/x-shockwave-flash", "CWS") | 240 MAGIC_NUMBER("application/x-shockwave-flash", "CWS") |
241 MAGIC_NUMBER("application/x-shockwave-flash", "FWS") | 241 MAGIC_NUMBER("application/x-shockwave-flash", "FWS") |
242 MAGIC_NUMBER("video/x-flv", "FLV") | 242 MAGIC_NUMBER("video/x-flv", "FLV") MAGIC_NUMBER("audio/x-flac", "fLaC") |
243 MAGIC_NUMBER("audio/x-flac", "fLaC") | |
244 | 243 |
245 // RAW image types. | 244 // RAW image types. |
246 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR") | 245 MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR") |
247 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR") | 246 MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR") |
248 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM") | 247 MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM") |
249 MAGIC_NUMBER("image/x-olympus-orf", "MMOR") // big-endian | 248 MAGIC_NUMBER("image/x-olympus-orf", "MMOR") // big-endian |
250 MAGIC_NUMBER("image/x-olympus-orf", "IIRO") // little-endian | 249 MAGIC_NUMBER("image/x-olympus-orf", "IIRO") // little-endian |
251 MAGIC_NUMBER("image/x-olympus-orf", "IIRS") // little-endian | 250 MAGIC_NUMBER("image/x-olympus-orf", "IIRS") // little-endian |
252 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ") | 251 MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW ") |
253 MAGIC_NUMBER("image/x-panasonic-raw", | 252 MAGIC_NUMBER("image/x-panasonic-raw", |
254 "IIU\x00\x08\x00\x00\x00") // Panasonic .raw | 253 "IIU\x00\x08\x00\x00\x00") // Panasonic .raw |
255 MAGIC_NUMBER("image/x-panasonic-raw", | 254 MAGIC_NUMBER("image/x-panasonic-raw", |
256 "IIU\x00\x18\x00\x00\x00") // Panasonic .rw2 | 255 "IIU\x00\x18\x00\x00\x00") // Panasonic .rw2 |
257 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw") | 256 MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw") |
258 MAGIC_NUMBER("image/x-x3f", "FOVb") | 257 MAGIC_NUMBER("image/x-x3f", "FOVb")}; |
259 }; | |
260 | 258 |
261 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will | 259 // Our HTML sniffer differs slightly from Mozilla. For example, Mozilla will |
262 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is | 260 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is |
263 // HTML, but we will not. | 261 // HTML, but we will not. |
264 | 262 |
265 #define MAGIC_HTML_TAG(tag) \ | 263 #define MAGIC_HTML_TAG(tag) MAGIC_STRING("text/html", "<" tag) |
266 MAGIC_STRING("text/html", "<" tag) | |
267 | 264 |
268 static const MagicNumber kSniffableTags[] = { | 265 static const MagicNumber kSniffableTags[] = { |
269 // XML processing directive. Although this is not an HTML mime type, we sniff | 266 // XML processing directive. Although this is not an HTML mime type, we |
270 // for this in the HTML phase because text/xml is just as powerful as HTML and | 267 // sniff |
271 // we want to leverage our white space skipping technology. | 268 // for this in the HTML phase because text/xml is just as powerful as HTML |
272 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla | 269 // and |
273 // DOCTYPEs | 270 // we want to leverage our white space skipping technology. |
274 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec | 271 MAGIC_NUMBER("text/xml", "<?xml") // Mozilla |
275 // Sniffable tags, ordered by how often they occur in sniffable documents. | 272 // DOCTYPEs |
276 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla | 273 MAGIC_HTML_TAG("!DOCTYPE html") // HTML5 spec |
277 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla | 274 // Sniffable tags, ordered by how often they occur in sniffable documents. |
278 MAGIC_HTML_TAG("!--") | 275 MAGIC_HTML_TAG("script") // HTML5 spec, Mozilla |
279 MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla | 276 MAGIC_HTML_TAG("html") // HTML5 spec, Mozilla |
280 MAGIC_HTML_TAG("iframe") // Mozilla | 277 MAGIC_HTML_TAG("!--") MAGIC_HTML_TAG("head") // HTML5 spec, Mozilla |
281 MAGIC_HTML_TAG("h1") // Mozilla | 278 MAGIC_HTML_TAG("iframe") // Mozilla |
282 MAGIC_HTML_TAG("div") // Mozilla | 279 MAGIC_HTML_TAG("h1") // Mozilla |
283 MAGIC_HTML_TAG("font") // Mozilla | 280 MAGIC_HTML_TAG("div") // Mozilla |
284 MAGIC_HTML_TAG("table") // Mozilla | 281 MAGIC_HTML_TAG("font") // Mozilla |
285 MAGIC_HTML_TAG("a") // Mozilla | 282 MAGIC_HTML_TAG("table") // Mozilla |
286 MAGIC_HTML_TAG("style") // Mozilla | 283 MAGIC_HTML_TAG("a") // Mozilla |
287 MAGIC_HTML_TAG("title") // Mozilla | 284 MAGIC_HTML_TAG("style") // Mozilla |
288 MAGIC_HTML_TAG("b") // Mozilla | 285 MAGIC_HTML_TAG("title") // Mozilla |
289 MAGIC_HTML_TAG("body") // Mozilla | 286 MAGIC_HTML_TAG("b") // Mozilla |
290 MAGIC_HTML_TAG("br") | 287 MAGIC_HTML_TAG("body") // Mozilla |
291 MAGIC_HTML_TAG("p") // Mozilla | 288 MAGIC_HTML_TAG("br") MAGIC_HTML_TAG("p") // Mozilla |
292 }; | 289 }; |
293 | 290 |
294 static base::HistogramBase* UMASnifferHistogramGet(const char* name, | 291 static base::HistogramBase* UMASnifferHistogramGet(const char* name, |
295 int array_size) { | 292 int array_size) { |
296 base::HistogramBase* counter = | 293 base::HistogramBase* counter = base::LinearHistogram::FactoryGet( |
297 base::LinearHistogram::FactoryGet(name, 1, array_size - 1, array_size, | 294 name, |
298 base::HistogramBase::kUmaTargetedHistogramFlag); | 295 1, |
296 array_size - 1, | |
297 array_size, | |
298 base::HistogramBase::kUmaTargetedHistogramFlag); | |
299 return counter; | 299 return counter; |
300 } | 300 } |
301 | 301 |
302 // Compare content header to a magic number where magic_entry can contain '.' | 302 // Compare content header to a magic number where magic_entry can contain '.' |
303 // for single character of anything, allowing some bytes to be skipped. | 303 // for single character of anything, allowing some bytes to be skipped. |
304 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { | 304 static bool MagicCmp(const char* magic_entry, const char* content, size_t len) { |
305 while (len) { | 305 while (len) { |
306 if ((*magic_entry != '.') && (*magic_entry != *content)) | 306 if ((*magic_entry != '.') && (*magic_entry != *content)) |
307 return false; | 307 return false; |
308 ++magic_entry; | 308 ++magic_entry; |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
361 } | 361 } |
362 } | 362 } |
363 | 363 |
364 if (match) { | 364 if (match) { |
365 result->assign(magic_entry.mime_type); | 365 result->assign(magic_entry.mime_type); |
366 return true; | 366 return true; |
367 } | 367 } |
368 return false; | 368 return false; |
369 } | 369 } |
370 | 370 |
371 static bool CheckForMagicNumbers(const char* content, size_t size, | 371 static bool CheckForMagicNumbers(const char* content, |
372 const MagicNumber* magic, size_t magic_len, | 372 size_t size, |
373 const MagicNumber* magic, | |
374 size_t magic_len, | |
373 base::HistogramBase* counter, | 375 base::HistogramBase* counter, |
374 std::string* result) { | 376 std::string* result) { |
375 for (size_t i = 0; i < magic_len; ++i) { | 377 for (size_t i = 0; i < magic_len; ++i) { |
376 if (MatchMagicNumber(content, size, magic[i], result)) { | 378 if (MatchMagicNumber(content, size, magic[i], result)) { |
377 if (counter) counter->Add(static_cast<int>(i)); | 379 if (counter) |
380 counter->Add(static_cast<int>(i)); | |
378 return true; | 381 return true; |
379 } | 382 } |
380 } | 383 } |
381 return false; | 384 return false; |
382 } | 385 } |
383 | 386 |
384 // Truncates |size| to |max_size| and returns true if |size| is at least | 387 // Truncates |size| to |max_size| and returns true if |size| is at least |
385 // |max_size|. | 388 // |max_size|. |
386 static bool TruncateSize(const size_t max_size, size_t* size) { | 389 static bool TruncateSize(const size_t max_size, size_t* size) { |
387 // Keep kMaxBytesToSniff honest. | 390 // Keep kMaxBytesToSniff honest. |
(...skipping 23 matching lines...) Expand all Loading... | |
411 for (pos = content; pos < end; ++pos) { | 414 for (pos = content; pos < end; ++pos) { |
412 if (!IsAsciiWhitespace(*pos)) | 415 if (!IsAsciiWhitespace(*pos)) |
413 break; | 416 break; |
414 } | 417 } |
415 static base::HistogramBase* counter(NULL); | 418 static base::HistogramBase* counter(NULL); |
416 if (!counter) { | 419 if (!counter) { |
417 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", | 420 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTags2", |
418 arraysize(kSniffableTags)); | 421 arraysize(kSniffableTags)); |
419 } | 422 } |
420 // |pos| now points to first non-whitespace character (or at end). | 423 // |pos| now points to first non-whitespace character (or at end). |
421 return CheckForMagicNumbers(pos, end - pos, | 424 return CheckForMagicNumbers(pos, |
422 kSniffableTags, arraysize(kSniffableTags), | 425 end - pos, |
423 counter, result); | 426 kSniffableTags, |
427 arraysize(kSniffableTags), | |
428 counter, | |
429 result); | |
424 } | 430 } |
425 | 431 |
426 // Returns true and sets result if the content matches any of kMagicNumbers. | 432 // Returns true and sets result if the content matches any of kMagicNumbers. |
427 // Clears have_enough_content if more data could possibly change the result. | 433 // Clears have_enough_content if more data could possibly change the result. |
428 static bool SniffForMagicNumbers(const char* content, | 434 static bool SniffForMagicNumbers(const char* content, |
429 size_t size, | 435 size_t size, |
430 bool* have_enough_content, | 436 bool* have_enough_content, |
431 std::string* result) { | 437 std::string* result) { |
432 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | 438 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); |
433 | 439 |
434 // Check our big table of Magic Numbers | 440 // Check our big table of Magic Numbers |
435 static base::HistogramBase* counter(NULL); | 441 static base::HistogramBase* counter(NULL); |
436 if (!counter) { | 442 if (!counter) { |
437 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", | 443 counter = UMASnifferHistogramGet("mime_sniffer.kMagicNumbers2", |
438 arraysize(kMagicNumbers)); | 444 arraysize(kMagicNumbers)); |
439 } | 445 } |
440 return CheckForMagicNumbers(content, size, | 446 return CheckForMagicNumbers( |
441 kMagicNumbers, arraysize(kMagicNumbers), | 447 content, size, kMagicNumbers, arraysize(kMagicNumbers), counter, result); |
442 counter, result); | |
443 } | 448 } |
444 | 449 |
445 // Returns true and sets result if the content matches any of | 450 // Returns true and sets result if the content matches any of |
446 // kOfficeMagicNumbers, and the URL has the proper extension. | 451 // kOfficeMagicNumbers, and the URL has the proper extension. |
447 // Clears |have_enough_content| if more data could possibly change the result. | 452 // Clears |have_enough_content| if more data could possibly change the result. |
448 static bool SniffForOfficeDocs(const char* content, | 453 static bool SniffForOfficeDocs(const char* content, |
449 size_t size, | 454 size_t size, |
450 const GURL& url, | 455 const GURL& url, |
451 bool* have_enough_content, | 456 bool* have_enough_content, |
452 std::string* result) { | 457 std::string* result) { |
453 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size); | 458 *have_enough_content &= TruncateSize(kBytesRequiredForOfficeMagic, &size); |
454 | 459 |
455 // Check our table of magic numbers for Office file types. | 460 // Check our table of magic numbers for Office file types. |
456 std::string office_version; | 461 std::string office_version; |
457 if (!CheckForMagicNumbers(content, size, | 462 if (!CheckForMagicNumbers(content, |
458 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), | 463 size, |
459 NULL, &office_version)) | 464 kOfficeMagicNumbers, |
465 arraysize(kOfficeMagicNumbers), | |
466 NULL, | |
467 &office_version)) | |
460 return false; | 468 return false; |
461 | 469 |
462 OfficeDocType type = DOC_TYPE_NONE; | 470 OfficeDocType type = DOC_TYPE_NONE; |
463 for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) { | 471 for (size_t i = 0; i < arraysize(kOfficeExtensionTypes); ++i) { |
464 std::string url_path = url.path(); | 472 std::string url_path = url.path(); |
465 | 473 |
466 if (url_path.length() < kOfficeExtensionTypes[i].extension_len) | 474 if (url_path.length() < kOfficeExtensionTypes[i].extension_len) |
467 continue; | 475 continue; |
468 | 476 |
469 const char* extension = | 477 const char* extension = |
470 &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len]; | 478 &url_path[url_path.length() - kOfficeExtensionTypes[i].extension_len]; |
471 | 479 |
472 if (0 == base::strncasecmp(extension, kOfficeExtensionTypes[i].extension, | 480 if (0 == base::strncasecmp(extension, |
481 kOfficeExtensionTypes[i].extension, | |
473 kOfficeExtensionTypes[i].extension_len)) { | 482 kOfficeExtensionTypes[i].extension_len)) { |
474 type = kOfficeExtensionTypes[i].doc_type; | 483 type = kOfficeExtensionTypes[i].doc_type; |
475 break; | 484 break; |
476 } | 485 } |
477 } | 486 } |
478 | 487 |
479 if (type == DOC_TYPE_NONE) | 488 if (type == DOC_TYPE_NONE) |
480 return false; | 489 return false; |
481 | 490 |
482 if (office_version == "CFB") { | 491 if (office_version == "CFB") { |
483 switch (type) { | 492 switch (type) { |
484 case DOC_TYPE_WORD: | 493 case DOC_TYPE_WORD: |
485 *result = "application/msword"; | 494 *result = "application/msword"; |
486 return true; | 495 return true; |
487 case DOC_TYPE_EXCEL: | 496 case DOC_TYPE_EXCEL: |
488 *result = "application/vnd.ms-excel"; | 497 *result = "application/vnd.ms-excel"; |
489 return true; | 498 return true; |
490 case DOC_TYPE_POWERPOINT: | 499 case DOC_TYPE_POWERPOINT: |
491 *result = "application/vnd.ms-powerpoint"; | 500 *result = "application/vnd.ms-powerpoint"; |
492 return true; | 501 return true; |
493 case DOC_TYPE_NONE: | 502 case DOC_TYPE_NONE: |
494 NOTREACHED(); | 503 NOTREACHED(); |
495 return false; | 504 return false; |
496 } | 505 } |
497 } else if (office_version == "OOXML") { | 506 } else if (office_version == "OOXML") { |
498 switch (type) { | 507 switch (type) { |
499 case DOC_TYPE_WORD: | 508 case DOC_TYPE_WORD: |
500 *result = "application/vnd.openxmlformats-officedocument." | 509 *result = |
501 "wordprocessingml.document"; | 510 "application/vnd.openxmlformats-officedocument." |
511 "wordprocessingml.document"; | |
502 return true; | 512 return true; |
503 case DOC_TYPE_EXCEL: | 513 case DOC_TYPE_EXCEL: |
504 *result = "application/vnd.openxmlformats-officedocument." | 514 *result = |
505 "spreadsheetml.sheet"; | 515 "application/vnd.openxmlformats-officedocument." |
516 "spreadsheetml.sheet"; | |
506 return true; | 517 return true; |
507 case DOC_TYPE_POWERPOINT: | 518 case DOC_TYPE_POWERPOINT: |
508 *result = "application/vnd.openxmlformats-officedocument." | 519 *result = |
509 "presentationml.presentation"; | 520 "application/vnd.openxmlformats-officedocument." |
521 "presentationml.presentation"; | |
510 return true; | 522 return true; |
511 case DOC_TYPE_NONE: | 523 case DOC_TYPE_NONE: |
512 NOTREACHED(); | 524 NOTREACHED(); |
513 return false; | 525 return false; |
514 } | 526 } |
515 } | 527 } |
516 | 528 |
517 NOTREACHED(); | 529 NOTREACHED(); |
518 return false; | 530 return false; |
519 } | 531 } |
520 | 532 |
521 static bool IsOfficeType(const std::string& type_hint) { | 533 static bool IsOfficeType(const std::string& type_hint) { |
522 return (type_hint == "application/msword" || | 534 return (type_hint == "application/msword" || |
523 type_hint == "application/vnd.ms-excel" || | 535 type_hint == "application/vnd.ms-excel" || |
524 type_hint == "application/vnd.ms-powerpoint" || | 536 type_hint == "application/vnd.ms-powerpoint" || |
525 type_hint == "application/vnd.openxmlformats-officedocument." | 537 type_hint == |
526 "wordprocessingml.document" || | 538 "application/vnd.openxmlformats-officedocument." |
527 type_hint == "application/vnd.openxmlformats-officedocument." | 539 "wordprocessingml.document" || |
528 "spreadsheetml.sheet" || | 540 type_hint == |
529 type_hint == "application/vnd.openxmlformats-officedocument." | 541 "application/vnd.openxmlformats-officedocument." |
530 "presentationml.presentation" || | 542 "spreadsheetml.sheet" || |
543 type_hint == | |
544 "application/vnd.openxmlformats-officedocument." | |
545 "presentationml.presentation" || | |
531 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" || | 546 type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" || |
532 type_hint == "application/vnd.ms-word.document.macroenabled.12" || | 547 type_hint == "application/vnd.ms-word.document.macroenabled.12" || |
533 type_hint == "application/vnd.ms-powerpoint.presentation." | 548 type_hint == |
534 "macroenabled.12" || | 549 "application/vnd.ms-powerpoint.presentation." |
550 "macroenabled.12" || | |
535 type_hint == "application/mspowerpoint" || | 551 type_hint == "application/mspowerpoint" || |
536 type_hint == "application/msexcel" || | 552 type_hint == "application/msexcel" || |
537 type_hint == "application/vnd.ms-word" || | 553 type_hint == "application/vnd.ms-word" || |
538 type_hint == "application/vnd.ms-word.document.12" || | 554 type_hint == "application/vnd.ms-word.document.12" || |
539 type_hint == "application/vnd.msword"); | 555 type_hint == "application/vnd.msword"); |
540 } | 556 } |
541 | 557 |
542 // This function checks for files that have a Microsoft Office MIME type | 558 // This function checks for files that have a Microsoft Office MIME type |
543 // set, but are not actually Office files. | 559 // set, but are not actually Office files. |
544 // | 560 // |
545 // If this is not actually an Office file, |*result| is set to | 561 // If this is not actually an Office file, |*result| is set to |
546 // "application/octet-stream", otherwise it is not modified. | 562 // "application/octet-stream", otherwise it is not modified. |
547 // | 563 // |
548 // Returns false if additional data is required to determine the file type, or | 564 // Returns false if additional data is required to determine the file type, or |
549 // true if there is enough data to make a decision. | 565 // true if there is enough data to make a decision. |
550 static bool SniffForInvalidOfficeDocs(const char* content, | 566 static bool SniffForInvalidOfficeDocs(const char* content, |
551 size_t size, | 567 size_t size, |
552 const GURL& url, | 568 const GURL& url, |
553 std::string* result) { | 569 std::string* result) { |
554 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size)) | 570 if (!TruncateSize(kBytesRequiredForOfficeMagic, &size)) |
555 return false; | 571 return false; |
556 | 572 |
557 // Check our table of magic numbers for Office file types. If it does not | 573 // Check our table of magic numbers for Office file types. If it does not |
558 // match one, the MIME type was invalid. Set it instead to a safe value. | 574 // match one, the MIME type was invalid. Set it instead to a safe value. |
559 std::string office_version; | 575 std::string office_version; |
560 if (!CheckForMagicNumbers(content, size, | 576 if (!CheckForMagicNumbers(content, |
561 kOfficeMagicNumbers, arraysize(kOfficeMagicNumbers), | 577 size, |
562 NULL, &office_version)) { | 578 kOfficeMagicNumbers, |
579 arraysize(kOfficeMagicNumbers), | |
580 NULL, | |
581 &office_version)) { | |
563 *result = "application/octet-stream"; | 582 *result = "application/octet-stream"; |
564 } | 583 } |
565 | 584 |
566 // We have enough information to determine if this was a Microsoft Office | 585 // We have enough information to determine if this was a Microsoft Office |
567 // document or not, so sniffing is completed. | 586 // document or not, so sniffing is completed. |
568 return true; | 587 return true; |
569 } | 588 } |
570 | 589 |
571 // Byte order marks | 590 // Byte order marks |
572 static const MagicNumber kMagicXML[] = { | 591 static const MagicNumber kMagicXML[] = { |
573 // We want to be very conservative in interpreting text/xml content as | 592 // We want to be very conservative in interpreting text/xml content as |
574 // XHTML -- we just want to sniff enough to make unit tests pass. | 593 // XHTML -- we just want to sniff enough to make unit tests pass. |
575 // So we match explicitly on this, and don't match other ways of writing | 594 // So we match explicitly on this, and don't match other ways of writing |
576 // it in semantically-equivalent ways. | 595 // it in semantically-equivalent ways. |
577 MAGIC_STRING("application/xhtml+xml", | 596 MAGIC_STRING("application/xhtml+xml", |
578 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") | 597 "<html xmlns=\"http://www.w3.org/1999/xhtml\"") |
579 MAGIC_STRING("application/atom+xml", "<feed") | 598 MAGIC_STRING("application/atom+xml", "<feed") |
580 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 | 599 MAGIC_STRING("application/rss+xml", "<rss") // UTF-8 |
581 }; | 600 }; |
582 | 601 |
583 // Returns true and sets result if the content appears to contain XHTML or a | 602 // Returns true and sets result if the content appears to contain XHTML or a |
584 // feed. | 603 // feed. |
585 // Clears have_enough_content if more data could possibly change the result. | 604 // Clears have_enough_content if more data could possibly change the result. |
586 // | 605 // |
587 // TODO(evanm): this is similar but more conservative than what Safari does, | 606 // TODO(evanm): this is similar but more conservative than what Safari does, |
588 // while HTML5 has a different recommendation -- what should we do? | 607 // while HTML5 has a different recommendation -- what should we do? |
589 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset | 608 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset |
590 // of ASCII -- do we care? | 609 // of ASCII -- do we care? |
591 static bool SniffXML(const char* content, | 610 static bool SniffXML(const char* content, |
592 size_t size, | 611 size_t size, |
593 bool* have_enough_content, | 612 bool* have_enough_content, |
594 std::string* result) { | 613 std::string* result) { |
595 // We allow at most 300 bytes of content before we expect the opening tag. | 614 // We allow at most 300 bytes of content before we expect the opening tag. |
596 *have_enough_content &= TruncateSize(300, &size); | 615 *have_enough_content &= TruncateSize(300, &size); |
597 const char* pos = content; | 616 const char* pos = content; |
598 const char* const end = content + size; | 617 const char* const end = content + size; |
599 | 618 |
600 // This loop iterates through tag-looking offsets in the file. | 619 // This loop iterates through tag-looking offsets in the file. |
601 // We want to skip XML processing instructions (of the form "<?xml ...") | 620 // We want to skip XML processing instructions (of the form "<?xml ...") |
602 // and stop at the first "plain" tag, then make a decision on the mime-type | 621 // and stop at the first "plain" tag, then make a decision on the mime-type |
603 // based on the name (or possibly attributes) of that tag. | 622 // based on the name (or possibly attributes) of that tag. |
604 static base::HistogramBase* counter(NULL); | 623 static base::HistogramBase* counter(NULL); |
605 if (!counter) { | 624 if (!counter) { |
606 counter = UMASnifferHistogramGet("mime_sniffer.kMagicXML2", | 625 counter = |
607 arraysize(kMagicXML)); | 626 UMASnifferHistogramGet("mime_sniffer.kMagicXML2", arraysize(kMagicXML)); |
608 } | 627 } |
609 const int kMaxTagIterations = 5; | 628 const int kMaxTagIterations = 5; |
610 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { | 629 for (int i = 0; i < kMaxTagIterations && pos < end; ++i) { |
611 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); | 630 pos = reinterpret_cast<const char*>(memchr(pos, '<', end - pos)); |
612 if (!pos) | 631 if (!pos) |
613 return false; | 632 return false; |
614 | 633 |
615 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0) { | 634 if (base::strncasecmp(pos, "<?xml", sizeof("<?xml") - 1) == 0) { |
616 // Skip XML declarations. | 635 // Skip XML declarations. |
617 ++pos; | 636 ++pos; |
618 continue; | 637 continue; |
619 } else if (base::strncasecmp(pos, "<!DOCTYPE", | 638 } else if (base::strncasecmp(pos, "<!DOCTYPE", sizeof("<!DOCTYPE") - 1) == |
620 sizeof("<!DOCTYPE") - 1) == 0) { | 639 0) { |
mmenke
2014/10/10 18:12:39
Think this is pretty ugly - I find no extra indent
| |
621 // Skip DOCTYPE declarations. | 640 // Skip DOCTYPE declarations. |
622 ++pos; | 641 ++pos; |
623 continue; | 642 continue; |
624 } | 643 } |
625 | 644 |
626 if (CheckForMagicNumbers(pos, end - pos, | 645 if (CheckForMagicNumbers( |
627 kMagicXML, arraysize(kMagicXML), | 646 pos, end - pos, kMagicXML, arraysize(kMagicXML), counter, result)) |
628 counter, result)) | |
629 return true; | 647 return true; |
mmenke
2014/10/10 18:12:39
This is a style violation - when an if body takes
| |
630 | 648 |
631 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult | 649 // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult |
632 // to identify. | 650 // to identify. |
633 | 651 |
634 // If we get here, we've hit an initial tag that hasn't matched one of the | 652 // If we get here, we've hit an initial tag that hasn't matched one of the |
635 // above tests. Abort. | 653 // above tests. Abort. |
636 return true; | 654 return true; |
637 } | 655 } |
638 | 656 |
639 // We iterated too far without finding a start tag. | 657 // We iterated too far without finding a start tag. |
640 // If we have more content to look at, we aren't going to change our mind by | 658 // If we have more content to look at, we aren't going to change our mind by |
641 // seeing more bytes from the network. | 659 // seeing more bytes from the network. |
642 return pos < end; | 660 return pos < end; |
643 } | 661 } |
644 | 662 |
645 // Byte order marks | 663 // Byte order marks |
646 static const MagicNumber kByteOrderMark[] = { | 664 static const MagicNumber kByteOrderMark[] = { |
647 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE | 665 MAGIC_NUMBER("text/plain", "\xFE\xFF") // UTF-16BE |
648 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE | 666 MAGIC_NUMBER("text/plain", "\xFF\xFE") // UTF-16LE |
649 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 | 667 MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF") // UTF-8 |
650 }; | 668 }; |
651 | 669 |
652 // Whether a given byte looks like it might be part of binary content. | 670 // Whether a given byte looks like it might be part of binary content. |
653 // Source: HTML5 spec | 671 // Source: HTML5 spec |
654 static char kByteLooksBinary[] = { | 672 static char kByteLooksBinary[] = { |
655 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F | 673 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, // 0x00 - 0x0F |
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F | 674 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, // 0x10 - 0x1F |
657 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F | 675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x20 - 0x2F |
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F | 676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x30 - 0x3F |
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F | 677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x40 - 0x4F |
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F | 678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x50 - 0x5F |
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F | 679 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x60 - 0x6F |
662 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F | 680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x70 - 0x7F |
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F | 681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80 - 0x8F |
664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F | 682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x90 - 0x9F |
665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF | 683 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xA0 - 0xAF |
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF | 684 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xB0 - 0xBF |
667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF | 685 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xC0 - 0xCF |
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF | 686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xD0 - 0xDF |
669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF | 687 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xE0 - 0xEF |
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF | 688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xF0 - 0xFF |
671 }; | 689 }; |
672 | 690 |
673 // Returns true and sets result to "application/octet-stream" if the content | 691 // Returns true and sets result to "application/octet-stream" if the content |
674 // appears to be binary data. Otherwise, returns false and sets "text/plain". | 692 // appears to be binary data. Otherwise, returns false and sets "text/plain". |
675 // Clears have_enough_content if more data could possibly change the result. | 693 // Clears have_enough_content if more data could possibly change the result. |
676 static bool SniffBinary(const char* content, | 694 static bool SniffBinary(const char* content, |
677 size_t size, | 695 size_t size, |
678 bool* have_enough_content, | 696 bool* have_enough_content, |
679 std::string* result) { | 697 std::string* result) { |
680 // There is no concensus about exactly how to sniff for binary content. | 698 // There is no concensus about exactly how to sniff for binary content. |
681 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. | 699 // * IE 7: Don't sniff for binary looking bytes, but trust the file extension. |
682 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. | 700 // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte. |
683 // Here, we side with FF, but with a smaller buffer. This size was chosen | 701 // Here, we side with FF, but with a smaller buffer. This size was chosen |
684 // because it is small enough to comfortably fit into a single packet (after | 702 // because it is small enough to comfortably fit into a single packet (after |
685 // allowing for headers) and yet large enough to account for binary formats | 703 // allowing for headers) and yet large enough to account for binary formats |
686 // that have a significant amount of ASCII at the beginning (crbug.com/15314). | 704 // that have a significant amount of ASCII at the beginning (crbug.com/15314). |
687 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); | 705 const bool is_truncated = TruncateSize(kMaxBytesToSniff, &size); |
688 | 706 |
689 // First, we look for a BOM. | 707 // First, we look for a BOM. |
690 static base::HistogramBase* counter(NULL); | 708 static base::HistogramBase* counter(NULL); |
691 if (!counter) { | 709 if (!counter) { |
692 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", | 710 counter = UMASnifferHistogramGet("mime_sniffer.kByteOrderMark2", |
693 arraysize(kByteOrderMark)); | 711 arraysize(kByteOrderMark)); |
694 } | 712 } |
695 std::string unused; | 713 std::string unused; |
696 if (CheckForMagicNumbers(content, size, | 714 if (CheckForMagicNumbers(content, |
697 kByteOrderMark, arraysize(kByteOrderMark), | 715 size, |
698 counter, &unused)) { | 716 kByteOrderMark, |
717 arraysize(kByteOrderMark), | |
718 counter, | |
719 &unused)) { | |
699 // If there is BOM, we think the buffer is not binary. | 720 // If there is BOM, we think the buffer is not binary. |
700 result->assign("text/plain"); | 721 result->assign("text/plain"); |
701 return false; | 722 return false; |
702 } | 723 } |
703 | 724 |
704 // Next we look to see if any of the bytes "look binary." | 725 // Next we look to see if any of the bytes "look binary." |
705 for (size_t i = 0; i < size; ++i) { | 726 for (size_t i = 0; i < size; ++i) { |
706 // If we a see a binary-looking byte, we think the content is binary. | 727 // If we a see a binary-looking byte, we think the content is binary. |
707 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { | 728 if (kByteLooksBinary[static_cast<unsigned char>(content[i])]) { |
708 result->assign("application/octet-stream"); | 729 result->assign("application/octet-stream"); |
709 return true; | 730 return true; |
710 } | 731 } |
711 } | 732 } |
712 | 733 |
713 // No evidence either way. Default to non-binary and, if truncated, clear | 734 // No evidence either way. Default to non-binary and, if truncated, clear |
714 // have_enough_content because there could be a binary looking byte in the | 735 // have_enough_content because there could be a binary looking byte in the |
715 // truncated data. | 736 // truncated data. |
716 *have_enough_content &= is_truncated; | 737 *have_enough_content &= is_truncated; |
717 result->assign("text/plain"); | 738 result->assign("text/plain"); |
718 return false; | 739 return false; |
719 } | 740 } |
720 | 741 |
721 static bool IsUnknownMimeType(const std::string& mime_type) { | 742 static bool IsUnknownMimeType(const std::string& mime_type) { |
722 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. | 743 // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here. |
723 // If we do, please be careful not to alter the semantics at all. | 744 // If we do, please be careful not to alter the semantics at all. |
724 static const char* kUnknownMimeTypes[] = { | 745 static const char* kUnknownMimeTypes[] = { |
725 // Empty mime types are as unknown as they get. | 746 // Empty mime types are as unknown as they get. |
726 "", | 747 "", |
727 // The unknown/unknown type is popular and uninformative | 748 // The unknown/unknown type is popular and uninformative |
728 "unknown/unknown", | 749 "unknown/unknown", |
729 // The second most popular unknown mime type is application/unknown | 750 // The second most popular unknown mime type is application/unknown |
730 "application/unknown", | 751 "application/unknown", |
731 // Firefox rejects a mime type if it is exactly */* | 752 // Firefox rejects a mime type if it is exactly */* |
732 "*/*", | 753 "*/*", |
733 }; | 754 }; |
734 static base::HistogramBase* counter(NULL); | 755 static base::HistogramBase* counter(NULL); |
735 if (!counter) { | 756 if (!counter) { |
736 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", | 757 counter = UMASnifferHistogramGet("mime_sniffer.kUnknownMimeTypes2", |
737 arraysize(kUnknownMimeTypes) + 1); | 758 arraysize(kUnknownMimeTypes) + 1); |
738 } | 759 } |
739 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { | 760 for (size_t i = 0; i < arraysize(kUnknownMimeTypes); ++i) { |
740 if (mime_type == kUnknownMimeTypes[i]) { | 761 if (mime_type == kUnknownMimeTypes[i]) { |
741 counter->Add(i); | 762 counter->Add(i); |
742 return true; | 763 return true; |
(...skipping 21 matching lines...) Expand all Loading... | |
764 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); | 785 counter = UMASnifferHistogramGet("mime_sniffer.kSniffCRX", 3); |
765 | 786 |
766 // Technically, the crx magic number is just Cr24, but the bytes after that | 787 // Technically, the crx magic number is just Cr24, but the bytes after that |
767 // are a version number which changes infrequently. Including it in the | 788 // are a version number which changes infrequently. Including it in the |
768 // sniffing gives us less room for error. If the version number ever changes, | 789 // sniffing gives us less room for error. If the version number ever changes, |
769 // we can just add an entry to this list. | 790 // we can just add an entry to this list. |
770 // | 791 // |
771 // TODO(aa): If we ever have another magic number, we'll want to pass a | 792 // TODO(aa): If we ever have another magic number, we'll want to pass a |
772 // histogram into CheckForMagicNumbers(), below, to see which one matched. | 793 // histogram into CheckForMagicNumbers(), below, to see which one matched. |
773 static const struct MagicNumber kCRXMagicNumbers[] = { | 794 static const struct MagicNumber kCRXMagicNumbers[] = { |
774 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00") | 795 MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00")}; |
775 }; | |
776 | 796 |
777 // Only consider files that have the extension ".crx". | 797 // Only consider files that have the extension ".crx". |
778 static const char kCRXExtension[] = ".crx"; | 798 static const char kCRXExtension[] = ".crx"; |
779 // Ignore null by subtracting 1. | 799 // Ignore null by subtracting 1. |
780 static const int kExtensionLength = arraysize(kCRXExtension) - 1; | 800 static const int kExtensionLength = arraysize(kCRXExtension) - 1; |
781 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == | 801 if (url.path().rfind(kCRXExtension, std::string::npos, kExtensionLength) == |
782 url.path().size() - kExtensionLength) { | 802 url.path().size() - kExtensionLength) { |
783 counter->Add(1); | 803 counter->Add(1); |
784 } else { | 804 } else { |
785 return false; | 805 return false; |
786 } | 806 } |
787 | 807 |
788 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); | 808 *have_enough_content &= TruncateSize(kBytesRequiredForMagic, &size); |
789 if (CheckForMagicNumbers(content, size, | 809 if (CheckForMagicNumbers(content, |
790 kCRXMagicNumbers, arraysize(kCRXMagicNumbers), | 810 size, |
791 NULL, result)) { | 811 kCRXMagicNumbers, |
812 arraysize(kCRXMagicNumbers), | |
813 NULL, | |
814 result)) { | |
792 counter->Add(2); | 815 counter->Add(2); |
793 } else { | 816 } else { |
794 return false; | 817 return false; |
795 } | 818 } |
796 | 819 |
797 return true; | 820 return true; |
798 } | 821 } |
799 | 822 |
800 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { | 823 bool ShouldSniffMimeType(const GURL& url, const std::string& mime_type) { |
801 static base::HistogramBase* should_sniff_counter(NULL); | 824 static base::HistogramBase* should_sniff_counter(NULL); |
802 if (!should_sniff_counter) { | 825 if (!should_sniff_counter) { |
803 should_sniff_counter = | 826 should_sniff_counter = |
804 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); | 827 UMASnifferHistogramGet("mime_sniffer.ShouldSniffMimeType2", 3); |
805 } | 828 } |
806 bool sniffable_scheme = url.is_empty() || | 829 bool sniffable_scheme = url.is_empty() || url.SchemeIsHTTPOrHTTPS() || |
807 url.SchemeIsHTTPOrHTTPS() || | |
808 url.SchemeIs("ftp") || | 830 url.SchemeIs("ftp") || |
809 #if defined(OS_ANDROID) | 831 #if defined(OS_ANDROID) |
810 url.SchemeIs("content") || | 832 url.SchemeIs("content") || |
811 #endif | 833 #endif |
812 url.SchemeIsFile() || | 834 url.SchemeIsFile() || url.SchemeIsFileSystem(); |
813 url.SchemeIsFileSystem(); | |
814 if (!sniffable_scheme) { | 835 if (!sniffable_scheme) { |
815 should_sniff_counter->Add(1); | 836 should_sniff_counter->Add(1); |
816 return false; | 837 return false; |
817 } | 838 } |
818 | 839 |
819 static const char* kSniffableTypes[] = { | 840 static const char* |
820 // Many web servers are misconfigured to send text/plain for many | 841 kSniffableTypes |
821 // different types of content. | 842 [] = {// Many web servers are misconfigured to send text/plain for |
mmenke
2014/10/10 18:12:39
Just no.
| |
822 "text/plain", | 843 // many |
823 // We want to sniff application/octet-stream for | 844 // different types of content. |
824 // application/x-chrome-extension, but nothing else. | 845 "text/plain", |
825 "application/octet-stream", | 846 // We want to sniff application/octet-stream for |
826 // XHTML and Atom/RSS feeds are often served as plain xml instead of | 847 // application/x-chrome-extension, but nothing else. |
827 // their more specific mime types. | 848 "application/octet-stream", |
828 "text/xml", | 849 // XHTML and Atom/RSS feeds are often served as plain xml |
829 "application/xml", | 850 // instead of |
830 // Check for false Microsoft Office MIME types. | 851 // their more specific mime types. |
831 "application/msword", | 852 "text/xml", |
832 "application/vnd.ms-excel", | 853 "application/xml", |
833 "application/vnd.ms-powerpoint", | 854 // Check for false Microsoft Office MIME types. |
834 "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | 855 "application/msword", |
835 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | 856 "application/vnd.ms-excel", |
836 "application/vnd.openxmlformats-officedocument.presentationml.presentation", | 857 "application/vnd.ms-powerpoint", |
837 "application/vnd.ms-excel.sheet.macroenabled.12", | 858 "application/" |
838 "application/vnd.ms-word.document.macroenabled.12", | 859 "vnd.openxmlformats-officedocument.wordprocessingml.document", |
839 "application/vnd.ms-powerpoint.presentation.macroenabled.12", | 860 "application/" |
840 "application/mspowerpoint", | 861 "vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
841 "application/msexcel", | 862 "application/" |
842 "application/vnd.ms-word", | 863 "vnd.openxmlformats-officedocument.presentationml.presentation", |
843 "application/vnd.ms-word.document.12", | 864 "application/vnd.ms-excel.sheet.macroenabled.12", |
844 "application/vnd.msword", | 865 "application/vnd.ms-word.document.macroenabled.12", |
845 }; | 866 "application/vnd.ms-powerpoint.presentation.macroenabled.12", |
867 "application/mspowerpoint", | |
868 "application/msexcel", | |
869 "application/vnd.ms-word", | |
870 "application/vnd.ms-word.document.12", | |
871 "application/vnd.msword", | |
872 }; | |
846 static base::HistogramBase* counter(NULL); | 873 static base::HistogramBase* counter(NULL); |
847 if (!counter) { | 874 if (!counter) { |
848 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", | 875 counter = UMASnifferHistogramGet("mime_sniffer.kSniffableTypes2", |
849 arraysize(kSniffableTypes) + 1); | 876 arraysize(kSniffableTypes) + 1); |
850 } | 877 } |
851 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { | 878 for (size_t i = 0; i < arraysize(kSniffableTypes); ++i) { |
852 if (mime_type == kSniffableTypes[i]) { | 879 if (mime_type == kSniffableTypes[i]) { |
853 counter->Add(i); | 880 counter->Add(i); |
854 should_sniff_counter->Add(2); | 881 should_sniff_counter->Add(2); |
855 return true; | 882 return true; |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
923 // We're not interested in sniffing these types for images and the like. | 950 // We're not interested in sniffing these types for images and the like. |
924 // Instead, we're looking explicitly for a feed. If we don't find one | 951 // Instead, we're looking explicitly for a feed. If we don't find one |
925 // we're done and return early. | 952 // we're done and return early. |
926 if (SniffXML(content, content_size, &have_enough_content, result)) | 953 if (SniffXML(content, content_size, &have_enough_content, result)) |
927 return true; | 954 return true; |
928 return have_enough_content; | 955 return have_enough_content; |
929 } | 956 } |
930 | 957 |
931 // CRX files (Chrome extensions) have a special sniffing algorithm. It is | 958 // CRX files (Chrome extensions) have a special sniffing algorithm. It is |
932 // tighter than the others because we don't have to match legacy behavior. | 959 // tighter than the others because we don't have to match legacy behavior. |
933 if (SniffCRX(content, content_size, url, type_hint, | 960 if (SniffCRX( |
934 &have_enough_content, result)) | 961 content, content_size, url, type_hint, &have_enough_content, result)) |
935 return true; | 962 return true; |
936 | 963 |
937 // Check the file extension and magic numbers to see if this is an Office | 964 // Check the file extension and magic numbers to see if this is an Office |
938 // document. This needs to be checked before the general magic numbers | 965 // document. This needs to be checked before the general magic numbers |
939 // because zip files and Office documents (OOXML) have the same magic number. | 966 // because zip files and Office documents (OOXML) have the same magic number. |
940 if (SniffForOfficeDocs(content, content_size, url, | 967 if (SniffForOfficeDocs( |
941 &have_enough_content, result)) | 968 content, content_size, url, &have_enough_content, result)) |
942 return true; // We've matched a magic number. No more content needed. | 969 return true; // We've matched a magic number. No more content needed. |
943 | 970 |
944 // We're not interested in sniffing for magic numbers when the type_hint | 971 // We're not interested in sniffing for magic numbers when the type_hint |
945 // is application/octet-stream. Time to bail out. | 972 // is application/octet-stream. Time to bail out. |
946 if (type_hint == "application/octet-stream") | 973 if (type_hint == "application/octet-stream") |
947 return have_enough_content; | 974 return have_enough_content; |
948 | 975 |
949 // Now we look in our large table of magic numbers to see if we can find | 976 // Now we look in our large table of magic numbers to see if we can find |
950 // anything that matches the content. | 977 // anything that matches the content. |
951 if (SniffForMagicNumbers(content, content_size, | 978 if (SniffForMagicNumbers(content, content_size, &have_enough_content, result)) |
952 &have_enough_content, result)) | |
953 return true; // We've matched a magic number. No more content needed. | 979 return true; // We've matched a magic number. No more content needed. |
954 | 980 |
955 return have_enough_content; | 981 return have_enough_content; |
956 } | 982 } |
957 | 983 |
958 bool SniffMimeTypeFromLocalData(const char* content, | 984 bool SniffMimeTypeFromLocalData(const char* content, |
959 size_t size, | 985 size_t size, |
960 std::string* result) { | 986 std::string* result) { |
961 // First check the extra table. | 987 // First check the extra table. |
962 if (CheckForMagicNumbers(content, size, kExtraMagicNumbers, | 988 if (CheckForMagicNumbers(content, |
963 arraysize(kExtraMagicNumbers), NULL, result)) | 989 size, |
990 kExtraMagicNumbers, | |
991 arraysize(kExtraMagicNumbers), | |
992 NULL, | |
993 result)) | |
964 return true; | 994 return true; |
965 // Finally check the original table. | 995 // Finally check the original table. |
966 return CheckForMagicNumbers(content, size, kMagicNumbers, | 996 return CheckForMagicNumbers( |
967 arraysize(kMagicNumbers), NULL, result); | 997 content, size, kMagicNumbers, arraysize(kMagicNumbers), NULL, result); |
968 } | 998 } |
969 | 999 |
970 } // namespace net | 1000 } // namespace net |
OLD | NEW |